From 4b4aad7217d3292650e77eec2cf4c198ea9c3b4b Mon Sep 17 00:00:00 2001
From: Jiyoung Yun <jy910.yun@samsung.com>
Date: Wed, 23 Nov 2016 19:09:09 +0900
Subject: Imported Upstream version 1.1.0

---
 src/vm/i386/.gitmirror                   |    1 +
 src/vm/i386/CLRErrorReporting.vrg        |    5 +
 src/vm/i386/RedirectedHandledJITCase.asm |  136 +
 src/vm/i386/asmconstants.h               |  485 +++
 src/vm/i386/asmhelpers.asm               | 2400 +++++++++++
 src/vm/i386/cgencpu.h                    |  573 +++
 src/vm/i386/cgenx86.cpp                  | 2257 ++++++++++
 src/vm/i386/excepcpu.h                   |   87 +
 src/vm/i386/excepx86.cpp                 | 3734 ++++++++++++++++
 src/vm/i386/fptext.asm                   |  277 ++
 src/vm/i386/gmsasm.asm                   |   37 +
 src/vm/i386/gmscpu.h                     |  140 +
 src/vm/i386/gmsx86.cpp                   | 1245 ++++++
 src/vm/i386/jithelp.asm                  | 2574 +++++++++++
 src/vm/i386/jitinterfacex86.cpp          | 1922 +++++++++
 src/vm/i386/profiler.cpp                 |  336 ++
 src/vm/i386/remotingx86.cpp              |  225 +
 src/vm/i386/stublinkerx86.cpp            | 6806 ++++++++++++++++++++++++++++++
 src/vm/i386/stublinkerx86.h              |  781 ++++
 src/vm/i386/virtualcallstubcpu.hpp       | 1077 +++++
 20 files changed, 25098 insertions(+)
 create mode 100644 src/vm/i386/.gitmirror
 create mode 100644 src/vm/i386/CLRErrorReporting.vrg
 create mode 100644 src/vm/i386/RedirectedHandledJITCase.asm
 create mode 100644 src/vm/i386/asmconstants.h
 create mode 100644 src/vm/i386/asmhelpers.asm
 create mode 100644 src/vm/i386/cgencpu.h
 create mode 100644 src/vm/i386/cgenx86.cpp
 create mode 100644 src/vm/i386/excepcpu.h
 create mode 100644 src/vm/i386/excepx86.cpp
 create mode 100644 src/vm/i386/fptext.asm
 create mode 100644 src/vm/i386/gmsasm.asm
 create mode 100644 src/vm/i386/gmscpu.h
 create mode 100644 src/vm/i386/gmsx86.cpp
 create mode 100644 src/vm/i386/jithelp.asm
 create mode 100644 src/vm/i386/jitinterfacex86.cpp
 create mode 100644 src/vm/i386/profiler.cpp
 create mode 100644 src/vm/i386/remotingx86.cpp
 create mode 100644 src/vm/i386/stublinkerx86.cpp
 create mode 100644 src/vm/i386/stublinkerx86.h
 create mode 100644 src/vm/i386/virtualcallstubcpu.hpp

(limited to 'src/vm/i386')

diff --git a/src/vm/i386/.gitmirror b/src/vm/i386/.gitmirror
new file mode 100644
index 0000000000..f507630f94
--- /dev/null
+++ b/src/vm/i386/.gitmirror
@@ -0,0 +1 @@
+Only contents of this folder, excluding subfolders, will be mirrored by the Git-TFS Mirror. 
\ No newline at end of file
diff --git a/src/vm/i386/CLRErrorReporting.vrg b/src/vm/i386/CLRErrorReporting.vrg
new file mode 100644
index 0000000000..6e45ba967c
--- /dev/null
+++ b/src/vm/i386/CLRErrorReporting.vrg
@@ -0,0 +1,5 @@
+VSREG 7
+
+[HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\Eventlog\Application\.NET Runtime 4.0 Error Reporting]
+"EventMessageFile"="[DWFolder.D0DF3458_A845_11D3_8D0A_0050046416B9]DW20.EXE"
+"TypesSupported"=dword:00000007
diff --git a/src/vm/i386/RedirectedHandledJITCase.asm b/src/vm/i386/RedirectedHandledJITCase.asm
new file mode 100644
index 0000000000..80345623e7
--- /dev/null
+++ b/src/vm/i386/RedirectedHandledJITCase.asm
@@ -0,0 +1,136 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+; See the LICENSE file in the project root for more information.
+
+; ==++==
+; 
+
+; 
+; ==--==
+; ***********************************************************************
+; File: RedirectedHandledJITCase.asm
+;
+; ***********************************************************************
+;
+
+; This contains thread-redirecting helper routines that are 100% x86 assembly
+
+        .586
+        .model  flat
+
+        include asmconstants.inc
+
+        option  casemap:none
+        .code
+
+EXTERN _GetCurrentSavedRedirectContext@0:PROC
+
+;
+; WARNING!!  These functions immediately ruin thread unwindability.  This is
+; WARNING!!  OK as long as there is a mechanism for saving the thread context
+; WARNING!!  prior to running these functions as well as a mechanism for 
+; WARNING!!  restoring the context prior to any stackwalk.  This means that
+; WARNING!!  we need to ensure that no GC can occur while the stack is 
+; WARNING!!  unwalkable.  This further means that we cannot allow any exception
+; WARNING!!  to occure when the stack is unwalkable
+;
+
+
+; If you edit this macro, make sure you update GetCONTEXTFromRedirectedStubStackFrame.
+; This function is used by both the personality routine and the debugger to retrieve the original CONTEXT.
+GenerateRedirectedHandledJITCaseStub MACRO reason
+
+EXTERN ?RedirectedHandledJITCaseFor&reason&@Thread@@CGXXZ:proc
+
+        ALIGN 4
+_RedirectedHandledJITCaseFor&reason&_Stub@0 PROC PUBLIC
+
+        push            eax                     ; where to stuff the fake return address
+        push            ebp                     ; save interrupted ebp for stack walk
+        mov             ebp, esp
+        sub             esp, 4                  ; stack slot to save the CONTEXT *
+
+        ;
+        ; Save a copy of the redirect CONTEXT*.
+        ; This is needed for the debugger to unwind the stack.
+        ;
+        call            _GetCurrentSavedRedirectContext@0
+
+        mov             [ebp-4], eax
+.errnz REDIRECTSTUB_EBP_OFFSET_CONTEXT + 4, REDIRECTSTUB_EBP_OFFSET_CONTEXT has changed - update asm stubs
+
+        ;
+        ; Fetch the interrupted eip and save it as our return address.
+        ;
+        mov             eax, [eax + CONTEXT_Eip]
+        mov             [ebp+4], eax
+
+        ;
+        ; Call target, which will do whatever we needed to do in the context
+        ; of the target thread, and will RtlRestoreContext when it is done.
+        ;
+        call            ?RedirectedHandledJITCaseFor&reason&@Thread@@CGXXZ
+        
+        int             3                       ; target shouldn't return.
+
+; Put a label here to tell the debugger where the end of this function is.
+PUBLIC _RedirectedHandledJITCaseFor&reason&_StubEnd@0
+_RedirectedHandledJITCaseFor&reason&_StubEnd@0:
+
+_RedirectedHandledJITCaseFor&reason&_Stub@0 ENDP
+
+ENDM
+
+; HijackFunctionStart and HijackFunctionEnd are used to tell BBT to keep the hijacking functions together.
+; Debugger uses range to check whether IP falls into one of them (see code:Debugger::s_hijackFunction).
+
+_HijackFunctionStart@0 proc public
+ret
+_HijackFunctionStart@0 endp
+
+GenerateRedirectedHandledJITCaseStub <GCThreadControl>
+GenerateRedirectedHandledJITCaseStub <DbgThreadControl>
+GenerateRedirectedHandledJITCaseStub <UserSuspend>
+GenerateRedirectedHandledJITCaseStub <YieldTask>
+
+; Hijack for exceptions.
+; This can be used to hijack at a 2nd-chance exception and execute the UEF
+
+EXTERN _ExceptionHijackWorker@16:PROC
+
+_ExceptionHijack@0 PROC PUBLIC
+
+    ; This is where we land when we're hijacked from an IP by the debugger.
+    ; The debugger has already pushed the args:
+    ; - a CONTEXT 
+    ; - a EXCEPTION_RECORD onto the stack
+    ; - an DWORD to use to mulitplex the hijack
+    ; - an arbitrary void* data parameter
+    call _ExceptionHijackWorker@16
+    
+    ; Don't expect to return from here. Debugger will unhijack us. It has the full 
+    ; context and can properly restore us.
+    int 3
+
+; Put a label here to tell the debugger where the end of this function is.
+public _ExceptionHijackEnd@0
+_ExceptionHijackEnd@0:
+
+_ExceptionHijack@0 ENDP
+
+; It is very important to have a dummy function here.
+; Without it, the image has two labels without any instruction in between:
+; One for the last label in this function, and one for the first function in the image following this asm file.
+; Then the linker is free to remove from PDB the function symbol for the function
+; immediately following this, and replace the reference with the last label in this file.
+; When this happens, BBT loses info about function, moves pieces within the function to random place, and generates bad code.
+_HijackFunctionLast@0 proc public
+ret
+_HijackFunctionLast@0 endp
+
+; This is the first function outside the "keep together range". Used by BBT scripts.
+_HijackFunctionEnd@0 proc public
+ret
+_HijackFunctionEnd@0 endp
+
+END
diff --git a/src/vm/i386/asmconstants.h b/src/vm/i386/asmconstants.h
new file mode 100644
index 0000000000..5fd39d6897
--- /dev/null
+++ b/src/vm/i386/asmconstants.h
@@ -0,0 +1,485 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// asmconstants.h -
+//
+// This header defines field offsets and constants used by assembly code
+// Be sure to rebuild clr/src/vm/ceemain.cpp after changing this file, to
+// ensure that the constants match the expected C/C++ values
+
+// 
+// If you need to figure out a constant that has changed and is causing
+// a compile-time assert, check out USE_COMPILE_TIME_CONSTANT_FINDER.
+// TODO: put the constant finder in a common place so other platforms can use it.
+
+#ifndef _TARGET_X86_ 
+#error this file should only be used on an X86 platform
+#endif
+
+#include "../../inc/switches.h"
+
+#ifndef ASMCONSTANTS_C_ASSERT
+#define ASMCONSTANTS_C_ASSERT(cond)
+#endif
+
+#ifndef ASMCONSTANTS_RUNTIME_ASSERT
+#define ASMCONSTANTS_RUNTIME_ASSERT(cond)
+#endif
+
+// Some contants are different in _DEBUG builds.  This macro factors out ifdefs from below.
+#ifdef _DEBUG
+#define DBG_FRE(dbg,fre) dbg
+#else
+#define DBG_FRE(dbg,fre) fre
+#endif
+
+//***************************************************************************
+#if defined(_DEBUG) && defined(_TARGET_X86_) && !defined(FEATURE_CORECLR)
+ #define HAS_TRACK_CXX_EXCEPTION_CODE_HACK 1
+ #define TRACK_CXX_EXCEPTION_CODE_HACK
+#else
+ #define HAS_TRACK_CXX_EXCEPTION_CODE_HACK 0
+#endif
+
+#define INITIAL_SUCCESS_COUNT               0x100
+
+#define DynamicHelperFrameFlags_Default     0
+#define DynamicHelperFrameFlags_ObjectArg   1
+#define DynamicHelperFrameFlags_ObjectArg2  2
+
+#ifdef FEATURE_REMOTING
+#define TransparentProxyObject___stubData 0x8
+ASMCONSTANTS_C_ASSERT(TransparentProxyObject___stubData == offsetof(TransparentProxyObject, _stubData))
+
+#define TransparentProxyObject___stub 0x14
+ASMCONSTANTS_C_ASSERT(TransparentProxyObject___stub == offsetof(TransparentProxyObject, _stub))
+
+#define TransparentProxyObject___pMT 0xc
+ASMCONSTANTS_C_ASSERT(TransparentProxyObject___pMT == offsetof(TransparentProxyObject, _pMT))
+#endif // FEATURE_REMOTING
+
+// CONTEXT from rotor_pal.h
+#define CONTEXT_Edi 0x9c
+ASMCONSTANTS_C_ASSERT(CONTEXT_Edi == offsetof(CONTEXT,Edi))
+
+#define CONTEXT_Esi 0xa0
+ASMCONSTANTS_C_ASSERT(CONTEXT_Esi == offsetof(CONTEXT,Esi))
+
+#define CONTEXT_Ebx 0xa4
+ASMCONSTANTS_C_ASSERT(CONTEXT_Ebx == offsetof(CONTEXT,Ebx))
+
+#define CONTEXT_Edx 0xa8
+ASMCONSTANTS_C_ASSERT(CONTEXT_Edx == offsetof(CONTEXT,Edx))
+
+#define CONTEXT_Eax 0xb0
+ASMCONSTANTS_C_ASSERT(CONTEXT_Eax == offsetof(CONTEXT,Eax))
+
+#define CONTEXT_Ebp 0xb4
+ASMCONSTANTS_C_ASSERT(CONTEXT_Ebp == offsetof(CONTEXT,Ebp))
+
+#define CONTEXT_Eip 0xb8
+ASMCONSTANTS_C_ASSERT(CONTEXT_Eip == offsetof(CONTEXT,Eip))
+
+#define CONTEXT_Esp 0xc4
+ASMCONSTANTS_C_ASSERT(CONTEXT_Esp == offsetof(CONTEXT,Esp))
+
+// SYSTEM_INFO from rotor_pal.h
+#define SYSTEM_INFO_dwNumberOfProcessors 20 
+ASMCONSTANTS_C_ASSERT(SYSTEM_INFO_dwNumberOfProcessors == offsetof(SYSTEM_INFO,dwNumberOfProcessors))
+
+// SpinConstants from clr/src/vars.h
+#define SpinConstants_dwInitialDuration 0 
+ASMCONSTANTS_C_ASSERT(SpinConstants_dwInitialDuration == offsetof(SpinConstants,dwInitialDuration))
+
+#define SpinConstants_dwMaximumDuration 4 
+ASMCONSTANTS_C_ASSERT(SpinConstants_dwMaximumDuration == offsetof(SpinConstants,dwMaximumDuration))
+
+#define SpinConstants_dwBackoffFactor 8
+ASMCONSTANTS_C_ASSERT(SpinConstants_dwBackoffFactor == offsetof(SpinConstants,dwBackoffFactor))
+
+// EHContext from clr/src/vm/i386/cgencpu.h
+#define EHContext_Eax 0x00
+ASMCONSTANTS_C_ASSERT(EHContext_Eax == offsetof(EHContext,Eax))
+
+#define EHContext_Ebx 0x04
+ASMCONSTANTS_C_ASSERT(EHContext_Ebx == offsetof(EHContext,Ebx))
+
+#define EHContext_Ecx 0x08
+ASMCONSTANTS_C_ASSERT(EHContext_Ecx == offsetof(EHContext,Ecx))
+
+#define EHContext_Edx 0x0c
+ASMCONSTANTS_C_ASSERT(EHContext_Edx == offsetof(EHContext,Edx))
+
+#define EHContext_Esi 0x10
+ASMCONSTANTS_C_ASSERT(EHContext_Esi == offsetof(EHContext,Esi))
+
+#define EHContext_Edi 0x14
+ASMCONSTANTS_C_ASSERT(EHContext_Edi == offsetof(EHContext,Edi))
+
+#define EHContext_Ebp 0x18
+ASMCONSTANTS_C_ASSERT(EHContext_Ebp == offsetof(EHContext,Ebp))
+
+#define EHContext_Esp 0x1c
+ASMCONSTANTS_C_ASSERT(EHContext_Esp == offsetof(EHContext,Esp))
+
+#define EHContext_Eip 0x20
+ASMCONSTANTS_C_ASSERT(EHContext_Eip == offsetof(EHContext,Eip))
+
+
+// from clr/src/fjit/helperframe.h
+#define SIZEOF_MachState          40
+ASMCONSTANTS_C_ASSERT(SIZEOF_MachState == sizeof(MachState))
+
+#define MachState__pEdi           0
+ASMCONSTANTS_C_ASSERT(MachState__pEdi == offsetof(MachState, _pEdi))
+
+#define MachState__edi            4
+ASMCONSTANTS_C_ASSERT(MachState__edi == offsetof(MachState, _edi))
+
+#define MachState__pEsi           8
+ASMCONSTANTS_C_ASSERT(MachState__pEsi == offsetof(MachState, _pEsi))
+
+#define MachState__esi            12
+ASMCONSTANTS_C_ASSERT(MachState__esi == offsetof(MachState, _esi))
+
+#define MachState__pEbx           16
+ASMCONSTANTS_C_ASSERT(MachState__pEbx == offsetof(MachState, _pEbx))
+
+#define MachState__ebx            20
+ASMCONSTANTS_C_ASSERT(MachState__ebx == offsetof(MachState, _ebx))
+
+#define MachState__pEbp           24
+ASMCONSTANTS_C_ASSERT(MachState__pEbp == offsetof(MachState, _pEbp))
+
+#define MachState__ebp            28
+ASMCONSTANTS_C_ASSERT(MachState__ebp == offsetof(MachState, _ebp))
+
+#define MachState__esp            32
+ASMCONSTANTS_C_ASSERT(MachState__esp == offsetof(MachState, _esp))
+
+#define MachState__pRetAddr       36
+ASMCONSTANTS_C_ASSERT(MachState__pRetAddr == offsetof(MachState, _pRetAddr))
+
+#define LazyMachState_captureEbp  40
+ASMCONSTANTS_C_ASSERT(LazyMachState_captureEbp == offsetof(LazyMachState, captureEbp))
+
+#define LazyMachState_captureEsp  44
+ASMCONSTANTS_C_ASSERT(LazyMachState_captureEsp == offsetof(LazyMachState, captureEsp))
+
+#define LazyMachState_captureEip  48
+ASMCONSTANTS_C_ASSERT(LazyMachState_captureEip == offsetof(LazyMachState, captureEip))
+
+
+#define VASigCookie__StubOffset 4
+ASMCONSTANTS_C_ASSERT(VASigCookie__StubOffset == offsetof(VASigCookie, pNDirectILStub))
+
+#define SIZEOF_TailCallFrame 32
+ASMCONSTANTS_C_ASSERT(SIZEOF_TailCallFrame == sizeof(TailCallFrame))
+
+#define SIZEOF_GSCookie 4
+
+// ICodeManager::SHADOW_SP_IN_FILTER from clr/src/inc/eetwain.h
+#define SHADOW_SP_IN_FILTER_ASM 0x1
+ASMCONSTANTS_C_ASSERT(SHADOW_SP_IN_FILTER_ASM == ICodeManager::SHADOW_SP_IN_FILTER)
+
+// from clr/src/inc/corinfo.h
+#define CORINFO_NullReferenceException_ASM 0
+ASMCONSTANTS_C_ASSERT(CORINFO_NullReferenceException_ASM == CORINFO_NullReferenceException)
+
+#define CORINFO_IndexOutOfRangeException_ASM 3
+ASMCONSTANTS_C_ASSERT(CORINFO_IndexOutOfRangeException_ASM == CORINFO_IndexOutOfRangeException)
+
+#define CORINFO_OverflowException_ASM 4
+ASMCONSTANTS_C_ASSERT(CORINFO_OverflowException_ASM == CORINFO_OverflowException)
+
+#define CORINFO_SynchronizationLockException_ASM 5
+ASMCONSTANTS_C_ASSERT(CORINFO_SynchronizationLockException_ASM == CORINFO_SynchronizationLockException)
+
+#define CORINFO_ArrayTypeMismatchException_ASM 6
+ASMCONSTANTS_C_ASSERT(CORINFO_ArrayTypeMismatchException_ASM == CORINFO_ArrayTypeMismatchException)
+
+#define CORINFO_ArgumentNullException_ASM 8
+ASMCONSTANTS_C_ASSERT(CORINFO_ArgumentNullException_ASM == CORINFO_ArgumentNullException)
+
+#define CORINFO_ArgumentException_ASM 9
+ASMCONSTANTS_C_ASSERT(CORINFO_ArgumentException_ASM == CORINFO_ArgumentException)
+
+
+#ifndef CROSSGEN_COMPILE
+
+// from clr/src/vm/threads.h
+#if defined(TRACK_CXX_EXCEPTION_CODE_HACK) // Is C++ exception code tracking turned on?
+    #define Thread_m_LastCxxSEHExceptionCode      0x20
+    ASMCONSTANTS_C_ASSERT(Thread_m_LastCxxSEHExceptionCode == offsetof(Thread, m_LastCxxSEHExceptionCode))
+
+    #define Thread_m_Context    0x3C
+#else
+    #define Thread_m_Context    0x38
+#endif // TRACK_CXX_EXCEPTION_CODE_HACK
+ASMCONSTANTS_C_ASSERT(Thread_m_Context == offsetof(Thread, m_Context))
+
+#define Thread_m_State      0x04
+ASMCONSTANTS_C_ASSERT(Thread_m_State == offsetof(Thread, m_State))
+#endif // CROSSGEN_COMPILE
+
+#define Thread_m_fPreemptiveGCDisabled     0x08
+#ifndef CROSSGEN_COMPILE
+ASMCONSTANTS_C_ASSERT(Thread_m_fPreemptiveGCDisabled == offsetof(Thread, m_fPreemptiveGCDisabled))
+#endif // CROSSGEN_COMPILE
+
+#define Thread_m_pFrame     0x0C
+#ifndef CROSSGEN_COMPILE
+ASMCONSTANTS_C_ASSERT(Thread_m_pFrame == offsetof(Thread, m_pFrame))
+#endif // CROSSGEN_COMPILE
+
+#ifndef CROSSGEN_COMPILE
+#define Thread_m_dwLockCount 0x18
+ASMCONSTANTS_C_ASSERT(Thread_m_dwLockCount == offsetof(Thread, m_dwLockCount))
+
+#define Thread_m_ThreadId 0x1C
+ASMCONSTANTS_C_ASSERT(Thread_m_ThreadId == offsetof(Thread, m_ThreadId))
+
+#define TS_CatchAtSafePoint_ASM 0x5F
+ASMCONSTANTS_C_ASSERT(Thread::TS_CatchAtSafePoint == TS_CatchAtSafePoint_ASM)
+
+#ifdef FEATURE_HIJACK
+#define TS_Hijacked_ASM 0x80
+ASMCONSTANTS_C_ASSERT(Thread::TS_Hijacked == TS_Hijacked_ASM)
+#endif
+
+#endif // CROSSGEN_COMPILE
+
+
+// from clr/src/vm/appdomain.hpp
+
+#define AppDomain__m_dwId 0x4
+ASMCONSTANTS_C_ASSERT(AppDomain__m_dwId == offsetof(AppDomain, m_dwId));
+
+// from clr/src/vm/ceeload.cpp
+#ifdef FEATURE_MIXEDMODE
+#define IJWNOADThunk__m_cache 0x1C
+ASMCONSTANTS_C_ASSERT(IJWNOADThunk__m_cache == offsetof(IJWNOADThunk, m_cache))
+
+#define IJWNOADThunk__NextCacheOffset 0x8
+ASMCONSTANTS_C_ASSERT(IJWNOADThunk__NextCacheOffset == sizeof(IJWNOADThunkStubCache))
+
+#define IJWNOADThunk__CodeAddrOffsetFromADID 0x4
+ASMCONSTANTS_C_ASSERT(IJWNOADThunk__CodeAddrOffsetFromADID == offsetof(IJWNOADThunkStubCache, m_CodeAddr))
+#endif //FEATURE_MIXEDMODE
+
+// from clr/src/vm/syncblk.h
+#define SizeOfSyncTableEntry_ASM 8
+ASMCONSTANTS_C_ASSERT(sizeof(SyncTableEntry) == SizeOfSyncTableEntry_ASM)
+
+#define SyncBlockIndexOffset_ASM 4
+ASMCONSTANTS_C_ASSERT(sizeof(ObjHeader) - offsetof(ObjHeader, m_SyncBlockValue) == SyncBlockIndexOffset_ASM)
+
+#ifndef __GNUC__
+#define SyncTableEntry_m_SyncBlock 0
+ASMCONSTANTS_C_ASSERT(offsetof(SyncTableEntry, m_SyncBlock) == SyncTableEntry_m_SyncBlock)
+
+#define SyncBlock_m_Monitor 0
+ASMCONSTANTS_C_ASSERT(offsetof(SyncBlock, m_Monitor) == SyncBlock_m_Monitor)
+
+#define AwareLock_m_MonitorHeld 0
+ASMCONSTANTS_C_ASSERT(offsetof(AwareLock, m_MonitorHeld) == AwareLock_m_MonitorHeld)
+#else
+// The following 3 offsets have value of 0, and must be
+// defined to be an empty string. Otherwise, gas may generate assembly
+// code with 0 displacement if 0 is left in the displacement field
+// of an instruction.
+#define SyncTableEntry_m_SyncBlock // 0
+ASMCONSTANTS_C_ASSERT(offsetof(SyncTableEntry, m_SyncBlock) == 0)
+
+#define SyncBlock_m_Monitor // 0
+ASMCONSTANTS_C_ASSERT(offsetof(SyncBlock, m_Monitor) == 0)
+
+#define AwareLock_m_MonitorHeld // 0
+ASMCONSTANTS_C_ASSERT(offsetof(AwareLock, m_MonitorHeld) == 0)
+#endif // !__GNUC__
+
+#define AwareLock_m_HoldingThread 8
+ASMCONSTANTS_C_ASSERT(offsetof(AwareLock, m_HoldingThread) == AwareLock_m_HoldingThread)
+
+#define AwareLock_m_Recursion 4
+ASMCONSTANTS_C_ASSERT(offsetof(AwareLock, m_Recursion) == AwareLock_m_Recursion)
+
+#define BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM 0x08000000
+ASMCONSTANTS_C_ASSERT(BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM == BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX)
+
+#define BIT_SBLK_SPIN_LOCK_ASM 0x10000000
+ASMCONSTANTS_C_ASSERT(BIT_SBLK_SPIN_LOCK_ASM == BIT_SBLK_SPIN_LOCK)
+
+#define SBLK_MASK_LOCK_THREADID_ASM 0x000003FF   // special value of 0 + 1023 thread ids
+ASMCONSTANTS_C_ASSERT(SBLK_MASK_LOCK_THREADID_ASM == SBLK_MASK_LOCK_THREADID)
+
+#define SBLK_MASK_LOCK_RECLEVEL_ASM 0x0000FC00   // 64 recursion levels
+ASMCONSTANTS_C_ASSERT(SBLK_MASK_LOCK_RECLEVEL_ASM == SBLK_MASK_LOCK_RECLEVEL)
+
+#define SBLK_LOCK_RECLEVEL_INC_ASM 0x00000400   // each level is this much higher than the previous one
+ASMCONSTANTS_C_ASSERT(SBLK_LOCK_RECLEVEL_INC_ASM == SBLK_LOCK_RECLEVEL_INC)
+
+#define BIT_SBLK_IS_HASHCODE_ASM 0x04000000
+ASMCONSTANTS_C_ASSERT(BIT_SBLK_IS_HASHCODE_ASM == BIT_SBLK_IS_HASHCODE)
+
+#define MASK_SYNCBLOCKINDEX_ASM  0x03ffffff // ((1<<SYNCBLOCKINDEX_BITS)-1)
+ASMCONSTANTS_C_ASSERT(MASK_SYNCBLOCKINDEX_ASM == MASK_SYNCBLOCKINDEX)
+
+// BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM + BIT_SBLK_SPIN_LOCK_ASM + 
+// SBLK_MASK_LOCK_THREADID_ASM + SBLK_MASK_LOCK_RECLEVEL_ASM
+#define SBLK_COMBINED_MASK_ASM 0x1800ffff
+ASMCONSTANTS_C_ASSERT(SBLK_COMBINED_MASK_ASM == (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL))
+
+// BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM + BIT_SBLK_SPIN_LOCK_ASM
+#define BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_SPIN_LOCK_ASM 0x18000000
+ASMCONSTANTS_C_ASSERT(BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_SPIN_LOCK_ASM == (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK))
+
+// BIT_SBLK_IS_HASHCODE + BIT_SBLK_SPIN_LOCK
+#define BIT_SBLK_IS_HASHCODE_OR_SPIN_LOCK_ASM 0x14000000
+ASMCONSTANTS_C_ASSERT(BIT_SBLK_IS_HASHCODE_OR_SPIN_LOCK_ASM == (BIT_SBLK_IS_HASHCODE + BIT_SBLK_SPIN_LOCK))
+
+// This is the offset from EBP at which the original CONTEXT is stored in one of the 
+// RedirectedHandledJITCase*_Stub functions.
+#define REDIRECTSTUB_EBP_OFFSET_CONTEXT (-4)
+
+#define MethodTable_m_wNumInterfaces    0x0E
+ASMCONSTANTS_C_ASSERT(MethodTable_m_wNumInterfaces == offsetof(MethodTable, m_wNumInterfaces))
+
+#define MethodTable_m_dwFlags           0x0
+ASMCONSTANTS_C_ASSERT(MethodTable_m_dwFlags == offsetof(MethodTable, m_dwFlags))
+
+#define MethodTable_m_pInterfaceMap     DBG_FRE(0x28, 0x24)
+ASMCONSTANTS_C_ASSERT(MethodTable_m_pInterfaceMap == offsetof(MethodTable, m_pMultipurposeSlot2))
+
+#define SIZEOF_MethodTable              DBG_FRE(0x2C, 0x28)
+ASMCONSTANTS_C_ASSERT(SIZEOF_MethodTable == sizeof(MethodTable))
+
+#define SIZEOF_InterfaceInfo_t          0x4
+ASMCONSTANTS_C_ASSERT(SIZEOF_InterfaceInfo_t == sizeof(InterfaceInfo_t))
+
+#ifdef FEATURE_COMINTEROP
+
+#define SIZEOF_FrameHandlerExRecord 0x0c
+#define OFFSETOF__FrameHandlerExRecord__m_ExReg__Next 0
+#define OFFSETOF__FrameHandlerExRecord__m_ExReg__Handler 4
+#define OFFSETOF__FrameHandlerExRecord__m_pEntryFrame 8
+ASMCONSTANTS_C_ASSERT(SIZEOF_FrameHandlerExRecord == sizeof(FrameHandlerExRecord))
+ASMCONSTANTS_C_ASSERT(OFFSETOF__FrameHandlerExRecord__m_ExReg__Next == offsetof(FrameHandlerExRecord, m_ExReg) + offsetof(EXCEPTION_REGISTRATION_RECORD, Next))
+ASMCONSTANTS_C_ASSERT(OFFSETOF__FrameHandlerExRecord__m_ExReg__Handler == offsetof(FrameHandlerExRecord, m_ExReg) + offsetof(EXCEPTION_REGISTRATION_RECORD, Handler))
+ASMCONSTANTS_C_ASSERT(OFFSETOF__FrameHandlerExRecord__m_pEntryFrame == offsetof(FrameHandlerExRecord, m_pEntryFrame))
+
+#ifdef _DEBUG
+#ifndef STACK_OVERWRITE_BARRIER_SIZE 
+#define STACK_OVERWRITE_BARRIER_SIZE 20
+#endif
+#ifndef STACK_OVERWRITE_BARRIER_VALUE 
+#define STACK_OVERWRITE_BARRIER_VALUE 0xabcdefab
+#endif
+
+#define SIZEOF_FrameHandlerExRecordWithBarrier 0x5c
+ASMCONSTANTS_C_ASSERT(SIZEOF_FrameHandlerExRecordWithBarrier == sizeof(FrameHandlerExRecordWithBarrier))
+#endif
+
+
+#ifdef MDA_SUPPORTED
+#define SIZEOF_StackImbalanceCookie 0x14
+ASMCONSTANTS_C_ASSERT(SIZEOF_StackImbalanceCookie == sizeof(StackImbalanceCookie))
+
+#define StackImbalanceCookie__m_pMD            0x00
+#define StackImbalanceCookie__m_pTarget        0x04
+#define StackImbalanceCookie__m_dwStackArgSize 0x08
+#define StackImbalanceCookie__m_callConv       0x0c
+#define StackImbalanceCookie__m_dwSavedEsp     0x10
+#define StackImbalanceCookie__HAS_FP_RETURN_VALUE 0x80000000
+
+ASMCONSTANTS_C_ASSERT(StackImbalanceCookie__m_pMD            == offsetof(StackImbalanceCookie, m_pMD))
+ASMCONSTANTS_C_ASSERT(StackImbalanceCookie__m_pTarget        == offsetof(StackImbalanceCookie, m_pTarget))
+ASMCONSTANTS_C_ASSERT(StackImbalanceCookie__m_dwStackArgSize == offsetof(StackImbalanceCookie, m_dwStackArgSize))
+ASMCONSTANTS_C_ASSERT(StackImbalanceCookie__m_callConv       == offsetof(StackImbalanceCookie, m_callConv))
+ASMCONSTANTS_C_ASSERT(StackImbalanceCookie__m_dwSavedEsp     == offsetof(StackImbalanceCookie, m_dwSavedEsp))
+ASMCONSTANTS_C_ASSERT(StackImbalanceCookie__HAS_FP_RETURN_VALUE == StackImbalanceCookie::HAS_FP_RETURN_VALUE)
+#endif // MDA_SUPPORTED
+
+#define MethodDesc_m_wFlags                   DBG_FRE(0x1a, 0x06)
+ASMCONSTANTS_C_ASSERT(MethodDesc_m_wFlags == offsetof(MethodDesc, m_wFlags))
+
+#define MethodDesc_mdcClassification          7
+ASMCONSTANTS_C_ASSERT(MethodDesc_mdcClassification == mdcClassification)
+
+#define MethodDesc_mcComInterop               6
+ASMCONSTANTS_C_ASSERT(MethodDesc_mcComInterop == mcComInterop)
+
+#define ComPlusCallMethodDesc__m_pComPlusCallInfo DBG_FRE(0x1C, 0x8)
+ASMCONSTANTS_C_ASSERT(ComPlusCallMethodDesc__m_pComPlusCallInfo == offsetof(ComPlusCallMethodDesc, m_pComPlusCallInfo))
+
+#define ComPlusCallInfo__m_pRetThunk 0x10
+ASMCONSTANTS_C_ASSERT(ComPlusCallInfo__m_pRetThunk == offsetof(ComPlusCallInfo, m_pRetThunk))
+
+#endif // FEATURE_COMINTEROP
+
+#define               NonTrivialInterfaceCastFlags (0x00080000 + 0x40000000 + 0x00400000)
+ASMCONSTANTS_C_ASSERT(NonTrivialInterfaceCastFlags == MethodTable::public_enum_flag_NonTrivialInterfaceCast)
+
+#define ASM__VTABLE_SLOTS_PER_CHUNK 8
+ASMCONSTANTS_C_ASSERT(ASM__VTABLE_SLOTS_PER_CHUNK == VTABLE_SLOTS_PER_CHUNK)
+
+#define ASM__VTABLE_SLOTS_PER_CHUNK_LOG2 3
+ASMCONSTANTS_C_ASSERT(ASM__VTABLE_SLOTS_PER_CHUNK_LOG2 == VTABLE_SLOTS_PER_CHUNK_LOG2)
+
+#define TLS_GETTER_MAX_SIZE_ASM DBG_FRE(0x20, 0x10)
+ASMCONSTANTS_C_ASSERT(TLS_GETTER_MAX_SIZE_ASM == TLS_GETTER_MAX_SIZE)
+
+#define JIT_TailCall_StackOffsetToFlags       0x08
+
+#define CallDescrData__pSrc                0x00
+#define CallDescrData__numStackSlots       0x04
+#define CallDescrData__pArgumentRegisters  0x08
+#define CallDescrData__fpReturnSize        0x0C
+#define CallDescrData__pTarget             0x10
+#ifndef __GNUC__
+#define CallDescrData__returnValue         0x18
+#else
+#define CallDescrData__returnValue         0x14
+#endif
+
+ASMCONSTANTS_C_ASSERT(CallDescrData__pSrc                 == offsetof(CallDescrData, pSrc))
+ASMCONSTANTS_C_ASSERT(CallDescrData__numStackSlots        == offsetof(CallDescrData, numStackSlots))
+ASMCONSTANTS_C_ASSERT(CallDescrData__pArgumentRegisters   == offsetof(CallDescrData, pArgumentRegisters))
+ASMCONSTANTS_C_ASSERT(CallDescrData__fpReturnSize         == offsetof(CallDescrData, fpReturnSize))
+ASMCONSTANTS_C_ASSERT(CallDescrData__pTarget              == offsetof(CallDescrData, pTarget))
+ASMCONSTANTS_C_ASSERT(CallDescrData__returnValue          == offsetof(CallDescrData, returnValue))
+
+#undef ASMCONSTANTS_C_ASSERT
+#undef ASMCONSTANTS_RUNTIME_ASSERT
+
+// #define USE_COMPILE_TIME_CONSTANT_FINDER // Uncomment this line to use the constant finder
+#if defined(__cplusplus) && defined(USE_COMPILE_TIME_CONSTANT_FINDER)
+// This class causes the compiler to emit an error with the constant we're interested in
+// in the error message. This is useful if a size or offset changes. To use, comment out
+// the compile-time assert that is firing, enable the constant finder, add the appropriate
+// constant to find to BogusFunction(), and build.
+// 
+// Here's a sample compiler error:
+// d:\dd\clr\src\ndp\clr\src\vm\i386\asmconstants.h(326) : error C2248: 'FindCompileTimeConstant<N>::FindCompileTimeConstant' : cannot access private member declared in class 'FindCompileTimeConstant<N>'
+//         with
+//         [
+//             N=1520
+//         ]
+//         d:\dd\clr\src\ndp\clr\src\vm\i386\asmconstants.h(321) : see declaration of 'FindCompileTimeConstant<N>::FindCompileTimeConstant'
+//         with
+//         [
+//             N=1520
+//         ]
+template<size_t N>
+class FindCompileTimeConstant
+{
+private:
+	FindCompileTimeConstant();
+};
+
+void BogusFunction()
+{
+	// Sample usage to generate the error
+	FindCompileTimeConstant<offsetof(AppDomain, m_dwId)> bogus_variable;
+}
+#endif // defined(__cplusplus) && defined(USE_COMPILE_TIME_CONSTANT_FINDER)
diff --git a/src/vm/i386/asmhelpers.asm b/src/vm/i386/asmhelpers.asm
new file mode 100644
index 0000000000..66a22b7962
--- /dev/null
+++ b/src/vm/i386/asmhelpers.asm
@@ -0,0 +1,2400 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+; See the LICENSE file in the project root for more information.
+
+; ==++==
+;
+
+;
+; ==--==
+;
+; FILE: asmhelpers.asm
+;
+;  *** NOTE:  If you make changes to this file, propagate the changes to
+;             asmhelpers.s in this directory
+;
+
+;
+; ======================================================================================
+
+        .586
+        .model  flat
+
+include asmconstants.inc
+
+        assume fs: nothing
+        option  casemap:none
+        .code
+
+EXTERN __imp__RtlUnwind@16:DWORD
+ifdef _DEBUG
+EXTERN _HelperMethodFrameConfirmState@20:PROC
+endif
+ifdef FEATURE_MIXEDMODE
+EXTERN _IJWNOADThunkJumpTargetHelper@4:PROC
+endif
+EXTERN _StubRareEnableWorker@4:PROC
+ifdef FEATURE_COMINTEROP
+EXTERN _StubRareDisableHRWorker@4:PROC
+endif ; FEATURE_COMINTEROP
+EXTERN _StubRareDisableTHROWWorker@4:PROC
+EXTERN __imp__TlsGetValue@4:DWORD
+TlsGetValue PROTO stdcall
+ifdef FEATURE_HIJACK
+EXTERN _OnHijackWorker@4:PROC
+endif ;FEATURE_HIJACK
+EXTERN _COMPlusEndCatch@20:PROC
+EXTERN _COMPlusFrameHandler:PROC
+ifdef FEATURE_COMINTEROP
+EXTERN _COMPlusFrameHandlerRevCom:PROC
+endif ; FEATURE_COMINTEROP
+EXTERN __alloca_probe:PROC
+EXTERN _NDirectImportWorker@4:PROC
+EXTERN _UMThunkStubRareDisableWorker@8:PROC
+ifndef FEATURE_IMPLICIT_TLS
+ifdef ENABLE_GET_THREAD_GENERIC_FULL_CHECK
+; This is defined in C (threads.cpp) and enforces EE_THREAD_NOT_REQUIRED contracts
+GetThreadGenericFullCheck EQU ?GetThreadGenericFullCheck@@YGPAVThread@@XZ
+EXTERN  GetThreadGenericFullCheck:PROC
+endif ; ENABLE_GET_THREAD_GENERIC_FULL_CHECK
+
+EXTERN _gThreadTLSIndex:DWORD
+EXTERN _gAppDomainTLSIndex:DWORD
+endif ; FEATURE_IMPLICIT_TLS
+
+EXTERN _VarargPInvokeStubWorker@12:PROC
+EXTERN _GenericPInvokeCalliStubWorker@12:PROC
+
+; To debug that LastThrownObjectException really is EXCEPTION_COMPLUS
+ifdef TRACK_CXX_EXCEPTION_CODE_HACK	
+EXTERN __imp____CxxFrameHandler:PROC
+endif
+
+EXTERN _GetThread@0:PROC
+EXTERN _GetAppDomain@0:PROC
+
+ifdef MDA_SUPPORTED
+EXTERN _PInvokeStackImbalanceWorker@8:PROC
+endif
+
+ifndef FEATURE_CORECLR
+EXTERN _CopyCtorCallStubWorker@4:PROC
+endif
+
+EXTERN _PreStubWorker@8:PROC
+
+ifdef FEATURE_COMINTEROP
+EXTERN _CLRToCOMWorker@8:PROC
+endif
+
+ifdef FEATURE_REMOTING
+EXTERN _TransparentProxyStubWorker@8:PROC
+endif
+
+ifdef FEATURE_PREJIT
+EXTERN _ExternalMethodFixupWorker@16:PROC
+EXTERN _VirtualMethodFixupWorker@8:PROC
+EXTERN _StubDispatchFixupWorker@16:PROC
+endif
+
+ifdef FEATURE_COMINTEROP
+EXTERN _ComPreStubWorker@8:PROC
+endif
+
+ifdef FEATURE_READYTORUN
+EXTERN _DynamicHelperWorker@20:PROC
+endif
+
+ifdef FEATURE_REMOTING
+EXTERN _InContextTPQuickDispatchAsmStub@0:PROC
+endif
+
+EXTERN @JIT_InternalThrow@4:PROC
+
+EXTERN @ProfileEnter@8:PROC
+EXTERN @ProfileLeave@8:PROC
+EXTERN @ProfileTailcall@8:PROC
+
+UNREFERENCED macro arg
+    local unref
+    unref equ size arg
+endm
+
+FASTCALL_FUNC macro FuncName,cbArgs
+FuncNameReal EQU @&FuncName&@&cbArgs
+FuncNameReal proc public
+endm
+
+FASTCALL_ENDFUNC macro
+FuncNameReal endp
+endm
+
+ifdef FEATURE_COMINTEROP
+ifdef _DEBUG
+    CPFH_STACK_SIZE     equ SIZEOF_FrameHandlerExRecord + STACK_OVERWRITE_BARRIER_SIZE*4
+else ; _DEBUG
+    CPFH_STACK_SIZE     equ SIZEOF_FrameHandlerExRecord
+endif ; _DEBUG
+
+PUSH_CPFH_FOR_COM macro trashReg, pFrameBaseReg, pFrameOffset
+
+    ;
+    ; Setup the FrameHandlerExRecord 
+    ;
+    push    dword ptr [pFrameBaseReg + pFrameOffset]
+    push    _COMPlusFrameHandlerRevCom
+    mov     trashReg, fs:[0]
+    push    trashReg
+    mov     fs:[0], esp
+
+ifdef _DEBUG
+    mov     trashReg, STACK_OVERWRITE_BARRIER_SIZE
+@@:
+    push    STACK_OVERWRITE_BARRIER_VALUE
+    dec     trashReg
+    jnz     @B
+endif ; _DEBUG
+
+endm  ; PUSH_CPFH_FOR_COM
+
+
+POP_CPFH_FOR_COM macro trashReg
+
+    ;
+    ; Unlink FrameHandlerExRecord from FS:0 chain
+    ;
+ifdef _DEBUG
+    add     esp, STACK_OVERWRITE_BARRIER_SIZE*4
+endif
+    mov     trashReg, [esp + OFFSETOF__FrameHandlerExRecord__m_ExReg__Next]
+    mov     fs:[0], trashReg
+    add     esp, SIZEOF_FrameHandlerExRecord
+
+endm  ; POP_CPFH_FOR_COM
+endif ; FEATURE_COMINTEROP
+
+;
+; FramedMethodFrame prolog
+;
+STUB_PROLOG  macro
+    ; push ebp-frame
+    push        ebp
+    mov         ebp,esp
+
+    ; save CalleeSavedRegisters
+    push        ebx
+    push        esi
+    push        edi
+
+    ; push ArgumentRegisters
+    push        ecx
+    push        edx
+endm
+
+;
+; FramedMethodFrame epilog
+;
+STUB_EPILOG macro
+    ; pop ArgumentRegisters
+    pop     edx
+    pop     ecx
+
+    ; pop CalleeSavedRegisters
+    pop edi
+    pop esi
+    pop ebx
+    pop ebp
+endm
+
+;
+; FramedMethodFrame epilog
+;
+STUB_EPILOG_RETURN macro
+    ; pop ArgumentRegisters
+    add esp, 8
+
+    ; pop CalleeSavedRegisters
+    pop edi
+    pop esi
+    pop ebx
+    pop ebp
+endm
+
+STUB_PROLOG_2_HIDDEN_ARGS macro
+
+    ;
+    ; The stub arguments are where we want to setup the TransitionBlock. We will
+    ; setup the TransitionBlock later once we can trash them
+    ;
+    ; push ebp-frame
+    ; push      ebp
+    ; mov       ebp,esp
+
+    ; save CalleeSavedRegisters
+    ; push      ebx
+
+    push        esi
+    push        edi
+
+    ; push ArgumentRegisters
+    push        ecx
+    push        edx
+
+    mov         ecx, [esp + 4*4]
+    mov         edx, [esp + 5*4]
+
+    ; Setup up proper EBP frame now that the stub arguments can be trashed
+    mov         [esp + 4*4],ebx
+    mov         [esp + 5*4],ebp
+    lea         ebp, [esp + 5*4]
+endm
+
+ResetCurrentContext PROC stdcall public
+        LOCAL ctrlWord:WORD
+
+        ; Clear the direction flag (used for rep instructions)
+        cld
+
+        fnstcw ctrlWord
+        fninit                  ; reset FPU
+        and ctrlWord, 0f00h     ; preserve precision and rounding control
+        or  ctrlWord, 007fh     ; mask all exceptions
+        fldcw ctrlWord          ; preserve precision control
+        RET
+ResetCurrentContext ENDP
+
+;Incoming:
+;   ESP+4: Pointer to buffer to which FPU state should be saved
+_CaptureFPUContext@4 PROC public
+        
+        mov ecx, [esp+4]
+        fnstenv [ecx]
+        retn 4
+
+_CaptureFPUContext@4 ENDP
+
+; Incoming:
+;  ESP+4: Pointer to buffer from which FPU state should be restored
+_RestoreFPUContext@4 PROC public
+        
+        mov ecx, [esp+4]
+        fldenv [ecx]
+        retn 4
+
+_RestoreFPUContext@4 ENDP
+
+ifndef FEATURE_CORECLR
+ifdef _DEBUG
+; For C++ exceptions, we desperately need to know the SEH code.  This allows us to properly
+; distinguish managed exceptions from C++ exceptions from standard SEH like hard stack overflow.
+; We do this by providing our own handler that squirrels away the exception code and then
+; defers to the C++ service.  Fortunately, two symbols exist for the C++ symbol.
+___CxxFrameHandler3 PROC public
+
+        ; We don't know what arguments are passed to us (except for the first arg on stack)
+        ; It turns out that EAX is part of the non-standard calling convention of this
+        ; function.
+
+        push            eax
+        push            edx
+
+        cmp             dword ptr [_gThreadTLSIndex], -1
+        je              Chain                   ; CLR is not initialized yet
+
+        call            _GetThread@0
+
+        test            eax, eax                ; not a managed thread
+        jz              Chain
+
+        mov             edx, [esp + 0ch]        ; grab the first argument
+        mov             edx, [edx]              ; grab the SEH exception code
+        
+        mov             dword ptr [eax + Thread_m_LastCxxSEHExceptionCode], edx
+
+Chain:        
+
+        pop             edx
+
+        ; [esp] contains the value of EAX we must restore.  We would like
+        ; [esp] to contain the address of the real imported CxxFrameHandler
+        ; so we can chain to it.
+        
+        mov             eax, [__imp____CxxFrameHandler]
+        mov             eax, [eax]
+        xchg            [esp], eax
+        
+        ret
+        
+___CxxFrameHandler3 ENDP
+endif ; _DEBUG
+endif ; FEATURE_CORECLR
+
+; Register CLR exception handlers defined on the C++ side with SAFESEH.
+; Note that these directives must be in a file that defines symbols that will be used during linking,
+; otherwise it's possible that the resulting .obj will completly be ignored by the linker and these
+; directives will have no effect.
+COMPlusFrameHandler proto c
+.safeseh COMPlusFrameHandler
+
+COMPlusNestedExceptionHandler proto c
+.safeseh COMPlusNestedExceptionHandler
+
+FastNExportExceptHandler proto c
+.safeseh FastNExportExceptHandler
+
+UMThunkPrestubHandler proto c
+.safeseh UMThunkPrestubHandler
+
+ifdef FEATURE_COMINTEROP
+COMPlusFrameHandlerRevCom proto c
+.safeseh COMPlusFrameHandlerRevCom
+endif
+
+; Note that RtlUnwind trashes EBX, ESI and EDI, so this wrapper preserves them
+CallRtlUnwind PROC stdcall public USES ebx esi edi, pEstablisherFrame :DWORD, callback :DWORD, pExceptionRecord :DWORD, retVal :DWORD
+
+        push retVal
+        push pExceptionRecord
+        push callback
+        push pEstablisherFrame
+        call dword ptr [__imp__RtlUnwind@16]
+
+        ; return 1
+        push 1
+        pop eax
+
+        RET
+CallRtlUnwind ENDP
+
+_ResumeAtJitEHHelper@4 PROC public
+        mov     edx, [esp+4]     ; edx = pContext (EHContext*)
+                
+        mov     ebx, [edx+EHContext_Ebx]
+        mov     esi, [edx+EHContext_Esi]
+        mov     edi, [edx+EHContext_Edi]
+        mov     ebp, [edx+EHContext_Ebp]        
+        mov     ecx, [edx+EHContext_Esp]
+        mov     eax, [edx+EHContext_Eip]
+        mov     [ecx-4], eax
+        mov     eax, [edx+EHContext_Eax]
+        mov     [ecx-8], eax
+        mov     eax, [edx+EHContext_Ecx]
+        mov     [ecx-0Ch], eax
+        mov     eax, [edx+EHContext_Edx]
+        mov     [ecx-10h], eax
+        lea     esp, [ecx-10h]
+        pop     edx
+        pop     ecx             
+        pop     eax           
+        ret
+_ResumeAtJitEHHelper@4 ENDP
+
+; int __stdcall CallJitEHFilterHelper(size_t *pShadowSP, EHContext *pContext);
+;   on entry, only the pContext->Esp, Ebx, Esi, Edi, Ebp, and Eip are initialized
+_CallJitEHFilterHelper@8 PROC public
+        push    ebp
+        mov     ebp, esp
+        push    ebx
+        push    esi
+        push    edi
+
+        pShadowSP equ [ebp+8]
+        pContext  equ [ebp+12]
+
+        mov     eax, pShadowSP      ; Write esp-4 to the shadowSP slot
+        test    eax, eax
+        jz      DONE_SHADOWSP_FILTER
+        mov     ebx, esp
+        sub     ebx, 4
+        or      ebx, SHADOW_SP_IN_FILTER_ASM
+        mov     [eax], ebx
+    DONE_SHADOWSP_FILTER:
+
+        mov     edx, [pContext]
+        mov     eax, [edx+EHContext_Eax]
+        mov     ebx, [edx+EHContext_Ebx]
+        mov     esi, [edx+EHContext_Esi]
+        mov     edi, [edx+EHContext_Edi]
+        mov     ebp, [edx+EHContext_Ebp]
+
+        call    dword ptr [edx+EHContext_Eip]
+ifdef _DEBUG
+        nop  ; Indicate that it is OK to call managed code directly from here
+endif
+
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp ; don't use 'leave' here, as ebp as been trashed
+        retn    8
+_CallJitEHFilterHelper@8 ENDP
+
+
+; void __stdcall CallJITEHFinallyHelper(size_t *pShadowSP, EHContext *pContext);
+;   on entry, only the pContext->Esp, Ebx, Esi, Edi, Ebp, and Eip are initialized
+_CallJitEHFinallyHelper@8 PROC public
+        push    ebp
+        mov     ebp, esp
+        push    ebx
+        push    esi
+        push    edi
+
+        pShadowSP equ [ebp+8]
+        pContext  equ [ebp+12]
+
+        mov     eax, pShadowSP      ; Write esp-4 to the shadowSP slot
+        test    eax, eax
+        jz      DONE_SHADOWSP_FINALLY
+        mov     ebx, esp
+        sub     ebx, 4
+        mov     [eax], ebx
+    DONE_SHADOWSP_FINALLY:
+
+        mov     edx, [pContext]
+        mov     eax, [edx+EHContext_Eax]
+        mov     ebx, [edx+EHContext_Ebx]
+        mov     esi, [edx+EHContext_Esi]
+        mov     edi, [edx+EHContext_Edi]
+        mov     ebp, [edx+EHContext_Ebp]
+        call    dword ptr [edx+EHContext_Eip]
+ifdef _DEBUG
+        nop  ; Indicate that it is OK to call managed code directly from here
+endif
+
+        ; Reflect the changes to the context and only update non-volatile registers. 
+        ; This will be used later to update REGDISPLAY
+        mov     edx, [esp+12+12]        
+        mov     [edx+EHContext_Ebx], ebx
+        mov     [edx+EHContext_Esi], esi
+        mov     [edx+EHContext_Edi], edi
+        mov     [edx+EHContext_Ebp], ebp
+        
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp ; don't use 'leave' here, as ebp as been trashed
+        retn    8
+_CallJitEHFinallyHelper@8 ENDP
+
+
+_GetSpecificCpuTypeAsm@0 PROC public
+        push    ebx         ; ebx is trashed by the cpuid calls
+
+        ; See if the chip supports CPUID
+        pushfd
+        pop     ecx         ; Get the EFLAGS
+        mov     eax, ecx    ; Save for later testing
+        xor     ecx, 200000h ; Invert the ID bit.
+        push    ecx
+        popfd               ; Save the updated flags.
+        pushfd
+        pop     ecx         ; Retrieve the updated flags
+        xor     ecx, eax    ; Test if it actually changed (bit set means yes)
+        push    eax
+        popfd               ; Restore the flags
+
+        test    ecx, 200000h
+        jz      Assume486
+
+        xor     eax, eax
+        cpuid
+
+        test    eax, eax
+        jz      Assume486   ; brif CPUID1 not allowed
+
+        mov     eax, 1
+        cpuid
+
+        ; filter out everything except family and model
+        ; Note that some multi-procs have different stepping number for each proc
+        and     eax, 0ff0h
+
+        jmp     CpuTypeDone
+
+Assume486:
+        mov     eax, 0400h ; report 486
+CpuTypeDone:
+        pop     ebx
+        retn
+_GetSpecificCpuTypeAsm@0 ENDP
+
+; DWORD __stdcall GetSpecificCpuFeaturesAsm(DWORD *pInfo);
+_GetSpecificCpuFeaturesAsm@4 PROC public
+        push    ebx         ; ebx is trashed by the cpuid calls
+
+        ; See if the chip supports CPUID
+        pushfd
+        pop     ecx         ; Get the EFLAGS
+        mov     eax, ecx    ; Save for later testing
+        xor     ecx, 200000h ; Invert the ID bit.
+        push    ecx
+        popfd               ; Save the updated flags.
+        pushfd
+        pop     ecx         ; Retrieve the updated flags
+        xor     ecx, eax    ; Test if it actually changed (bit set means yes)
+        push    eax
+        popfd               ; Restore the flags
+
+        test    ecx, 200000h
+        jz      CpuFeaturesFail
+
+        xor     eax, eax
+        cpuid
+
+        test    eax, eax
+        jz      CpuFeaturesDone ; br if CPUID1 not allowed
+
+        mov     eax, 1
+        cpuid
+        mov     eax, edx        ; return all feature flags
+        mov     edx, [esp+8]
+        test    edx, edx
+        jz      CpuFeaturesDone
+        mov     [edx],ebx       ; return additional useful information
+        jmp     CpuFeaturesDone
+
+CpuFeaturesFail:
+        xor     eax, eax    ; Nothing to report
+CpuFeaturesDone:
+        pop     ebx
+        retn    4
+_GetSpecificCpuFeaturesAsm@4 ENDP
+
+
+;-----------------------------------------------------------------------
+; The out-of-line portion of the code to enable preemptive GC.
+; After the work is done, the code jumps back to the "pRejoinPoint"
+; which should be emitted right after the inline part is generated.
+;
+; Assumptions:
+;      ebx = Thread
+; Preserves
+;      all registers except ecx.
+;
+;-----------------------------------------------------------------------
+_StubRareEnable proc public
+        push    eax
+        push    edx
+
+        push    ebx
+        call    _StubRareEnableWorker@4
+
+        pop     edx
+        pop     eax
+        retn
+_StubRareEnable ENDP
+
+ifdef FEATURE_COMINTEROP
+_StubRareDisableHR proc public
+        push    edx
+
+        push    ebx     ; Thread
+        call    _StubRareDisableHRWorker@4
+
+        pop     edx
+        retn
+_StubRareDisableHR ENDP
+endif ; FEATURE_COMINTEROP
+
+_StubRareDisableTHROW proc public
+        push    eax
+        push    edx
+
+        push    ebx     ; Thread
+        call    _StubRareDisableTHROWWorker@4
+
+        pop     edx
+        pop     eax
+        retn
+_StubRareDisableTHROW endp
+
+
+ifdef FEATURE_MIXEDMODE
+; VOID __stdcall IJWNOADThunkJumpTarget(void);
+; This routine is used by the IJWNOADThunk to determine the callsite of the domain-specific stub to call.
+_IJWNOADThunkJumpTarget@0 proc public
+
+        push ebp
+        mov ebp, esp
+
+        ; EAX contains IJWNOADThunk*
+        ; Must retain ebx, ecx, edx, esi, edi.
+
+        ; save ebx - holds the IJWNOADThunk*
+        ; save ecx - holds the current AppDomain ID.
+        ; save edx - holds the cached AppDomain ID.
+        push ebx
+        push ecx
+
+        ; put the IJWNOADThunk into ebx for safe keeping
+        mov ebx, eax
+
+        ; get thread - assumes registers are preserved
+        call _GetThread@0
+
+        ; if thread is null, go down un-optimized path
+        test eax,eax
+        jz cachemiss
+
+        ; get current domain - assumes registers are preserved
+        call _GetAppDomain@0
+
+        ; if domain is null, go down un-optimized path
+        test eax,eax
+        jz cachemiss
+
+        ; get the current appdomain id
+        mov ecx, [eax + AppDomain__m_dwId]
+
+        ; test it against each cache location
+        mov eax, ebx
+        add eax, IJWNOADThunk__m_cache
+        cmp ecx, [eax]
+        je cachehit
+
+        add eax, IJWNOADThunk__NextCacheOffset
+        cmp ecx, [eax]
+        je cachehit
+
+        add eax, IJWNOADThunk__NextCacheOffset
+        cmp ecx, [eax]
+        je cachehit
+
+        add eax, IJWNOADThunk__NextCacheOffset
+        cmp ecx, [eax]
+        je cachehit
+
+cachemiss:
+        ; save extra registers
+        push edx
+        push esi
+        push edi
+
+        ; call unoptimized path
+        push ebx                ; only arg is IJWNOADThunk*
+        call _IJWNOADThunkJumpTargetHelper@4
+
+        ; restore extra registers
+        pop edi
+        pop esi
+        pop edx
+        
+        ; jump back up to the epilog
+        jmp complete
+
+cachehit:
+        ; found a matching ADID, get the code addr.
+        mov eax, [eax + IJWNOADThunk__CodeAddrOffsetFromADID]
+
+        ; if the callsite is null, go down the un-optimized path
+        test eax, eax
+        jz cachemiss
+
+complete:
+        ; restore regs
+        pop ecx
+        pop ebx
+
+        mov esp, ebp
+        pop ebp
+
+        ; Jump to callsite
+        jmp eax
+        
+        ; This will never be executed. It is just to help out stack-walking logic
+        ; which disassembles the epilog to unwind the stack.
+        ret
+_IJWNOADThunkJumpTarget@0 endp
+
+endif
+
+InternalExceptionWorker proc public
+        pop     edx             ; recover RETADDR
+        add     esp, eax        ; release caller's args
+        push    edx             ; restore RETADDR
+        jmp     @JIT_InternalThrow@4
+InternalExceptionWorker endp
+
+; EAX -> number of caller arg bytes on the stack that we must remove before going
+; to the throw helper, which assumes the stack is clean.
+_ArrayOpStubNullException proc public
+        ; kFactorReg and kTotalReg could not have been modified, but let's pop
+        ; them anyway for consistency and to avoid future bugs.
+        pop     esi
+        pop     edi
+        mov     ecx, CORINFO_NullReferenceException_ASM
+        jmp     InternalExceptionWorker
+_ArrayOpStubNullException endp
+
+; EAX -> number of caller arg bytes on the stack that we must remove before going
+; to the throw helper, which assumes the stack is clean.
+_ArrayOpStubRangeException proc public
+        ; kFactorReg and kTotalReg could not have been modified, but let's pop
+        ; them anyway for consistency and to avoid future bugs.
+        pop     esi
+        pop     edi
+        mov     ecx, CORINFO_IndexOutOfRangeException_ASM
+        jmp     InternalExceptionWorker
+_ArrayOpStubRangeException endp
+
+; EAX -> number of caller arg bytes on the stack that we must remove before going
+; to the throw helper, which assumes the stack is clean.
+_ArrayOpStubTypeMismatchException proc public
+        ; kFactorReg and kTotalReg could not have been modified, but let's pop
+        ; them anyway for consistency and to avoid future bugs.
+        pop     esi
+        pop     edi
+        mov     ecx, CORINFO_ArrayTypeMismatchException_ASM
+        jmp     InternalExceptionWorker
+_ArrayOpStubTypeMismatchException endp
+
+;------------------------------------------------------------------------------
+; This helper routine enregisters the appropriate arguments and makes the
+; actual call.
+;------------------------------------------------------------------------------
+; void STDCALL CallDescrWorkerInternal(CallDescrWorkerParams *  pParams)
+CallDescrWorkerInternal PROC stdcall public USES EBX,
+                         pParams: DWORD
+
+        mov     ebx, pParams
+
+        mov     ecx, [ebx+CallDescrData__numStackSlots]
+        mov     eax, [ebx+CallDescrData__pSrc]            ; copy the stack
+        test    ecx, ecx
+        jz      donestack
+        lea     eax, [eax+4*ecx-4]          ; last argument
+        push    dword ptr [eax]
+        dec     ecx
+        jz      donestack
+        sub     eax, 4
+        push    dword ptr [eax]
+        dec     ecx
+        jz      donestack
+stackloop:
+        sub     eax, 4
+        push    dword ptr [eax]
+        dec     ecx
+        jnz     stackloop
+donestack:
+
+        ; now we must push each field of the ArgumentRegister structure
+        mov     eax, [ebx+CallDescrData__pArgumentRegisters]
+        mov     edx, dword ptr [eax]
+        mov     ecx, dword ptr [eax+4]
+
+        call    [ebx+CallDescrData__pTarget]
+ifdef _DEBUG
+        nop     ; This is a tag that we use in an assert.  Fcalls expect to
+                ; be called from Jitted code or from certain blessed call sites like
+                ; this one.  (See HelperMethodFrame::InsureInit)
+endif
+
+        ; Save FP return value if necessary
+        mov     ecx, [ebx+CallDescrData__fpReturnSize]
+        cmp     ecx, 0
+        je      ReturnsInt
+
+        cmp     ecx, 4
+        je      ReturnsFloat
+        cmp     ecx, 8
+        je      ReturnsDouble
+        ; unexpected
+        jmp     Epilog
+
+ReturnsInt:
+        mov     [ebx+CallDescrData__returnValue], eax
+        mov     [ebx+CallDescrData__returnValue+4], edx
+
+Epilog:
+       RET
+
+ReturnsFloat:
+        fstp    dword ptr [ebx+CallDescrData__returnValue]    ; Spill the Float return value
+        jmp     Epilog
+
+ReturnsDouble:
+        fstp    qword ptr [ebx+CallDescrData__returnValue]    ; Spill the Double return value
+        jmp     Epilog
+
+CallDescrWorkerInternal endp
+
+ifdef _DEBUG
+; int __fastcall HelperMethodFrameRestoreState(HelperMethodFrame*, struct MachState *)
+FASTCALL_FUNC HelperMethodFrameRestoreState,8
+    mov         eax, edx        ; eax = MachState*
+else
+; int __fastcall HelperMethodFrameRestoreState(struct MachState *)
+FASTCALL_FUNC HelperMethodFrameRestoreState,4
+    mov         eax, ecx        ; eax = MachState*
+endif
+    ; restore the registers from the m_MachState stucture.  Note that
+    ; we only do this for register that where not saved on the stack
+    ; at the time the machine state snapshot was taken.
+
+    cmp         [eax+MachState__pRetAddr], 0
+
+ifdef _DEBUG
+    jnz         noConfirm
+    push        ebp
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx     ; HelperFrame*
+    call        _HelperMethodFrameConfirmState@20
+    ; on return, eax = MachState*
+    cmp         [eax+MachState__pRetAddr], 0
+noConfirm:
+endif
+
+    jz          doRet
+
+    lea         edx, [eax+MachState__esi]       ; Did we have to spill ESI
+    cmp         [eax+MachState__pEsi], edx
+    jnz         SkipESI
+    mov         esi, [edx]                      ; Then restore it
+SkipESI:
+
+    lea         edx, [eax+MachState__edi]       ; Did we have to spill EDI
+    cmp         [eax+MachState__pEdi], edx
+    jnz         SkipEDI
+    mov         edi, [edx]                      ; Then restore it
+SkipEDI:
+
+    lea         edx, [eax+MachState__ebx]       ; Did we have to spill EBX
+    cmp         [eax+MachState__pEbx], edx
+    jnz         SkipEBX
+    mov         ebx, [edx]                      ; Then restore it
+SkipEBX:
+
+    lea         edx, [eax+MachState__ebp]       ; Did we have to spill EBP
+    cmp         [eax+MachState__pEbp], edx
+    jnz         SkipEBP
+    mov         ebp, [edx]                      ; Then restore it
+SkipEBP:
+
+doRet:
+    xor         eax, eax
+    retn
+FASTCALL_ENDFUNC HelperMethodFrameRestoreState
+
+
+ifndef FEATURE_IMPLICIT_TLS
+;---------------------------------------------------------------------------
+; Portable GetThread() function: used if no platform-specific optimizations apply.
+; This is in assembly code because we count on edx not getting trashed on calls
+; to this function.
+;---------------------------------------------------------------------------
+; Thread* __stdcall GetThreadGeneric(void);
+GetThreadGeneric PROC stdcall public USES ecx edx
+
+ifdef _DEBUG
+    cmp         dword ptr [_gThreadTLSIndex], -1
+    jnz         @F
+    int         3
+@@:
+endif
+ifdef ENABLE_GET_THREAD_GENERIC_FULL_CHECK
+    ; non-PAL, debug-only GetThreadGeneric should defer to GetThreadGenericFullCheck
+    ; to do extra contract enforcement.  (See GetThreadGenericFullCheck for details.)
+    ; This code is intentionally not added to asmhelper.s, as this enforcement is only
+    ; implemented for non-PAL builds.
+    call        GetThreadGenericFullCheck
+else
+    push        dword ptr [_gThreadTLSIndex]
+    call        dword ptr [__imp__TlsGetValue@4]
+endif
+    ret
+GetThreadGeneric ENDP
+
+;---------------------------------------------------------------------------
+; Portable GetAppdomain() function: used if no platform-specific optimizations apply.
+; This is in assembly code because we count on edx not getting trashed on calls
+; to this function.
+;---------------------------------------------------------------------------
+; Appdomain* __stdcall GetAppDomainGeneric(void);
+GetAppDomainGeneric PROC stdcall public USES ecx edx
+
+ifdef _DEBUG
+    cmp         dword ptr [_gAppDomainTLSIndex], -1
+    jnz         @F
+    int         3
+@@:
+endif
+
+    push        dword ptr [_gAppDomainTLSIndex]
+    call        dword ptr [__imp__TlsGetValue@4]
+    ret
+GetAppDomainGeneric ENDP
+endif
+
+
+ifdef FEATURE_HIJACK
+
+; A JITted method's return address was hijacked to return to us here.  
+; VOID OnHijackTripThread()
+OnHijackTripThread PROC stdcall public
+
+    ; Don't fiddle with this unless you change HijackFrame::UpdateRegDisplay
+    ; and HijackArgs
+    push    eax         ; make room for the real return address (Eip)
+    push    ebp
+    push    eax
+    push    ecx
+    push    edx
+    push    ebx
+    push    esi
+    push    edi
+
+    ; unused space for floating point state
+    sub     esp,12
+
+    push    esp
+    call    _OnHijackWorker@4
+
+    ; unused space for floating point state
+    add     esp,12
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     edx
+    pop     ecx
+    pop     eax
+    pop     ebp
+    retn                ; return to the correct place, adjusted by our caller
+OnHijackTripThread ENDP
+
+; VOID OnHijackFPTripThread()
+OnHijackFPTripThread PROC stdcall public
+
+    ; Don't fiddle with this unless you change HijackFrame::UpdateRegDisplay
+    ; and HijackArgs
+    push    eax         ; make room for the real return address (Eip)
+    push    ebp
+    push    eax
+    push    ecx
+    push    edx
+    push    ebx
+    push    esi
+    push    edi
+
+    sub     esp,12
+
+    ; save top of the floating point stack (there is return value passed in it)
+    ; save full 10 bytes to avoid precision loss
+    fstp    tbyte ptr [esp]
+
+    push    esp
+    call    _OnHijackWorker@4
+
+    ; restore top of the floating point stack
+    fld     tbyte ptr [esp]
+
+    add     esp,12
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     edx
+    pop     ecx
+    pop     eax
+    pop     ebp
+    retn                ; return to the correct place, adjusted by our caller
+OnHijackFPTripThread ENDP
+
+endif ; FEATURE_HIJACK
+
+
+; Note that the debugger skips this entirely when doing SetIP,
+; since COMPlusCheckForAbort should always return 0.  Excep.cpp:LeaveCatch
+; asserts that to be true.  If this ends up doing more work, then the
+; debugger may need additional support.
+; void __stdcall JIT_EndCatch();
+JIT_EndCatch PROC stdcall public
+
+    ; make temp storage for return address, and push the address of that 
+    ; as the last arg to COMPlusEndCatch	
+    mov     ecx, [esp]
+    push    ecx;
+    push    esp;
+
+    ; push the rest of COMPlusEndCatch's args, right-to-left
+    push    esi
+    push    edi
+    push    ebx
+    push    ebp
+
+    call    _COMPlusEndCatch@20 ; returns old esp value in eax, stores jump address 
+    ; now eax = new esp, [esp] = new eip
+
+    pop     edx         ; edx = new eip
+    mov     esp, eax    ; esp = new esp
+    jmp     edx         ; eip = new eip
+
+JIT_EndCatch ENDP
+
+;==========================================================================
+; This function is reached only via the embedded ImportThunkGlue code inside
+; an NDirectMethodDesc. It's purpose is to load the DLL associated with an
+; N/Direct method, then backpatch the DLL target into the methoddesc.
+;
+; Initial state:
+;
+;      Preemptive GC is *enabled*: we are actually in an unmanaged state.
+;
+;
+;      [esp+...]   - The *unmanaged* parameters to the DLL target.
+;      [esp+4]     - Return address back into the JIT'ted code that made
+;                    the DLL call.
+;      [esp]       - Contains the "return address." Because we got here
+;                    thru a call embedded inside a MD, this "return address"
+;                    gives us an easy to way to find the MD (which was the
+;                    whole purpose of the embedded call manuever.)
+;
+;
+;
+;==========================================================================
+_NDirectImportThunk@0 proc public
+
+        ; Preserve argument registers
+        push    ecx
+        push    edx
+
+        ; Invoke the function that does the real work.
+        push    eax
+        call    _NDirectImportWorker@4
+
+        ; Restore argument registers
+        pop     edx
+        pop     ecx
+
+        ; If we got back from NDirectImportWorker, the MD has been successfully
+        ; linked and "eax" contains the DLL target. Proceed to execute the
+        ; original DLL call.
+        jmp     eax     ; Jump to DLL target
+_NDirectImportThunk@0 endp
+
+;==========================================================================
+; The call in fixup precode initally points to this function.
+; The pupose of this function is to load the MethodDesc and forward the call the prestub.
+_PrecodeFixupThunk@0 proc public
+
+        pop     eax         ; Pop the return address. It points right after the call instruction in the precode.
+        push    esi
+        push    edi
+
+        ; Inline computation done by FixupPrecode::GetMethodDesc()
+        movzx   esi,byte ptr [eax+2]    ; m_PrecodeChunkIndex
+        movzx   edi,byte ptr [eax+1]    ; m_MethodDescChunkIndex
+        mov     eax,dword ptr [eax+esi*8+3]
+        lea     eax,[eax+edi*4]
+
+        pop     edi
+        pop     esi
+        jmp     _ThePreStub@0
+
+_PrecodeFixupThunk@0 endp
+
+; LPVOID __stdcall CTPMethodTable__CallTargetHelper2(
+;     const void *pTarget,
+;     LPVOID pvFirst,
+;     LPVOID pvSecond)
+CTPMethodTable__CallTargetHelper2 proc stdcall public,
+                                  pTarget : DWORD,
+                                  pvFirst : DWORD,
+                                  pvSecond : DWORD
+    mov     ecx, pvFirst
+    mov     edx, pvSecond
+
+    call    pTarget
+ifdef _DEBUG
+    nop                         ; Mark this as a special call site that can
+                                ; directly call unmanaged code
+endif
+    ret
+CTPMethodTable__CallTargetHelper2 endp
+
+; LPVOID __stdcall CTPMethodTable__CallTargetHelper3(
+;     const void *pTarget,
+;     LPVOID pvFirst,
+;     LPVOID pvSecond,
+;     LPVOID pvThird)
+CTPMethodTable__CallTargetHelper3 proc stdcall public,
+                                  pTarget : DWORD,
+                                  pvFirst : DWORD,
+                                  pvSecond : DWORD,
+                                  pvThird : DWORD
+    push    pvThird
+
+    mov     ecx, pvFirst
+    mov     edx, pvSecond
+
+    call    pTarget
+ifdef _DEBUG
+    nop                         ; Mark this as a special call site that can
+                                ; directly call unmanaged code
+endif
+    ret
+CTPMethodTable__CallTargetHelper3 endp
+
+
+; void __stdcall setFPReturn(int fpSize, INT64 retVal)
+_setFPReturn@12 proc public
+    mov     ecx, [esp+4]
+
+    ; leave the return value in eax:edx if it is not the floating point case
+    mov     eax, [esp+8]
+    mov     edx, [esp+12]
+
+    cmp     ecx, 4
+    jz      setFPReturn4
+
+    cmp     ecx, 8
+    jnz     setFPReturnNot8
+    fld     qword ptr [esp+8]
+setFPReturnNot8:
+    retn    12
+
+setFPReturn4:
+    fld     dword ptr [esp+8]
+    retn    12
+_setFPReturn@12 endp
+
+; void __stdcall getFPReturn(int fpSize, INT64 *pretVal)
+_getFPReturn@8 proc public
+   mov     ecx, [esp+4]
+   mov     eax, [esp+8]
+   cmp     ecx, 4
+   jz      getFPReturn4
+
+   cmp     ecx, 8
+   jnz     getFPReturnNot8
+   fstp    qword ptr [eax]
+getFPReturnNot8:
+   retn    8
+
+getFPReturn4:
+   fstp    dword ptr [eax]
+   retn    8
+_getFPReturn@8 endp
+
+; void __stdcall UM2MThunk_WrapperHelper(void *pThunkArgs,
+;                                        int argLen,
+;                                        void *pAddr,
+;                                        UMEntryThunk *pEntryThunk,
+;                                        Thread *pThread)
+UM2MThunk_WrapperHelper proc stdcall public,
+                        pThunkArgs : DWORD,
+                        argLen : DWORD,
+                        pAddr : DWORD,
+                        pEntryThunk : DWORD,
+                        pThread : DWORD
+    UNREFERENCED argLen
+
+    push    ebx
+
+    mov     eax, pEntryThunk
+    mov     ecx, pThread
+    mov     ebx, pThunkArgs
+    call    pAddr
+
+    pop     ebx
+
+    ret
+UM2MThunk_WrapperHelper endp
+
+; VOID __cdecl UMThunkStubRareDisable()
+;<TODO>
+; @todo: this is very similar to StubRareDisable
+;</TODO>
+_UMThunkStubRareDisable proc public
+    push    eax
+    push    ecx
+
+    push    eax          ; Push the UMEntryThunk
+    push    ecx          ; Push thread
+    call    _UMThunkStubRareDisableWorker@8
+
+    pop     ecx
+    pop     eax
+    retn
+_UMThunkStubRareDisable endp
+
+
+;+----------------------------------------------------------------------------
+;
+;  Method:     CRemotingServices::CheckForContextMatch   public
+;
+;  Synopsis:   This code generates a check to see if the current context and
+;              the context of the proxy match.
+;
+;+----------------------------------------------------------------------------
+;
+; returns zero if contexts match
+; returns non-zero if contexts do not match
+;
+; UINT_PTR __stdcall CRemotingServices__CheckForContextMatch(Object* pStubData)
+ifdef FEATURE_REMOTING
+_CRemotingServices__CheckForContextMatch@4 proc public
+    push    ebx                  ; spill ebx
+    mov     ebx, [eax+4]         ; Get the internal context id by unboxing
+                                 ; the stub data
+    call    _GetThread@0         ; Get the current thread, assumes that the
+                                 ; registers are preserved
+    mov     eax, [eax+Thread_m_Context] ; Get the current context from the
+                                 ; thread
+    sub     eax, ebx             ; Get the pointer to the context from the
+                                 ; proxy and compare with the current context
+    pop     ebx                  ; restore the value of ebx
+    retn
+_CRemotingServices__CheckForContextMatch@4 endp
+endif ; FEATURE_REMOTING
+
+;+----------------------------------------------------------------------------
+;
+;  Method:     CRemotingServices::DispatchInterfaceCall   public
+;
+;  Synopsis:
+;              Push that method desc on the stack and jump to the
+;              transparent proxy stub to execute the call.
+;              WARNING!! This MethodDesc is not the methoddesc in the vtable
+;              of the object instead it is the methoddesc in the vtable of
+;              the interface class. Since we use the MethodDesc only to probe
+;              the stack via the signature of the method call we are safe.
+;              If we want to get any object vtable/class specific
+;              information this is not safe.
+;
+;
+;+----------------------------------------------------------------------------
+; void __stdcall CRemotingServices__DispatchInterfaceCall()
+ifdef FEATURE_REMOTING
+_CRemotingServices__DispatchInterfaceCall@0 proc public
+    ; push MethodDesc* passed in eax by precode and forward to the worker
+    push        eax
+    
+    ; NOTE: At this point the stack looks like
+    ;
+    ; esp--->  saved MethodDesc of Interface method
+    ;          return addr of calling function
+    ;
+    mov      eax, [ecx + TransparentProxyObject___stubData]
+    call    [ecx + TransparentProxyObject___stub]
+ifdef _DEBUG
+    nop     ; Mark this as a special call site that can directly
+            ; call managed code
+endif
+    test    eax, eax
+    jnz     CtxMismatch
+    jmp     _InContextTPQuickDispatchAsmStub@0
+
+CtxMismatch:
+    pop     eax                                  ; restore MethodDesc *
+    jmp     _TransparentProxyStub_CrossContext@0 ; jump to slow TP stub
+_CRemotingServices__DispatchInterfaceCall@0 endp
+endif ; FEATURE_REMOTING
+
+
+;+----------------------------------------------------------------------------
+;
+;  Method:     CRemotingServices::CallFieldGetter   private
+;
+;  Synopsis:   Calls the field getter function (Object::__FieldGetter) in
+;              managed code by setting up the stack and calling the target
+;
+;
+;+----------------------------------------------------------------------------
+; void __stdcall CRemotingServices__CallFieldGetter(
+;    MethodDesc *pMD,
+;    LPVOID pThis,
+;    LPVOID pFirst,
+;    LPVOID pSecond,
+;    LPVOID pThird)
+ifdef FEATURE_REMOTING
+CRemotingServices__CallFieldGetter proc stdcall public,
+                                   pMD : DWORD,
+                                   pThis : DWORD,
+                                   pFirst : DWORD,
+                                   pSecond : DWORD,
+                                   pThird : DWORD
+
+    push    [pSecond]           ; push the second argument on the stack
+    push    [pThird]            ; push the third argument on the stack
+
+    mov     ecx, [pThis]        ; enregister pThis, the 'this' pointer
+    mov     edx, [pFirst]       ; enregister pFirst, the first argument
+
+    mov     eax, [pMD]          ; load MethodDesc of object::__FieldGetter
+    call    _TransparentProxyStub_CrossContext@0 ; call the TP stub
+
+    ret
+CRemotingServices__CallFieldGetter endp
+endif ;  FEATURE_REMOTING
+
+;+----------------------------------------------------------------------------
+;
+;  Method:     CRemotingServices::CallFieldSetter   private
+;
+;  Synopsis:   Calls the field setter function (Object::__FieldSetter) in
+;              managed code by setting up the stack and calling the target
+;
+;
+;+----------------------------------------------------------------------------
+; void __stdcall CRemotingServices__CallFieldSetter(
+;    MethodDesc *pMD,
+;    LPVOID pThis,
+;    LPVOID pFirst,
+;    LPVOID pSecond,
+;    LPVOID pThird)
+ifdef FEATURE_REMOTING
+CRemotingServices__CallFieldSetter proc stdcall public,
+                                   pMD : DWORD,
+                                   pThis : DWORD,
+                                   pFirst : DWORD,
+                                   pSecond : DWORD,
+                                   pThird : DWORD
+
+    push    [pSecond]           ; push the field name (second arg)
+    push    [pThird]            ; push the object (third arg) on the stack
+
+    mov     ecx, [pThis]        ; enregister pThis, the 'this' pointer
+    mov     edx, [pFirst]       ; enregister the first argument
+
+    mov     eax, [pMD]          ; load MethodDesc of object::__FieldGetter
+    call    _TransparentProxyStub_CrossContext@0 ; call the TP stub
+
+    ret
+CRemotingServices__CallFieldSetter endp
+endif ;  FEATURE_REMOTING
+
+;+----------------------------------------------------------------------------
+;
+;  Method:     CTPMethodTable::GenericCheckForContextMatch private
+;
+;  Synopsis:   Calls the stub in the TP & returns TRUE if the contexts
+;              match, FALSE otherwise.
+;
+;  Note:       1. Called during FieldSet/Get, used for proxy extensibility
+;
+;+----------------------------------------------------------------------------
+; BOOL __stdcall CTPMethodTable__GenericCheckForContextMatch(Object* orTP)
+ifdef FEATURE_REMOTING
+CTPMethodTable__GenericCheckForContextMatch proc stdcall public uses ecx, tp : DWORD
+
+    mov     ecx, [tp]
+    mov     eax, [ecx + TransparentProxyObject___stubData]
+    call    [ecx + TransparentProxyObject___stub]
+ifdef _DEBUG
+    nop     ; Mark this as a special call site that can directly
+            ; call managed code
+endif
+    test    eax, eax
+    mov     eax, 0
+    setz    al
+    ; NOTE: In the CheckForXXXMatch stubs (for URT ctx/ Ole32 ctx) eax is
+    ; non-zero if contexts *do not* match & zero if they do.
+    ret
+CTPMethodTable__GenericCheckForContextMatch endp
+endif ;  FEATURE_REMOTING
+
+
+; void __stdcall JIT_ProfilerEnterLeaveTailcallStub(UINT_PTR ProfilerHandle)
+_JIT_ProfilerEnterLeaveTailcallStub@4 proc public
+    ; this function must preserve all registers, including scratch
+    retn    4
+_JIT_ProfilerEnterLeaveTailcallStub@4 endp
+
+;
+; Used to get the current instruction pointer value
+;
+; UINT_PTR __stdcall GetCurrentIP(void);
+_GetCurrentIP@0 proc public
+    mov     eax, [esp]
+    retn
+_GetCurrentIP@0 endp
+
+; LPVOID __stdcall GetCurrentSP(void);
+_GetCurrentSP@0 proc public
+    mov     eax, esp
+    retn
+_GetCurrentSP@0 endp
+
+
+; void __stdcall ProfileEnterNaked(FunctionIDOrClientID functionIDOrClientID);
+_ProfileEnterNaked@4 proc public
+    push    esi
+    push    edi
+
+    ;
+    ; Push in reverse order the fields of ProfilePlatformSpecificData
+    ;
+    push    dword ptr [esp+8] ; EIP of the managed code that we return to.	-- struct ip field
+    push    ebp          ; Methods are always EBP framed
+    add     [esp], 8     ; Skip past the return IP, straight to the stack args that were passed to our caller
+                         ; Skip past saved EBP value: 4 bytes
+                         ;   - plus return address from caller's caller: 4 bytes   
+                         ;
+                         ; Assuming Foo() calls Bar(), and Bar() calls ProfileEnterNake() as illustrated (stack 
+                         ; grows up). We want to get what Foo() passed on the stack to Bar(), so we need to pass 
+                         ; the return address from caller's caller which is Foo() in this example.
+                         ;
+                         ; ProfileEnterNaked()
+                         ; Bar()
+                         ; Foo()
+                         ;
+                         ; [ESP] is now the ESP of caller's caller pointing to the arguments to the caller.
+
+    push    ecx	         ;                                                  -- struct ecx field
+    push    edx	         ;                                                  -- struct edx field
+    push    eax	         ;                                                  -- struct eax field
+    push    0            ; Create buffer space in the structure             -- struct floatingPointValuePresent field
+    push    0            ; Create buffer space in the structure             -- struct floatBuffer field
+    push    0            ; Create buffer space in the structure             -- struct doubleBuffer2 field
+    push    0            ; Create buffer space in the structure             -- struct doubleBuffer1 field
+    push    0            ; Create buffer space in the structure             -- struct functionId field
+
+    mov     edx, esp     ; the address of the Platform structure
+    mov     ecx, [esp+52]; The functionIDOrClientID parameter that was pushed to FunctionEnter
+                         ; Skip past ProfilePlatformSpecificData we pushed: 40 bytes
+                         ;   - plus saved edi, esi : 8 bytes   
+                         ;   - plus return address from caller: 4 bytes   
+
+    call    @ProfileEnter@8
+
+    add     esp, 20      ; Remove buffer space
+    pop     eax
+    pop     edx
+    pop     ecx
+    add     esp, 8       ; Remove buffer space
+    pop     edi
+    pop     esi
+
+    retn    4
+_ProfileEnterNaked@4 endp
+
+; void __stdcall ProfileLeaveNaked(FunctionIDOrClientID functionIDOrClientID);
+_ProfileLeaveNaked@4 proc public
+    push    ecx       ; We do not strictly need to save ECX, however
+                      ; emitNoGChelper(CORINFO_HELP_PROF_FCN_LEAVE) returns true in the JITcompiler
+    push    edx       ; Return value may be in EAX:EDX
+
+    ;
+    ; Push in reverse order the fields of ProfilePlatformSpecificData
+    ;
+    push    dword ptr [esp+8] ; EIP of the managed code that we return to.	-- struct ip field
+    push    ebp          ; Methods are always EBP framed
+    add     [esp], 8     ; Skip past the return IP, straight to the stack args that were passed to our caller
+                         ; Skip past saved EBP value: 4 bytes
+                         ;   - plus return address from caller's caller: 4 bytes   
+                         ;
+                         ; Assuming Foo() calls Bar(), and Bar() calls ProfileEnterNake() as illustrated (stack 
+                         ; grows up). We want to get what Foo() passed on the stack to Bar(), so we need to pass 
+                         ; the return address from caller's caller which is Foo() in this example.
+                         ;
+                         ; ProfileEnterNaked()
+                         ; Bar()
+                         ; Foo()
+                         ;
+                         ; [ESP] is now the ESP of caller's caller pointing to the arguments to the caller.
+
+    push    ecx	         ;                                                  -- struct ecx field
+    push    edx	         ;                                                  -- struct edx field
+    push    eax	         ;                                                  -- struct eax field
+
+    ; Check if we need to save off any floating point registers
+    fstsw   ax           
+    and     ax, 3800h    ; Check the top-of-fp-stack bits
+    cmp     ax, 0        ; If non-zero, we have something to save
+    jnz     SaveFPReg
+
+    push    0            ; Create buffer space in the structure             -- struct floatingPointValuePresent field
+    push    0            ; Create buffer space in the structure             -- struct floatBuffer field
+    push    0            ; Create buffer space in the structure             -- struct doubleBuffer2 field
+    push    0            ; Create buffer space in the structure             -- struct doubleBuffer1 field
+    jmp     Continue
+
+SaveFPReg:
+    push    1            ; mark that a float value is present               -- struct floatingPointValuePresent field
+    sub     esp, 4       ; Make room for the FP value                      
+    fst     dword ptr [esp] ; Copy the FP value to the buffer as a float    -- struct floatBuffer field
+    sub     esp, 8       ; Make room for the FP value
+    fstp    qword ptr [esp] ; Copy FP values to the buffer as a double      -- struct doubleBuffer1 and doubleBuffer2 fields
+
+Continue:
+    push    0            ; Create buffer space in the structure             -- struct functionId field
+
+    mov     edx, esp     ; the address of the Platform structure
+    mov     ecx, [esp+52]; The clientData that was pushed to FunctionEnter
+                         ; Skip past ProfilePlatformSpecificData we pushed: 40 bytes
+                         ;   - plus saved edx, ecx : 8 bytes   
+                         ;   - plus return address from caller: 4 bytes   
+
+    call    @ProfileLeave@8
+
+    ;
+    ; Now see if we have to restore and floating point registers
+    ;
+
+    cmp     [esp + 16], 0
+    jz      NoRestore
+
+    fld     qword ptr [esp + 4]
+
+NoRestore:
+
+    add     esp, 20      ; Remove buffer space
+    pop     eax
+    add     esp, 16      ; Remove buffer space
+    pop     edx
+    pop     ecx
+    retn    4
+_ProfileLeaveNaked@4 endp
+
+
+; void __stdcall ProfileTailcallNaked(FunctionIDOrClientID functionIDOrClientID);
+_ProfileTailcallNaked@4 proc public
+    push    ecx
+    push    edx
+
+    ;
+    ; Push in reverse order the fields of ProfilePlatformSpecificData
+    ;
+    push    dword ptr [esp+8] ; EIP of the managed code that we return to.	-- struct ip field
+    push    ebp          ; Methods are always EBP framed
+    add     [esp], 8     ; Skip past the return IP, straight to the stack args that were passed to our caller
+                         ; Skip past saved EBP value: 4 bytes
+                         ;   - plus return address from caller's caller: 4 bytes   
+                         ;
+                         ; Assuming Foo() calls Bar(), and Bar() calls ProfileEnterNake() as illustrated (stack 
+                         ; grows up). We want to get what Foo() passed on the stack to Bar(), so we need to pass 
+                         ; the return address from caller's caller which is Foo() in this example.
+                         ;
+                         ; ProfileEnterNaked()
+                         ; Bar()
+                         ; Foo()
+                         ;
+                         ; [ESP] is now the ESP of caller's caller pointing to the arguments to the caller.
+
+    push    ecx	         ;                                                  -- struct ecx field
+    push    edx	         ;                                                  -- struct edx field
+    push    eax	         ;                                                  -- struct eax field
+    push    0            ; Create buffer space in the structure             -- struct floatingPointValuePresent field
+    push    0            ; Create buffer space in the structure             -- struct floatBuffer field
+    push    0            ; Create buffer space in the structure             -- struct doubleBuffer2 field
+    push    0            ; Create buffer space in the structure             -- struct doubleBuffer1 field
+    push    0            ; Create buffer space in the structure             -- struct functionId field
+
+    mov     edx, esp     ; the address of the Platform structure
+    mov     ecx, [esp+52]; The clientData that was pushed to FunctionEnter
+                         ; Skip past ProfilePlatformSpecificData we pushed: 40 bytes
+                         ;   - plus saved edx, ecx : 8 bytes   
+                         ;   - plus return address from caller: 4 bytes   
+
+    call    @ProfileTailcall@8
+
+    add     esp, 40      ; Remove buffer space
+    pop     edx
+    pop     ecx
+    retn    4
+_ProfileTailcallNaked@4 endp
+
+;==========================================================================
+; Invoked for vararg forward P/Invoke calls as a stub.
+; Except for secret return buffer, arguments come on the stack so EDX is available as scratch.
+; EAX       - the NDirectMethodDesc
+; ECX       - may be return buffer address
+; [ESP + 4] - the VASigCookie
+; 
+_VarargPInvokeStub@0 proc public
+    ; EDX <- VASigCookie
+    mov     edx, [esp + 4]           ; skip retaddr
+
+    mov     edx, [edx + VASigCookie__StubOffset]
+    test    edx, edx
+    
+    jz      GoCallVarargWorker
+    ; ---------------------------------------
+    
+    ; EAX contains MD ptr for the IL stub
+    jmp     edx
+    
+GoCallVarargWorker:
+    ;
+    ; MD ptr in EAX, VASigCookie ptr at [esp+4]
+    ;
+    
+    STUB_PROLOG
+
+    mov         esi, esp
+
+    ; save pMD
+    push        eax
+
+    push        eax                     ; pMD
+    push        dword ptr [esi + 4*7]   ; pVaSigCookie
+    push        esi                     ; pTransitionBlock
+
+    call        _VarargPInvokeStubWorker@12
+
+    ; restore pMD
+    pop     eax
+    
+    STUB_EPILOG
+
+    ; jump back to the helper - this time it won't come back here as the stub already exists
+    jmp _VarargPInvokeStub@0
+
+_VarargPInvokeStub@0 endp
+
+;==========================================================================
+; Invoked for marshaling-required unmanaged CALLI calls as a stub.
+; EAX       - the unmanaged target
+; ECX, EDX  - arguments
+; [ESP + 4] - the VASigCookie
+; 
+_GenericPInvokeCalliHelper@0 proc public
+    ; save the target
+    push    eax
+
+    ; EAX <- VASigCookie
+    mov     eax, [esp + 8]           ; skip target and retaddr
+
+    mov     eax, [eax + VASigCookie__StubOffset]
+    test    eax, eax
+    
+    jz      GoCallCalliWorker
+    ; ---------------------------------------
+    
+    push    eax
+
+    ; stack layout at this point:
+    ;
+    ; |         ...          |
+    ; |   stack arguments    | ESP + 16
+    ; +----------------------+
+    ; |     VASigCookie*     | ESP + 12
+    ; +----------------------+
+    ; |    return address    | ESP + 8
+    ; +----------------------+
+    ; | CALLI target address | ESP + 4
+    ; +----------------------+
+    ; |   stub entry point   | ESP + 0
+    ; ------------------------
+    
+    ; remove VASigCookie from the stack
+    mov     eax, [esp + 8]
+    mov     [esp + 12], eax
+    
+    ; move stub entry point below the RA
+    mov     eax, [esp]
+    mov     [esp + 8], eax
+
+    ; load EAX with the target address
+    pop     eax
+    pop     eax
+    
+    ; stack layout at this point:
+    ;
+    ; |         ...          |
+    ; |   stack arguments    | ESP + 8
+    ; +----------------------+
+    ; |    return address    | ESP + 4
+    ; +----------------------+
+    ; |   stub entry point   | ESP + 0
+    ; ------------------------
+
+    ; CALLI target address is in EAX
+    ret
+    
+GoCallCalliWorker:
+    ; the target is on the stack and will become m_Datum of PInvokeCalliFrame
+    ; call the stub generating worker
+    pop     eax
+
+    ;
+    ; target ptr in EAX, VASigCookie ptr in EDX
+    ;
+    
+    STUB_PROLOG
+
+    mov         esi, esp
+
+    ; save target
+    push        eax
+
+    push        eax                         ; unmanaged target
+    push        dword ptr [esi + 4*7]       ; pVaSigCookie (first stack argument)
+    push        esi                         ; pTransitionBlock
+
+    call        _GenericPInvokeCalliStubWorker@12
+
+    ; restore target
+    pop     eax
+    
+    STUB_EPILOG
+
+    ; jump back to the helper - this time it won't come back here as the stub already exists
+    jmp _GenericPInvokeCalliHelper@0
+
+_GenericPInvokeCalliHelper@0 endp
+
+ifdef MDA_SUPPORTED
+
+;==========================================================================
+; Invoked from on-the-fly generated stubs when the stack imbalance MDA is
+; enabled. The common low-level work for both direct P/Invoke and unmanaged
+; delegate P/Invoke happens here. PInvokeStackImbalanceWorker is where the
+; actual imbalance check is implemented.
+; [ESP + 4] - the StackImbalanceCookie
+; [EBP + 8] - stack arguments (EBP frame pushed by the calling stub)
+; 
+_PInvokeStackImbalanceHelper@0 proc public
+    ; StackImbalanceCookie to EBX
+    push    ebx
+    lea     ebx, [esp + 8]
+    
+    push    esi
+    push    edi
+    
+    ; copy stack args
+    mov     edx, ecx
+    mov     ecx, [ebx + StackImbalanceCookie__m_dwStackArgSize]
+    sub     esp, ecx
+
+    shr     ecx, 2
+    lea     edi, [esp]
+    lea     esi, [ebp + 8]
+
+    cld
+    rep movsd
+    
+    ; record pre-call ESP
+    mov     [ebx + StackImbalanceCookie__m_dwSavedEsp], esp
+    
+    ; call the target (restore ECX in case it's a thiscall)
+    mov     ecx, edx
+    call    [ebx + StackImbalanceCookie__m_pTarget]
+
+    ; record post-call ESP and restore ESP to pre-pushed state
+    mov     ecx, esp
+    lea     esp, [ebp - SIZEOF_StackImbalanceCookie - 16] ; 4 DWORDs and the cookie have been pushed
+
+    ; save return value
+    push    eax
+    push    edx
+    sub     esp, 12
+    
+.errnz (StackImbalanceCookie__HAS_FP_RETURN_VALUE AND 00ffffffh), HAS_FP_RETURN_VALUE has changed - update asm code
+    
+    ; save top of the floating point stack if the target has FP retval
+    test    byte ptr [ebx + StackImbalanceCookie__m_callConv + 3], (StackImbalanceCookie__HAS_FP_RETURN_VALUE SHR 24)
+    jz      noFPURetVal
+    fstp    tbyte ptr [esp] ; save full 10 bytes to avoid precision loss
+noFPURetVal:
+
+    ; call PInvokeStackImbalanceWorker(StackImbalanceCookie *pSICookie, DWORD dwPostESP)
+    push    ecx
+    push    ebx
+    call    _PInvokeStackImbalanceWorker@8
+
+    ; restore return value
+    test    byte ptr [ebx + StackImbalanceCookie__m_callConv + 3], (StackImbalanceCookie__HAS_FP_RETURN_VALUE SHR 24)
+    jz      noFPURetValToRestore
+    fld     tbyte ptr [esp]
+noFPURetValToRestore:
+
+    add     esp, 12
+    pop     edx
+    pop     eax
+
+    ; restore registers
+    pop     edi
+    pop     esi
+
+    pop     ebx
+    
+    ; EBP frame and original stack arguments will be removed by the caller
+    ret
+_PInvokeStackImbalanceHelper@0 endp
+
+endif ; MDA_SUPPORTED
+
+ifdef FEATURE_COMINTEROP
+
+;==========================================================================
+; This is a fast alternative to CallDescr* tailored specifically for
+; COM to CLR calls. Stack arguments don't come in a continuous buffer
+; and secret argument can be passed in EAX.
+; 
+
+; extern "C" ARG_SLOT __fastcall COMToCLRDispatchHelper(
+;     INT_PTR dwArgECX,                 ; ecx
+;     INT_PTR dwArgEDX,                 ; edx
+;     PCODE   pTarget,                  ; [esp + 4]
+;     PCODE   pSecretArg,               ; [esp + 8]
+;     INT_PTR *pInputStack,             ; [esp + c]
+;     WORD    wOutputStackSlots,        ; [esp +10]
+;     UINT16  *pOutputStackOffsets,     ; [esp +14]
+;     Frame   *pCurFrame);              ; [esp +18]
+
+FASTCALL_FUNC COMToCLRDispatchHelper, 32
+
+    ; ecx: dwArgECX
+    ; edx: dwArgEDX
+
+    offset_pTarget              equ 4   
+    offset_pSecretArg           equ 8   
+    offset_pInputStack          equ 0Ch 
+    offset_wOutputStackSlots    equ 10h
+    offset_pOutputStackOffsets  equ 14h 
+    offset_pCurFrame            equ 18h
+
+    movzx   eax, word ptr [esp + offset_wOutputStackSlots]
+    test    eax, eax
+    jnz     CopyStackArgs
+
+    ; There are no stack args to copy and ECX and EDX are already setup
+    ; with the correct arguments for the callee, so we just have to 
+    ; push the CPFH and make the call.
+
+    PUSH_CPFH_FOR_COM   eax, esp, offset_pCurFrame     ; trashes eax
+
+    mov     eax, [esp + offset_pSecretArg + CPFH_STACK_SIZE]
+    call    [esp + offset_pTarget + CPFH_STACK_SIZE]
+ifdef _DEBUG
+    nop     ; This is a tag that we use in an assert.
+endif
+
+    POP_CPFH_FOR_COM    ecx     ; trashes ecx
+
+    ret     18h
+
+
+CopyStackArgs:
+    ; eax: num stack slots
+    ; ecx: dwArgECX
+    ; edx: dwArgEDX
+
+    push    ebp
+    mov     ebp, esp
+    push    ebx
+    push    esi
+    push    edi
+
+    ebpFrame_adjust         equ 4h
+    ebp_offset_pCurFrame    equ ebpFrame_adjust + offset_pCurFrame
+
+    PUSH_CPFH_FOR_COM   ebx, ebp, ebp_offset_pCurFrame     ; trashes ebx
+
+    mov     edi, [ebp + ebpFrame_adjust + offset_pOutputStackOffsets]
+    mov     esi, [ebp + ebpFrame_adjust + offset_pInputStack]
+
+    ; eax: num stack slots
+    ; ecx: dwArgECX
+    ; edx: dwArgEDX
+    ; edi: pOutputStackOffsets
+    ; esi: pInputStack
+
+CopyStackLoop:
+    dec     eax
+    movzx   ebx, word ptr [edi + 2 * eax] ; ebx <- input stack offset
+    push    [esi + ebx]                   ; stack <- value on the input stack
+    jnz     CopyStackLoop
+
+    ; ECX and EDX are setup with the correct arguments for the callee,
+    ; and we've copied the stack arguments over as well, so now it's
+    ; time to make the call.
+
+    mov     eax, [ebp + ebpFrame_adjust + offset_pSecretArg]
+    call    [ebp + ebpFrame_adjust + offset_pTarget]
+ifdef _DEBUG
+    nop     ; This is a tag that we use in an assert.
+endif
+
+    POP_CPFH_FOR_COM    ecx     ; trashes ecx
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+
+    ret     18h
+
+FASTCALL_ENDFUNC
+
+endif ; FEATURE_COMINTEROP
+
+ifndef FEATURE_CORECLR
+
+;==========================================================================
+; This is small stub whose purpose is to record current stack pointer and
+; call CopyCtorCallStubWorker to invoke copy constructors and destructors
+; as appropriate. This stub operates on arguments already pushed to the
+; stack by JITted IL stub and must not create a new frame, i.e. it must tail
+; call to the target for it to see the arguments that copy ctors have been
+; called on.
+;
+_CopyCtorCallStub@0 proc public
+    ; there may be an argument in ecx - save it
+    push    ecx
+    
+    ; push pointer to arguments
+    lea     edx, [esp + 8]
+    push    edx
+    
+    call    _CopyCtorCallStubWorker@4
+
+    ; restore ecx and tail call to the target
+    pop     ecx
+    jmp     eax
+_CopyCtorCallStub@0 endp
+
+endif ; !FEATURE_CORECLR
+
+ifdef FEATURE_PREJIT
+
+;==========================================================================
+_StubDispatchFixupStub@0 proc public
+
+    STUB_PROLOG
+
+    mov         esi, esp
+
+    push        0
+    push        0
+
+    push        eax             ; siteAddrForRegisterIndirect (for tailcalls)
+    push        esi             ; pTransitionBlock
+
+    call        _StubDispatchFixupWorker@16
+    
+    STUB_EPILOG
+
+_StubDispatchFixupPatchLabel@0:
+public _StubDispatchFixupPatchLabel@0
+
+    ; Tailcall target
+    jmp eax
+
+    ; This will never be executed. It is just to help out stack-walking logic
+    ; which disassembles the epilog to unwind the stack.
+    ret
+
+_StubDispatchFixupStub@0 endp
+
+;==========================================================================
+_ExternalMethodFixupStub@0 proc public
+
+    pop     eax             ; pop off the return address to the stub
+                            ; leaving the actual caller's return address on top of the stack
+
+    STUB_PROLOG
+
+    mov         esi, esp
+
+    ; EAX is return address into CORCOMPILE_EXTERNAL_METHOD_THUNK. Subtract 5 to get start address.
+    sub         eax, 5
+
+    push        0
+    push        0
+
+    push        eax
+
+    ; pTransitionBlock
+    push        esi
+
+    call        _ExternalMethodFixupWorker@16
+
+    ; eax now contains replacement stub. PreStubWorker will never return
+    ; NULL (it throws an exception if stub creation fails.)
+
+    ; From here on, mustn't trash eax
+    
+    STUB_EPILOG
+
+_ExternalMethodFixupPatchLabel@0:
+public _ExternalMethodFixupPatchLabel@0
+
+    ; Tailcall target
+    jmp eax    
+
+    ; This will never be executed. It is just to help out stack-walking logic
+    ; which disassembles the epilog to unwind the stack.
+    ret
+
+_ExternalMethodFixupStub@0 endp
+
+ifdef FEATURE_READYTORUN
+;==========================================================================
+_DelayLoad_MethodCall@0 proc public
+
+    STUB_PROLOG_2_HIDDEN_ARGS
+
+    mov         esi, esp
+
+    push        ecx
+    push        edx
+
+    push        eax
+
+    ; pTransitionBlock
+    push        esi
+
+    call        _ExternalMethodFixupWorker@16
+
+    ; eax now contains replacement stub. PreStubWorker will never return
+    ; NULL (it throws an exception if stub creation fails.)
+
+    ; From here on, mustn't trash eax
+    
+    STUB_EPILOG
+
+    ; Share the patch label
+    jmp _ExternalMethodFixupPatchLabel@0
+
+    ; This will never be executed. It is just to help out stack-walking logic
+    ; which disassembles the epilog to unwind the stack.
+    ret
+
+_DelayLoad_MethodCall@0 endp
+endif
+
+;=======================================================================================
+; The call in softbound vtable slots initially points to this function.
+; The pupose of this function is to transfer the control to right target and
+; to optionally patch the target of the jump so that we do not take this slow path again.
+; 
+_VirtualMethodFixupStub@0 proc public
+
+        pop     eax         ; Pop the return address. It points right after the call instruction in the thunk.
+        sub     eax,5       ; Calculate the address of the thunk
+
+        ; Push ebp frame to get good callstack under debugger
+        push    ebp
+        mov     ebp, esp
+
+        ; Preserve argument registers
+        push    ecx
+        push    edx
+        
+        push    eax         ; address of the thunk
+        push    ecx         ; this ptr
+        call    _VirtualMethodFixupWorker@8
+
+        ; Restore argument registers
+        pop     edx
+        pop     ecx
+
+        ; Pop ebp frame
+        pop     ebp
+
+_VirtualMethodFixupPatchLabel@0:
+public _VirtualMethodFixupPatchLabel@0
+
+        ; Proceed to execute the actual method.
+        jmp     eax
+
+        ; This will never be executed. It is just to help out stack-walking logic
+        ; which disassembles the epilog to unwind the stack.
+        ret
+
+_VirtualMethodFixupStub@0 endp
+
+endif ; FEATURE_PREJIT
+
+;==========================================================================
+; The prestub
+_ThePreStub@0 proc public
+
+    STUB_PROLOG
+
+    mov         esi, esp
+
+    ; EAX contains MethodDesc* from the precode. Push it here as argument 
+    ; for PreStubWorker 
+    push        eax
+    
+    push        esi
+
+    call        _PreStubWorker@8
+
+    ; eax now contains replacement stub. PreStubWorker will never return
+    ; NULL (it throws an exception if stub creation fails.)
+
+    ; From here on, mustn't trash eax
+    
+    STUB_EPILOG
+    
+    ; Tailcall target
+    jmp eax
+    
+    ; This will never be executed. It is just to help out stack-walking logic
+    ; which disassembles the epilog to unwind the stack.
+    ret
+
+_ThePreStub@0 endp
+
+; This method does nothing.  It's just a fixed function for the debugger to put a breakpoint
+; on so that it can trace a call target.
+_ThePreStubPatch@0 proc public
+    ; make sure that the basic block is unique
+    test eax,34
+_ThePreStubPatchLabel@0:
+public _ThePreStubPatchLabel@0
+    ret
+_ThePreStubPatch@0 endp
+
+ifdef FEATURE_COMINTEROP
+;==========================================================================
+; CLR -> COM generic or late-bound call
+_GenericComPlusCallStub@0 proc public
+
+    STUB_PROLOG
+
+    ; pTransitionBlock
+    mov         esi, esp
+
+    ; return value
+    sub         esp, 8
+
+    ; save pMD
+    mov         ebx, eax
+
+    push        eax                 ; pMD
+    push        esi                 ; pTransitionBlock
+    call        _CLRToCOMWorker@8
+
+    push        eax
+    call        _setFPReturn@12     ; pop & set the return value
+
+    ; From here on, mustn't trash eax:edx
+
+    ; Get pComPlusCallInfo for return thunk
+    mov         ecx, [ebx + ComPlusCallMethodDesc__m_pComPlusCallInfo]
+    
+    STUB_EPILOG_RETURN
+    
+    ; Tailcall return thunk
+    jmp [ecx + ComPlusCallInfo__m_pRetThunk]
+    
+    ; This will never be executed. It is just to help out stack-walking logic
+    ; which disassembles the epilog to unwind the stack.
+    ret
+
+_GenericComPlusCallStub@0 endp
+endif ; FEATURE_COMINTEROP
+
+ifdef FEATURE_REMOTING
+_TransparentProxyStub@0 proc public
+    ; push slot passed in eax
+    push eax
+
+    ; Move into eax the stub data and call the stub
+    mov     eax, [ecx + TransparentProxyObject___stubData]
+    call    [ecx + TransparentProxyObject___stub]
+ifdef _DEBUG
+    nop     ; Mark this as a special call site that can directly
+            ; call managed code
+endif
+    test    eax, eax
+    jnz     CtxMismatch2
+
+    mov eax,            [ecx + TransparentProxyObject___pMT]
+
+    push ebx            ; spill EBX
+
+    ; Convert the slot number into the code address
+    ; See MethodTable.h for details on vtable layout
+
+    mov ebx, [esp + 4]  ; Reload the slot
+    shr ebx, ASM__VTABLE_SLOTS_PER_CHUNK_LOG2   ; indirectionSlotNumber
+
+    mov eax,[eax + ebx*4 + SIZEOF_MethodTable]
+
+    mov ebx, [esp + 4]                      ; use unchanged slot from above
+    and ebx, ASM__VTABLE_SLOTS_PER_CHUNK-1  ; offsetInChunk
+    mov eax, [eax + ebx*4]
+
+    ; At this point, eax contains the code address
+
+    ; Restore EBX
+    pop ebx
+
+    ; Remove the slot number from the stack
+    lea esp, [esp+4]
+
+    jmp eax
+
+    ; CONTEXT MISMATCH CASE, call out to the real proxy to dispatch
+
+CtxMismatch2:
+    pop     eax                                  ; restore MethodDesc *
+    jmp     _TransparentProxyStub_CrossContext@0 ; jump to slow TP stub
+
+_TransparentProxyStub@0 endp
+
+_TransparentProxyStub_CrossContext@0 proc public
+
+    STUB_PROLOG
+
+    ; pTransitionBlock
+    mov         esi, esp
+
+    ; return value
+    sub         esp, 3*4            ; 64-bit return value + cb stack pop
+
+    push        eax                 ; pMD
+    push        esi                 ; pTransitionBlock
+    call        _TransparentProxyStubWorker@8
+
+    pop         ebx                 ; cbStackPop
+
+    push        eax
+    call        _setFPReturn@12     ; pop & set the return value
+
+    ; From here on, mustn't trash eax:edx
+    mov         ecx, ebx            ; cbStackPop
+    
+    mov         ebx, [esp+6*4]      ; get retaddr
+    mov         [esp+6*4+ecx], ebx  ; put it where it belongs
+
+    STUB_EPILOG_RETURN
+
+    add     esp, ecx                ; pop all the args
+    ret
+
+_TransparentProxyStub_CrossContext@0 endp
+
+; This method does nothing.  It's just a fixed function for the debugger to put a breakpoint
+; on so that it can trace a call target.
+_TransparentProxyStubPatch@0 proc public
+    ; make sure that the basic block is unique
+    test eax,12
+_TransparentProxyStubPatchLabel@0:
+public _TransparentProxyStubPatchLabel@0
+    ret
+_TransparentProxyStubPatch@0 endp
+
+endif ; FEATURE_REMOTING
+
+ifdef FEATURE_COMINTEROP
+;--------------------------------------------------------------------------
+; This is the code that all com call method stubs run initially. 
+; Most of the real work occurs in ComStubWorker(), a C++ routine.
+; The template only does the part that absolutely has to be in assembly
+; language.
+;--------------------------------------------------------------------------
+_ComCallPreStub@0 proc public
+    pop     eax                 ;ComCallMethodDesc*
+
+    ; push ebp-frame
+    push        ebp
+    mov         ebp,esp
+
+    ; save CalleeSavedRegisters
+    push        ebx
+    push        esi
+    push        edi
+
+    push        eax         ; ComCallMethodDesc*
+    sub         esp, 5*4    ; next, vtable, gscookie, 64-bit error return
+
+    lea     edi, [esp]
+    lea     esi, [esp+3*4]
+ 
+    push    edi                 ; pErrorReturn
+    push    esi                 ; pFrame
+    call    _ComPreStubWorker@8
+
+    ; eax now contains replacement stub. ComStubWorker will  return NULL if stub creation fails
+    cmp eax, 0
+    je nostub                   ; oops we could not create a stub
+
+    add     esp, 6*4
+
+    ; pop CalleeSavedRegisters
+    pop edi
+    pop esi
+    pop ebx
+    pop ebp
+
+    jmp     eax                 ; Reexecute with replacement stub.
+    ; We will never get here. This "ret" is just so that code-disassembling
+    ; profilers know to stop disassembling any further
+    ret
+
+nostub:
+
+    ; Even though the ComPreStubWorker sets a 64 bit value as the error return code. 
+    ; Only the lower 32 bits contain usefula data. The reason for this is that the 
+    ; possible error return types are: failure HRESULT, 0 and floating point 0.
+    ; In each case, the data fits in 32 bits. Instead, we use the upper half of 
+    ; the return value to store number of bytes to pop
+    mov     eax, [edi]
+    mov     edx, [edi+4]
+
+    add     esp, 6*4
+
+    ; pop CalleeSavedRegisters
+    pop edi
+    pop esi
+    pop ebx
+    pop ebp
+
+    pop     ecx                 ; return address
+    add     esp, edx            ; pop bytes of the stack
+    push    ecx                 ; return address
+
+    ; We need to deal with the case where the method is PreserveSig=true and has an 8 
+    ; byte return type. There are 2 types of 8 byte return types: integer and floating point.
+    ; For integer 8 byte return types, we always return 0 in case of failure. For floating
+    ; point return types, we return the value in the floating point register. In both cases
+    ; edx should be 0.
+    xor     edx, edx            ; edx <-- 0
+
+    ret
+
+_ComCallPreStub@0 endp
+endif ; FEATURE_COMINTEROP
+
+ifdef FEATURE_READYTORUN
+;==========================================================================
+; Define helpers for delay loading of readytorun helpers
+
+DYNAMICHELPER macro frameFlags, suffix
+
+_DelayLoad_Helper&suffix&@0 proc public
+
+    STUB_PROLOG_2_HIDDEN_ARGS
+
+    mov         esi, esp
+
+    push        frameFlags
+    push        ecx             ; module
+    push        edx             ; section index
+
+    push        eax             ; indirection cell address. 
+    push        esi             ; pTransitionBlock
+
+    call        _DynamicHelperWorker@20
+    test        eax,eax
+    jnz         @F
+
+    mov         eax, [esi]      ; The result is stored in the argument area of the transition block
+    STUB_EPILOG_RETURN
+    ret
+
+@@:
+    STUB_EPILOG
+    jmp eax
+
+_DelayLoad_Helper&suffix&@0 endp
+
+    endm
+
+DYNAMICHELPER DynamicHelperFrameFlags_Default
+DYNAMICHELPER DynamicHelperFrameFlags_ObjectArg, _Obj
+DYNAMICHELPER <DynamicHelperFrameFlags_ObjectArg OR DynamicHelperFrameFlags_ObjectArg2>, _ObjObj
+
+endif ; FEATURE_READYTORUN
+
+    end
diff --git a/src/vm/i386/cgencpu.h b/src/vm/i386/cgencpu.h
new file mode 100644
index 0000000000..2da98821bc
--- /dev/null
+++ b/src/vm/i386/cgencpu.h
@@ -0,0 +1,573 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// CGENX86.H -
+//
+// Various helper routines for generating x86 assembly code.
+//
+// DO NOT INCLUDE THIS FILE DIRECTLY - ALWAYS USE CGENSYS.H INSTEAD
+//
+
+
+
+#ifndef _TARGET_X86_
+#error Should only include "cgenx86.h" for X86 builds
+#endif // _TARGET_X86_
+
+#ifndef __cgenx86_h__
+#define __cgenx86_h__
+
+#include "utilcode.h"
+
+// Given a return address retrieved during stackwalk,
+// this is the offset by which it should be decremented to lend somewhere in a call instruction.
+#define STACKWALK_CONTROLPC_ADJUST_OFFSET 1
+
+// preferred alignment for data
+#define DATA_ALIGNMENT 4
+
+class MethodDesc;
+class FramedMethodFrame;
+class Module;
+class ComCallMethodDesc;
+class BaseDomain;
+
+// CPU-dependent functions
+Stub * GenerateInitPInvokeFrameHelper();
+
+#ifdef MDA_SUPPORTED
+EXTERN_C void STDCALL PInvokeStackImbalanceHelper(void);
+#endif // MDA_SUPPORTED
+
+#ifndef FEATURE_CORECLR
+EXTERN_C void STDCALL CopyCtorCallStub(void);
+#endif // !FEATURE_CORECLR
+
+BOOL Runtime_Test_For_SSE2();
+
+#ifdef CROSSGEN_COMPILE
+#define GetEEFuncEntryPoint(pfn) 0x1001
+#else
+#define GetEEFuncEntryPoint(pfn) GFN_TADDR(pfn)
+#endif
+
+//**********************************************************************
+// To be used with GetSpecificCpuInfo()
+
+#define CPU_X86_FAMILY(cpuType)     (((cpuType) & 0x0F00) >> 8)
+#define CPU_X86_MODEL(cpuType)      (((cpuType) & 0x00F0) >> 4)
+// Stepping is masked out by GetSpecificCpuInfo()
+// #define CPU_X86_STEPPING(cpuType)   (((cpuType) & 0x000F)     )
+
+#define CPU_X86_USE_CMOV(cpuFeat)   ((cpuFeat & 0x00008001) == 0x00008001)
+#define CPU_X86_USE_SSE2(cpuFeat)  (((cpuFeat & 0x04000000) == 0x04000000) && Runtime_Test_For_SSE2())
+
+// Values for CPU_X86_FAMILY(cpuType)
+#define CPU_X86_486                 4
+#define CPU_X86_PENTIUM             5
+#define CPU_X86_PENTIUM_PRO         6
+#define CPU_X86_PENTIUM_4           0xF
+
+// Values for CPU_X86_MODEL(cpuType) for CPU_X86_PENTIUM_PRO
+#define CPU_X86_MODEL_PENTIUM_PRO_BANIAS    9 // Pentium M (Mobile PPro with P4 feautres)
+
+#define COMMETHOD_PREPAD                        8   // # extra bytes to allocate in addition to sizeof(ComCallMethodDesc)
+#ifdef FEATURE_COMINTEROP
+#define COMMETHOD_CALL_PRESTUB_SIZE             5   // x86: CALL(E8) xx xx xx xx
+#define COMMETHOD_CALL_PRESTUB_ADDRESS_OFFSET   1   // the offset of the call target address inside the prestub
+#endif // FEATURE_COMINTEROP
+
+#define STACK_ALIGN_SIZE                        4
+
+#define JUMP_ALLOCATE_SIZE                      8   // # bytes to allocate for a jump instruction
+#define BACK_TO_BACK_JUMP_ALLOCATE_SIZE         8   // # bytes to allocate for a back to back jump instruction
+
+#define HAS_COMPACT_ENTRYPOINTS                 1
+
+// Needed for PInvoke inlining in ngened images
+#define HAS_NDIRECT_IMPORT_PRECODE              1
+
+#ifdef FEATURE_REMOTING
+#define HAS_REMOTING_PRECODE                    1
+#endif
+#ifdef FEATURE_PREJIT
+#define HAS_FIXUP_PRECODE                       1
+#define HAS_FIXUP_PRECODE_CHUNKS                1
+#endif
+
+// ThisPtrRetBufPrecode one is necessary for closed delegates over static methods with return buffer
+#define HAS_THISPTR_RETBUF_PRECODE              1
+
+#define CODE_SIZE_ALIGN                         4
+#define CACHE_LINE_SIZE                         32  // As per Intel Optimization Manual the cache line size is 32 bytes
+#define LOG2SLOT                                LOG2_PTRSIZE
+
+#define ENREGISTERED_RETURNTYPE_MAXSIZE         8
+#define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 4
+#define CALLDESCR_ARGREGS                       1   // CallDescrWorker has ArgumentRegister parameter
+
+// Max size of patched TLS helpers
+#ifdef _DEBUG
+// Debug build needs extra space for last error trashing
+#define TLS_GETTER_MAX_SIZE 0x20
+#else
+#define TLS_GETTER_MAX_SIZE 0x10
+#endif
+
+//=======================================================================
+// IMPORTANT: This value is used to figure out how much to allocate
+// for a fixed array of FieldMarshaler's. That means it must be at least
+// as large as the largest FieldMarshaler subclass. This requirement
+// is guarded by an assert.
+//=======================================================================
+#define MAXFIELDMARSHALERSIZE               24
+
+//**********************************************************************
+// Parameter size
+//**********************************************************************
+
+typedef INT32 StackElemType;
+#define STACK_ELEM_SIZE sizeof(StackElemType)
+
+
+
+#include "stublinkerx86.h"
+
+
+
+// !! This expression assumes STACK_ELEM_SIZE is a power of 2.
+#define StackElemSize(parmSize) (((parmSize) + STACK_ELEM_SIZE - 1) & ~((ULONG)(STACK_ELEM_SIZE - 1)))
+
+
+//**********************************************************************
+// Frames
+//**********************************************************************
+//--------------------------------------------------------------------
+// This represents some of the FramedMethodFrame fields that are
+// stored at negative offsets.
+//--------------------------------------------------------------------
+typedef DPTR(struct CalleeSavedRegisters) PTR_CalleeSavedRegisters;
+struct CalleeSavedRegisters {
+    INT32       edi;
+    INT32       esi;
+    INT32       ebx;
+    INT32       ebp;
+};
+
+//--------------------------------------------------------------------
+// This represents the arguments that are stored in volatile registers.
+// This should not overlap the CalleeSavedRegisters since those are already
+// saved separately and it would be wasteful to save the same register twice.
+// If we do use a non-volatile register as an argument, then the ArgIterator
+// will probably have to communicate this back to the PromoteCallerStack
+// routine to avoid a double promotion.
+//--------------------------------------------------------------------
+#define ENUM_ARGUMENT_REGISTERS() \
+    ARGUMENT_REGISTER(ECX) \
+    ARGUMENT_REGISTER(EDX)
+
+#define ENUM_ARGUMENT_REGISTERS_BACKWARD() \
+    ARGUMENT_REGISTER(EDX) \
+    ARGUMENT_REGISTER(ECX)
+
+typedef DPTR(struct ArgumentRegisters) PTR_ArgumentRegisters;
+struct ArgumentRegisters {
+    #define ARGUMENT_REGISTER(regname) INT32 regname;
+    ENUM_ARGUMENT_REGISTERS_BACKWARD()
+    #undef ARGUMENT_REGISTER
+};
+#define NUM_ARGUMENT_REGISTERS 2
+
+#define SCRATCH_REGISTER_X86REG kEAX
+
+#define THIS_REG ECX
+#define THIS_kREG kECX
+
+#define ARGUMENT_REG1   ECX
+#define ARGUMENT_REG2   EDX
+
+// forward decl
+struct REGDISPLAY;
+typedef REGDISPLAY *PREGDISPLAY;
+
+// Sufficient context for Try/Catch restoration.
+struct EHContext {
+    INT32       Eax;
+    INT32       Ebx;
+    INT32       Ecx;
+    INT32       Edx;
+    INT32       Esi;
+    INT32       Edi;
+    INT32       Ebp;
+    INT32       Esp;
+    INT32       Eip;
+
+    void Setup(PCODE resumePC, PREGDISPLAY regs);
+    void UpdateFrame(PREGDISPLAY regs);
+    
+    inline TADDR GetSP() {
+        LIMITED_METHOD_CONTRACT;
+        return (TADDR)Esp;
+    }
+    inline void SetSP(LPVOID esp) {
+        LIMITED_METHOD_CONTRACT;
+        Esp = (INT32)(size_t)esp;
+    }
+
+    inline LPVOID GetFP() {
+        LIMITED_METHOD_CONTRACT;
+        return (LPVOID)(UINT_PTR)Ebp;
+    }
+
+    inline void SetArg(LPVOID arg) {
+        LIMITED_METHOD_CONTRACT;
+        Eax = (INT32)(size_t)arg;
+    }
+
+    inline void Init()
+    {
+        LIMITED_METHOD_CONTRACT;
+        Eax = 0;
+        Ebx = 0;
+        Ecx = 0;
+        Edx = 0;
+        Esi = 0;
+        Edi = 0;
+        Ebp = 0;
+        Esp = 0;
+        Eip = 0;
+    }
+};
+
+#define ARGUMENTREGISTERS_SIZE sizeof(ArgumentRegisters)
+
+//**********************************************************************
+// Exception handling
+//**********************************************************************
+
+inline PCODE GetIP(const CONTEXT * context) {
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    return PCODE(context->Eip);
+}
+
+inline void SetIP(CONTEXT *context, PCODE eip) {
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    context->Eip = (DWORD)eip;
+}
+
+inline TADDR GetSP(const CONTEXT * context) {
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    return (TADDR)(context->Esp);
+}
+
+EXTERN_C LPVOID STDCALL GetCurrentSP();
+
+inline void SetSP(CONTEXT *context, TADDR esp) {
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    context->Esp = (DWORD)esp;
+}
+
+inline void SetFP(CONTEXT *context, TADDR ebp) {
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    context->Ebp = (INT32)ebp;
+}
+
+inline TADDR GetFP(const CONTEXT * context)
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    return (TADDR)context->Ebp;
+}
+
+// Get Rel32 destination, emit jumpStub if necessary
+inline INT32 rel32UsingJumpStub(INT32 UNALIGNED * pRel32, PCODE target, MethodDesc *pMethod = NULL, LoaderAllocator *pLoaderAllocator = NULL)
+{
+    // We do not need jump stubs on i386
+    LIMITED_METHOD_CONTRACT;
+
+    TADDR baseAddr = (TADDR)pRel32 + 4;
+    return (INT32)(target - baseAddr);
+}
+
+#ifdef FEATURE_COMINTEROP
+inline void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target)
+{
+    WRAPPER_NO_CONTRACT;
+
+    BYTE *pBuffer = (BYTE*)pCOMMethod - COMMETHOD_CALL_PRESTUB_SIZE;
+
+    pBuffer[0] = X86_INSTR_CALL_REL32; //CALLNEAR32
+    *((LPVOID*)(1+pBuffer)) = (LPVOID) (((LPBYTE)target) - (pBuffer+5));
+    
+    _ASSERTE(IS_ALIGNED(pBuffer + COMMETHOD_CALL_PRESTUB_ADDRESS_OFFSET, sizeof(void*)) &&
+        *((SSIZE_T*)(pBuffer + COMMETHOD_CALL_PRESTUB_ADDRESS_OFFSET)) == ((LPBYTE)target - (LPBYTE)pCOMMethod));
+}
+#endif // FEATURE_COMINTEROP
+
+//------------------------------------------------------------------------
+WORD GetUnpatchedCodeData(LPCBYTE pAddr);
+
+//------------------------------------------------------------------------
+inline WORD GetUnpatchedOpcodeWORD(LPCBYTE pAddr)
+{
+    WRAPPER_NO_CONTRACT;
+    if (CORDebuggerAttached())
+    {
+        return GetUnpatchedCodeData(pAddr);
+    }
+    else
+    {
+        return *((WORD *)pAddr);
+    }
+}
+
+//------------------------------------------------------------------------
+inline BYTE GetUnpatchedOpcodeBYTE(LPCBYTE pAddr)
+{
+    WRAPPER_NO_CONTRACT;
+    if (CORDebuggerAttached())
+    {
+        return (BYTE) GetUnpatchedCodeData(pAddr);
+    }
+    else
+    {
+        return *pAddr;
+    }
+}
+
+ //------------------------------------------------------------------------
+// The following must be a distinguishable set of instruction sequences for
+// various stub dispatch calls.
+//
+// An x86 JIT which uses full stub dispatch must generate only
+// the following stub dispatch calls:
+//
+// (1) isCallRelativeIndirect:
+//        call dword ptr [rel32]  ;  FF 15 ---rel32----
+// (2) isCallRelative:
+//        call abc                ;     E8 ---rel32----
+// (3) isCallRegisterIndirect:
+//     3-byte nop                 ;
+//     call dword ptr [eax]       ;     FF 10
+//
+// NOTE: You must be sure that pRetAddr is a true return address for
+// a stub dispatch call.
+
+BOOL isCallRelativeIndirect(const BYTE *pRetAddr);
+BOOL isCallRelative(const BYTE *pRetAddr);
+BOOL isCallRegisterIndirect(const BYTE *pRetAddr);
+
+inline BOOL isCallRelativeIndirect(const BYTE *pRetAddr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    BOOL fRet = (GetUnpatchedOpcodeWORD(&pRetAddr[-6]) == X86_INSTR_CALL_IND);
+    _ASSERTE(!fRet || !isCallRelative(pRetAddr));
+    _ASSERTE(!fRet || !isCallRegisterIndirect(pRetAddr));
+    return fRet;
+}
+
+inline BOOL isCallRelative(const BYTE *pRetAddr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    BOOL fRet = (GetUnpatchedOpcodeBYTE(&pRetAddr[-5]) == X86_INSTR_CALL_REL32);
+    _ASSERTE(!fRet || !isCallRelativeIndirect(pRetAddr));
+    _ASSERTE(!fRet || !isCallRegisterIndirect(pRetAddr));
+    return fRet;
+}
+
+inline BOOL isCallRegisterIndirect(const BYTE *pRetAddr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    BOOL fRet = (GetUnpatchedOpcodeWORD(&pRetAddr[-5]) == X86_INSTR_NOP3_1)
+             && (GetUnpatchedOpcodeBYTE(&pRetAddr[-3]) == X86_INSTR_NOP3_3)
+             && (GetUnpatchedOpcodeWORD(&pRetAddr[-2]) == X86_INSTR_CALL_IND_EAX);
+    _ASSERTE(!fRet || !isCallRelative(pRetAddr));
+    _ASSERTE(!fRet || !isCallRelativeIndirect(pRetAddr));
+    return fRet;
+}
+
+//------------------------------------------------------------------------
+inline void emitJump(LPBYTE pBuffer, LPVOID target)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    pBuffer[0] = X86_INSTR_JMP_REL32; //JUMPNEAR32
+    *((LPVOID*)(1+pBuffer)) = (LPVOID) (((LPBYTE)target) - (pBuffer+5));
+}
+
+//------------------------------------------------------------------------
+inline void emitJumpInd(LPBYTE pBuffer, LPVOID target)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    *((WORD*)pBuffer) = X86_INSTR_JMP_IND; // 0x25FF  jmp dword ptr[addr32]
+    *((LPVOID*)(2+pBuffer)) = target;
+}
+ 
+//------------------------------------------------------------------------
+inline PCODE isJump(PCODE pCode)
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+    return *PTR_BYTE(pCode) == X86_INSTR_JMP_REL32;
+}
+
+//------------------------------------------------------------------------
+//  Given the same pBuffer that was used by emitJump this method
+//  decodes the instructions and returns the jump target
+inline PCODE decodeJump(PCODE pCode)
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+    CONSISTENCY_CHECK(*PTR_BYTE(pCode) == X86_INSTR_JMP_REL32);
+    return rel32Decode(pCode+1);
+}
+
+//
+// On IA64 back to back jumps should be separated by a nop bundle to get
+// the best performance from the hardware's branch prediction logic.
+// For all other platforms back to back jumps don't require anything special
+// That is why we have these two wrapper functions that call emitJump and decodeJump
+//
+
+//------------------------------------------------------------------------
+inline void emitBackToBackJump(LPBYTE pBuffer, LPVOID target)
+{
+    WRAPPER_NO_CONTRACT;
+    emitJump(pBuffer, target);
+}
+
+//------------------------------------------------------------------------
+inline PCODE isBackToBackJump(PCODE pBuffer)
+{
+    WRAPPER_NO_CONTRACT;
+    SUPPORTS_DAC;
+    return isJump(pBuffer);
+}
+
+//------------------------------------------------------------------------
+inline PCODE decodeBackToBackJump(PCODE pBuffer)
+{
+    WRAPPER_NO_CONTRACT;
+    SUPPORTS_DAC;
+    return decodeJump(pBuffer);
+}
+
+EXTERN_C void __stdcall setFPReturn(int fpSize, INT64 retVal);
+EXTERN_C void __stdcall getFPReturn(int fpSize, INT64 *pretval);
+
+
+// SEH info forward declarations
+
+inline BOOL IsUnmanagedValueTypeReturnedByRef(UINT sizeofvaluetype) 
+{
+    LIMITED_METHOD_CONTRACT;
+
+    // odd-sized small structures are not 
+    //  enregistered e.g. struct { char a,b,c; }
+    return (sizeofvaluetype > 8) ||
+        (sizeofvaluetype & (sizeofvaluetype - 1)); // check that the size is power of two
+}
+
+#include <pshpack1.h>
+DECLSPEC_ALIGN(4) struct UMEntryThunkCode
+{
+    BYTE            m_alignpad[2];  // used to guarantee alignment of backpactched portion
+    BYTE            m_movEAX;   //MOV EAX,imm32
+    LPVOID          m_uet;      // pointer to start of this structure
+    BYTE            m_jmp;      //JMP NEAR32
+    const BYTE *    m_execstub; // pointer to destination code  // make sure the backpatched portion is dword aligned.
+
+    void Encode(BYTE* pTargetCode, void* pvSecretParam);
+
+    LPCBYTE GetEntryPoint() const
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        return (LPCBYTE)&m_movEAX;
+    }
+
+    static int GetEntryPointOffset()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        return 2;
+    }
+};
+#include <poppack.h>
+
+struct HijackArgs
+{
+    DWORD FPUState[3]; // 12 bytes for FPU state (10 bytes for FP top-of-stack + 2 bytes padding)
+    DWORD Edi;
+    DWORD Esi;
+    DWORD Ebx;
+    DWORD Edx;
+    DWORD Ecx;
+    union
+    {
+        DWORD Eax;
+        size_t ReturnValue[1];
+    };
+    DWORD Ebp;
+    union
+    {
+        DWORD Eip;
+        size_t ReturnAddress;
+    };
+};
+
+// ClrFlushInstructionCache is used when we want to call FlushInstructionCache
+// for a specific architecture in the common code, but not for other architectures.
+// On IA64 ClrFlushInstructionCache calls the Kernel FlushInstructionCache function
+// to flush the instruction cache. 
+// We call ClrFlushInstructionCache whenever we create or modify code in the heap. 
+// Currently ClrFlushInstructionCache has no effect on X86
+//
+
+inline BOOL ClrFlushInstructionCache(LPCVOID pCodeAddr, size_t sizeOfCode)
+{
+    // FlushInstructionCache(GetCurrentProcess(), pCodeAddr, sizeOfCode);
+    MemoryBarrier();
+    return TRUE;
+}
+
+#ifndef FEATURE_IMPLICIT_TLS
+//
+// JIT HELPER ALIASING FOR PORTABILITY.
+//
+// Create alias for optimized implementations of helpers provided on this platform
+//
+
+#define JIT_MonEnter         JIT_MonEnterWorker
+#define JIT_MonEnterWorker   JIT_MonEnterWorker
+#define JIT_MonReliableEnter JIT_MonReliableEnter
+#define JIT_MonTryEnter      JIT_MonTryEnter
+#define JIT_MonExit          JIT_MonExitWorker
+#define JIT_MonExitWorker    JIT_MonExitWorker
+#define JIT_MonEnterStatic   JIT_MonEnterStatic
+#define JIT_MonExitStatic    JIT_MonExitStatic
+
+#endif
+
+// optimized static helpers generated dynamically at runtime
+// #define JIT_GetSharedGCStaticBase
+// #define JIT_GetSharedNonGCStaticBase
+// #define JIT_GetSharedGCStaticBaseNoCtor
+// #define JIT_GetSharedNonGCStaticBaseNoCtor
+
+#define JIT_ChkCastClass            JIT_ChkCastClass
+#define JIT_ChkCastClassSpecial     JIT_ChkCastClassSpecial
+#define JIT_IsInstanceOfClass       JIT_IsInstanceOfClass
+#define JIT_ChkCastInterface        JIT_ChkCastInterface
+#define JIT_IsInstanceOfInterface   JIT_IsInstanceOfInterface
+#define JIT_NewCrossContext         JIT_NewCrossContext
+#define JIT_Stelem_Ref              JIT_Stelem_Ref
+
+#endif // __cgenx86_h__
diff --git a/src/vm/i386/cgenx86.cpp b/src/vm/i386/cgenx86.cpp
new file mode 100644
index 0000000000..ff2f2df5a3
--- /dev/null
+++ b/src/vm/i386/cgenx86.cpp
@@ -0,0 +1,2257 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// CGENX86.CPP -
+//
+// Various helper routines for generating x86 assembly code.
+//
+//
+
+// Precompiled Header
+
+#include "common.h"
+
+#include "field.h"
+#include "stublink.h"
+#include "cgensys.h"
+#include "frames.h"
+#include "excep.h"
+#include "dllimport.h"
+#include "comdelegate.h"
+#include "log.h"
+#include "security.h"
+#include "comdelegate.h"
+#include "array.h"
+#include "jitinterface.h"
+#include "codeman.h"
+#ifdef FEATURE_REMOTING
+#include "remoting.h"
+#endif
+#include "dbginterface.h"
+#include "eeprofinterfaces.h"
+#include "eeconfig.h"
+#include "asmconstants.h"
+#include "class.h"
+#include "virtualcallstub.h"
+#include "mdaassistants.h"
+#include "jitinterface.h"
+
+#ifdef FEATURE_COMINTEROP
+#include "comtoclrcall.h"
+#include "runtimecallablewrapper.h"
+#include "comcache.h"
+#include "olevariant.h"
+#endif // FEATURE_COMINTEROP
+
+#ifdef FEATURE_PREJIT
+#include "compile.h"
+#endif
+
+#include "stublink.inl"
+
+extern "C" DWORD STDCALL GetSpecificCpuTypeAsm(void);
+extern "C" DWORD STDCALL GetSpecificCpuFeaturesAsm(DWORD *pInfo);
+
+// NOTE on Frame Size C_ASSERT usage in this file 
+// if the frame size changes then the stubs have to be revisited for correctness
+// kindly revist the logic and then update the constants so that the C_ASSERT will again fire
+// if someone changes the frame size.  You are expected to keep this hard coded constant
+// up to date so that changes in the frame size trigger errors at compile time if the code is not altered
+
+void generate_noref_copy (unsigned nbytes, StubLinkerCPU* sl);
+
+#ifndef DACCESS_COMPILE
+
+//=============================================================================
+// Runtime test to see if the OS has enabled support for the SSE2 instructions
+//
+//
+BOOL Runtime_Test_For_SSE2()
+{
+#ifdef FEATURE_CORESYSTEM
+    return TRUE;
+#else
+
+    BOOL result = IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE);
+
+    if (result == FALSE)
+        return FALSE;
+
+    // **********************************************************************
+    // ***                                                                ***
+    // ***   IMPORTANT NOTE:                                              ***
+    // ***                                                                ***
+    // ***     All of these RunningOnXXX APIs return true when            ***
+    // ***     the OS that you are running on is that OS or later.        ***
+    // ***     For example RunningOnWin2003() will return true            ***
+    // ***     when you are running on Win2k3, Vista, Win7 or later.      ***
+    // ***                                                                ***
+    // **********************************************************************
+
+
+    // Windows 7 and later should alwys be using SSE2 instructions
+    //  this is true for both for native and Wow64
+    //
+    if (RunningOnWin7())
+        return TRUE;
+
+    if (RunningInWow64())
+    {
+        // There is an issue with saving/restoring the SSE2 registers under wow64 
+        // So we figure out if we are running on an impacted OS and Service Pack level
+        //     See DevDiv Bugs 89587 for the wow64 bug.
+        //
+
+        _ASSERTE(ExOSInfoAvailable());  // This is always available on Vista and later
+
+        //
+        // The issue is fixed in Windows Server 2008 or Vista/SP1
+        //
+        // It is not fixed in Vista/RTM, so check for that case
+        // 
+        if ((ExOSInfoRunningOnServer() == FALSE))
+        {
+            OSVERSIONINFOEX osvi;
+
+            ZeroMemory(&osvi, sizeof(OSVERSIONINFOEX));
+            osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+            osvi.wServicePackMajor = 0;
+
+            DWORDLONG dwlConditionMask = 0;
+            VER_SET_CONDITION( dwlConditionMask, CLR_VER_SERVICEPACKMAJOR, VER_EQUAL);
+                
+            if (VerifyVersionInfo(&osvi, CLR_VER_SERVICEPACKMAJOR, dwlConditionMask))
+                result = FALSE;
+        }
+    }
+
+    return result;
+#endif
+}
+
+//---------------------------------------------------------------
+// Returns the type of CPU (the value of x of x86)
+// (Please note, that it returns 6 for P5-II)
+//---------------------------------------------------------------
+void GetSpecificCpuInfo(CORINFO_CPU * cpuInfo)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    static CORINFO_CPU val = { 0, 0, 0 };
+
+    if (val.dwCPUType)
+    {
+        *cpuInfo = val;
+        return;
+    }
+
+    CORINFO_CPU tempVal;
+    tempVal.dwCPUType = GetSpecificCpuTypeAsm();  // written in ASM & doesn't participate in contracts
+    _ASSERTE(tempVal.dwCPUType);
+    
+#ifdef _DEBUG
+    {
+        SO_NOT_MAINLINE_REGION();
+
+    /* Set Family+Model+Stepping string (eg., x690 for Banias, or xF30 for P4 Prescott)
+     * instead of Family only
+     */
+     
+    const DWORD cpuDefault = 0xFFFFFFFF;
+    static ConfigDWORD cpuFamily;
+    DWORD configCpuFamily = cpuFamily.val_DontUse_(CLRConfig::INTERNAL_CPUFamily, cpuDefault);
+    if (configCpuFamily != cpuDefault)
+    {
+        assert((configCpuFamily & 0xFFF) == configCpuFamily);
+        tempVal.dwCPUType = (tempVal.dwCPUType & 0xFFFF0000) | configCpuFamily;
+    }
+    }
+#endif
+
+    tempVal.dwFeatures = GetSpecificCpuFeaturesAsm(&tempVal.dwExtendedFeatures);  // written in ASM & doesn't participate in contracts
+
+#ifdef _DEBUG
+    {
+        SO_NOT_MAINLINE_REGION();
+
+    /* Set the 32-bit feature mask
+     */
+    
+    const DWORD cpuFeaturesDefault = 0xFFFFFFFF;
+    static ConfigDWORD cpuFeatures;
+    DWORD configCpuFeatures = cpuFeatures.val_DontUse_(CLRConfig::INTERNAL_CPUFeatures, cpuFeaturesDefault);
+    if (configCpuFeatures != cpuFeaturesDefault)
+    {
+        tempVal.dwFeatures = configCpuFeatures;
+    }
+    }
+#endif
+
+    val = *cpuInfo = tempVal;
+}
+
+#endif // #ifndef DACCESS_COMPILE
+
+
+//---------------------------------------------------------------------------------------
+//
+// Initialize the EHContext using the resume PC and the REGDISPLAY.  The EHContext is currently used in two
+// scenarios: to store the register state before calling an EH clause, and to retrieve the ambient SP of a 
+// particular stack frame.  resumePC means different things in the two scenarios.  In the former case, it
+// is the IP at which we are going to resume execution when we call an EH clause.  In the latter case, it 
+// is just the current IP.
+//
+// Arguments:
+//    resumePC - refer to the comment above
+//    regs     - This is the REGDISPLAY obtained from the CrawlFrame used in the stackwalk.  It represents the
+//               stack frame of the method containing the EH clause we are about to call.  For getting the 
+//               ambient SP, this is the stack frame we are interested in.
+//
+
+void EHContext::Setup(PCODE resumePC, PREGDISPLAY regs)
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    // EAX ECX EDX are scratch
+    this->Esp  = regs->Esp;
+    this->Ebx = *regs->pEbx;
+    this->Esi = *regs->pEsi;
+    this->Edi = *regs->pEdi;
+    this->Ebp = *regs->pEbp;
+
+    this->Eip = (ULONG)(size_t)resumePC;
+}
+
+//
+// Update the registers using new context
+//
+// This is necessary to reflect GC pointer changes during the middle of a unwind inside a 
+// finally clause, because:
+// 1. GC won't see the part of stack inside try (which has thrown an exception) that is already 
+// unwinded and thus GC won't update GC pointers for this portion of the stack, but rather the 
+// call stack in finally.
+// 2. upon return of finally, the unwind process continues and unwinds stack based on the part 
+// of stack inside try and won't see the updated values in finally.
+// As a result, we need to manually update the context using register values upon return of finally
+//
+// Note that we only update the registers for finally clause because
+// 1. For filter handlers, stack walker is able to see the whole stack (including the try part)
+// with the help of ExceptionFilterFrame as filter handlers are called in first pass
+// 2. For catch handlers, the current unwinding is already finished
+//
+void EHContext::UpdateFrame(PREGDISPLAY regs)
+{
+    LIMITED_METHOD_CONTRACT;
+    
+    // EAX ECX EDX are scratch. 
+    // No need to update ESP as unwinder takes care of that for us
+
+    LOG((LF_EH, LL_INFO1000, "Updating saved EBX: *%p= %p\n", regs->pEbx, this->Ebx));
+    LOG((LF_EH, LL_INFO1000, "Updating saved ESI: *%p= %p\n", regs->pEsi, this->Esi));
+    LOG((LF_EH, LL_INFO1000, "Updating saved EDI: *%p= %p\n", regs->pEdi, this->Edi));
+    LOG((LF_EH, LL_INFO1000, "Updating saved EBP: *%p= %p\n", regs->pEbp, this->Ebp));
+    
+    *regs->pEbx = this->Ebx;
+    *regs->pEsi = this->Esi;
+    *regs->pEdi = this->Edi;
+    *regs->pEbp = this->Ebp;
+}
+
+void TransitionFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    ENABLE_FORBID_GC_LOADER_USE_IN_THIS_SCOPE();
+
+    MethodDesc * pFunc = GetFunction();
+    _ASSERTE(pFunc != NULL);
+    UpdateRegDisplayHelper(pRD, pFunc->CbStackPop());
+
+    RETURN;
+}
+
+void TransitionFrame::UpdateRegDisplayHelper(const PREGDISPLAY pRD, UINT cbStackPop)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    CalleeSavedRegisters* regs = GetCalleeSavedRegisters();
+
+    // reset pContext; it's only valid for active (top-most) frame
+
+    pRD->pContext = NULL;
+
+    pRD->pEdi = (DWORD*) &regs->edi;
+    pRD->pEsi = (DWORD*) &regs->esi;
+    pRD->pEbx = (DWORD*) &regs->ebx;
+    pRD->pEbp = (DWORD*) &regs->ebp;
+    pRD->PCTAddr = GetReturnAddressPtr();
+    pRD->ControlPC = *PTR_PCODE(pRD->PCTAddr);
+    pRD->Esp  = (DWORD)(pRD->PCTAddr + sizeof(TADDR) + cbStackPop);
+
+    RETURN;
+}
+
+void HelperMethodFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        PRECONDITION(m_MachState.isValid());               // InsureInit has been called
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    ENABLE_FORBID_GC_LOADER_USE_IN_THIS_SCOPE();
+
+    // reset pContext; it's only valid for active (top-most) frame
+    pRD->pContext = NULL;
+
+#ifdef DACCESS_COMPILE
+
+    //
+    // In the dac case we may have gotten here
+    // without the frame being initialized, so
+    // try and initialize on the fly.
+    //
+
+    if (!m_MachState.isValid())
+    {
+        MachState unwindState;
+
+        InsureInit(false, &unwindState);
+        pRD->PCTAddr = dac_cast<TADDR>(unwindState.pRetAddr());
+        pRD->ControlPC = unwindState.GetRetAddr();
+        pRD->Esp = unwindState._esp;
+
+        // Get some special host instance memory
+        // so we have a place to point to.
+        // This host memory has no target address
+        // and so won't be looked up or used for
+        // anything else.
+        MachState* thisState = (MachState*)
+            DacAllocHostOnlyInstance(sizeof(*thisState), true);
+
+        thisState->_edi = unwindState._edi;
+        pRD->pEdi = (DWORD *)&thisState->_edi;
+        thisState->_esi = unwindState._esi;
+        pRD->pEsi = (DWORD *)&thisState->_esi;
+        thisState->_ebx = unwindState._ebx;
+        pRD->pEbx = (DWORD *)&thisState->_ebx;
+        thisState->_ebp = unwindState._ebp;
+        pRD->pEbp = (DWORD *)&thisState->_ebp;
+
+        // InsureInit always sets m_RegArgs to zero
+        // in the real code.  I'm not sure exactly
+        // what should happen in the on-the-fly case,
+        // but go with what would happen from an InsureInit.
+        RETURN;
+    }
+
+#endif // #ifdef DACCESS_COMPILE
+    
+    // DACCESS: The MachState pointers are kept as PTR_TADDR so
+    // the host pointers here refer to the appropriate size and
+    // these casts are not a problem.
+    pRD->pEdi = (DWORD*) m_MachState.pEdi();
+    pRD->pEsi = (DWORD*) m_MachState.pEsi();
+    pRD->pEbx = (DWORD*) m_MachState.pEbx();
+    pRD->pEbp = (DWORD*) m_MachState.pEbp();
+    pRD->PCTAddr = dac_cast<TADDR>(m_MachState.pRetAddr());
+    pRD->ControlPC = m_MachState.GetRetAddr();
+    pRD->Esp  = (DWORD) m_MachState.esp();
+
+    RETURN;
+}
+
+#ifdef _DEBUG_IMPL
+// Confirm that if the machine state was not initialized, then
+// any unspilled callee saved registers did not change
+EXTERN_C MachState* STDCALL HelperMethodFrameConfirmState(HelperMethodFrame* frame, void* esiVal, void* ediVal, void* ebxVal, void* ebpVal) 
+    {
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        DEBUG_ONLY;
+    }
+    CONTRACTL_END;
+
+    MachState* state = frame->MachineState(); 
+
+    // if we've already executed this check once for this helper method frame then
+    // we don't do the check again because it is very expensive.
+    if (frame->HaveDoneConfirmStateCheck())
+    {
+        return state;
+    }
+
+    // probe to avoid a kazillion violations in the code that follows.
+    BEGIN_DEBUG_ONLY_CODE;
+    if (!state->isValid())
+    {
+        frame->InsureInit(false, NULL);
+        _ASSERTE(state->_pEsi != &state->_esi || state->_esi  == (TADDR)esiVal);
+        _ASSERTE(state->_pEdi != &state->_edi || state->_edi  == (TADDR)ediVal);
+        _ASSERTE(state->_pEbx != &state->_ebx || state->_ebx  == (TADDR)ebxVal);
+        _ASSERTE(state->_pEbp != &state->_ebp || state->_ebp  == (TADDR)ebpVal);
+    }
+    END_DEBUG_ONLY_CODE;
+
+    // set that we have executed this check once for this helper method frame.
+    frame->SetHaveDoneConfirmStateCheck();
+
+    return state;
+}
+#endif
+
+void ExternalMethodFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    UpdateRegDisplayHelper(pRD, CbStackPopUsingGCRefMap(GetGCRefMap()));
+
+    RETURN;
+}
+
+
+void StubDispatchFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    PTR_BYTE pGCRefMap = GetGCRefMap();
+    if (pGCRefMap != NULL)
+    {
+        UpdateRegDisplayHelper(pRD, CbStackPopUsingGCRefMap(pGCRefMap));
+    }
+    else
+    if (GetFunction() != NULL)
+    {
+        FramedMethodFrame::UpdateRegDisplay(pRD);
+    }
+    else
+    {
+        UpdateRegDisplayHelper(pRD, 0);
+
+        // If we do not have owning MethodDesc, we need to pretend that 
+        // the call happened on the call instruction to get the ESP unwound properly.
+        //
+        // This path is hit when we are throwing null reference exception from
+        // code:VSD_ResolveWorker or code:StubDispatchFixupWorker
+        pRD->ControlPC = GetAdjustedCallAddress(pRD->ControlPC);
+    }
+
+    RETURN;
+}
+
+PCODE StubDispatchFrame::GetReturnAddress()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+    }
+    CONTRACTL_END;
+
+    PCODE retAddress = FramedMethodFrame::GetReturnAddress();
+    if (GetFunction() == NULL && GetGCRefMap() == NULL)
+    {
+        // See comment in code:StubDispatchFrame::UpdateRegDisplay
+        retAddress = GetAdjustedCallAddress(retAddress);
+    }
+    return retAddress;
+}
+
+void FaultingExceptionFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    CalleeSavedRegisters* regs = GetCalleeSavedRegisters();
+
+    // reset pContext; it's only valid for active (top-most) frame
+    pRD->pContext = NULL;
+
+    pRD->pEdi = (DWORD*) &regs->edi;
+    pRD->pEsi = (DWORD*) &regs->esi;
+    pRD->pEbx = (DWORD*) &regs->ebx;
+    pRD->pEbp = (DWORD*) &regs->ebp;
+    pRD->PCTAddr = GetReturnAddressPtr();
+    pRD->ControlPC = *PTR_PCODE(pRD->PCTAddr);
+    pRD->Esp = m_Esp;
+    RETURN;
+}
+
+void InlinedCallFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        // We should skip over InlinedCallFrame if it is not active.
+        // It will be part of a JITed method's frame, and the stack-walker
+        // can handle such a case.
+#ifdef PROFILING_SUPPORTED        
+        PRECONDITION(CORProfilerStackSnapshotEnabled() || InlinedCallFrame::FrameHasActiveCall(this));
+#endif
+        HOST_NOCALLS;
+        MODE_ANY;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    // @TODO: Remove this after the debugger is fixed to avoid stack-walks from bad places
+    // @TODO: This may be still needed for sampling profilers
+    if (!InlinedCallFrame::FrameHasActiveCall(this))
+    {
+        LOG((LF_CORDB, LL_ERROR, "WARNING: InlinedCallFrame::UpdateRegDisplay called on inactive frame %p\n", this));
+        return;
+    }
+    
+    DWORD stackArgSize = (DWORD) dac_cast<TADDR>(m_Datum);   
+
+    if (stackArgSize & ~0xFFFF)
+    {
+        NDirectMethodDesc * pMD = PTR_NDirectMethodDesc(m_Datum);
+
+        /* if this is not an NDirect frame, something is really wrong */
+
+        _ASSERTE(pMD->SanityCheck() && pMD->IsNDirect());
+
+        stackArgSize = pMD->GetStackArgumentSize();
+    }
+
+    // reset pContext; it's only valid for active (top-most) frame
+    pRD->pContext = NULL;
+
+
+    pRD->pEbp = (DWORD*) &m_pCalleeSavedFP;
+
+    /* The return address is just above the "ESP" */
+    pRD->PCTAddr = PTR_HOST_MEMBER_TADDR(InlinedCallFrame, this,
+                                         m_pCallerReturnAddress);
+    pRD->ControlPC = *PTR_PCODE(pRD->PCTAddr);
+
+    /* Now we need to pop off the outgoing arguments */
+    pRD->Esp  = (DWORD) dac_cast<TADDR>(m_pCallSiteSP) + stackArgSize;
+    RETURN;
+}
+
+#ifdef FEATURE_HIJACK
+//==========================
+// Resumable Exception Frame
+//
+TADDR ResumableFrame::GetReturnAddressPtr()
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+    return dac_cast<TADDR>(m_Regs) + offsetof(CONTEXT, Eip);
+}
+
+void ResumableFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    // reset pContext; it's only valid for active (top-most) frame
+    pRD->pContext = NULL;
+
+    CONTEXT* pUnwoundContext = m_Regs;
+
+#if !defined(DACCESS_COMPILE)
+    // "pContextForUnwind" field is only used on X86 since not only is it initialized just for it,
+    // but its used only under the confines of STACKWALKER_MAY_POP_FRAMES preprocessor define,
+    // which is defined for x86 only (refer to its definition in stackwalk.cpp).
+    if (pRD->pContextForUnwind != NULL)
+    {
+        pUnwoundContext = pRD->pContextForUnwind;
+
+        pUnwoundContext->Eax = m_Regs->Eax;
+        pUnwoundContext->Ecx = m_Regs->Ecx;
+        pUnwoundContext->Edx = m_Regs->Edx;
+
+        pUnwoundContext->Edi = m_Regs->Edi;
+        pUnwoundContext->Esi = m_Regs->Esi;
+        pUnwoundContext->Ebx = m_Regs->Ebx;
+        pUnwoundContext->Ebp = m_Regs->Ebp;
+        pUnwoundContext->Eip = m_Regs->Eip;
+    }
+#endif // !defined(DACCESS_COMPILE)
+
+    pRD->pEax = &pUnwoundContext->Eax;
+    pRD->pEcx = &pUnwoundContext->Ecx;
+    pRD->pEdx = &pUnwoundContext->Edx;
+
+    pRD->pEdi = &pUnwoundContext->Edi;
+    pRD->pEsi = &pUnwoundContext->Esi;
+    pRD->pEbx = &pUnwoundContext->Ebx;
+    pRD->pEbp = &pUnwoundContext->Ebp;
+
+    pRD->ControlPC = pUnwoundContext->Eip;
+    pRD->PCTAddr = dac_cast<TADDR>(m_Regs) + offsetof(CONTEXT, Eip);
+
+    pRD->Esp  = m_Regs->Esp;
+
+    RETURN;
+}
+
+// The HijackFrame has to know the registers that are pushed by OnHijackTripThread
+void HijackFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACTL {
+        NOTHROW;
+        GC_NOTRIGGER;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACTL_END;
+
+    // This only describes the top-most frame
+    pRD->pContext = NULL;
+
+    pRD->pEdi = &m_Args->Edi;
+    pRD->pEsi = &m_Args->Esi;
+    pRD->pEbx = &m_Args->Ebx;
+    pRD->pEdx = &m_Args->Edx;
+    pRD->pEcx = &m_Args->Ecx;
+    pRD->pEax = &m_Args->Eax;
+
+    pRD->pEbp = &m_Args->Ebp;
+    pRD->PCTAddr = dac_cast<TADDR>(m_Args) + offsetof(HijackArgs, Eip);
+    pRD->ControlPC = *PTR_PCODE(pRD->PCTAddr);
+    pRD->Esp  = (DWORD)(pRD->PCTAddr + sizeof(TADDR));
+}
+
+#endif  // FEATURE_HIJACK
+
+void PInvokeCalliFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    VASigCookie *pVASigCookie = GetVASigCookie();
+    UpdateRegDisplayHelper(pRD, pVASigCookie->sizeOfArgs+sizeof(int));
+
+    RETURN;
+}
+
+void TailCallFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
+{
+    CONTRACT_VOID
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        HOST_NOCALLS;
+        SUPPORTS_DAC;
+    }
+    CONTRACT_END;
+
+    // reset pContext; it's only valid for active (top-most) frame
+    pRD->pContext = NULL;
+
+    pRD->pEdi = (DWORD*)&m_regs.edi;
+    pRD->pEsi = (DWORD*)&m_regs.esi;
+    pRD->pEbx = (DWORD*)&m_regs.ebx;
+    pRD->pEbp = (DWORD*)&m_regs.ebp;
+
+    pRD->PCTAddr = GetReturnAddressPtr();
+    pRD->ControlPC = *PTR_PCODE(pRD->PCTAddr);
+    pRD->Esp  = (DWORD)(pRD->PCTAddr + sizeof(TADDR));
+
+    RETURN;
+}
+
+//------------------------------------------------------------------------
+// This is declared as returning WORD instead of PRD_TYPE because of
+// header issues with cgencpu.h including dbginterface.h.
+WORD GetUnpatchedCodeData(LPCBYTE pAddr)
+{
+#ifndef _TARGET_X86_
+#error Make sure this works before porting to platforms other than x86.
+#endif
+    CONTRACT(WORD) {
+        NOTHROW;
+        GC_NOTRIGGER;
+        PRECONDITION(CORDebuggerAttached());
+        PRECONDITION(CheckPointer(pAddr));
+        SO_TOLERANT;
+    } CONTRACT_END;
+
+    // Ordering is because x86 is little-endien.
+    BYTE bLow  = pAddr[0];
+    BYTE bHigh = pAddr[1];
+
+#ifndef DACCESS_COMPILE
+    // Need to make sure that the code we're reading is free of breakpoint patches.
+    PRD_TYPE unpatchedOpcode;
+    if (g_pDebugInterface->CheckGetPatchedOpcode((CORDB_ADDRESS_TYPE *)pAddr,
+                                                 &unpatchedOpcode))
+    {
+        // PRD_TYPE is supposed to be an opaque debugger structure representing data to remove a patch.
+        // Although PRD_TYPE is currently typedef'ed to be a DWORD_PTR, it's actually semantically just a BYTE. 
+        // (since a patch on x86 is just an 0xCC instruction).
+        // Ideally, the debugger subsystem would expose a patch-code stripper that returns BYTE/WORD/etc, and
+        // not force us to crack it ourselves here. 
+        bLow = (BYTE) unpatchedOpcode;
+    }
+    // 
+#endif
+
+    WORD w = bLow + (bHigh << 8);
+    RETURN w;
+}
+
+
+#ifndef DACCESS_COMPILE
+
+//-------------------------------------------------------------------------
+// One-time creation of special prestub to initialize UMEntryThunks.
+//-------------------------------------------------------------------------
+Stub *GenerateUMThunkPrestub()
+{
+    CONTRACT(Stub*)
+    {
+        STANDARD_VM_CHECK;
+        POSTCONDITION(CheckPointer(RETVAL));
+    }
+    CONTRACT_END;
+
+    CPUSTUBLINKER sl;
+    CPUSTUBLINKER *psl = &sl;
+
+    CodeLabel* rgRareLabels[] = { psl->NewCodeLabel(),
+                                  psl->NewCodeLabel(),
+                                  psl->NewCodeLabel()
+                                };
+
+
+    CodeLabel* rgRejoinLabels[] = { psl->NewCodeLabel(),
+                                    psl->NewCodeLabel(),
+                                    psl->NewCodeLabel()
+                                };
+
+    // emit the initial prolog
+    psl->EmitComMethodStubProlog(UMThkCallFrame::GetMethodFrameVPtr(), rgRareLabels, rgRejoinLabels, FALSE /*Don't profile*/);
+
+    // mov ecx, [esi+UMThkCallFrame.pUMEntryThunk]
+    psl->X86EmitIndexRegLoad(kECX, kESI, UMThkCallFrame::GetOffsetOfUMEntryThunk());
+
+    // The call conv is a __stdcall   
+    psl->X86EmitPushReg(kECX);
+
+    // call UMEntryThunk::DoRunTimeInit
+    psl->X86EmitCall(psl->NewExternalCodeLabel((LPVOID)UMEntryThunk::DoRunTimeInit), 4);
+
+    // mov ecx, [esi+UMThkCallFrame.pUMEntryThunk]
+    psl->X86EmitIndexRegLoad(kEAX, kESI, UMThkCallFrame::GetOffsetOfUMEntryThunk());
+
+    //    lea eax, [eax + UMEntryThunk.m_code]  // point to fixedup UMEntryThunk
+    psl->X86EmitOp(0x8d, kEAX, kEAX, 
+                   UMEntryThunk::GetCodeOffset() + UMEntryThunkCode::GetEntryPointOffset());
+
+    psl->EmitComMethodStubEpilog(UMThkCallFrame::GetMethodFrameVPtr(), rgRareLabels, rgRejoinLabels, FALSE /*Don't profile*/);
+
+    RETURN psl->Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+}
+
+Stub *GenerateInitPInvokeFrameHelper()
+{
+    CONTRACT(Stub*)
+    {
+        STANDARD_VM_CHECK;
+        POSTCONDITION(CheckPointer(RETVAL));
+    }
+    CONTRACT_END;
+
+    CPUSTUBLINKER sl;
+    CPUSTUBLINKER *psl = &sl;
+
+    CORINFO_EE_INFO::InlinedCallFrameInfo FrameInfo;
+    InlinedCallFrame::GetEEInfo(&FrameInfo);
+
+    // EDI contains address of the frame on stack (the frame ptr, not its negspace)
+    unsigned negSpace = FrameInfo.offsetOfFrameVptr;
+
+    // mov esi, GetThread()
+    psl->X86EmitCurrentThreadFetch(kESI, (1<<kEDI)|(1<<kEBX)|(1<<kECX)|(1<<kEDX));
+
+    // mov [edi + FrameInfo.offsetOfGSCookie], GetProcessGSCookie()
+    psl->X86EmitOffsetModRM(0xc7, (X86Reg)0x0, kEDI, FrameInfo.offsetOfGSCookie - negSpace);
+    psl->Emit32(GetProcessGSCookie());
+
+    // mov [edi + FrameInfo.offsetOfFrameVptr], InlinedCallFrame::GetFrameVtable()
+    psl->X86EmitOffsetModRM(0xc7, (X86Reg)0x0, kEDI, FrameInfo.offsetOfFrameVptr - negSpace);
+    psl->Emit32(InlinedCallFrame::GetMethodFrameVPtr());
+
+    // mov eax, [esi + offsetof(Thread, m_pFrame)]
+    // mov [edi + FrameInfo.offsetOfFrameLink], eax
+    psl->X86EmitIndexRegLoad(kEAX, kESI, offsetof(Thread, m_pFrame));
+    psl->X86EmitIndexRegStore(kEDI, FrameInfo.offsetOfFrameLink - negSpace, kEAX);
+
+    // mov [edi + FrameInfo.offsetOfCalleeSavedEbp], ebp
+    psl->X86EmitIndexRegStore(kEDI, FrameInfo.offsetOfCalleeSavedFP - negSpace, kEBP);
+
+    // mov [edi + FrameInfo.offsetOfReturnAddress], 0
+    psl->X86EmitOffsetModRM(0xc7, (X86Reg)0x0, kEDI, FrameInfo.offsetOfReturnAddress - negSpace);
+    psl->Emit32(0);
+
+    // mov [esi + offsetof(Thread, m_pFrame)], edi
+    psl->X86EmitIndexRegStore(kESI, offsetof(Thread, m_pFrame), kEDI);
+
+    // leave current Thread in ESI
+    psl->X86EmitReturn(0);
+
+    // A single process-wide stub that will never unload
+    RETURN psl->Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+}
+
+#ifdef FEATURE_INCLUDE_ALL_INTERFACES
+
+static void STDCALL LeaveRuntimeHelperWithFrame (Thread *pThread, size_t target, Frame *pFrame)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_PREEMPTIVE;
+        ENTRY_POINT;
+    }
+    CONTRACTL_END;
+    
+    Thread::LeaveRuntimeThrowComplus(target);
+    GCX_COOP_THREAD_EXISTS(pThread); 
+    pFrame->Push(pThread);
+
+}
+
+static void STDCALL EnterRuntimeHelperWithFrame (Thread *pThread, Frame *pFrame)
+{
+    // make sure we restore the original Win32 last error before leaving this function - we are
+    // called right after returning from the P/Invoke target and the error has not been saved yet
+    BEGIN_PRESERVE_LAST_ERROR;
+
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_PREEMPTIVE;
+        ENTRY_POINT;
+    }
+    CONTRACTL_END;
+    
+    {
+        HRESULT hr = Thread::EnterRuntimeNoThrow();
+        GCX_COOP_THREAD_EXISTS(pThread);
+        if (FAILED(hr))
+        {
+            INSTALL_UNWIND_AND_CONTINUE_HANDLER;
+            ThrowHR (hr);
+            UNINSTALL_UNWIND_AND_CONTINUE_HANDLER;
+        }
+
+        pFrame->Pop(pThread);
+    }
+
+    END_PRESERVE_LAST_ERROR;
+}
+
+// "ip" is the return address
+// This function disassembles the code at the return address to determine
+// how many arguments to pop off.
+// Returns the number of DWORDs that should be popped off on return.
+
+static int STDCALL GetStackSizeForVarArgCall(BYTE* ip)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        SO_TOLERANT;
+    }
+    CONTRACTL_END;
+
+    int retValue = 0;
+    //BEGIN_ENTRYPOINT_VOIDRET;
+
+    // The instruction immediately following the call may be a move into esp used for
+    // P/Invoke stack resilience. For caller-pop calls it's always mov esp, [ebp-n].
+    if (ip[0] == 0x8b)
+    {
+        if (ip[1] == 0x65)
+        {
+            // mov esp, [ebp+disp8]
+            ip += 3;
+        }
+        else if (ip[1] == 0xa5)
+        {
+            // mov esp, [ebp+disp32]
+            ip += 6;
+        }
+    }
+
+    if (ip[0] == 0x81 && ip[1] == 0xc4)
+    {
+        // add esp, imm32
+        retValue = (*(int*)&ip[2])/4;
+    }
+    else if (ip[0] == 0x83 && ip[1] == 0xc4)
+    {
+        // add esp, imm8
+        retValue = ip[2]/4;
+    }
+    else if (ip[0] == 0x59)
+    {
+        // pop ecx
+        retValue = 1;
+    }
+    else
+    {
+        retValue = 0;
+    }
+    //END_ENTRYPOINT_VOIDRET;
+    return retValue;
+}
+
+void LeaveRuntimeStackProbeOnly()
+{
+    CONTRACTL {
+        THROWS;
+        GC_TRIGGERS;
+        ENTRY_POINT;
+    }
+    CONTRACTL_END;
+
+#ifdef FEATURE_STACK_PROBE
+    RetailStackProbe(ADJUST_PROBE(DEFAULT_ENTRY_PROBE_AMOUNT));
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Hosting stub for calls from CLR code to unmanaged code
+//
+// We push a LeaveRuntimeFrame, and then re-push all the arguments.
+// Note that we have to support all the different native calling conventions
+// viz. stdcall, thiscall, cdecl, varargs
+
+#if 0
+
+This is a diagramatic description of what the stub does:
+
+                (lower addresses)
+
+                               |                |
+                               +----------------+ <--- ESP
+                               |                |
+                               |     copied     |
+                               |   arguments    |
+                               |                |
+                               |                |
+                               +----------------+
+                               |      EDX       |
+                               |      ECX       |
+                               +----------------+
+|                |             |   GSCookie     |
+|                |             +----------------+ <--- ESI
+|                |             |     vptr       |
+|                |             +----------------+
+|                |             |    m_Next      |
+|                |             +----------------+
+|                |             |      EDI       |   Scratch register
+|                |             |      ESI       |   For LeaveRuntimeFrame*
+|                |             |      EBX       |   For Thread*
+|                |             +----------------+ <--- EBP
+|                |             |      EBP       |
++----------------+ <---ESP     +----------------+
+|    ret addr    |             |    ret addr    |
++----------------+             +----------------+
+|                |             |                |
+|   arguments    |             |   arguments    |
+|                |             |                |
+|                |             |                |
++----------------+             +----------------+
+|                |             |                |
+| caller's frame |             | caller's frame |
+|                |             |                |
+
+                (higher addresses)
+
+  Stack on entry               Stack before the call 
+   to this stub.                to unmanaged code.
+
+#endif
+
+//-----------------------------------------------------------------------------
+// This the layout of the frame of the stub
+
+struct StubForHostStackFrame
+{
+    LPVOID                  m_outgingArgs[1];
+    ArgumentRegisters       m_argumentRegisters;
+    GSCookie                m_gsCookie;
+    LeaveRuntimeFrame       m_LeaveRuntimeFrame;
+    CalleeSavedRegisters    m_calleeSavedRegisters;
+    LPVOID                  m_retAddr;
+    LPVOID                  m_incomingArgs[1];
+
+public:
+
+    // Where does the FP/EBP point to?
+    static INT32 GetFPpositionOffset()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return offsetof(StubForHostStackFrame, m_calleeSavedRegisters) + 
+               offsetof(CalleeSavedRegisters, ebp); 
+    }
+
+    static INT32 GetFPrelOffsOfArgumentRegisters() 
+    { 
+        LIMITED_METHOD_CONTRACT;
+        return offsetof(StubForHostStackFrame, m_argumentRegisters) - GetFPpositionOffset(); 
+    }
+
+    static INT32 GetFPrelOffsOfCalleeSavedRegisters() 
+    { 
+        LIMITED_METHOD_CONTRACT;
+        return offsetof(StubForHostStackFrame, m_calleeSavedRegisters) - GetFPpositionOffset(); 
+    }
+
+    static INT32 GetFPrelOffsOfRetAddr() 
+    { 
+        LIMITED_METHOD_CONTRACT;
+        return offsetof(StubForHostStackFrame, m_retAddr) - GetFPpositionOffset(); 
+    }
+
+    static INT32 GetFPrelOffsOfIncomingArgs() 
+    { 
+        LIMITED_METHOD_CONTRACT;
+        return offsetof(StubForHostStackFrame, m_incomingArgs) - GetFPpositionOffset(); 
+    }
+};
+
+static Stub *GenerateStubForHostWorker(LoaderHeap *pHeap,
+                                       LPVOID pNativeTarget,    // NULL to fetch from the last pushed argument (COM)
+                                       Stub *pInnerStub,        // stub to call instead of pNativeTarget, or NULL
+                                       LONG dwComSlot,          // only valid if pNativeTarget is NULL
+                                       WORD wStackArgumentSize, // -1 for varargs
+                                       WORD wStackPopSize)      // 0 for cdecl
+{
+    STANDARD_VM_CONTRACT;
+    
+    // We need to call LeaveRuntime before the target, and EnterRuntime after the target
+    CPUSTUBLINKER sl;
+
+    sl.X86EmitPushEBPframe();
+    
+    // save EBX, ESI, EDI
+    sl.X86EmitPushReg(kEBX);
+    sl.X86EmitPushReg(kESI);
+    sl.X86EmitPushReg(kEDI);
+
+    // Frame
+    sl.X86EmitPushReg(kDummyPushReg); // m_Next
+    sl.X86EmitPushImm32((UINT)(size_t)LeaveRuntimeFrame::GetMethodFrameVPtr());
+
+    // mov esi, esp;  esi is Frame
+    sl.X86EmitMovRegSP(kESI);
+
+    sl.X86EmitPushImmPtr((LPVOID)GetProcessGSCookie());
+
+    // Save outgoing arguments on the stack
+    sl.X86EmitPushReg(kECX);
+    sl.X86EmitPushReg(kEDX);
+
+    INT32 offs = 0;
+    if (wStackArgumentSize == (WORD)-1)
+    {
+        // Re-push the return address as an argument to GetStackSizeForVarArgCall()
+        // This will return the number of stack arguments (in DWORDs)
+        sl.X86EmitIndexPush(kEBP, StubForHostStackFrame::GetFPrelOffsOfRetAddr());
+        sl.X86EmitCall(sl.NewExternalCodeLabel((LPVOID)GetStackSizeForVarArgCall), 4);
+
+        // We generate the following code sequence to re-push all the arguments
+        //
+        // Note that we cannot use "sub ESP, EAX" as ESP might jump past the
+        // stack guard-page.
+        //
+        //      cmp EAX, 0
+        // LoopTop:
+        //      jz LoopDone
+        //      push dword ptr[EBP + EAX*4 + 4]
+        //      sub EAX, 1
+        //      jmp LoopTop
+        // LoopDone:
+        //      ...
+
+        sl.X86EmitCmpRegImm32(kEAX, 0);
+        CodeLabel * pLoopTop = sl.EmitNewCodeLabel();
+        CodeLabel * pLoopDone = sl.NewCodeLabel();
+        sl.X86EmitCondJump(pLoopDone, X86CondCode::kJZ);
+        sl.X86EmitBaseIndexPush(kEBP, kEAX, 4, StubForHostStackFrame::GetFPrelOffsOfIncomingArgs() - sizeof(LPVOID));
+        sl.X86EmitSubReg(kEAX, 1);
+        sl.X86EmitNearJump(pLoopTop);
+        sl.EmitLabel(pLoopDone);
+    }
+    else
+    {
+        offs = StubForHostStackFrame::GetFPrelOffsOfIncomingArgs() + wStackArgumentSize;
+
+        int numStackSlots = wStackArgumentSize / sizeof(LPVOID);
+        for (int i = 0; i < numStackSlots; i++) {
+            offs -= sizeof(LPVOID);
+            sl.X86EmitIndexPush(kEBP, offs);
+        }
+    }
+
+    //-------------------------------------------------------------------------
+    
+    // EBX has Thread*
+    // X86TLSFetch_TRASHABLE_REGS will get trashed
+    sl.X86EmitCurrentThreadFetch(kEBX, 0);
+
+    if (pNativeTarget != NULL)
+    {
+        // push Frame
+        sl.X86EmitPushReg(kESI);
+
+        // push target
+        if (pNativeTarget == (LPVOID)-1)
+        {
+            // target comes right above arguments
+            sl.X86EmitIndexPush(kEBP, StubForHostStackFrame::GetFPrelOffsOfIncomingArgs() + wStackArgumentSize);
+        }
+        else
+        {
+            // target is fixed
+            sl.X86EmitPushImm32((UINT)(size_t)pNativeTarget);
+        }
+    }
+    else
+    {
+        // mov eax, [first_arg]
+        // mov eax, [eax]
+        // push [eax + slot_offset]
+        sl.X86EmitIndexRegLoad(kEAX, kEBP, offs);
+        sl.X86EmitIndexRegLoad(kEAX, kEAX, 0);
+        sl.X86EmitIndexPush(kEAX, sizeof(LPVOID) * dwComSlot);
+
+        // push Frame
+        sl.X86EmitPushReg(kESI);
+        // push [esp + 4]
+        sl.X86EmitEspOffset(0xff, (X86Reg)6, 4);
+    }
+
+    // push Thread
+    sl.X86EmitPushReg(kEBX);
+    sl.X86EmitCall(sl.NewExternalCodeLabel((LPVOID)LeaveRuntimeHelperWithFrame), 0xc);
+
+    //-------------------------------------------------------------------------
+    // call NDirect
+    // See diagram above to see what the stack looks like at this point
+
+    // Restore outgoing arguments
+    unsigned offsToArgRegs = StubForHostStackFrame::GetFPrelOffsOfArgumentRegisters();
+    sl.X86EmitIndexRegLoad(kECX, kEBP, offsToArgRegs + offsetof(ArgumentRegisters, ECX));
+    sl.X86EmitIndexRegLoad(kEDX, kEBP, offsToArgRegs + offsetof(ArgumentRegisters, EDX));
+    
+    if (pNativeTarget != NULL || pInnerStub != NULL)
+    {
+        if (pNativeTarget == (LPVOID)-1)
+        {
+            // mov eax, target
+            sl.X86EmitIndexRegLoad(kEAX, kEBP, StubForHostStackFrame::GetFPrelOffsOfIncomingArgs() + wStackArgumentSize);
+            // call eax
+            sl.Emit16(X86_INSTR_CALL_EAX);
+        }
+        else
+        {
+            if (pNativeTarget == NULL)
+            {
+                // pop target and discard it (we go to the inner stub)
+                _ASSERTE(pInnerStub != NULL);
+                sl.X86EmitPopReg(kEAX);
+            }
+
+            LPVOID pTarget = (pInnerStub != NULL ? (LPVOID)pInnerStub->GetEntryPoint() : pNativeTarget);
+            sl.X86EmitCall(sl.NewExternalCodeLabel(pTarget), wStackPopSize / 4);
+        }
+    }
+    else
+    {
+        // pop target
+        sl.X86EmitPopReg(kEAX);
+        // call eax
+        sl.Emit16(X86_INSTR_CALL_EAX);
+    }
+
+    //-------------------------------------------------------------------------
+    // Save return value registers and call EnterRuntimeHelperWithFrame
+    //
+    
+    sl.X86EmitPushReg(kEAX);
+    sl.X86EmitPushReg(kEDX);
+
+    // push Frame
+    sl.X86EmitPushReg(kESI);
+    // push Thread
+    sl.X86EmitPushReg(kEBX);
+    // call EnterRuntime
+    sl.X86EmitCall(sl.NewExternalCodeLabel((LPVOID)EnterRuntimeHelperWithFrame), 8);
+
+    sl.X86EmitPopReg(kEDX);
+    sl.X86EmitPopReg(kEAX);
+
+    //-------------------------------------------------------------------------
+    // Tear down the frame
+    //
+    
+    sl.EmitCheckGSCookie(kESI, LeaveRuntimeFrame::GetOffsetOfGSCookie());
+    
+    // lea esp, [ebp - offsToCalleeSavedRegs]
+    unsigned offsToCalleeSavedRegs = StubForHostStackFrame::GetFPrelOffsOfCalleeSavedRegisters();
+    sl.X86EmitIndexLea((X86Reg)kESP_Unsafe, kEBP, offsToCalleeSavedRegs);
+
+    sl.X86EmitPopReg(kEDI);
+    sl.X86EmitPopReg(kESI);
+    sl.X86EmitPopReg(kEBX);
+
+    sl.X86EmitPopReg(kEBP);
+
+    // ret [wStackPopSize]
+    sl.X86EmitReturn(wStackPopSize);
+
+    if (pInnerStub != NULL)
+    {
+        // this stub calls another stub
+        return sl.LinkInterceptor(pHeap, pInnerStub, pNativeTarget);
+    }
+    else
+    {
+        return sl.Link(pHeap);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+Stub *NDirectMethodDesc::GenerateStubForHost(LPVOID pNativeTarget, Stub *pInnerStub)
+{
+    STANDARD_VM_CONTRACT;
+    
+    // We need to call LeaveRuntime before the target, and EnterRuntime after the target
+
+    if (IsQCall())
+    {
+        // We need just the stack probe for QCalls
+        CPUSTUBLINKER sl;
+        sl.X86EmitCall(sl.NewExternalCodeLabel((LPVOID)LeaveRuntimeStackProbeOnly), 0);
+
+        sl.X86EmitNearJump(sl.NewExternalCodeLabel((LPVOID)pNativeTarget));
+
+        return sl.Link(GetLoaderAllocator()->GetStubHeap());
+    }
+
+    WORD wArgSize = (IsVarArgs() ? (WORD)-1 : GetStackArgumentSize());
+    WORD wPopSize = ((IsStdCall() || IsThisCall()) ? GetStackArgumentSize() : 0);
+
+    return GenerateStubForHostWorker(GetDomain()->GetLoaderAllocator()->GetStubHeap(),
+                                     pNativeTarget,
+                                     pInnerStub,
+                                     0,
+                                     wArgSize,
+                                     wPopSize);
+}
+
+
+#ifdef FEATURE_COMINTEROP
+
+//-----------------------------------------------------------------------------
+Stub *ComPlusCallInfo::GenerateStubForHost(LoaderHeap *pHeap, Stub *pInnerStub)
+{
+    STANDARD_VM_CONTRACT;
+    
+    WORD wArgSize = GetStackArgumentSize();
+
+    return GenerateStubForHostWorker(pHeap,
+                                     NULL,
+                                     pInnerStub,
+                                     m_cachedComSlot,
+                                     wArgSize,
+                                     wArgSize); //  always stdcall
+}
+
+#endif // FEATURE_COMINTEROP
+
+//-----------------------------------------------------------------------------
+// static
+Stub *COMDelegate::GenerateStubForHost(MethodDesc *pInvokeMD, MethodDesc *pStubMD, LPVOID pNativeTarget, Stub *pInnerStub)
+{
+    STANDARD_VM_CONTRACT;
+    
+    // get unmanaged calling convention from pInvokeMD's metadata
+    PInvokeStaticSigInfo sigInfo(pInvokeMD);
+    CorPinvokeMap callConv = sigInfo.GetCallConv();
+
+    WORD wArgSize = pStubMD->AsDynamicMethodDesc()->GetNativeStackArgSize();
+    WORD wPopSize = (callConv == pmCallConvCdecl ? 0 : wArgSize);
+
+    return GenerateStubForHostWorker(NULL, // we want to free this stub when the delegate dies
+                                     pNativeTarget,
+                                     pInnerStub,
+                                     0,
+                                     wArgSize,
+                                     wPopSize);
+}
+
+//-----------------------------------------------------------------------------
+// static
+Stub *NDirect::GenerateStubForHost(Module *pModule, CorUnmanagedCallingConvention callConv, WORD wArgSize)
+{
+    STANDARD_VM_CONTRACT;
+
+    // This one is for unmanaged CALLI where the target is passed as last argument
+    // (first pushed to stack)
+
+    WORD wPopSize = (callConv == IMAGE_CEE_CS_CALLCONV_C ? 0 : (wArgSize + STACK_ELEM_SIZE));
+
+    return GenerateStubForHostWorker(pModule->GetDomain()->GetLoaderAllocator()->GetStubHeap(),
+                                     (LPVOID)-1,
+                                     NULL,
+                                     0,
+                                     wArgSize,
+                                     wPopSize);
+}
+
+#endif // FEATURE_INCLUDE_ALL_INTERFACES
+
+
+#ifdef MDA_SUPPORTED
+
+//-----------------------------------------------------------------------------
+Stub *NDirectMethodDesc::GenerateStubForMDA(LPVOID pNativeTarget, Stub *pInnerStub, BOOL fCalledByStub)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER sl;
+    sl.X86EmitPushEBPframe();
+
+    DWORD callConv = (DWORD)(IsThisCall() ? pmCallConvThiscall : (IsStdCall() ? pmCallConvStdcall : pmCallConvCdecl));
+    _ASSERTE((callConv & StackImbalanceCookie::HAS_FP_RETURN_VALUE) == 0);
+
+    MetaSig msig(this);
+    if (msig.HasFPReturn())
+    {
+        // check for the HRESULT swapping impl flag
+        DWORD dwImplFlags;
+        IfFailThrow(GetMDImport()->GetMethodImplProps(GetMemberDef(), NULL, &dwImplFlags));
+
+        if (dwImplFlags & miPreserveSig)
+        {
+            // pass a flag to PInvokeStackImbalanceHelper that it should save & restore FPU return value
+            callConv |= StackImbalanceCookie::HAS_FP_RETURN_VALUE;
+        }
+    }
+
+    // init StackImbalanceCookie
+    sl.X86EmitPushReg(kEAX);       // m_dwSavedEsp (just making space)
+    sl.X86EmitPushImm32(callConv); // m_callConv
+
+    if (IsVarArgs())
+    {
+        // Re-push the return address as an argument to GetStackSizeForVarArgCall()
+        if (fCalledByStub)
+        {
+            // We will be called by another stub that doesn't know the stack size,
+            // so we need to skip a frame to get to the managed caller.
+            sl.X86EmitIndexRegLoad(kEAX, kEBP, 0);
+            sl.X86EmitIndexPush(kEAX, 4);
+        }
+        else
+        {
+            sl.X86EmitIndexPush(kEBP, 4);
+        }
+
+        // This will return the number of stack arguments (in DWORDs)
+        sl.X86EmitCall(sl.NewExternalCodeLabel((LPVOID)GetStackSizeForVarArgCall), 4);
+        
+        // shl eax,2
+        sl.Emit16(0xe0c1);
+        sl.Emit8(0x02);
+        
+        sl.X86EmitPushReg(kEAX); // m_dwStackArgSize
+    }
+    else
+    {
+        sl.X86EmitPushImm32(GetStackArgumentSize()); // m_dwStackArgSize
+    }
+
+    LPVOID pTarget = (pInnerStub != NULL ? (LPVOID)pInnerStub->GetEntryPoint() : pNativeTarget);
+    sl.X86EmitPushImmPtr(pTarget);       // m_pTarget
+    sl.X86EmitPushImmPtr(this);          // m_pMD
+
+    // stack layout at this point
+
+    // |          ...          |
+    // |    stack arguments    | EBP + 8
+    // +-----------------------+
+    // |    return address     | EBP + 4
+    // +-----------------------+
+    // |      saved EBP        | EBP + 0
+    // +-----------------------+
+    // | SIC::m_dwSavedEsp     |
+    // | SIC::m_callConv       |
+    // | SIC::m_dwStackArgSize |
+    // | SIC::m_pTarget        |
+    // | SIC::m_pMD            | EBP - 20
+    // ------------------------
+
+    // call the helper
+    sl.X86EmitCall(sl.NewExternalCodeLabel(PInvokeStackImbalanceHelper), sizeof(StackImbalanceCookie));
+
+    //  pop StackImbalanceCookie
+    sl.X86EmitMovSPReg(kEBP);
+
+    sl.X86EmitPopReg(kEBP);
+    sl.X86EmitReturn((IsStdCall() || IsThisCall()) ? GetStackArgumentSize() : 0);
+
+    if (pInnerStub)
+    {
+        return sl.LinkInterceptor(GetLoaderAllocator()->GetStubHeap(), pInnerStub, pNativeTarget);
+    }
+    else
+    {
+        return sl.Link(GetLoaderAllocator()->GetStubHeap());
+    }
+}
+
+//-----------------------------------------------------------------------------
+// static
+Stub *COMDelegate::GenerateStubForMDA(MethodDesc *pInvokeMD, MethodDesc *pStubMD, LPVOID pNativeTarget, Stub *pInnerStub)
+{
+    STANDARD_VM_CONTRACT;
+
+    WORD wStackArgSize = pStubMD->AsDynamicMethodDesc()->GetNativeStackArgSize();
+
+    // get unmanaged calling convention from pInvokeMD's metadata
+    PInvokeStaticSigInfo sigInfo(pInvokeMD);
+    DWORD callConv = (DWORD)sigInfo.GetCallConv();
+    _ASSERTE((callConv & StackImbalanceCookie::HAS_FP_RETURN_VALUE) == 0);
+
+    MetaSig msig(pInvokeMD);
+    if (msig.HasFPReturn())
+    {
+        // pass a flag to PInvokeStackImbalanceHelper that it should save & restore FPU return value
+        callConv |= StackImbalanceCookie::HAS_FP_RETURN_VALUE;
+    }
+
+    CPUSTUBLINKER sl;
+    sl.X86EmitPushEBPframe();
+
+    LPVOID pTarget = (pInnerStub != NULL ? (LPVOID)pInnerStub->GetEntryPoint() : pNativeTarget);
+
+    // init StackImbalanceCookie
+    sl.X86EmitPushReg(kEAX);             // m_dwSavedEsp (just making space)
+    sl.X86EmitPushImm32(callConv);       // m_callConv
+    sl.X86EmitPushImm32(wStackArgSize);  // m_dwStackArgSize
+    sl.X86EmitPushImmPtr(pTarget);       // m_pTarget
+    sl.X86EmitPushImmPtr(pInvokeMD);     // m_pMD
+
+    // stack layout at this point
+
+    // |          ...          |
+    // |    stack arguments    | EBP + 8
+    // +-----------------------+
+    // |    return address     | EBP + 4
+    // +-----------------------+
+    // |      saved EBP        | EBP + 0
+    // +-----------------------+
+    // | SIC::m_dwSavedEsp     |
+    // | SIC::m_callConv       |
+    // | SIC::m_dwStackArgSize |
+    // | SIC::m_pTarget        |
+    // | SIC::m_pMD            | EBP - 20
+    // ------------------------
+
+    // call the helper
+    sl.X86EmitCall(sl.NewExternalCodeLabel(PInvokeStackImbalanceHelper), sizeof(StackImbalanceCookie));
+
+    //  pop StackImbalanceCookie
+    sl.X86EmitMovSPReg(kEBP);
+
+    sl.X86EmitPopReg(kEBP);
+    sl.X86EmitReturn(callConv == pmCallConvCdecl ? 0 : wStackArgSize);
+
+    if (pInnerStub != NULL)
+    {
+        return sl.LinkInterceptor(pInnerStub, pNativeTarget);
+    }
+    else
+    {
+        return sl.Link(); // don't use loader heap as we want to be able to free the stub
+    }
+}
+
+#endif // MDA_SUPPORTED
+
+extern "C" VOID STDCALL StubRareEnableWorker(Thread *pThread)
+{
+    WRAPPER_NO_CONTRACT;
+
+    //printf("RareEnable\n");
+    pThread->RareEnablePreemptiveGC();
+}
+
+
+
+
+// Disable when calling into managed code from a place that fails via Exceptions
+extern "C" VOID STDCALL StubRareDisableTHROWWorker(Thread *pThread)
+{
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+
+    // Do not add a CONTRACT here.  We haven't set up SEH.  We rely
+    // on HandleThreadAbort and COMPlusThrowBoot dealing with this situation properly.
+
+    // WARNING!!!!
+    // when we start executing here, we are actually in cooperative mode.  But we
+    // haven't synchronized with the barrier to reentry yet.  So we are in a highly
+    // dangerous mode.  If we call managed code, we will potentially be active in
+    // the GC heap, even as GC's are occuring!
+
+    // Check for ShutDown scenario.  This happens only when we have initiated shutdown 
+    // and someone is trying to call in after the CLR is suspended.  In that case, we
+    // must either raise an unmanaged exception or return an HRESULT, depending on the
+    // expectations of our caller.
+    if (!CanRunManagedCode())
+    {
+        // DO NOT IMPROVE THIS EXCEPTION!  It cannot be a managed exception.  It
+        // cannot be a real exception object because we cannot execute any managed
+        // code here.
+        pThread->m_fPreemptiveGCDisabled = 0;
+        COMPlusThrowBoot(E_PROCESS_SHUTDOWN_REENTRY);
+    }
+
+    // We must do the following in this order, because otherwise we would be constructing
+    // the exception for the abort without synchronizing with the GC.  Also, we have no
+    // CLR SEH set up, despite the fact that we may throw a ThreadAbortException.
+    pThread->RareDisablePreemptiveGC();
+    pThread->HandleThreadAbort();
+}
+
+// Note that this logic is copied below, in PopSEHRecords
+__declspec(naked)
+VOID __cdecl PopSEHRecords(LPVOID pTargetSP)
+{
+    // No CONTRACT possible on naked functions
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+
+    __asm{
+        mov     ecx, [esp+4]        ;; ecx <- pTargetSP
+        mov     eax, fs:[0]         ;; get current SEH record
+  poploop:
+        cmp     eax, ecx
+        jge     done
+        mov     eax, [eax]          ;; get next SEH record
+        jmp     poploop
+  done:
+        mov     fs:[0], eax
+        retn
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// JITInterface
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/*********************************************************************/
+#ifdef EnC_SUPPORTED
+#pragma warning (disable : 4731)
+void ResumeAtJit(PCONTEXT pContext, LPVOID oldESP)
+{
+    // No CONTRACT here, because we can't run the risk of it pushing any SEH into the
+    // current method.
+
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+
+#ifdef _DEBUG
+    DWORD curESP;
+    __asm mov curESP, esp
+#endif
+
+    if (oldESP)
+    {
+        _ASSERTE(curESP < (DWORD)(size_t)oldESP);
+        // should have popped the SEH records by now as stack has been overwritten
+        _ASSERTE(GetCurrentSEHRecord() > oldESP);
+    }
+
+    // For the "push Eip, ..., ret"
+    _ASSERTE(curESP < pContext->Esp - sizeof(DWORD));
+    pContext->Esp -= sizeof(DWORD);
+
+    __asm {
+        mov     ebp, pContext
+
+        // Push Eip onto the targetESP, so that the final "ret" will consume it
+        mov     ecx, [ebp]CONTEXT.Esp
+        mov     edx, [ebp]CONTEXT.Eip
+        mov     [ecx], edx
+
+        // Restore all registers except Esp, Ebp, Eip
+        mov     eax, [ebp]CONTEXT.Eax
+        mov     ebx, [ebp]CONTEXT.Ebx
+        mov     ecx, [ebp]CONTEXT.Ecx
+        mov     edx, [ebp]CONTEXT.Edx
+        mov     esi, [ebp]CONTEXT.Esi
+        mov     edi, [ebp]CONTEXT.Edi
+
+        push    [ebp]CONTEXT.Esp  // pContext->Esp is (targetESP-sizeof(DWORD))
+        push    [ebp]CONTEXT.Ebp
+        pop     ebp
+        pop     esp
+
+        // esp is (targetESP-sizeof(DWORD)), and [esp] is the targetEIP.
+        // The ret will set eip to targetEIP and esp will be automatically
+        // incremented to targetESP
+
+        ret
+    }
+}
+#pragma warning (default : 4731)
+#endif // !EnC_SUPPORTED
+
+
+#pragma warning(push)
+#pragma warning(disable: 4035)
+DWORD getcpuid(DWORD arg, unsigned char result[16])
+{
+    LIMITED_METHOD_CONTRACT
+
+    __asm
+    {
+        push    ebx
+        push    esi
+        mov     eax, arg
+        cpuid
+        mov     esi, result
+        mov     [esi+ 0], eax
+        mov     [esi+ 4], ebx
+        mov     [esi+ 8], ecx
+        mov     [esi+12], edx
+        pop     esi
+        pop     ebx
+    }
+}
+
+// The following function uses Deterministic Cache Parameter leafs to determine the cache hierarchy information on Prescott & Above platforms. 
+//  This function takes 3 arguments:
+//     Arg1 is an input to ECX. Used as index to specify which cache level to return infoformation on by CPUID.
+//     Arg2 is an input to EAX. For deterministic code enumeration, we pass in 4H in arg2.
+//     Arg3 is a pointer to the return buffer
+//   No need to check whether or not CPUID is supported because we have already called CPUID with success to come here.
+
+DWORD getextcpuid(DWORD arg1, DWORD arg2, unsigned char result[16])
+{
+    LIMITED_METHOD_CONTRACT
+
+    __asm
+    {
+        push    ebx
+        push    esi
+        mov     ecx, arg1
+        mov     eax, arg2
+        cpuid
+        mov     esi, result
+        mov     [esi+ 0], eax
+        mov     [esi+ 4], ebx
+        mov     [esi+ 8], ecx
+        mov     [esi+12], edx
+        pop     esi
+        pop     ebx
+    }
+}
+
+#pragma warning(pop)
+
+
+// This function returns the number of logical processors on a given physical chip.  If it cannot
+// determine the number of logical cpus, or the machine is not populated uniformly with the same
+// type of processors, this function returns 1.
+DWORD GetLogicalCpuCount()
+{
+    // No CONTRACT possible because GetLogicalCpuCount uses SEH
+
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+
+    static DWORD val = 0;
+
+    // cache value for later re-use
+    if (val)
+    {
+        return val;
+    }
+
+    struct Param : DefaultCatchFilterParam
+    {
+        DWORD retVal;
+    } param;
+    param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER;
+    param.retVal = 1;
+
+    PAL_TRY(Param *, pParam, &param)
+    {
+        unsigned char buffer[16];
+
+        DWORD maxCpuId = getcpuid(0, buffer);
+
+        if (maxCpuId < 1)
+            goto lDone;
+
+        DWORD* dwBuffer = (DWORD*)buffer;
+
+        if (dwBuffer[1] == 'uneG') {
+            if (dwBuffer[3] == 'Ieni') {
+                if (dwBuffer[2] == 'letn')  {  // get SMT/multicore enumeration for Intel EM64T 
+
+                    // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on 
+                    // multi-core processor, but we never call into those two functions since we don't halve the
+                    // gen0size when it's prescott and above processor. We keep the old version here for earlier
+                    // generation system(Northwood based), perf data suggests on those systems, halve gen0 size 
+                    // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) 
+                    // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS() 
+                    // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. 
+                    // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 
+                    // size at all gives us overall better performance. 
+                    // This is going to be fixed with a new version in orcas time frame. 
+
+                    if( (maxCpuId > 3) && (maxCpuId < 0x80000000) ) 
+                        goto lDone;
+
+                    val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
+                    if (val )
+                    {
+                        pParam->retVal = val;     // OS API HT enumeration successful, we are Done        
+                        goto lDone;
+                    }
+
+                    val = GetLogicalCpuCountFallback();    // OS API failed, Fallback to HT enumeration using CPUID
+                    if( val )
+                        pParam->retVal = val;
+                }
+            }
+        }
+lDone: ;
+    }
+    PAL_EXCEPT_FILTER(DefaultCatchFilter)
+    {
+    }
+    PAL_ENDTRY
+
+    if (val == 0)
+    {
+        val = param.retVal;
+    }
+
+    return param.retVal;
+}
+
+void UMEntryThunkCode::Encode(BYTE* pTargetCode, void* pvSecretParam)
+{
+    LIMITED_METHOD_CONTRACT;
+
+#ifdef _DEBUG
+    m_alignpad[0] = X86_INSTR_INT3;
+    m_alignpad[1] = X86_INSTR_INT3;
+#endif // _DEBUG
+    m_movEAX     = X86_INSTR_MOV_EAX_IMM32;
+    m_uet        = pvSecretParam;
+    m_jmp        = X86_INSTR_JMP_REL32;
+    m_execstub   = (BYTE*) ((pTargetCode) - (4+((BYTE*)&m_execstub)));
+
+    FlushInstructionCache(GetCurrentProcess(),GetEntryPoint(),sizeof(UMEntryThunkCode));
+}
+
+UMEntryThunk* UMEntryThunk::Decode(LPVOID pCallback)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    if (*((BYTE*)pCallback) != X86_INSTR_MOV_EAX_IMM32 ||
+        ( ((size_t)pCallback) & 3) != 2) {
+        return NULL;
+    }
+    return *(UMEntryThunk**)( 1 + (BYTE*)pCallback );
+}
+
+BOOL DoesSlotCallPrestub(PCODE pCode)
+{
+    CONTRACTL {
+        NOTHROW;
+        GC_NOTRIGGER;
+        SO_TOLERANT;
+        PRECONDITION(pCode != NULL);
+        PRECONDITION(pCode != GetPreStubEntryPoint());
+    } CONTRACTL_END;
+
+    // x86 has the following possible sequences for prestub logic:
+    // 1. slot -> temporary entrypoint -> prestub
+    // 2. slot -> precode -> prestub
+    // 3. slot -> precode -> jumprel32 (NGEN case) -> prestub
+
+#ifdef HAS_COMPACT_ENTRYPOINTS
+    if (MethodDescChunk::GetMethodDescFromCompactEntryPoint(pCode, TRUE) != NULL)
+    {
+        return TRUE;
+    }
+#endif // HAS_COMPACT_ENTRYPOINTS
+
+    if (!IS_ALIGNED(pCode, PRECODE_ALIGNMENT))
+    {
+        return FALSE;
+    }
+
+#ifdef HAS_FIXUP_PRECODE
+    if (*PTR_BYTE(pCode) == X86_INSTR_CALL_REL32)
+    {
+        // Note that call could have been patched to jmp in the meantime
+        pCode = rel32Decode(pCode+1);
+
+        // NGEN case
+        if (*PTR_BYTE(pCode) == X86_INSTR_JMP_REL32) {
+            pCode = rel32Decode(pCode+1);
+        }
+
+        return pCode == (TADDR)PrecodeFixupThunk;
+    }
+#endif
+
+    if (*PTR_BYTE(pCode) != X86_INSTR_MOV_EAX_IMM32 ||
+        *PTR_BYTE(pCode+5) != X86_INSTR_MOV_RM_R ||
+        *PTR_BYTE(pCode+7) != X86_INSTR_JMP_REL32)
+    {
+        return FALSE;
+    }
+    pCode = rel32Decode(pCode+8);
+
+    // NGEN case
+    if (*PTR_BYTE(pCode) == X86_INSTR_JMP_REL32) {
+        pCode = rel32Decode(pCode+1);
+    }
+
+    return pCode == GetPreStubEntryPoint();
+}
+
+//==========================================================================================
+// In NGen image, virtual slots inherited from cross-module dependencies point to jump thunks.
+// These jump thunk initially point to VirtualMethodFixupStub which transfers control here.
+// This method 'VirtualMethodFixupWorker' will patch the jump thunk to point to the actual
+// inherited method body after we have execute the precode and a stable entry point.
+//
+EXTERN_C PVOID STDCALL VirtualMethodFixupWorker(Object * pThisPtr,  CORCOMPILE_VIRTUAL_IMPORT_THUNK *pThunk)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_COOPERATIVE;
+        ENTRY_POINT;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(pThisPtr != NULL);
+    VALIDATEOBJECT(pThisPtr);
+
+    MethodTable * pMT = pThisPtr->GetTrueMethodTable();
+
+    WORD slotNumber = pThunk->slotNum;
+    _ASSERTE(slotNumber != (WORD)-1);
+
+    PCODE pCode = pMT->GetRestoredSlot(slotNumber);
+
+    if (!DoesSlotCallPrestub(pCode))
+    {
+        // Skip fixup precode jump for better perf
+        PCODE pDirectTarget = Precode::TryToSkipFixupPrecode(pCode);
+        if (pDirectTarget != NULL)
+            pCode = pDirectTarget;
+
+        INT64 oldValue = *(INT64*)pThunk;
+        BYTE* pOldValue = (BYTE*)&oldValue;
+
+        if (pOldValue[0] == X86_INSTR_CALL_REL32)
+        {
+            INT64 newValue = oldValue;
+            BYTE* pNewValue = (BYTE*)&newValue;
+            pNewValue[0] = X86_INSTR_JMP_REL32;
+
+            INT_PTR pcRelOffset = (BYTE*)pCode - &pThunk->callJmp[5];
+            *(INT32 *)(&pNewValue[1]) = (INT32) pcRelOffset;
+
+            _ASSERTE(IS_ALIGNED(pThunk, sizeof(INT64)));
+            if (EnsureWritableExecutablePagesNoThrow(pThunk, sizeof(INT64)))
+                FastInterlockCompareExchangeLong((INT64*)pThunk, newValue, oldValue);
+
+            FlushInstructionCache(GetCurrentProcess(), pThunk, 8);
+        }
+    }
+
+    return PVOID(pCode);
+}
+
+
+#ifdef FEATURE_READYTORUN
+
+//
+// Allocation of dynamic helpers
+//
+
+#define DYNAMIC_HELPER_ALIGNMENT sizeof(TADDR)
+
+#define BEGIN_DYNAMIC_HELPER_EMIT(size) \
+    SIZE_T cb = size; \
+    SIZE_T cbAligned = ALIGN_UP(cb, DYNAMIC_HELPER_ALIGNMENT); \
+    BYTE * pStart = (BYTE *)(void *)pAllocator->GetDynamicHelpersHeap()->AllocAlignedMem(cbAligned, DYNAMIC_HELPER_ALIGNMENT); \
+    BYTE * p = pStart;
+
+#define END_DYNAMIC_HELPER_EMIT() \
+    _ASSERTE(pStart + cb == p); \
+    while (p < pStart + cbAligned) *p++ = X86_INSTR_INT3; \
+    ClrFlushInstructionCache(pStart, cbAligned); \
+    return (PCODE)pStart
+
+PCODE DynamicHelpers::CreateHelper(LoaderAllocator * pAllocator, TADDR arg, PCODE target)
+{
+    STANDARD_VM_CONTRACT;
+
+    BEGIN_DYNAMIC_HELPER_EMIT(10);
+
+    *p++ = 0xB9; // mov ecx, XXXXXX
+    *(INT32 *)p = (INT32)arg;
+    p += 4;
+
+    *p++ = X86_INSTR_JMP_REL32; // jmp rel32
+    *(INT32 *)p = rel32UsingJumpStub((INT32 *)p, target);
+    p += 4;
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+void DynamicHelpers::EmitHelperWithArg(BYTE*& p, LoaderAllocator * pAllocator, TADDR arg, PCODE target)
+{
+    CONTRACTL
+    {
+        GC_NOTRIGGER;
+        PRECONDITION(p != NULL && target != NULL);
+    }
+    CONTRACTL_END;
+
+    // Move an an argument into the second argument register and jump to a target function.
+
+    *p++ = 0xBA; // mov edx, XXXXXX
+    *(INT32 *)p = (INT32)arg;
+    p += 4;
+
+    *p++ = X86_INSTR_JMP_REL32; // jmp rel32
+    *(INT32 *)p = rel32UsingJumpStub((INT32 *)p, target);
+    p += 4;
+}
+
+PCODE DynamicHelpers::CreateHelperWithArg(LoaderAllocator * pAllocator, TADDR arg, PCODE target)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(10);
+
+    EmitHelperWithArg(p, pAllocator, arg, target);
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateHelper(LoaderAllocator * pAllocator, TADDR arg, TADDR arg2, PCODE target)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(15);
+
+    *p++ = 0xB9; // mov ecx, XXXXXX
+    *(INT32 *)p = (INT32)arg;
+    p += 4;
+
+    *p++ = 0xBA; // mov edx, XXXXXX
+    *(INT32 *)p = (INT32)arg2;
+    p += 4;
+
+    *p++ = X86_INSTR_JMP_REL32; // jmp rel32
+    *(INT32 *)p = rel32UsingJumpStub((INT32 *)p, target);
+    p += 4;
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateHelperArgMove(LoaderAllocator * pAllocator, TADDR arg, PCODE target)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(12);
+
+    *(UINT16 *)p = 0xD18B; // mov edx, ecx
+    p += 2;
+
+    *p++ = 0xB9; // mov ecx, XXXXXX
+    *(INT32 *)p = (INT32)arg;
+    p += 4;
+
+    *p++ = X86_INSTR_JMP_REL32; // jmp rel32
+    *(INT32 *)p = rel32UsingJumpStub((INT32 *)p, target);
+    p += 4;
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateReturn(LoaderAllocator * pAllocator)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(1);
+
+    *p++ = 0xC3; // ret
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateReturnConst(LoaderAllocator * pAllocator, TADDR arg)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(6);
+
+    *p++ = 0xB8; // mov eax, XXXXXX
+    *(INT32 *)p = (INT32)arg;
+    p += 4;
+
+    *p++ = 0xC3; // ret
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateReturnIndirConst(LoaderAllocator * pAllocator, TADDR arg, INT8 offset)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT((offset != 0) ? 9 : 6);
+
+    *p++ = 0xA1; // mov eax, [XXXXXX]
+    *(INT32 *)p = (INT32)arg;
+    p += 4;
+
+    if (offset != 0)
+    {
+        // add eax, <offset>
+        *p++ = 0x83;
+        *p++ = 0xC0;
+        *p++ = offset;
+    }
+
+    *p++ = 0xC3; // ret
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateHelperWithTwoArgs(LoaderAllocator * pAllocator, TADDR arg, PCODE target)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(12);
+
+    // pop eax
+    *p++ = 0x58;
+
+    // push arg
+    *p++ = 0x68;
+    *(INT32 *)p = arg;
+    p += 4;
+
+    // push eax
+    *p++ = 0x50;
+
+    *p++ = X86_INSTR_JMP_REL32; // jmp rel32
+    *(INT32 *)p = rel32UsingJumpStub((INT32 *)p, target);
+    p += 4;
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateHelperWithTwoArgs(LoaderAllocator * pAllocator, TADDR arg, TADDR arg2, PCODE target)
+{
+    BEGIN_DYNAMIC_HELPER_EMIT(17);
+
+    // pop eax
+    *p++ = 0x58;
+
+    // push arg
+    *p++ = 0x68;
+    *(INT32 *)p = arg;
+    p += 4;
+
+    // push arg2
+    *p++ = 0x68;
+    *(INT32 *)p = arg2;
+    p += 4;
+
+    // push eax
+    *p++ = 0x50;
+
+    *p++ = X86_INSTR_JMP_REL32; // jmp rel32
+    *(INT32 *)p = rel32UsingJumpStub((INT32 *)p, target);
+    p += 4;
+
+    END_DYNAMIC_HELPER_EMIT();
+}
+
+PCODE DynamicHelpers::CreateDictionaryLookupHelper(LoaderAllocator * pAllocator, CORINFO_RUNTIME_LOOKUP * pLookup, DWORD dictionaryIndexAndSlot, Module * pModule)
+{
+    STANDARD_VM_CONTRACT;
+
+    PCODE helperAddress = (pLookup->helper == CORINFO_HELP_RUNTIMEHANDLE_METHOD ?
+        GetEEFuncEntryPoint(JIT_GenericHandleMethodWithSlotAndModule) :
+        GetEEFuncEntryPoint(JIT_GenericHandleClassWithSlotAndModule));
+
+    GenericHandleArgs * pArgs = (GenericHandleArgs *)(void *)pAllocator->GetDynamicHelpersHeap()->AllocAlignedMem(sizeof(GenericHandleArgs), DYNAMIC_HELPER_ALIGNMENT);
+    pArgs->dictionaryIndexAndSlot = dictionaryIndexAndSlot;
+    pArgs->signature = pLookup->signature;
+    pArgs->module = (CORINFO_MODULE_HANDLE)pModule;
+
+    // It's available only via the run-time helper function
+    if (pLookup->indirections == CORINFO_USEHELPER)
+    {
+        BEGIN_DYNAMIC_HELPER_EMIT(10);
+
+        // ecx contains the generic context parameter
+        // mov edx,pArgs
+        // jmp helperAddress
+        EmitHelperWithArg(p, pAllocator, (TADDR)pArgs, helperAddress);
+
+        END_DYNAMIC_HELPER_EMIT();
+    }
+    else
+    {
+        int indirectionsSize = 0;
+        for (WORD i = 0; i < pLookup->indirections; i++)
+            indirectionsSize += (pLookup->offsets[i] >= 0x80 ? 6 : 3);
+
+        int codeSize = indirectionsSize + (pLookup->testForNull ? 21 : 3);
+
+        BEGIN_DYNAMIC_HELPER_EMIT(codeSize);
+
+        if (pLookup->testForNull)
+        {
+            // ecx contains the generic context parameter. Save a copy of it in the eax register
+            // mov eax,ecx
+            *(UINT16*)p = 0xc889; p += 2;
+        }
+
+        for (WORD i = 0; i < pLookup->indirections; i++)
+        {
+            // mov ecx,qword ptr [ecx+offset]
+            if (pLookup->offsets[i] >= 0x80)
+            {
+                *(UINT16*)p = 0x898b; p += 2;
+                *(UINT32*)p = (UINT32)pLookup->offsets[i]; p += 4;
+            }
+            else
+            {
+                *(UINT16*)p = 0x498b; p += 2;
+                *p++ = (BYTE)pLookup->offsets[i];
+            }
+        }
+
+        // No null test required
+        if (!pLookup->testForNull)
+        {
+            // No fixups needed for R2R
+
+            // mov eax,ecx
+            *(UINT16*)p = 0xc889; p += 2;
+            *p++ = 0xC3;    // ret
+        }
+        else
+        {
+            // ecx contains the value of the dictionary slot entry
+
+            _ASSERTE(pLookup->indirections != 0);
+
+            // test ecx,ecx
+            *(UINT16*)p = 0xc985; p += 2;
+
+            // je 'HELPER_CALL' (a jump of 3 bytes)
+            *(UINT16*)p = 0x0374; p += 2;
+
+            // mov eax,ecx
+            *(UINT16*)p = 0xc889; p += 2;
+            *p++ = 0xC3;    // ret
+
+            // 'HELPER_CALL'
+            {
+                // Put the generic context back into rcx (was previously saved in eax)
+                // mov ecx,eax
+                *(UINT16*)p = 0xc189; p += 2;
+
+                // mov edx,pArgs
+                // jmp helperAddress
+                EmitHelperWithArg(p, pAllocator, (TADDR)pArgs, helperAddress);
+            }
+        }
+
+        END_DYNAMIC_HELPER_EMIT();
+    }
+}
+
+#endif // FEATURE_READYTORUN
+
+
+#endif // DACCESS_COMPILE
diff --git a/src/vm/i386/excepcpu.h b/src/vm/i386/excepcpu.h
new file mode 100644
index 0000000000..3f2f0810a7
--- /dev/null
+++ b/src/vm/i386/excepcpu.h
@@ -0,0 +1,87 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+//
+// EXCEPX86.H -
+//
+// This header file is optionally included from Excep.h if the target platform is x86
+//
+
+
+#ifndef __excepx86_h__
+#define __excepx86_h__
+
+#include "corerror.h"  // HResults for the COM+ Runtime
+
+#include "../dlls/mscorrc/resource.h"
+
+#define THROW_CONTROL_FOR_THREAD_FUNCTION  ThrowControlForThread
+
+#define STATUS_CLR_GCCOVER_CODE         STATUS_PRIVILEGED_INSTRUCTION
+
+class Thread;
+
+#if defined(_MSC_VER)
+#pragma warning(disable:4733) // Inline asm assigning to `FS:0` : handler not registered as safe handler
+                              // Actually, the handler getting set is properly registered
+#endif
+
+#define INSTALL_EXCEPTION_HANDLING_RECORD(record)               \
+    {                                                           \
+        PEXCEPTION_REGISTRATION_RECORD __record = (record);     \
+        _ASSERTE(__record < GetCurrentSEHRecord());             \
+        __record->Next = (PEXCEPTION_REGISTRATION_RECORD)__readfsdword(0); \
+        __writefsdword(0, (DWORD)__record);                     \
+    }
+
+//
+// Note: this only pops a handler from the top of the stack. It will not remove a record from the middle of the
+// chain, and I can assure you that you don't want to do that anyway.
+//
+#define UNINSTALL_EXCEPTION_HANDLING_RECORD(record)             \
+    {                                                           \
+        PEXCEPTION_REGISTRATION_RECORD __record = (record);     \
+        _ASSERTE(__record == GetCurrentSEHRecord());            \
+        __writefsdword(0, (DWORD)__record->Next);               \
+    }
+
+// stackOverwriteBarrier is used to detect overwriting of stack which will mess up handler registration
+#if defined(_DEBUG)
+#define DECLARE_CPFH_EH_RECORD(pCurThread) \
+    FrameHandlerExRecordWithBarrier *___pExRecordWithBarrier = (FrameHandlerExRecordWithBarrier *)_alloca(sizeof(FrameHandlerExRecordWithBarrier)); \
+    for (int ___i =0; ___i < STACK_OVERWRITE_BARRIER_SIZE; ___i++) \
+        ___pExRecordWithBarrier->m_StackOverwriteBarrier[___i] = STACK_OVERWRITE_BARRIER_VALUE; \
+    FrameHandlerExRecord *___pExRecord = &(___pExRecordWithBarrier->m_ExRecord); \
+    ___pExRecord->m_ExReg.Handler = (PEXCEPTION_ROUTINE)COMPlusFrameHandler; \
+    ___pExRecord->m_pEntryFrame = (pCurThread)->GetFrame();
+
+#else
+#define DECLARE_CPFH_EH_RECORD(pCurThread) \
+    FrameHandlerExRecord *___pExRecord = (FrameHandlerExRecord *)_alloca(sizeof(FrameHandlerExRecord)); \
+    ___pExRecord->m_ExReg.Handler = (PEXCEPTION_ROUTINE)COMPlusFrameHandler; \
+    ___pExRecord->m_pEntryFrame = (pCurThread)->GetFrame();
+
+#endif
+
+//
+// Retrieves the redirected CONTEXT* from the stack frame of one of the
+// RedirectedHandledJITCaseForXXX_Stub's.
+//
+PTR_CONTEXT GetCONTEXTFromRedirectedStubStackFrame(CONTEXT * pContext);
+
+PEXCEPTION_REGISTRATION_RECORD GetCurrentSEHRecord();
+PEXCEPTION_REGISTRATION_RECORD GetFirstCOMPlusSEHRecord(Thread*);
+
+// Determine the address of the instruction that made the current call.
+inline
+PCODE GetAdjustedCallAddress(PCODE returnAddress)
+{
+    LIMITED_METHOD_CONTRACT;
+    return returnAddress - 5;
+}
+
+BOOL AdjustContextForVirtualStub(EXCEPTION_RECORD *pExceptionRecord, CONTEXT *pContext);
+
+#endif // __excepx86_h__
diff --git a/src/vm/i386/excepx86.cpp b/src/vm/i386/excepx86.cpp
new file mode 100644
index 0000000000..27c923b749
--- /dev/null
+++ b/src/vm/i386/excepx86.cpp
@@ -0,0 +1,3734 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+//
+
+/*  EXCEP.CPP:
+ *
+ */
+#include "common.h"
+
+#include "frames.h"
+#include "excep.h"
+#include "object.h"
+#include "field.h"
+#include "dbginterface.h"
+#include "cgensys.h"
+#include "comutilnative.h"
+#include "sigformat.h"
+#include "siginfo.hpp"
+#include "gc.h"
+#include "eedbginterfaceimpl.h" //so we can clearexception in COMPlusThrow
+#include "perfcounters.h"
+#include "eventtrace.h"
+#include "eetoprofinterfacewrapper.inl"
+#include "eedbginterfaceimpl.inl"
+#include "dllimportcallback.h"
+#include "threads.h"
+#ifdef FEATURE_REMOTING
+#include "appdomainhelper.h"
+#endif
+#include "eeconfig.h"
+#include "vars.hpp"
+#include "generics.h"
+#include "securityprincipal.h"
+
+#include "asmconstants.h"
+#include "virtualcallstub.h"
+
+MethodDesc * GetUserMethodForILStub(Thread * pThread, UINT_PTR uStubSP, MethodDesc * pILStubMD, Frame ** ppFrameOut);
+
+#if !defined(DACCESS_COMPILE)
+
+#define FORMAT_MESSAGE_BUFFER_LENGTH 1024
+
+BOOL ComPlusFrameSEH(EXCEPTION_REGISTRATION_RECORD*);
+PEXCEPTION_REGISTRATION_RECORD GetPrevSEHRecord(EXCEPTION_REGISTRATION_RECORD*);
+
+extern "C" {
+// in asmhelpers.asm:
+VOID STDCALL ResumeAtJitEHHelper(EHContext *pContext);
+int STDCALL CallJitEHFilterHelper(size_t *pShadowSP, EHContext *pContext);
+VOID STDCALL CallJitEHFinallyHelper(size_t *pShadowSP, EHContext *pContext);
+
+BOOL CallRtlUnwind(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+           void *callback,
+           EXCEPTION_RECORD *pExceptionRecord,
+           void *retval);
+
+BOOL CallRtlUnwindSafe(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+           void *callback,
+           EXCEPTION_RECORD *pExceptionRecord,
+           void *retval);
+}
+
+static inline BOOL
+CPFH_ShouldUnwindStack(const EXCEPTION_RECORD * pCER) {
+
+    LIMITED_METHOD_CONTRACT;
+
+    _ASSERTE(pCER != NULL);
+
+    // We can only unwind those exceptions whose context/record we don't need for a
+    // rethrow.  This is complus, and stack overflow.  For all the others, we
+    // need to keep the context around for a rethrow, which means they can't
+    // be unwound.
+    if (IsComPlusException(pCER) || pCER->ExceptionCode == STATUS_STACK_OVERFLOW)
+        return TRUE;
+    else
+        return FALSE;
+}
+
+static inline BOOL IsComPlusNestedExceptionRecord(EXCEPTION_REGISTRATION_RECORD* pEHR)
+{
+    LIMITED_METHOD_CONTRACT;
+    if (pEHR->Handler == (PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler)
+        return TRUE;
+    return FALSE;
+}
+
+EXCEPTION_REGISTRATION_RECORD *TryFindNestedEstablisherFrame(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame)
+{
+    LIMITED_METHOD_CONTRACT;
+    while (pEstablisherFrame->Handler != (PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler) {
+        pEstablisherFrame = pEstablisherFrame->Next;
+        if (pEstablisherFrame == EXCEPTION_CHAIN_END) return 0;
+    }
+    return pEstablisherFrame;
+}
+
+#ifdef _DEBUG
+// stores last handler we went to in case we didn't get an endcatch and stack is
+// corrupted we can figure out who did it.
+static MethodDesc *gLastResumedExceptionFunc = NULL;
+static DWORD gLastResumedExceptionHandler = 0;
+#endif
+
+//---------------------------------------------------------------------
+//  void RtlUnwindCallback()
+// call back function after global unwind, rtlunwind calls this function
+//---------------------------------------------------------------------
+static void RtlUnwindCallback()
+{
+    LIMITED_METHOD_CONTRACT;
+    _ASSERTE(!"Should never get here");
+}
+
+BOOL NExportSEH(EXCEPTION_REGISTRATION_RECORD* pEHR)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    if ((LPVOID)pEHR->Handler == (LPVOID)UMThunkPrestubHandler)
+    {
+        return TRUE;
+    }
+    return FALSE;
+}
+
+BOOL FastNExportSEH(EXCEPTION_REGISTRATION_RECORD* pEHR)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    if ((LPVOID)pEHR->Handler == (LPVOID)FastNExportExceptHandler)
+        return TRUE;
+    return FALSE;
+}
+
+BOOL ReverseCOMSEH(EXCEPTION_REGISTRATION_RECORD* pEHR)
+{
+    LIMITED_METHOD_CONTRACT;
+
+#ifdef FEATURE_COMINTEROP
+    if ((LPVOID)pEHR->Handler == (LPVOID)COMPlusFrameHandlerRevCom)
+        return TRUE;
+#endif // FEATURE_COMINTEROP
+    return FALSE;
+}
+
+
+//
+// Returns true if the given SEH handler is one of our SEH handlers that is responsible for managing exceptions in
+// regions of managed code.
+//
+BOOL IsUnmanagedToManagedSEHHandler(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame)
+{
+    WRAPPER_NO_CONTRACT;
+
+    //
+    // ComPlusFrameSEH() is for COMPlusFrameHandler & COMPlusNestedExceptionHandler.
+    // FastNExportSEH() is for FastNExportExceptHandler.
+    // NExportSEH() is for UMThunkPrestubHandler.
+    //
+    return (ComPlusFrameSEH(pEstablisherFrame) || FastNExportSEH(pEstablisherFrame) || NExportSEH(pEstablisherFrame) || ReverseCOMSEH(pEstablisherFrame));
+}
+
+Frame *GetCurrFrame(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame)
+{
+    Frame *pFrame;
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(IsUnmanagedToManagedSEHHandler(pEstablisherFrame));
+    if (NExportSEH(pEstablisherFrame))
+        pFrame = ((ComToManagedExRecord *)pEstablisherFrame)->GetCurrFrame();
+    else
+        pFrame = ((FrameHandlerExRecord *)pEstablisherFrame)->GetCurrFrame();
+
+    _ASSERTE(GetThread() == NULL || GetThread()->GetFrame() <= pFrame);
+
+    return pFrame;
+}
+
+EXCEPTION_REGISTRATION_RECORD* GetNextCOMPlusSEHRecord(EXCEPTION_REGISTRATION_RECORD* pRec) {
+    WRAPPER_NO_CONTRACT;
+    if (pRec == EXCEPTION_CHAIN_END)
+        return EXCEPTION_CHAIN_END;
+
+    do {
+        _ASSERTE(pRec != 0);
+        pRec = pRec->Next;
+    } while (pRec != EXCEPTION_CHAIN_END && !IsUnmanagedToManagedSEHHandler(pRec));
+
+    _ASSERTE(pRec == EXCEPTION_CHAIN_END || IsUnmanagedToManagedSEHHandler(pRec));
+    return pRec;
+}
+
+
+/*
+ * GetClrSEHRecordServicingStackPointer
+ *
+ * This function searchs all the Frame SEH records, and finds the one that is
+ * currently signed up to do all exception handling for the given stack pointer
+ * on the given thread.
+ *
+ * Parameters:
+ *   pThread - The thread to search on.
+ *   pStackPointer - The stack location that we are finding the Frame SEH Record for.
+ *
+ * Returns
+ *   A pointer to the SEH record, or EXCEPTION_CHAIN_END if none was found.
+ *
+ */
+
+PEXCEPTION_REGISTRATION_RECORD
+GetClrSEHRecordServicingStackPointer(Thread *pThread,
+                                     void *pStackPointer)
+{
+    ThreadExceptionState* pExState = pThread->GetExceptionState();
+
+    //
+    // We can only do this if there is a context in the pExInfo. There are cases (most notably the
+    // EEPolicy::HandleFatalError case) where we don't have that.  In these cases we will return
+    // no enclosing handler since we cannot accurately determine the FS:0 entry which services
+    // this stack address.
+    //
+    // The side effect of this is that for these cases, the debugger cannot intercept
+    // the exception
+    //
+    CONTEXT* pContextRecord = pExState->GetContextRecord();
+    if (pContextRecord == NULL)
+    {
+        return EXCEPTION_CHAIN_END;
+    }
+
+    void *exceptionSP = dac_cast<PTR_VOID>(GetSP(pContextRecord));
+
+
+    //
+    // Now set the establishing frame.  What this means in English is that we need to find
+    // the fs:0 entry that handles exceptions for the place on the stack given in stackPointer.
+    //
+    PEXCEPTION_REGISTRATION_RECORD pSEHRecord = GetFirstCOMPlusSEHRecord(pThread);
+
+    while (pSEHRecord != EXCEPTION_CHAIN_END)
+    {
+
+        //
+        // Skip any SEHRecord which is not a CLR record or was pushed after the exception
+        // on this thread occurred.
+        //
+        if (IsUnmanagedToManagedSEHHandler(pSEHRecord) && (exceptionSP <= (void *)pSEHRecord))
+        {
+            Frame *pFrame = GetCurrFrame(pSEHRecord);
+            //
+            // Arcane knowledge here.  All Frame records are stored on the stack by the runtime
+            // in ever decreasing address space.  So, we merely have to search back until
+            // we find the first frame record with a higher stack value to find the
+            // establishing frame for the given stack address.
+            //
+            if (((void *)pFrame) >= pStackPointer)
+            {
+                break;
+            }
+
+        }
+
+        pSEHRecord = GetNextCOMPlusSEHRecord(pSEHRecord);
+    }
+
+    return pSEHRecord;
+}
+
+#ifdef _DEBUG
+// We've deteremined during a stack walk that managed code is transitioning to unamanaged (EE) code. Check that the
+// state of the EH chain is correct.
+//
+// For x86, check that we do INSTALL_COMPLUS_EXCEPTION_HANDLER before calling managed code.  This check should be
+// done for all managed code sites, not just transistions. But this will catch most problem cases.
+void VerifyValidTransitionFromManagedCode(Thread *pThread, CrawlFrame *pCF)
+{
+    WRAPPER_NO_CONTRACT;
+
+    _ASSERTE(ExecutionManager::IsManagedCode(GetControlPC(pCF->GetRegisterSet())));
+
+    // Cannot get to the TEB of other threads. So ignore them.
+    if (pThread != GetThread())
+    {
+        return;
+    }
+
+    // Find the EH record guarding the current region of managed code, based on the CrawlFrame passed in.
+    PEXCEPTION_REGISTRATION_RECORD pEHR = GetCurrentSEHRecord();
+
+    while ((pEHR != EXCEPTION_CHAIN_END) && ((ULONG_PTR)pEHR < GetRegdisplaySP(pCF->GetRegisterSet())))
+    {
+        pEHR = pEHR->Next;
+    }
+
+    // VerifyValidTransitionFromManagedCode can be called before the CrawlFrame's MethodDesc is initialized.
+    // Fix that if necessary for the consistency check.
+    MethodDesc * pFunction = pCF->GetFunction();
+    if ((!IsUnmanagedToManagedSEHHandler(pEHR)) && // Will the assert fire?  If not, don't waste our time.
+        (pFunction == NULL))
+    {
+        _ASSERTE(pCF->GetRegisterSet());
+        PCODE ip = GetControlPC(pCF->GetRegisterSet());
+        pFunction = ExecutionManager::GetCodeMethodDesc(ip);
+        _ASSERTE(pFunction);
+    }
+
+    // Great, we've got the EH record that's next up the stack from the current SP (which is in managed code). That
+    // had better be a record for one of our handlers responsible for handling exceptions in managed code. If its
+    // not, then someone made it into managed code without setting up one of our EH handlers, and that's really
+    // bad.
+    CONSISTENCY_CHECK_MSGF(IsUnmanagedToManagedSEHHandler(pEHR),
+                           ("Invalid transition into managed code!\n\n"
+                            "We're walking this thread's stack and we've reached a managed frame at Esp=0x%p. "
+                            "(The method is %s::%s) "
+                            "The very next FS:0 record (0x%p) up from this point on the stack should be one of "
+                            "our 'unmanaged to managed SEH handlers', but its not... its something else, and "
+                            "that's very bad. It indicates that someone managed to call into managed code without "
+                            "setting up the proper exception handling.\n\n"
+                            "Get a good unmanaged stack trace for this thread. All FS:0 records are on the stack, "
+                            "so you can see who installed the last handler. Somewhere between that function and "
+                            "where the thread is now is where the bad transition occurred.\n\n"
+                            "A little extra info: FS:0 = 0x%p, pEHR->Handler = 0x%p\n",
+                            GetRegdisplaySP(pCF->GetRegisterSet()),
+                            pFunction ->m_pszDebugClassName,
+                            pFunction ->m_pszDebugMethodName,
+                            pEHR,
+                            GetCurrentSEHRecord(),
+                            pEHR->Handler));
+}
+
+#endif
+
+//================================================================================
+
+// There are some things that should never be true when handling an
+// exception.  This function checks for them.  Will assert or trap
+// if it finds an error.
+static inline void
+CPFH_VerifyThreadIsInValidState(Thread* pThread, DWORD exceptionCode, EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame) {
+    WRAPPER_NO_CONTRACT;
+
+    if (   exceptionCode == STATUS_BREAKPOINT
+        || exceptionCode == STATUS_SINGLE_STEP) {
+        return;
+    }
+
+#ifdef _DEBUG
+    // check for overwriting of stack
+    CheckStackBarrier(pEstablisherFrame);
+    // trigger check for bad fs:0 chain
+    GetCurrentSEHRecord();
+#endif
+
+    if (!g_fEEShutDown) {
+        // An exception on the GC thread, or while holding the thread store lock, will likely lock out the entire process.
+        if (::IsGCThread() || ThreadStore::HoldingThreadStore())
+        {
+            _ASSERTE(!"Exception during garbage collection or while holding thread store");
+            EEPOLICY_HANDLE_FATAL_ERROR(COR_E_EXECUTIONENGINE);
+        }
+    }
+}
+
+
+#ifdef FEATURE_HIJACK
+void
+CPFH_AdjustContextForThreadSuspensionRace(CONTEXT *pContext, Thread *pThread)
+{
+    WRAPPER_NO_CONTRACT;
+
+    PCODE f_IP = GetIP(pContext);
+    if (Thread::IsAddrOfRedirectFunc((PVOID)f_IP)) {
+
+        // This is a very rare case where we tried to redirect a thread that was
+        // just about to dispatch an exception, and our update of EIP took, but
+        // the thread continued dispatching the exception.
+        //
+        // If this should happen (very rare) then we fix it up here.
+        //
+        _ASSERTE(pThread->GetSavedRedirectContext());
+        SetIP(pContext, GetIP(pThread->GetSavedRedirectContext()));
+        STRESS_LOG1(LF_EH, LL_INFO100, "CPFH_AdjustContextForThreadSuspensionRace: Case 1 setting IP = %x\n", pContext->Eip);
+    }
+
+    if (f_IP == GetEEFuncEntryPoint(THROW_CONTROL_FOR_THREAD_FUNCTION)) {
+
+        // This is a very rare case where we tried to redirect a thread that was
+        // just about to dispatch an exception, and our update of EIP took, but
+        // the thread continued dispatching the exception.
+        //
+        // If this should happen (very rare) then we fix it up here.
+        //
+        SetIP(pContext, GetIP(pThread->m_OSContext));
+        STRESS_LOG1(LF_EH, LL_INFO100, "CPFH_AdjustContextForThreadSuspensionRace: Case 2 setting IP = %x\n", pContext->Eip);
+    }
+
+// We have another even rarer race condition:
+// - A) On thread A, Debugger puts an int 3 in the code stream at address X
+// - A) We hit it and the begin an exception. The eip will be X + 1 (int3 is special)
+// - B) Meanwhile, thread B redirects A's eip to Y. (Although A is really somewhere
+// in the kernel, it looks like it's still in user code, so it can fall under the
+// HandledJitCase and can be redirected)
+// - A) The OS, trying to be nice, expects we have a breakpoint exception at X+1,
+// but does -1 on the address since it knows int3 will leave the eip +1.
+// So the context structure it will pass to the Handler is ideally (X+1)-1 = X
+//
+// ** Here's the race: Since thread B redirected A, the eip is actually Y (not X+1),
+// but the kernel still touches it up to Y-1. So there's a window between when we hit a
+// bp and when the handler gets called that this can happen.
+// This causes an unhandled BP (since the debugger doesn't recognize the bp at Y-1)
+//
+// So what to do: If we land at Y-1 (ie, if f_IP+1 is the addr of a Redirected Func),
+// then restore the EIP back to X. This will skip the redirection.
+// Fortunately, this only occurs in cases where it's ok
+// to skip. The debugger will recognize the patch and handle it.
+
+    if (Thread::IsAddrOfRedirectFunc((PVOID)(f_IP + 1))) {
+        _ASSERTE(pThread->GetSavedRedirectContext());
+        SetIP(pContext, GetIP(pThread->GetSavedRedirectContext()) - 1);
+        STRESS_LOG1(LF_EH, LL_INFO100, "CPFH_AdjustContextForThreadSuspensionRace: Case 3 setting IP = %x\n", pContext->Eip);
+    }
+
+    if (f_IP + 1 == GetEEFuncEntryPoint(THROW_CONTROL_FOR_THREAD_FUNCTION)) {
+        SetIP(pContext, GetIP(pThread->m_OSContext) - 1);
+        STRESS_LOG1(LF_EH, LL_INFO100, "CPFH_AdjustContextForThreadSuspensionRace: Case 4 setting IP = %x\n", pContext->Eip);
+    }
+}
+#endif // FEATURE_HIJACK
+
+
+// We want to leave true null reference exceptions alone.  But if we are
+// trashing memory, we don't want the application to swallow it.  The 0x100
+// below will give us false positives for debugging, if the app is accessing
+// a field more than 256 bytes down an object, where the reference is null.
+//
+// Removed use of the IgnoreUnmanagedExceptions reg key...simply return false now.
+//
+static inline BOOL
+CPFH_ShouldIgnoreException(EXCEPTION_RECORD *pExceptionRecord) {
+    LIMITED_METHOD_CONTRACT;
+     return FALSE;
+}
+
+static inline void
+CPFH_UpdatePerformanceCounters() {
+    WRAPPER_NO_CONTRACT;
+    COUNTER_ONLY(GetPerfCounters().m_Excep.cThrown++);
+}
+
+
+//******************************************************************************
+EXCEPTION_DISPOSITION COMPlusAfterUnwind(
+        EXCEPTION_RECORD *pExceptionRecord,
+        EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+        ThrowCallbackType& tct)
+{
+    WRAPPER_NO_CONTRACT;
+
+    // Note: we've completed the unwind pass up to the establisher frame, and we're headed off to finish our
+    // cleanup and end up back in jitted code. Any more FS0 handlers pushed from this point on out will _not_ be
+    // unwound. We go ahead and assert right here that indeed there are no handlers below the establisher frame
+    // before we go any further.
+    _ASSERTE(pEstablisherFrame == GetCurrentSEHRecord());
+
+    Thread* pThread = GetThread();
+
+    _ASSERTE(tct.pCurrentExceptionRecord == pEstablisherFrame);
+
+    NestedHandlerExRecord nestedHandlerExRecord;
+    nestedHandlerExRecord.Init((PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler, GetCurrFrame(pEstablisherFrame));
+
+    // ... and now, put the nested record back on.
+    INSTALL_EXCEPTION_HANDLING_RECORD(&(nestedHandlerExRecord.m_ExReg));
+
+    // We entered COMPlusAfterUnwind in PREEMP, but we need to be in COOP from here on out
+    GCX_COOP_NO_DTOR();
+
+    tct.bIsUnwind = TRUE;
+    tct.pProfilerNotify = NULL;
+
+    LOG((LF_EH, LL_INFO100, "COMPlusFrameHandler: unwinding\n"));
+
+    tct.bUnwindStack = CPFH_ShouldUnwindStack(pExceptionRecord);
+
+    LOG((LF_EH, LL_INFO1000, "COMPlusAfterUnwind: going to: pFunc:%#X, pStack:%#X\n",
+        tct.pFunc, tct.pStack));
+
+    // TODO: UnwindFrames ends up calling into StackWalkFrames which is SO_INTOLERANT
+    //                 as is UnwindFrames, etc... Should we make COMPlusAfterUnwind SO_INTOLERANT???
+    ANNOTATION_VIOLATION(SOToleranceViolation);
+
+    UnwindFrames(pThread, &tct);
+
+#ifdef DEBUGGING_SUPPORTED
+    ExInfo* pExInfo = pThread->GetExceptionState()->GetCurrentExceptionTracker();
+    if (pExInfo->m_ValidInterceptionContext)
+    {
+        // By now we should have all unknown FS:[0] handlers unwinded along with the managed Frames until
+        // the interception point. We can now pop nested exception handlers and resume at interception context.
+        EHContext context = pExInfo->m_InterceptionContext;
+        pExInfo->m_InterceptionContext.Init();
+        pExInfo->m_ValidInterceptionContext = FALSE;
+
+        UnwindExceptionTrackerAndResumeInInterceptionFrame(pExInfo, &context);
+    }
+#endif // DEBUGGING_SUPPORTED
+
+    _ASSERTE(!"Should not get here");
+    return ExceptionContinueSearch;
+} // EXCEPTION_DISPOSITION COMPlusAfterUnwind()
+
+#ifdef DEBUGGING_SUPPORTED
+
+//---------------------------------------------------------------------------------------
+//
+// This function is called to intercept an exception and start an unwind.
+//
+// Arguments:
+//    pCurrentEstablisherFrame  - the exception registration record covering the stack range 
+//                                containing the interception point
+//    pExceptionRecord          - EXCEPTION_RECORD of the exception being intercepted
+//
+// Return Value:
+//    ExceptionContinueSearch if the exception cannot be intercepted
+//
+// Notes:
+//    If the exception is intercepted, this function never returns.
+//
+
+EXCEPTION_DISPOSITION ClrDebuggerDoUnwindAndIntercept(EXCEPTION_REGISTRATION_RECORD *pCurrentEstablisherFrame,
+                                                      EXCEPTION_RECORD *pExceptionRecord)
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!CheckThreadExceptionStateForInterception())
+    {
+        return ExceptionContinueSearch;
+    }
+
+    Thread*               pThread  = GetThread();
+    ThreadExceptionState* pExState = pThread->GetExceptionState();
+
+    EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame;
+    ThrowCallbackType tct;
+    tct.Init();
+
+    pExState->GetDebuggerState()->GetDebuggerInterceptInfo(&pEstablisherFrame,
+                                      &(tct.pFunc),
+                                      &(tct.dHandler),
+                                      &(tct.pStack),
+                                      NULL,
+                                      &(tct.pBottomFrame)
+                                     );
+
+    //
+    // If the handler that we've selected as the handler for the target frame of the unwind is in fact above the
+    // handler that we're currently executing in, then use the current handler instead. Why? Our handlers for
+    // nested exceptions actually process managed frames that live above them, up to the COMPlusFrameHanlder that
+    // pushed the nested handler. If the user selectes a frame above the nested handler, then we will have selected
+    // the COMPlusFrameHandler above the current nested handler. But we don't want to ask RtlUnwind to unwind past
+    // the nested handler that we're currently executing in.
+    //
+    if (pEstablisherFrame > pCurrentEstablisherFrame)
+    {
+        // This should only happen if we're in a COMPlusNestedExceptionHandler.
+        _ASSERTE(IsComPlusNestedExceptionRecord(pCurrentEstablisherFrame));
+
+        pEstablisherFrame = pCurrentEstablisherFrame;
+    }
+
+#ifdef _DEBUG
+    tct.pCurrentExceptionRecord = pEstablisherFrame;
+#endif
+
+    LOG((LF_EH|LF_CORDB, LL_INFO100, "ClrDebuggerDoUnwindAndIntercept: Intercepting at %s\n", tct.pFunc->m_pszDebugMethodName));
+    LOG((LF_EH|LF_CORDB, LL_INFO100, "\t\t: pFunc is 0x%X\n", tct.pFunc));
+    LOG((LF_EH|LF_CORDB, LL_INFO100, "\t\t: pStack is 0x%X\n", tct.pStack));
+
+    CallRtlUnwindSafe(pEstablisherFrame, RtlUnwindCallback, pExceptionRecord, 0);
+
+    ExInfo* pExInfo = pThread->GetExceptionState()->GetCurrentExceptionTracker();
+    if (pExInfo->m_ValidInterceptionContext)
+    {
+        // By now we should have all unknown FS:[0] handlers unwinded along with the managed Frames until
+        // the interception point. We can now pop nested exception handlers and resume at interception context.
+        GCX_COOP();
+        EHContext context = pExInfo->m_InterceptionContext;
+        pExInfo->m_InterceptionContext.Init();
+        pExInfo->m_ValidInterceptionContext = FALSE;
+
+        UnwindExceptionTrackerAndResumeInInterceptionFrame(pExInfo, &context);
+    }
+
+    // on x86 at least, RtlUnwind always returns
+
+    // Note: we've completed the unwind pass up to the establisher frame, and we're headed off to finish our
+    // cleanup and end up back in jitted code. Any more FS0 handlers pushed from this point on out will _not_ be
+    // unwound.
+    return COMPlusAfterUnwind(pExState->GetExceptionRecord(), pEstablisherFrame, tct);
+} // EXCEPTION_DISPOSITION ClrDebuggerDoUnwindAndIntercept()
+
+#endif // DEBUGGING_SUPPORTED
+
+// This is a wrapper around the assembly routine that invokes RtlUnwind in the OS.
+// When we invoke RtlUnwind, the OS will modify the ExceptionFlags field in the 
+// exception record to reflect unwind. Since we call RtlUnwind in the first pass
+// with a valid exception record when we find an exception handler AND because RtlUnwind 
+// returns on x86, the OS would have flagged the exception record for unwind.
+//
+// Incase the exception is rethrown from the catch/filter-handler AND it's a non-COMPLUS
+// exception, the runtime will use the reference to the saved exception record to reraise 
+// the exception, as part of rethrow fixup. Since the OS would have modified the exception record
+// to reflect unwind, this wrapper will "reset" the ExceptionFlags field when RtlUnwind returns.
+// Otherwise, the rethrow will result in second pass, as opposed to first, since the ExceptionFlags
+// would indicate an unwind.
+// 
+// This rethrow issue does not affect COMPLUS exceptions since we always create a brand new exception
+// record for them in RaiseTheExceptionInternalOnly.
+BOOL CallRtlUnwindSafe(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+           void *callback,
+           EXCEPTION_RECORD *pExceptionRecord,
+           void *retval)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    // Save the ExceptionFlags value before invoking RtlUnwind.
+    DWORD dwExceptionFlags = pExceptionRecord->ExceptionFlags;
+
+    BOOL fRetVal = CallRtlUnwind(pEstablisherFrame, callback, pExceptionRecord, retval);
+
+    // Reset ExceptionFlags field, if applicable
+    if (pExceptionRecord->ExceptionFlags != dwExceptionFlags)
+    {
+        // We would expect the 32bit OS to have set the unwind flag at this point.
+        _ASSERTE(pExceptionRecord->ExceptionFlags & EXCEPTION_UNWINDING);
+        LOG((LF_EH, LL_INFO100, "CallRtlUnwindSafe: Resetting ExceptionFlags from %lu to %lu\n", pExceptionRecord->ExceptionFlags, dwExceptionFlags));
+        pExceptionRecord->ExceptionFlags = dwExceptionFlags;
+    }
+
+    return fRetVal;
+}
+
+//******************************************************************************
+// The essence of the first pass handler (after we've decided to actually do
+//  the first pass handling).
+//******************************************************************************
+inline EXCEPTION_DISPOSITION __cdecl
+CPFH_RealFirstPassHandler(                  // ExceptionContinueSearch, etc.
+    EXCEPTION_RECORD *pExceptionRecord,     // The exception record, with exception type.
+    EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,   // Exception frame on whose behalf this is called.
+    CONTEXT     *pContext,                  // Context from the exception.
+    void        *pDispatcherContext,        // @todo
+    BOOL        bAsynchronousThreadStop,    // @todo
+    BOOL        fPGCDisabledOnEntry)        // @todo
+{
+    // We don't want to use a runtime contract here since this codepath is used during
+    // the processing of a hard SO. Contracts use a significant amount of stack
+    // which we can't afford for those cases.
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+    STATIC_CONTRACT_SO_TOLERANT;
+
+#ifdef _DEBUG
+    static int breakOnFirstPass = -1;
+
+    if (breakOnFirstPass == -1)
+        breakOnFirstPass = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_BreakOnFirstPass);
+
+    if (breakOnFirstPass != 0)
+    {
+        _ASSERTE(!"First pass exception handler");
+    }
+#endif
+
+    EXCEPTION_DISPOSITION retval;
+    DWORD exceptionCode = pExceptionRecord->ExceptionCode;
+    Thread *pThread = GetThread();
+
+#ifdef _DEBUG
+    static int breakOnSO = -1;
+
+    if (breakOnSO == -1)
+        breakOnSO = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_BreakOnSO);
+
+    if (breakOnSO != 0 && exceptionCode == STATUS_STACK_OVERFLOW)
+    {
+        DebugBreak();   // ASSERTing will overwrite the guard region
+    }
+#endif
+
+    // We always want to be in co-operative mode when we run this function and whenever we return
+    // from it, want to go to pre-emptive mode because are returning to OS.
+    _ASSERTE(pThread->PreemptiveGCDisabled());
+
+    BOOL bPopNestedHandlerExRecord = FALSE;
+    LFH found = LFH_NOT_FOUND;          // Result of calling LookForHandler.
+    BOOL bRethrownException = FALSE;
+    BOOL bNestedException = FALSE;
+
+#if defined(USE_FEF)
+    BOOL bPopFaultingExceptionFrame = FALSE;
+    FrameWithCookie<FaultingExceptionFrame> faultingExceptionFrame;
+#endif // USE_FEF
+    ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+
+    ThrowCallbackType tct;
+    tct.Init();
+
+    tct.pTopFrame = GetCurrFrame(pEstablisherFrame); // highest frame to search to
+
+#ifdef _DEBUG
+    tct.pCurrentExceptionRecord = pEstablisherFrame;
+    tct.pPrevExceptionRecord    = GetPrevSEHRecord(pEstablisherFrame);
+#endif // _DEBUG
+
+    BOOL fIsManagedCode = pContext ? ExecutionManager::IsManagedCode(GetIP(pContext)) : FALSE;
+
+
+    // this establishes a marker so can determine if are processing a nested exception
+    // don't want to use the current frame to limit search as it could have been unwound by
+    // the time get to nested handler (ie if find an exception, unwind to the call point and
+    // then resume in the catch and then get another exception) so make the nested handler
+    // have the same boundary as this one. If nested handler can't find a handler, we won't
+    // end up searching this frame list twice because the nested handler will set the search
+    // boundary in the thread and so if get back to this handler it will have a range that starts
+    // and ends at the same place.
+
+    NestedHandlerExRecord nestedHandlerExRecord;
+    nestedHandlerExRecord.Init((PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler, GetCurrFrame(pEstablisherFrame));
+
+    INSTALL_EXCEPTION_HANDLING_RECORD(&(nestedHandlerExRecord.m_ExReg));
+    bPopNestedHandlerExRecord = TRUE;
+
+#if defined(USE_FEF)
+    // Note: don't attempt to push a FEF for an exception in managed code if we weren't in cooperative mode when
+    // the exception was received. If preemptive GC was enabled when we received the exception, then it means the
+    // exception was rethrown from unmangaed code (including EE impl), and we shouldn't push a FEF.
+    if (fIsManagedCode &&
+        fPGCDisabledOnEntry &&
+        (pThread->m_pFrame == FRAME_TOP ||
+         pThread->m_pFrame->GetVTablePtr() != FaultingExceptionFrame::GetMethodFrameVPtr() ||
+         (size_t)pThread->m_pFrame > (size_t)pEstablisherFrame))
+    {
+        // setup interrupted frame so that GC during calls to init won't collect the frames
+        // only need it for non COM+ exceptions in managed code when haven't already
+        // got one on the stack (will have one already if we have called rtlunwind because
+        // the instantiation that called unwind would have installed one)
+        faultingExceptionFrame.InitAndLink(pContext);
+        bPopFaultingExceptionFrame = TRUE;
+    }
+#endif // USE_FEF
+
+    OBJECTREF e;
+    e = pThread->LastThrownObject();
+
+    STRESS_LOG7(LF_EH, LL_INFO10, "CPFH_RealFirstPassHandler: code:%X, LastThrownObject:%p, MT:%pT"
+        ", IP:%p, SP:%p, pContext:%p, pEstablisherFrame:%p\n",
+        exceptionCode, OBJECTREFToObject(e), (e!=0)?e->GetMethodTable():0,
+        pContext ? GetIP(pContext) : 0, pContext ? GetSP(pContext) : 0, 
+        pContext, pEstablisherFrame);
+
+#ifdef LOGGING
+    // If it is a complus exception, and there is a thrown object, get its name, for better logging.
+    if (IsComPlusException(pExceptionRecord))
+    {
+        const char * eClsName = "!EXCEPTION_COMPLUS";
+        if (e != 0)
+        {
+            eClsName = e->GetTrueMethodTable()->GetDebugClassName();
+    }
+        LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: exception: 0x%08X, class: '%s', IP: 0x%p\n",
+             exceptionCode, eClsName, pContext ? GetIP(pContext) : NULL));
+    }
+#endif
+
+    EXCEPTION_POINTERS exceptionPointers = {pExceptionRecord, pContext};
+
+    STRESS_LOG4(LF_EH, LL_INFO10000, "CPFH_RealFirstPassHandler: setting boundaries: Exinfo: 0x%p, BottomMostHandler:0x%p, SearchBoundary:0x%p, TopFrame:0x%p\n",
+         pExInfo, pExInfo->m_pBottomMostHandler, pExInfo->m_pSearchBoundary, tct.pTopFrame);
+
+    // Here we are trying to decide if we are coming in as:
+    // 1) first handler in a brand new exception
+    // 2) a subsequent handler in an exception
+    // 3) a nested exception
+    // m_pBottomMostHandler is the registration structure (establisher frame) for the most recent (ie lowest in
+    // memory) non-nested handler that was installed  and pEstablisher frame is what the current handler
+    // was registered with.
+    // The OS calls each registered handler in the chain, passing its establisher frame to it.
+    if (pExInfo->m_pBottomMostHandler != NULL && pEstablisherFrame > pExInfo->m_pBottomMostHandler)
+    {
+        STRESS_LOG3(LF_EH, LL_INFO10000, "CPFH_RealFirstPassHandler: detected subsequent handler.  ExInfo:0x%p, BottomMost:0x%p SearchBoundary:0x%p\n", 
+                    pExInfo, pExInfo->m_pBottomMostHandler, pExInfo->m_pSearchBoundary);
+        
+        // If the establisher frame of this handler is greater than the bottommost then it must have been
+        // installed earlier and therefore we are case 2
+        if (pThread->GetThrowable() == NULL)
+        {
+            // Bottommost didn't setup a throwable, so not exception not for us
+            retval = ExceptionContinueSearch;
+            goto exit;
+        }
+
+        // setup search start point
+        tct.pBottomFrame = pExInfo->m_pSearchBoundary;
+
+        if (tct.pTopFrame == tct.pBottomFrame)
+        {
+            // this will happen if our nested handler already searched for us so we don't want
+            // to search again
+            retval = ExceptionContinueSearch;
+            goto exit;
+        }
+    }
+    else
+    {   // we are either case 1 or case 3
+#if defined(_DEBUG_IMPL)
+        //@todo: merge frames, context, handlers
+        if (pThread->GetFrame() != FRAME_TOP)
+            pThread->GetFrame()->LogFrameChain(LF_EH, LL_INFO1000);
+#endif // _DEBUG_IMPL
+
+        // If the exception was rethrown, we'll create a new ExInfo, which will represent the rethrown exception.
+        //  The original exception is not the rethrown one.
+        if (pExInfo->m_ExceptionFlags.IsRethrown() && pThread->LastThrownObject() != NULL)
+        {
+            pExInfo->m_ExceptionFlags.ResetIsRethrown();
+            bRethrownException = TRUE;
+
+#if defined(USE_FEF)
+            if (bPopFaultingExceptionFrame)
+            {
+                // if we added a FEF, it will refer to the frame at the point of the original exception which is
+                // already unwound so don't want it.
+                // If we rethrew the exception we have already added a helper frame for the rethrow, so don't
+                // need this one. If we didn't rethrow it, (ie rethrow from native) then there the topmost frame will
+                // be a transition to native frame in which case we don't need it either
+                faultingExceptionFrame.Pop();
+                bPopFaultingExceptionFrame = FALSE;
+            }
+#endif
+        }
+
+        // If the establisher frame is less than the bottommost handler, then this is nested because the
+        // establisher frame was installed after the bottommost.
+        if (pEstablisherFrame < pExInfo->m_pBottomMostHandler
+            /* || IsComPlusNestedExceptionRecord(pEstablisherFrame) */ )
+        {
+            bNestedException = TRUE;
+
+            // case 3: this is a nested exception. Need to save and restore the thread info
+            STRESS_LOG3(LF_EH, LL_INFO10000, "CPFH_RealFirstPassHandler: ExInfo:0x%p detected nested exception 0x%p < 0x%p\n",
+                        pExInfo, pEstablisherFrame, pExInfo->m_pBottomMostHandler);
+
+            EXCEPTION_REGISTRATION_RECORD* pNestedER = TryFindNestedEstablisherFrame(pEstablisherFrame);
+            ExInfo *pNestedExInfo;
+
+            if (!pNestedER || pNestedER >= pExInfo->m_pBottomMostHandler )
+            {
+                // RARE CASE.  We've re-entered the EE from an unmanaged filter.
+                // 
+                // OR
+                //
+                // We can be here if we dont find a nested exception handler. This is exemplified using 
+                // call chain of scenario 2 explained further below.
+                //
+                // Assuming __try of NativeB throws an exception E1 and it gets caught in ManagedA2, then
+                // bottom-most handler (BMH) is going to be CPFH_A. The catch will trigger an unwind
+                // and invoke __finally in NativeB. Let the __finally throw a new exception E2.
+                //
+                // Assuming ManagedB2 has a catch block to catch E2, when we enter CPFH_B looking for a 
+                // handler for E2, our establisher frame will be that of CPFH_B, which will be lower
+                // in stack than current BMH (which is CPFH_A). Thus, we will come here, determining
+                // E2 to be nested exception correctly but not find a nested exception handler.
+                void *limit = (void *) GetPrevSEHRecord(pExInfo->m_pBottomMostHandler);
+
+                pNestedExInfo = new (nothrow) ExInfo();     // Very rare failure here; need robust allocator.
+                if (pNestedExInfo == NULL)
+                {   // if we can't allocate memory, we can't correctly continue.
+                    #if defined(_DEBUG)
+                    if (CLRConfig::GetConfigValue(CLRConfig::INTERNAL_NestedEhOom))
+                        _ASSERTE(!"OOM in callback from unmanaged filter.");
+                    #endif // _DEBUG
+                    
+                    EEPOLICY_HANDLE_FATAL_ERROR(COR_E_OUTOFMEMORY);
+                }
+
+                
+                pNestedExInfo->m_StackAddress = limit;      // Note: this is also the flag that tells us this
+                                                            // ExInfo was stack allocated.
+            }
+            else
+            {
+                pNestedExInfo = &((NestedHandlerExRecord*)pNestedER)->m_handlerInfo;
+            }
+
+            LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: PushExInfo() current: 0x%p previous: 0x%p\n", 
+                 pExInfo->m_StackAddress, pNestedExInfo->m_StackAddress));
+
+            _ASSERTE(pNestedExInfo);
+            pNestedExInfo->m_hThrowable = NULL; // pNestedExInfo may be stack allocated, and as such full of
+                                                // garbage. m_hThrowable must be sane, so set it to NULL. (We could
+                                                // zero the entire record, but this is cheaper.)
+
+            pNestedExInfo->CopyAndClearSource(pExInfo);
+
+            pExInfo->m_pPrevNestedInfo = pNestedExInfo;     // Save at head of nested info chain
+
+#if 0
+/* the following code was introduced in Whidbey as part of the Faulting Exception Frame removal (12/03).
+   However it isn't correct.  If any nested exceptions occur while processing a rethrow, we would
+   incorrectly consider the nested exception to be a rethrow.  See VSWhidbey 349379 for an example.
+
+   Therefore I am disabling this code until we see a failure that explains why it was added in the first
+   place.  cwb 9/04.
+*/
+            // If we're here as a result of a rethrown exception, set the rethrown flag on the new ExInfo.
+            if (bRethrownException)
+            {
+                pExInfo->m_ExceptionFlags.SetIsRethrown();
+            }
+#endif
+        }
+        else
+        {
+            // At this point, either:
+            //
+            // 1) the bottom-most handler is NULL, implying this is a new exception for which we are getting ready, OR
+            // 2) the bottom-most handler is not-NULL, implying that a there is already an existing exception in progress.
+            //
+            // Scenario 1 is that of a new throw and is easy to understand. Scenario 2 is the interesting one.
+            //
+            // ManagedA1 -> ManagedA2 -> ManagedA3 -> NativeCodeA -> ManagedB1 -> ManagedB2 -> ManagedB3 -> NativeCodeB
+            //
+            // On x86, each block of managed code is protected by one COMPlusFrameHandler [CPFH] (CLR's exception handler
+            // for managed code), unlike 64bit where each frame has a personality routine attached to it. Thus,
+            // for the example above, assume CPFH_A protects ManagedA* blocks and is setup just before the call to
+            // ManagedA1. Likewise, CPFH_B protects ManagedB* blocks and is setup just before the call to ManagedB1.
+            // 
+            // When ManagedB3 throws an exception, CPFH_B is invoked to look for a handler in all of the ManagedB* blocks.
+            // At this point, it is setup as the "bottom-most-handler" (BMH). If no handler is found and exception reaches
+            // ManagedA* blocks, CPFH_A is invoked to look for a handler and thus, becomes BMH. 
+            //
+            // Thus, in the first pass on x86 for a given exception, a particular CPFH will be invoked only once when looking 
+            // for a handler and thus, registered as BMH only once. Either the exception goes unhandled and the process will
+            // terminate or a handler will be found and second pass will commence.
+            //
+            // However, assume NativeCodeB had a __try/__finally and raised an exception [E1] within the __try. Let's assume
+            // it gets caught in ManagedB1 and thus, unwind is triggered. At this point, the active exception tracker
+            // has context about the exception thrown out of __try and CPFH_B is registered as BMH. 
+            //
+            // If the __finally throws a new exception [E2], CPFH_B will be invoked again for first pass while looking for 
+            // a handler for the thrown exception. Since BMH is already non-NULL, we will come here since EstablisherFrame will be
+            // the same as BMH (because EstablisherFrame will be that of CPFH_B). We will proceed to overwrite the "required" parts
+            // of the existing exception tracker with the details of E2 (see setting of exception record and context below), erasing 
+            // any artifact of E1. 
+            //
+            // This is unlike Scenario 1 when exception tracker is completely initialized to default values. This is also
+            // unlike 64bit which will detect that E1 and E2 are different exceptions and hence, will setup a new tracker
+            // to track E2, effectively behaving like Scenario 1 above. X86 cannot do this since there is no nested exception
+            // tracker setup that gets to see the new exception. 
+            //
+            // Thus, if E1 was a CSE and E2 isn't, we will come here and treat E2 as a CSE as well since corruption severity
+            // is initialized as part of exception tracker initialization. Thus, E2 will start to be treated as CSE, which is 
+            // incorrect. Similar argument applies to delivery of First chance exception notification delivery.
+            //
+            // <QUIP> Another example why we should unify EH systems :) </QUIP>
+            //
+            // To address this issue, we will need to reset exception tracker here, just like the overwriting of "required"
+            // parts of exception tracker.
+
+            // If the current establisher frame is the same as the bottom-most-handler and we are here
+            // in the first pass, assert that current exception and the one tracked by active exception tracker
+            // are indeed different exceptions. In such a case, we must reset the exception tracker so that it can be
+            // setup correctly further down when CEHelper::SetupCorruptionSeverityForActiveException is invoked.
+            
+            if ((pExInfo->m_pBottomMostHandler != NULL) &&
+                (pEstablisherFrame == pExInfo->m_pBottomMostHandler))
+            {
+                // Current exception should be different from the one exception tracker is already tracking.
+                _ASSERTE(pExceptionRecord != pExInfo->m_pExceptionRecord);
+
+                // This cannot be nested exceptions - they are handled earlier (see above).
+                _ASSERTE(!bNestedException);
+                
+                LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: Bottom-most handler (0x%p) is the same as EstablisherFrame.\n", 
+                 pExInfo->m_pBottomMostHandler));
+                LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: Exception record in exception tracker is 0x%p, while that of new exception is 0x%p.\n", 
+                 pExInfo->m_pExceptionRecord, pExceptionRecord));
+                LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: Resetting exception tracker (0x%p).\n", pExInfo));
+                
+                // This will reset the exception tracker state, including the corruption severity.
+                pExInfo->Init();
+            }
+        }
+
+        // If we are handling a fault from managed code, we need to set the Thread->ExInfo->pContext to
+        //  the current fault context, which is used in the stack walk to get back into the managed
+        //  stack with the correct registers.  (Previously, this was done by linking in a FaultingExceptionFrame
+        //  record.)
+        // We are about to create the managed exception object, which may trigger a GC, so set this up now.
+
+        pExInfo->m_pExceptionRecord = pExceptionRecord;
+        pExInfo->m_pContext = pContext;
+        if (pContext && ShouldHandleManagedFault(pExceptionRecord, pContext, pEstablisherFrame, pThread))
+        {   // If this was a fault in managed code, rather than create a Frame for stackwalking,
+            //  we can use this exinfo (after all, it has all the register info.)
+            pExInfo->m_ExceptionFlags.SetUseExInfoForStackwalk();
+        }
+
+        // It should now be safe for a GC to happen.
+
+        // case 1 & 3: this is the first time through of a new, nested, or rethrown exception, so see if we can
+        // find a handler.  Only setup throwable if are bottommost handler
+        if (IsComPlusException(pExceptionRecord) && (!bAsynchronousThreadStop))
+        {
+
+            // Update the throwable from the last thrown object. Note: this may cause OOM, in which case we replace
+            // both throwables with the preallocated OOM exception.
+            pThread->SafeSetThrowables(pThread->LastThrownObject());
+
+            // now we've got a COM+ exception, fall through to so see if we handle it
+
+            STRESS_LOG3(LF_EH, LL_INFO10000, "CPFH_RealFirstPassHandler: fall through ExInfo:0x%p setting m_pBottomMostHandler to 0x%p from 0x%p\n",
+                        pExInfo, pEstablisherFrame, pExInfo->m_pBottomMostHandler);
+            pExInfo->m_pBottomMostHandler = pEstablisherFrame;
+        }
+        else if (bRethrownException)
+        {
+            // If it was rethrown and not COM+, will still be the last one thrown. Either we threw it last and
+            // stashed it here or someone else caught it and rethrew it, in which case it will still have been
+            // originally stashed here.
+
+            // Update the throwable from the last thrown object. Note: this may cause OOM, in which case we replace
+            // both throwables with the preallocated OOM exception.
+            pThread->SafeSetThrowables(pThread->LastThrownObject());
+            STRESS_LOG3(LF_EH, LL_INFO10000, "CPFH_RealFirstPassHandler: rethrow non-COM+ ExInfo:0x%p setting m_pBottomMostHandler to 0x%p from 0x%p\n",
+                        pExInfo, pEstablisherFrame, pExInfo->m_pBottomMostHandler);
+            pExInfo->m_pBottomMostHandler = pEstablisherFrame;
+        }
+        else
+        {
+            if (!fIsManagedCode)
+            {
+                tct.bDontCatch = false;
+            }
+
+            if (exceptionCode == STATUS_BREAKPOINT)
+            {
+                // don't catch int 3
+                retval = ExceptionContinueSearch;
+                goto exit;
+            }
+
+            // We need to set m_pBottomMostHandler here, Thread::IsExceptionInProgress returns 1.
+            // This is a necessary part of suppressing thread abort exceptions in the constructor
+            // of any exception object we might create.
+            STRESS_LOG3(LF_EH, LL_INFO10000, "CPFH_RealFirstPassHandler: setting ExInfo:0x%p m_pBottomMostHandler for IsExceptionInProgress to 0x%p from 0x%p\n",
+                        pExInfo, pEstablisherFrame, pExInfo->m_pBottomMostHandler);
+            pExInfo->m_pBottomMostHandler = pEstablisherFrame;
+
+            // Create the managed exception object.
+            OBJECTREF throwable = CreateCOMPlusExceptionObject(pThread, pExceptionRecord, bAsynchronousThreadStop);
+
+            // Set the throwables on the thread to the newly created object. If this fails, it will return a
+            // preallocated exception object instead. This also updates the last thrown exception, for rethrows.
+            throwable = pThread->SafeSetThrowables(throwable);
+
+            // Set the exception code and pointers. We set these after setting the throwables on the thread,
+            // because if the proper exception is replaced by an OOM exception, we still want the exception code
+            // and pointers set in the OOM exception.
+            EXCEPTIONREF exceptionRef = (EXCEPTIONREF)throwable;
+            exceptionRef->SetXCode(pExceptionRecord->ExceptionCode);
+            exceptionRef->SetXPtrs(&exceptionPointers);
+        }
+
+        tct.pBottomFrame = NULL;
+
+        EEToProfilerExceptionInterfaceWrapper::ExceptionThrown(pThread);
+        
+        CPFH_UpdatePerformanceCounters();
+    } // End of case-1-or-3
+
+    {
+        // Allocate storage for the stack trace.
+        OBJECTREF throwable = NULL;
+        GCPROTECT_BEGIN(throwable);
+        throwable = pThread->GetThrowable();
+
+#ifdef FEATURE_CORRUPTING_EXCEPTIONS
+        {
+            BEGIN_SO_INTOLERANT_CODE(GetThread());
+            // Setup the state in current exception tracker indicating the corruption severity
+            // of the active exception.
+            CEHelper::SetupCorruptionSeverityForActiveException(bRethrownException, bNestedException, 
+                CEHelper::ShouldTreatActiveExceptionAsNonCorrupting());
+            END_SO_INTOLERANT_CODE;
+        }
+#endif // FEATURE_CORRUPTING_EXCEPTIONS
+
+#ifdef FEATURE_CORECLR
+        // Check if we are dealing with AV or not and if we are,
+        // ensure that this is a real AV and not managed AV exception
+        BOOL fIsThrownExceptionAV = FALSE;
+        if ((pExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION) &&
+            (MscorlibBinder::GetException(kAccessViolationException) == throwable->GetMethodTable()))
+        {
+            // Its an AV - set the flag
+            fIsThrownExceptionAV = TRUE;
+        }
+        
+        // Did we get an AV?
+        if (fIsThrownExceptionAV == TRUE)
+        {
+            // Get the escalation policy action for handling AV
+            EPolicyAction actionAV = GetEEPolicy()->GetActionOnFailure(FAIL_AccessViolation);
+            
+            // Valid actions are: eNoAction (default behviour) or eRudeExitProcess
+            _ASSERTE(((actionAV == eNoAction) || (actionAV == eRudeExitProcess)));
+            if (actionAV == eRudeExitProcess)
+            {
+                LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: AccessViolation handler found and doing RudeExitProcess due to escalation policy (eRudeExitProcess)\n"));
+                
+                // EEPolicy::HandleFatalError will help us RudeExit the process.
+                // RudeExitProcess due to AV is to prevent a security risk - we are ripping
+                // at the boundary, without looking for the handlers.
+                EEPOLICY_HANDLE_FATAL_ERROR(COR_E_SECURITY);
+            }
+        }
+#endif // FEATURE_CORECLR
+
+        // If we're out of memory, then we figure there's probably not memory to maintain a stack trace, so we skip it.
+        // If we've got a stack overflow, then we figure the stack will be so huge as to make tracking the stack trace
+        // impracticle, so we skip it.
+        if ((throwable == CLRException::GetPreallocatedOutOfMemoryException()) ||
+            (throwable == CLRException::GetPreallocatedStackOverflowException()))
+        {
+            tct.bAllowAllocMem = FALSE;
+        }
+        else
+        {
+            pExInfo->m_StackTraceInfo.AllocateStackTrace();
+        }
+        
+        GCPROTECT_END();
+    }
+
+    // Set up information for GetExceptionPointers()/GetExceptionCode() callback.
+    pExInfo->SetExceptionCode(pExceptionRecord);
+
+    pExInfo->m_pExceptionPointers = &exceptionPointers;
+
+    if (bRethrownException || bNestedException)
+    {
+        _ASSERTE(pExInfo->m_pPrevNestedInfo != NULL);
+        
+        BEGIN_SO_INTOLERANT_CODE(GetThread());
+        SetStateForWatsonBucketing(bRethrownException, pExInfo->GetPreviousExceptionTracker()->GetThrowableAsHandle());
+        END_SO_INTOLERANT_CODE;
+    }
+
+#ifdef DEBUGGING_SUPPORTED
+    //
+    // At this point the exception is still fresh to us, so assert that
+    // there should be nothing from the debugger on it.
+    //
+    _ASSERTE(!pExInfo->m_ExceptionFlags.DebuggerInterceptInfo());
+#endif
+
+    if (pThread->IsRudeAbort())
+    {
+        OBJECTREF rudeAbortThrowable = CLRException::GetPreallocatedRudeThreadAbortException();
+
+        if (pThread->GetThrowable() != rudeAbortThrowable)
+        {
+            // Neither of these sets will throw because the throwable that we're setting is a preallocated
+            // exception. This also updates the last thrown exception, for rethrows.
+            pThread->SafeSetThrowables(rudeAbortThrowable);
+        }
+
+        if (!pThread->IsRudeAbortInitiated())
+        {
+            pThread->PreWorkForThreadAbort();
+        }
+    }
+
+    LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: looking for handler bottom %x, top %x\n",
+         tct.pBottomFrame, tct.pTopFrame));
+    tct.bReplaceStack = pExInfo->m_pBottomMostHandler == pEstablisherFrame && !bRethrownException;
+    tct.bSkipLastElement = bRethrownException && bNestedException;
+    found = LookForHandler(&exceptionPointers,
+                                pThread,
+                                &tct);
+
+    // We have searched this far.
+    pExInfo->m_pSearchBoundary = tct.pTopFrame;
+    LOG((LF_EH, LL_INFO1000, "CPFH_RealFirstPassHandler: set pSearchBoundary to 0x%p\n", pExInfo->m_pSearchBoundary));
+
+    if ((found == LFH_NOT_FOUND)
+#ifdef DEBUGGING_SUPPORTED
+        && !pExInfo->m_ExceptionFlags.DebuggerInterceptInfo()
+#endif
+        )
+    {
+        LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: NOT_FOUND\n"));
+
+        if (tct.pTopFrame == FRAME_TOP)
+        {
+            LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: NOT_FOUND at FRAME_TOP\n"));
+        }
+
+        retval = ExceptionContinueSearch;
+        goto exit;
+    }
+    else
+    {
+    // so we are going to handle the exception
+
+    // Remove the nested exception record -- before calling RtlUnwind.
+    // The second-pass callback for a NestedExceptionRecord assumes that if it's
+    // being unwound, it should pop one exception from the pExInfo chain.  This is
+    // true for any older NestedRecords that might be unwound -- but not for the
+    // new one we're about to add.  To avoid this, we remove the new record
+    // before calling Unwind.
+    //
+    // <TODO>@NICE: This can probably be a little cleaner -- the nested record currently
+    // is also used to guard the running of the filter code.  When we clean up the
+    // behaviour of exceptions within filters, we should be able to get rid of this
+    // PUSH/POP/PUSH behaviour.</TODO>
+    _ASSERTE(bPopNestedHandlerExRecord);
+
+    UNINSTALL_EXCEPTION_HANDLING_RECORD(&(nestedHandlerExRecord.m_ExReg));
+
+    // Since we are going to handle the exception we switch into preemptive mode
+    GCX_PREEMP_NO_DTOR();
+
+#ifdef DEBUGGING_SUPPORTED
+    //
+    // Check if the debugger wants to intercept this frame at a different point than where we are.
+    //
+    if (pExInfo->m_ExceptionFlags.DebuggerInterceptInfo())
+    {
+        ClrDebuggerDoUnwindAndIntercept(pEstablisherFrame, pExceptionRecord);
+
+        //
+        // If this returns, then the debugger couldn't do it's stuff and we default to the found handler.
+        //
+        if (found == LFH_NOT_FOUND)
+        {
+            retval = ExceptionContinueSearch;
+                // we need to be sure to switch back into Cooperative mode since we are going to 
+                // jump to the exit: label and follow the normal return path (it is expected that
+                // CPFH_RealFirstPassHandler returns in COOP.
+                GCX_PREEMP_NO_DTOR_END();
+            goto exit;
+        }
+    }
+#endif
+
+    LOG((LF_EH, LL_INFO100, "CPFH_RealFirstPassHandler: handler found: %s\n", tct.pFunc->m_pszDebugMethodName));
+
+    CallRtlUnwindSafe(pEstablisherFrame, RtlUnwindCallback, pExceptionRecord, 0);
+    // on x86 at least, RtlUnwind always returns
+
+    // Note: we've completed the unwind pass up to the establisher frame, and we're headed off to finish our
+    // cleanup and end up back in jitted code. Any more FS0 handlers pushed from this point on out will _not_ be
+    // unwound.
+        // Note: we are still in Preemptive mode here and that is correct, COMPlusAfterUnwind will switch us back
+        // into Cooperative mode.
+    return COMPlusAfterUnwind(pExceptionRecord, pEstablisherFrame, tct);
+    }
+
+exit:
+    {
+        // We need to be in COOP if we get here
+        GCX_ASSERT_COOP();
+    }
+
+    // If we got as far as saving pExInfo, save the context pointer so it's available for the unwind.
+    if (pExInfo)
+    {
+        pExInfo->m_pContext = pContext;
+        // pExInfo->m_pExceptionPointers points to a local structure, which is now going out of scope.
+        pExInfo->m_pExceptionPointers = NULL;
+    }
+
+#if defined(USE_FEF)
+    if (bPopFaultingExceptionFrame)
+    {
+        faultingExceptionFrame.Pop();
+    }
+#endif // USE_FEF
+
+    if (bPopNestedHandlerExRecord)
+    {
+        UNINSTALL_EXCEPTION_HANDLING_RECORD(&(nestedHandlerExRecord.m_ExReg));
+    }
+    return retval;
+} // CPFH_RealFirstPassHandler()
+
+
+//******************************************************************************
+//
+void InitializeExceptionHandling()
+{
+    WRAPPER_NO_CONTRACT;
+
+    InitSavedExceptionInfo();
+
+    CLRAddVectoredHandlers();
+
+    // Initialize the lock used for synchronizing access to the stacktrace in the exception object
+    g_StackTraceArrayLock.Init(LOCK_TYPE_DEFAULT, TRUE);
+}
+
+//******************************************************************************
+static inline EXCEPTION_DISPOSITION __cdecl
+CPFH_FirstPassHandler(EXCEPTION_RECORD *pExceptionRecord,
+                      EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+                      CONTEXT *pContext,
+                      DISPATCHER_CONTEXT *pDispatcherContext)
+{
+    WRAPPER_NO_CONTRACT;
+    EXCEPTION_DISPOSITION retval;
+
+    _ASSERTE (!(pExceptionRecord->ExceptionFlags & (EXCEPTION_UNWINDING | EXCEPTION_EXIT_UNWIND)));
+
+    DWORD exceptionCode = pExceptionRecord->ExceptionCode;
+
+    Thread *pThread = GetThread();
+
+    STRESS_LOG4(LF_EH, LL_INFO100,
+                "CPFH_FirstPassHandler: pEstablisherFrame = %x EH code = %x  EIP = %x with ESP = %x\n",
+				pEstablisherFrame, exceptionCode, pContext ? GetIP(pContext) : 0, pContext ? GetSP(pContext) : 0);
+
+    EXCEPTION_POINTERS ptrs = { pExceptionRecord, pContext };
+
+    // Call to the vectored handler to give other parts of the Runtime a chance to jump in and take over an
+    // exception before we do too much with it. The most important point in the vectored handler is not to toggle
+    // the GC mode.
+    DWORD filter = CLRVectoredExceptionHandler(&ptrs);
+
+    if (filter == (DWORD) EXCEPTION_CONTINUE_EXECUTION)
+    {
+        return ExceptionContinueExecution;
+    }
+    else if (filter == EXCEPTION_CONTINUE_SEARCH)
+    {
+        return ExceptionContinueSearch;
+    }
+
+#if defined(STRESS_HEAP)
+    //
+    // Check to see if this exception is due to GCStress. Since the GCStress mechanism only injects these faults
+    // into managed code, we only need to check for them in CPFH_FirstPassHandler.
+    //
+    if (IsGcMarker(exceptionCode, pContext))
+    {
+        return ExceptionContinueExecution;
+    }
+#endif // STRESS_HEAP
+
+    // We always want to be in co-operative mode when we run this function and whenever we return
+    // from it, want to go to pre-emptive mode because are returning to OS.
+    BOOL disabled = pThread->PreemptiveGCDisabled();
+    GCX_COOP_NO_DTOR();
+
+    BOOL bAsynchronousThreadStop = IsThreadHijackedForThreadStop(pThread, pExceptionRecord);
+
+    if (bAsynchronousThreadStop)
+    {
+        // If we ever get here in preemptive mode, we're in trouble.  We've
+        // changed the thread's IP to point at a little function that throws ... if
+        // the thread were to be in preemptive mode and a GC occurred, the stack
+        // crawl would have been all messed up (becuase we have no frame that points
+        // us back to the right place in managed code).
+        _ASSERTE(disabled);
+
+        AdjustContextForThreadStop(pThread, pContext);
+        LOG((LF_EH, LL_INFO100, "CPFH_FirstPassHandler is Asynchronous Thread Stop or Abort\n"));
+    }
+
+    pThread->ResetThrowControlForThread();
+
+    CPFH_VerifyThreadIsInValidState(pThread, exceptionCode, pEstablisherFrame);
+
+    // If we were in cooperative mode when we came in here, then its okay to see if we should do HandleManagedFault
+    // and push a FaultingExceptionFrame. If we weren't in coop mode coming in here, then it means that there's no
+    // way the exception could really be from managed code. I might look like it was from managed code, but in
+    // reality its a rethrow from unmanaged code, either unmanaged user code, or unmanaged EE implementation.
+    if (disabled && ShouldHandleManagedFault(pExceptionRecord, pContext, pEstablisherFrame, pThread))
+    {
+#if defined(USE_FEF)
+        HandleManagedFault(pExceptionRecord, pContext, pEstablisherFrame, pThread);
+        retval = ExceptionContinueExecution;
+        goto exit;
+#else // USE_FEF
+        // Save the context pointer in the Thread's EXInfo, so that a stack crawl can recover the
+        //  register values from the fault.
+
+        //@todo: I haven't yet found any case where we need to do anything here.  If there are none, eliminate
+        //  this entire if () {} block.
+#endif // USE_FEF
+    }
+
+    // OK. We're finally ready to start the real work. Nobody else grabbed the exception in front of us. Now we can
+    // get started.
+    retval = CPFH_RealFirstPassHandler(pExceptionRecord,
+                                       pEstablisherFrame,
+                                       pContext,
+                                       pDispatcherContext,
+                                       bAsynchronousThreadStop,
+                                       disabled);
+
+#if defined(USE_FEF) // This label is only used in the HandleManagedFault() case above.
+exit:
+#endif
+    if (retval != ExceptionContinueExecution || !disabled)
+    {
+        GCX_PREEMP_NO_DTOR();
+    }
+
+    STRESS_LOG1(LF_EH, LL_INFO100, "CPFH_FirstPassHandler: exiting with retval %d\n", retval);
+    return retval;
+} // CPFH_FirstPassHandler()
+
+//******************************************************************************
+inline void
+CPFH_UnwindFrames1(Thread* pThread, EXCEPTION_REGISTRATION_RECORD* pEstablisherFrame, DWORD exceptionCode)
+{
+    WRAPPER_NO_CONTRACT;
+
+    ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+
+    // Ready to unwind the stack...
+    ThrowCallbackType tct;
+    tct.Init();
+    tct.bIsUnwind = TRUE;
+    tct.pTopFrame = GetCurrFrame(pEstablisherFrame); // highest frame to search to
+    tct.pBottomFrame = NULL;
+    
+    // Set the flag indicating if the current exception represents a longjmp.
+    // See comment in COMPlusUnwindCallback for details.
+    CORRUPTING_EXCEPTIONS_ONLY(tct.m_fIsLongJump = (exceptionCode == STATUS_LONGJUMP);)
+
+    #ifdef _DEBUG
+    tct.pCurrentExceptionRecord = pEstablisherFrame;
+    tct.pPrevExceptionRecord = GetPrevSEHRecord(pEstablisherFrame);
+    #endif
+
+    #ifdef DEBUGGING_SUPPORTED
+        EXCEPTION_REGISTRATION_RECORD *pInterceptEstablisherFrame = NULL;
+    
+        // If the exception is intercepted, use information stored in the DebuggerExState to unwind the stack.
+        if (pExInfo->m_ExceptionFlags.DebuggerInterceptInfo())
+        {
+            pExInfo->m_DebuggerExState.GetDebuggerInterceptInfo(&pInterceptEstablisherFrame,
+                                              NULL,     // MethodDesc **ppFunc,                              
+                                              NULL,     // int *pdHandler,                                   
+                                              NULL,     // BYTE **ppStack                                    
+                                              NULL,     // ULONG_PTR *pNativeOffset,                        
+                                              NULL      // Frame **ppFrame)                                 
+                                             );
+            LOG((LF_EH, LL_INFO1000, "CPFH_UnwindFrames1: frames are Est 0x%X, Intercept 0x%X\n",
+                 pEstablisherFrame, pInterceptEstablisherFrame));
+    
+            //
+            // When we set up for the interception we store off the CPFH or CPNEH that we
+            // *know* will handle unwinding the destination of the intercept.
+            //
+            // However, a CPNEH with the same limiting Capital-F-rame could do the work
+            // and unwind us, so...
+            //
+            // If this is the exact frame handler we are supposed to search for, or
+            // if this frame handler services the same Capital-F-rame as the frame handler
+            // we are looking for (i.e. this frame handler may do the work that we would
+            // expect our frame handler to do),
+            // then
+            //   we need to pass the interception destination during this unwind.
+            //
+            _ASSERTE(IsUnmanagedToManagedSEHHandler(pEstablisherFrame));
+    
+            if ((pEstablisherFrame == pInterceptEstablisherFrame) ||
+                (GetCurrFrame(pEstablisherFrame) == GetCurrFrame(pInterceptEstablisherFrame)))
+            {
+                pExInfo->m_DebuggerExState.GetDebuggerInterceptInfo(NULL,
+                                              &(tct.pFunc),
+                                              &(tct.dHandler),
+                                              &(tct.pStack),
+                                              NULL,
+                                              &(tct.pBottomFrame)
+                                             );
+    
+                LOG((LF_EH, LL_INFO1000, "CPFH_UnwindFrames1: going to: pFunc:%#X, pStack:%#X\n",
+                    tct.pFunc, tct.pStack));
+    
+            }
+    
+        }
+    #endif
+    
+    UnwindFrames(pThread, &tct);
+
+    LOG((LF_EH, LL_INFO1000, "CPFH_UnwindFrames1: after unwind ec:%#x, tct.pTopFrame:0x%p, pSearchBndry:0x%p\n"
+                             "                    pEstFrame:0x%p, IsC+NestExRec:%d, !Nest||Active:%d\n", 
+         exceptionCode, tct.pTopFrame, pExInfo->m_pSearchBoundary, pEstablisherFrame,  
+         IsComPlusNestedExceptionRecord(pEstablisherFrame), 
+         (!IsComPlusNestedExceptionRecord(pEstablisherFrame) || reinterpret_cast<NestedHandlerExRecord*>(pEstablisherFrame)->m_ActiveForUnwind)));
+
+    if (tct.pTopFrame >= pExInfo->m_pSearchBoundary &&
+         (!IsComPlusNestedExceptionRecord(pEstablisherFrame) ||
+          reinterpret_cast<NestedHandlerExRecord*>(pEstablisherFrame)->m_ActiveForUnwind) )
+    {
+        // If this is the search boundary, and we're not a nested handler, then
+        // this is the last time we'll see this exception.  Time to unwind our
+        // exinfo.
+        STRESS_LOG0(LF_EH, LL_INFO100, "CPFH_UnwindFrames1: Exception unwind -- unmanaged catcher detected\n");
+        pExInfo->UnwindExInfo((VOID*)pEstablisherFrame);
+    }
+} // CPFH_UnwindFrames1()
+
+//******************************************************************************
+inline EXCEPTION_DISPOSITION __cdecl
+CPFH_UnwindHandler(EXCEPTION_RECORD *pExceptionRecord,
+                   EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+                   CONTEXT *pContext,
+                   void *pDispatcherContext)
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE (pExceptionRecord->ExceptionFlags & (EXCEPTION_UNWINDING | EXCEPTION_EXIT_UNWIND));
+
+    #ifdef _DEBUG
+    // Note: you might be inclined to write "static int breakOnSecondPass = CLRConfig::GetConfigValue(...);", but
+    // you can't do that here. That causes C++ EH to be generated under the covers for this function, and this
+    // function isn't allowed to have any C++ EH in it because its never going to return.
+    static int breakOnSecondPass; // = 0
+    static BOOL breakOnSecondPassSetup; // = FALSE
+    if (!breakOnSecondPassSetup)
+    {
+        breakOnSecondPass = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_BreakOnSecondPass);
+        breakOnSecondPassSetup = TRUE;
+    }
+    if (breakOnSecondPass != 0)
+    {
+        _ASSERTE(!"Unwind handler");
+    }
+    #endif
+
+    DWORD exceptionCode = pExceptionRecord->ExceptionCode;
+    Thread *pThread = GetThread();
+
+    ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+
+    STRESS_LOG4(LF_EH, LL_INFO100, "In CPFH_UnwindHandler EHCode = %x EIP = %x with ESP = %x, pEstablisherFrame = 0x%p\n", exceptionCode,
+        pContext ? GetIP(pContext) : 0, pContext ? GetSP(pContext) : 0, pEstablisherFrame);
+
+    // We always want to be in co-operative mode when we run this function.  Whenever we return
+    // from it, want to go to pre-emptive mode because are returning to OS.
+
+    {
+        // needs to be in its own scope to avoid polluting the namespace, since
+        // we don't do a _END then we don't revert the state
+        GCX_COOP_NO_DTOR();
+    }
+
+    CPFH_VerifyThreadIsInValidState(pThread, exceptionCode, pEstablisherFrame);
+
+    if (IsComPlusNestedExceptionRecord(pEstablisherFrame))
+    {
+        NestedHandlerExRecord *pHandler = reinterpret_cast<NestedHandlerExRecord*>(pEstablisherFrame);
+        if (pHandler->m_pCurrentExInfo != NULL)
+        {
+            // See the comment at the end of COMPlusNestedExceptionHandler about nested exception.
+            // OS is going to skip the EstablisherFrame before our NestedHandler.
+            if (pHandler->m_pCurrentExInfo->m_pBottomMostHandler <= pHandler->m_pCurrentHandler)
+            {
+                // We're unwinding -- the bottom most handler is potentially off top-of-stack now.  If
+                // it is, change it to the next COM+ frame.  (This one is not good, as it's about to
+                // disappear.)
+                EXCEPTION_REGISTRATION_RECORD *pNextBottomMost = GetNextCOMPlusSEHRecord(pHandler->m_pCurrentHandler);
+
+                STRESS_LOG3(LF_EH, LL_INFO10000, "COMPlusNestedExceptionHandler: setting ExInfo:0x%p m_pBottomMostHandler from 0x%p to 0x%p\n", 
+                    pHandler->m_pCurrentExInfo, pHandler->m_pCurrentExInfo->m_pBottomMostHandler, pNextBottomMost);
+
+                pHandler->m_pCurrentExInfo->m_pBottomMostHandler = pNextBottomMost;
+            }
+        }
+    }
+
+    // this establishes a marker so can determine if are processing a nested exception
+    // don't want to use the current frame to limit search as it could have been unwound by
+    // the time get to nested handler (ie if find an exception, unwind to the call point and
+    // then resume in the catch and then get another exception) so make the nested handler
+    // have the same boundary as this one. If nested handler can't find a handler, we won't
+    // end up searching this frame list twice because the nested handler will set the search
+    // boundary in the thread and so if get back to this handler it will have a range that starts
+    // and ends at the same place.
+    NestedHandlerExRecord nestedHandlerExRecord;
+    nestedHandlerExRecord.Init((PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler, GetCurrFrame(pEstablisherFrame));
+
+    nestedHandlerExRecord.m_ActiveForUnwind = TRUE;
+        nestedHandlerExRecord.m_pCurrentExInfo = pExInfo;
+        nestedHandlerExRecord.m_pCurrentHandler = pEstablisherFrame;
+
+    INSTALL_EXCEPTION_HANDLING_RECORD(&(nestedHandlerExRecord.m_ExReg));
+
+    // Unwind the stack.  The establisher frame sets the boundary.
+    CPFH_UnwindFrames1(pThread, pEstablisherFrame, exceptionCode);
+
+    // We're unwinding -- the bottom most handler is potentially off top-of-stack now.  If
+    // it is, change it to the next COM+ frame.  (This one is not good, as it's about to
+    // disappear.)
+    if (pExInfo->m_pBottomMostHandler &&
+        pExInfo->m_pBottomMostHandler <= pEstablisherFrame) 
+    {
+        EXCEPTION_REGISTRATION_RECORD *pNextBottomMost = GetNextCOMPlusSEHRecord(pEstablisherFrame);
+
+        // If there is no previous COM+ SEH handler, GetNextCOMPlusSEHRecord() will return -1.  Much later, we will dereference that and AV.
+        _ASSERTE (pNextBottomMost != EXCEPTION_CHAIN_END);
+        
+        STRESS_LOG3(LF_EH, LL_INFO10000, "CPFH_UnwindHandler: setting ExInfo:0x%p m_pBottomMostHandler from 0x%p to 0x%p\n", 
+            pExInfo, pExInfo->m_pBottomMostHandler, pNextBottomMost);
+        
+        pExInfo->m_pBottomMostHandler = pNextBottomMost;
+    }
+
+    {
+        // needs to be in its own scope to avoid polluting the namespace, since
+        // we don't do a _END then we don't revert the state
+        GCX_PREEMP_NO_DTOR();
+    }
+    UNINSTALL_EXCEPTION_HANDLING_RECORD(&(nestedHandlerExRecord.m_ExReg));
+
+    // If we are here, then exception was not caught in managed code protected by this
+    // ComplusFrameHandler. Hence, reset thread abort state if this is the last personality routine,
+    // for managed code, on the stack.
+    ResetThreadAbortState(pThread, pEstablisherFrame);
+
+    STRESS_LOG0(LF_EH, LL_INFO100, "CPFH_UnwindHandler: Leaving with ExceptionContinueSearch\n");
+    return ExceptionContinueSearch;
+} // CPFH_UnwindHandler()
+
+//******************************************************************************
+// This is the first handler that is called in the context of managed code
+// It is the first level of defense and tries to find a handler in the user
+// code to handle the exception
+//-------------------------------------------------------------------------
+// EXCEPTION_DISPOSITION __cdecl COMPlusFrameHandler(
+//     EXCEPTION_RECORD *pExceptionRecord,
+//     _EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame,
+//     CONTEXT *pContext,
+//     DISPATCHER_CONTEXT *pDispatcherContext)
+// 
+// See http://www.microsoft.com/msj/0197/exception/exception.aspx for a background piece on Windows
+// unmanaged structured exception handling.  
+EXCEPTION_HANDLER_IMPL(COMPlusFrameHandler)
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(!DebugIsEECxxException(pExceptionRecord) && "EE C++ Exception leaked into managed code!");
+
+    STRESS_LOG5(LF_EH, LL_INFO100, "In COMPlusFrameHander EH code = %x  flag = %x EIP = %x with ESP = %x, pEstablisherFrame = 0x%p\n",
+        pExceptionRecord->ExceptionCode, pExceptionRecord->ExceptionFlags, 
+        pContext ? GetIP(pContext) : 0, pContext ? GetSP(pContext) : 0, pEstablisherFrame);
+
+    _ASSERTE((pContext == NULL) || ((pContext->ContextFlags & CONTEXT_CONTROL) == CONTEXT_CONTROL));
+    
+    if (g_fNoExceptions)
+        return ExceptionContinueSearch; // No EH during EE shutdown.
+
+    // Check if the exception represents a GCStress Marker. If it does,
+    // we shouldnt record its entry in the TLS as such exceptions are
+    // continuable and can confuse the VM to treat them as CSE, 
+    // as they are implemented using illegal instruction exception.
+
+    bool fIsGCMarker = false;
+
+#ifdef HAVE_GCCOVER // This is a debug only macro
+    if (GCStress<cfg_instr_jit>::IsEnabled())
+    {
+        // UnsafeTlsGetValue trashes last error. When Complus_GCStress=4, GC is invoked
+        // on every allowable JITed instruction by means of our exception handling machanism
+        // it is very easy to trash the last error. For example, a p/invoke called a native method
+        // which sets last error. Before we getting the last error in the IL stub, it is trashed here
+        DWORD dwLastError = GetLastError();
+        fIsGCMarker = IsGcMarker(pExceptionRecord->ExceptionCode, pContext);
+        if (!fIsGCMarker)
+        {
+            SaveCurrentExceptionInfo(pExceptionRecord, pContext);
+        }
+        SetLastError(dwLastError);
+    }
+    else
+#endif
+    {
+        // GCStress does not exist on retail builds (see IsGcMarker implementation for details).
+        SaveCurrentExceptionInfo(pExceptionRecord, pContext);
+    }
+
+    if (fIsGCMarker)
+    {
+        // If this was a GCStress marker exception, then return
+        // ExceptionContinueExecution to the OS.
+        return ExceptionContinueExecution;
+    }
+    
+    EXCEPTION_DISPOSITION retVal = ExceptionContinueSearch;
+
+    Thread *pThread = GetThread();
+    if ((pExceptionRecord->ExceptionFlags & (EXCEPTION_UNWINDING | EXCEPTION_EXIT_UNWIND)) == 0)
+    {
+        if (IsSOExceptionCode(pExceptionRecord->ExceptionCode))
+        {
+            EEPolicy::HandleStackOverflow(SOD_ManagedFrameHandler, (void*)pEstablisherFrame);
+
+            // VC's unhandled exception filter plays with stack.  It VirtualAlloc's a new stack, and
+            // then launch Watson from the new stack.  When Watson asks CLR to save required data, we
+            // are not able to walk the stack.
+            // Setting Context in ExInfo so that our Watson dump routine knows how to walk this stack.
+            ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+            pExInfo->m_pContext = pContext;
+
+            // Save the reference to the topmost handler we see during first pass when an SO goes past us. 
+            // When an unwind gets triggered for the exception, we will reset the frame chain when we reach
+            // the topmost handler we saw during the first pass.
+            //
+            // This unifies, behaviour-wise, 32bit with 64bit.
+            if ((pExInfo->m_pTopMostHandlerDuringSO == NULL) ||
+                (pEstablisherFrame > pExInfo->m_pTopMostHandlerDuringSO))
+            {
+                pExInfo->m_pTopMostHandlerDuringSO = pEstablisherFrame;
+            }
+
+            // Switch to preemp mode since we are returning back to the OS.
+            // We will do the quick switch since we are short of stack
+            FastInterlockAnd (&pThread->m_fPreemptiveGCDisabled, 0);
+
+            return ExceptionContinueSearch;
+        }
+        else
+        {
+#ifdef FEATURE_STACK_PROBE
+            if (GetEEPolicy()->GetActionOnFailure(FAIL_StackOverflow) == eRudeUnloadAppDomain)
+            {
+                RetailStackProbe(static_cast<unsigned int>(ADJUST_PROBE(BACKOUT_CODE_STACK_LIMIT)), pThread);
+            }
+#endif
+        }
+    }
+    else
+    {
+        DWORD exceptionCode = pExceptionRecord->ExceptionCode;
+
+        if (exceptionCode == STATUS_UNWIND)
+        {
+            // If exceptionCode is STATUS_UNWIND, RtlUnwind is called with a NULL ExceptionRecord,
+            // therefore OS uses a faked ExceptionRecord with STATUS_UNWIND code.  Then we need to
+            // look at our saved exception code.
+            exceptionCode = GetCurrentExceptionCode();
+        }
+
+        if (IsSOExceptionCode(exceptionCode))
+        {
+            // We saved the context during the first pass in case the stack overflow exception is
+            // unhandled and Watson dump code needs it.  Now we are in the second pass, therefore
+            // either the exception is handled by user code, or we have finished unhandled exception 
+            // filter process, and the OS is unwinding the stack.  Either way, we don't need the 
+            // context any more.  It is very important to reset the context so that our code does not
+            // accidentally walk the frame using the dangling context in ExInfoWalker::WalkToPosition.
+            ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+            pExInfo->m_pContext = NULL;
+
+            // We should have the reference to the topmost handler seen during the first pass of SO
+            _ASSERTE(pExInfo->m_pTopMostHandlerDuringSO != NULL);
+
+            // Reset frame chain till we reach the topmost establisher frame we saw in the first pass.
+            // This will ensure that if any intermediary frame calls back into managed (e.g. native frame
+            // containing a __finally that reverse pinvokes into managed), then we have the correct
+            // explicit frame on the stack. Resetting the frame chain only when we reach the topmost
+            // personality routine seen in the first pass may not result in expected behaviour,
+            // specially during stack walks when crawl frame needs to be initialized from
+            // explicit frame.
+            if (pEstablisherFrame <= pExInfo->m_pTopMostHandlerDuringSO)
+            {
+                GCX_COOP_NO_DTOR();
+
+                if (pThread->GetFrame() < GetCurrFrame(pEstablisherFrame))
+                {
+                    // We are very short of stack.  We avoid calling UnwindFrame which may
+                    // run unknown code here.
+                    pThread->SetFrame(GetCurrFrame(pEstablisherFrame));
+                }
+            }
+
+            // Switch to preemp mode since we are returning back to the OS.
+            // We will do the quick switch since we are short of stack
+            FastInterlockAnd(&pThread->m_fPreemptiveGCDisabled, 0);
+
+            return ExceptionContinueSearch;
+        }
+    }
+
+    // <TODO> .  We need to probe here, but can't introduce destructors etc. </TODO>
+    BEGIN_CONTRACT_VIOLATION(SOToleranceViolation);
+
+    if (pExceptionRecord->ExceptionFlags & (EXCEPTION_UNWINDING | EXCEPTION_EXIT_UNWIND))
+    {
+        retVal =  CPFH_UnwindHandler(pExceptionRecord,
+                                     pEstablisherFrame,
+                                     pContext,
+                                     pDispatcherContext);
+    }
+    else
+    {
+
+        /* Make no assumptions about the current machine state.
+           <TODO>@PERF: Only needs to be called by the very first handler invoked by SEH </TODO>*/
+        ResetCurrentContext();
+
+        retVal = CPFH_FirstPassHandler(pExceptionRecord,
+                                       pEstablisherFrame,
+                                       pContext,
+                                       pDispatcherContext);
+
+    }
+
+    END_CONTRACT_VIOLATION;
+
+    return retVal;
+} // COMPlusFrameHandler()
+
+
+//-------------------------------------------------------------------------
+// This is called by the EE to restore the stack pointer if necessary.
+//-------------------------------------------------------------------------
+
+// This can't be inlined into the caller to avoid introducing EH frame
+NOINLINE LPVOID COMPlusEndCatchWorker(Thread * pThread)
+{
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+    STATIC_CONTRACT_SO_INTOLERANT;
+
+    LOG((LF_EH, LL_INFO1000, "COMPlusPEndCatch:called with "
+        "pThread:0x%x\n",pThread));
+
+    // indicate that we are out of the managed clause as early as possible
+    ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+    pExInfo->m_EHClauseInfo.SetManagedCodeEntered(FALSE);
+
+    void* esp = NULL;
+
+    // @todo .  We need to probe in the EH code, but can't introduce destructors etc.
+    BEGIN_CONTRACT_VIOLATION(SOToleranceViolation);
+
+    // Notify the profiler that the catcher has finished running
+    // IL stubs don't contain catch blocks so inability to perform this check does not matter.
+    // if (!pFunc->IsILStub())
+    EEToProfilerExceptionInterfaceWrapper::ExceptionCatcherLeave();
+
+    // no need to set pExInfo->m_ClauseType = (DWORD)COR_PRF_CLAUSE_NONE now that the
+    // notification is done because because the ExInfo record is about to be popped off anyway
+
+    LOG((LF_EH, LL_INFO1000, "COMPlusPEndCatch:pThread:0x%x\n",pThread));
+    
+#ifdef _DEBUG
+    gLastResumedExceptionFunc = NULL;
+    gLastResumedExceptionHandler = 0;
+#endif
+    // Set the thrown object to NULL as no longer needed. This also sets the last thrown object to NULL.
+    pThread->SafeSetThrowables(NULL);
+
+    // reset the stashed exception info
+    pExInfo->m_pExceptionRecord = NULL;
+    pExInfo->m_pContext = NULL;
+    pExInfo->m_pExceptionPointers = NULL;
+
+    if  (pExInfo->m_pShadowSP)
+    {
+        *pExInfo->m_pShadowSP = 0;  // Reset the shadow SP
+    }
+
+    // pExInfo->m_dEsp was set in ResumeAtJITEH(). It is the Esp of the
+    // handler nesting level which catches the exception.
+    esp = (void*)(size_t)pExInfo->m_dEsp;
+
+    pExInfo->UnwindExInfo(esp);
+    
+    // Prepare to sync managed exception state
+    //
+    // In a case when we're nested inside another catch block, the domain in which we're executing may not be the
+    // same as the one the domain of the throwable that was just made the current throwable above. Therefore, we
+    // make a special effort to preserve the domain of the throwable as we update the the last thrown object.
+    //
+    // This function (COMPlusEndCatch) can also be called by the in-proc debugger helper thread on x86 when
+    // an attempt to SetIP takes place to set IP outside the catch clause. In such a case, managed thread object
+    // will not be available. Thus, we should reset the severity only if its not such a thread.
+    //
+    // This behaviour (of debugger doing SetIP) is not allowed on 64bit since the catch clauses are implemented
+    // as a seperate funclet and it's just not allowed to set the IP across EH scopes, such as from inside a catch 
+    // clause to outside of the catch clause.
+    bool fIsDebuggerHelperThread = (g_pDebugInterface == NULL) ? false : g_pDebugInterface->ThisIsHelperThread();
+
+    // Sync managed exception state, for the managed thread, based upon any active exception tracker
+    pThread->SyncManagedExceptionState(fIsDebuggerHelperThread);
+
+    LOG((LF_EH, LL_INFO1000, "COMPlusPEndCatch: esp=%p\n", esp));
+   
+    END_CONTRACT_VIOLATION;
+
+    return esp;
+}
+
+//
+// This function works in conjunction with JIT_EndCatch.  On input, the parameters are set as follows:
+//    ebp, ebx, edi, esi: the values of these registers at the end of the catch block
+//    *pRetAddress: the next instruction after the call to JIT_EndCatch
+//
+// On output, *pRetAddress is the instruction at which to resume execution.  This may be user code,
+// or it may be ThrowControlForThread (which will re-raise a pending ThreadAbortException).
+//
+// Returns the esp to set before resuming at *pRetAddress.
+//
+LPVOID STDCALL COMPlusEndCatch(LPVOID ebp, DWORD ebx, DWORD edi, DWORD esi, LPVOID* pRetAddress)
+{
+    //
+    // PopNestedExceptionRecords directly manipulates fs:[0] chain. This method can't have any EH!
+    //
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+    STATIC_CONTRACT_SO_INTOLERANT;
+
+    ETW::ExceptionLog::ExceptionCatchEnd();
+    ETW::ExceptionLog::ExceptionThrownEnd();
+
+    void* esp = COMPlusEndCatchWorker(GetThread());
+
+    // We are going to resume at a handler nesting level whose esp is dEsp. Pop off any SEH records below it. This
+    // would be the COMPlusNestedExceptionHandler we had inserted.
+    PopNestedExceptionRecords(esp);
+
+    //
+    // Set up m_OSContext for the call to COMPlusCheckForAbort
+    //
+    Thread* pThread = GetThread();
+    _ASSERTE(pThread != NULL);
+
+    SetIP(pThread->m_OSContext, (PCODE)*pRetAddress);
+    SetSP(pThread->m_OSContext, (TADDR)esp);
+    SetFP(pThread->m_OSContext, (TADDR)ebp);
+    pThread->m_OSContext->Ebx = ebx;
+    pThread->m_OSContext->Edi = edi;
+    pThread->m_OSContext->Esi = esi;
+
+    LPVOID throwControl = COMPlusCheckForAbort((UINT_PTR)*pRetAddress);
+    if (throwControl)
+        *pRetAddress = throwControl;
+
+    return esp;
+}
+
+#endif // !DACCESS_COMPILE
+
+PTR_CONTEXT GetCONTEXTFromRedirectedStubStackFrame(CONTEXT * pContext)
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+    
+    UINT_PTR stackSlot = pContext->Ebp + REDIRECTSTUB_EBP_OFFSET_CONTEXT;
+    PTR_PTR_CONTEXT ppContext = dac_cast<PTR_PTR_CONTEXT>((TADDR)stackSlot);
+    return *ppContext;
+}
+
+#if !defined(DACCESS_COMPILE)
+
+PEXCEPTION_REGISTRATION_RECORD GetCurrentSEHRecord()
+{
+    WRAPPER_NO_CONTRACT;
+
+    LPVOID fs0 = (LPVOID)__readfsdword(0);
+
+#if 0  // This walk is too expensive considering we hit it every time we a CONTRACT(NOTHROW)
+#ifdef _DEBUG
+    EXCEPTION_REGISTRATION_RECORD *pEHR = (EXCEPTION_REGISTRATION_RECORD *)fs0;
+    LPVOID spVal;
+    __asm {
+        mov spVal, esp
+    }
+
+    // check that all the eh frames are all greater than the current stack value. If not, the
+    // stack has been updated somehow w/o unwinding the SEH chain.
+
+    // LOG((LF_EH, LL_INFO1000000, "ER Chain:\n"));
+    while (pEHR != NULL && pEHR != EXCEPTION_CHAIN_END) {
+        // LOG((LF_EH, LL_INFO1000000, "\tp: prev:p handler:%x\n", pEHR, pEHR->Next, pEHR->Handler));
+        if (pEHR < spVal) {
+            if (gLastResumedExceptionFunc != 0)
+                _ASSERTE(!"Stack is greater than start of SEH chain - possible missing leave in handler. See gLastResumedExceptionHandler & gLastResumedExceptionFunc for info");
+            else
+                _ASSERTE(!"Stack is greater than start of SEH chain (FS:0)");
+        }
+        if (pEHR->Handler == (void *)-1)
+            _ASSERTE(!"Handler value has been corrupted");
+
+            _ASSERTE(pEHR < pEHR->Next);
+
+        pEHR = pEHR->Next;
+    }
+#endif
+#endif
+
+    return (EXCEPTION_REGISTRATION_RECORD*) fs0;
+}
+
+PEXCEPTION_REGISTRATION_RECORD GetFirstCOMPlusSEHRecord(Thread *pThread) {
+    WRAPPER_NO_CONTRACT;
+    EXCEPTION_REGISTRATION_RECORD *pEHR = *(pThread->GetExceptionListPtr());
+    if (pEHR == EXCEPTION_CHAIN_END || IsUnmanagedToManagedSEHHandler(pEHR)) {
+        return pEHR;
+    } else {
+        return GetNextCOMPlusSEHRecord(pEHR);
+    }
+}
+
+
+PEXCEPTION_REGISTRATION_RECORD GetPrevSEHRecord(EXCEPTION_REGISTRATION_RECORD *next)
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(IsUnmanagedToManagedSEHHandler(next));
+
+    EXCEPTION_REGISTRATION_RECORD *pEHR = GetCurrentSEHRecord();
+    _ASSERTE(pEHR != 0 && pEHR != EXCEPTION_CHAIN_END);
+
+    EXCEPTION_REGISTRATION_RECORD *pBest = 0;
+    while (pEHR != next) {
+        if (IsUnmanagedToManagedSEHHandler(pEHR))
+            pBest = pEHR;
+        pEHR = pEHR->Next;
+        _ASSERTE(pEHR != 0 && pEHR != EXCEPTION_CHAIN_END);
+    }
+
+    return pBest;
+}
+
+VOID SetCurrentSEHRecord(EXCEPTION_REGISTRATION_RECORD *pSEH)
+{
+    WRAPPER_NO_CONTRACT;
+    *GetThread()->GetExceptionListPtr() = pSEH;
+}
+
+
+//
+// Unwind pExinfo, pops FS:[0] handlers until the interception context SP, and
+// resumes at interception context.
+//
+VOID UnwindExceptionTrackerAndResumeInInterceptionFrame(ExInfo* pExInfo, EHContext* context)
+{
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+    STATIC_CONTRACT_SO_TOLERANT;
+
+    _ASSERTE(pExInfo && context);
+
+    pExInfo->UnwindExInfo((LPVOID)(size_t)context->Esp);
+    PopNestedExceptionRecords((LPVOID)(size_t)context->Esp);
+
+    STRESS_LOG3(LF_EH|LF_CORDB, LL_INFO100, "UnwindExceptionTrackerAndResumeInInterceptionFrame: completing intercept at EIP = %p  ESP = %p EBP = %p\n", context->Eip, context->Esp, context->Ebp);
+
+    ResumeAtJitEHHelper(context);
+    UNREACHABLE_MSG("Should never return from ResumeAtJitEHHelper!");
+}
+
+//
+// Pop SEH records below the given target ESP. This is only used to pop nested exception records.
+// If bCheckForUnknownHandlers is set, it only checks for unknown FS:[0] handlers.
+//
+BOOL PopNestedExceptionRecords(LPVOID pTargetSP, BOOL bCheckForUnknownHandlers)
+{
+    // No CONTRACT here, because we can't run the risk of it pushing any SEH into the current method.
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_SO_TOLERANT;
+
+    PEXCEPTION_REGISTRATION_RECORD pEHR = GetCurrentSEHRecord();
+
+    while ((LPVOID)pEHR < pTargetSP)
+    {
+        //
+        // The only handler type we're allowed to have below the limit on the FS:0 chain in these cases is a nested
+        // exception record, so we verify that here.
+        //
+        // There is a special case, of course: for an unhandled exception, when the default handler does the exit
+        // unwind, we may have an exception that escapes a finally clause, thus replacing the original unhandled
+        // exception. If we find a catcher for that new exception, then we'll go ahead and do our own unwind, then
+        // jump to the catch. When we are called here, just before jumpping to the catch, we'll pop off our nested
+        // handlers, then we'll pop off one more handler: the handler that ntdll!ExecuteHandler2 pushed before
+        // calling our nested handler. We go ahead and pop off that handler, too. Its okay, its only there to catch
+        // exceptions from handlers and turn them into collided unwind status codes... there's no cleanup in the
+        // handler that we're removing, and that's the important point. The handler that ExecuteHandler2 pushes
+        // isn't a public export from ntdll, but its named "UnwindHandler" and is physically shortly after
+        // ExecuteHandler2 in ntdll.
+        //
+        static HINSTANCE ExecuteHandler2Module = 0;
+        static BOOL ExecuteHandler2ModuleInited = FALSE;
+
+        // Cache the handle to the dll with the handler pushed by ExecuteHandler2.
+        if (!ExecuteHandler2ModuleInited)
+        {
+            ExecuteHandler2Module = WszGetModuleHandle(W("ntdll.dll"));
+            ExecuteHandler2ModuleInited = TRUE;
+        }
+
+        if (bCheckForUnknownHandlers)
+        {
+            if (!IsComPlusNestedExceptionRecord(pEHR) || 
+                !((ExecuteHandler2Module != NULL) && IsIPInModule(ExecuteHandler2Module, (PCODE)pEHR->Handler)))
+            {
+                return TRUE;
+            }
+        }
+#ifdef _DEBUG
+        else
+        {
+            // Note: if we can't find the module containing ExecuteHandler2, we'll just be really strict and require
+            // that we're only popping nested handlers.
+            _ASSERTE(IsComPlusNestedExceptionRecord(pEHR) ||
+                     ((ExecuteHandler2Module != NULL) && IsIPInModule(ExecuteHandler2Module, (PCODE)pEHR->Handler)));
+        }
+#endif // _DEBUG
+
+        pEHR = pEHR->Next;
+    }
+
+    if (!bCheckForUnknownHandlers)
+    {
+        SetCurrentSEHRecord(pEHR);
+    }
+    return FALSE;
+}
+
+//
+// This is implemented differently from the PopNestedExceptionRecords above because it's called in the context of
+// the DebuggerRCThread to operate on the stack of another thread.
+//
+VOID PopNestedExceptionRecords(LPVOID pTargetSP, CONTEXT *pCtx, void *pSEH)
+{
+    // No CONTRACT here, because we can't run the risk of it pushing any SEH into the current method.
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+
+#ifdef _DEBUG
+    LOG((LF_CORDB,LL_INFO1000, "\nPrintSEHRecords:\n"));
+
+    EXCEPTION_REGISTRATION_RECORD *pEHR = (EXCEPTION_REGISTRATION_RECORD *)(size_t)*(DWORD *)pSEH;
+
+    // check that all the eh frames are all greater than the current stack value. If not, the
+    // stack has been updated somehow w/o unwinding the SEH chain.
+    while (pEHR != NULL && pEHR != EXCEPTION_CHAIN_END)
+    {
+        LOG((LF_EH, LL_INFO1000000, "\t%08x: next:%08x handler:%x\n", pEHR, pEHR->Next, pEHR->Handler));
+        pEHR = pEHR->Next;
+    }
+#endif
+
+    DWORD dwCur = *(DWORD*)pSEH; // 'EAX' in the original routine
+    DWORD dwPrev = (DWORD)(size_t)pSEH;
+
+    while (dwCur < (DWORD)(size_t)pTargetSP)
+    {
+        // Watch for the OS handler
+        // for nested exceptions, or any C++ handlers for destructors in our call
+        // stack, or anything else.
+        if (dwCur < (DWORD)GetSP(pCtx))
+            dwPrev = dwCur;
+
+        dwCur = *(DWORD *)(size_t)dwCur;
+
+        LOG((LF_CORDB,LL_INFO10000, "dwCur: 0x%x dwPrev:0x%x pTargetSP:0x%x\n",
+            dwCur, dwPrev, pTargetSP));
+    }
+
+    *(DWORD *)(size_t)dwPrev = dwCur;
+
+#ifdef _DEBUG
+    pEHR = (EXCEPTION_REGISTRATION_RECORD *)(size_t)*(DWORD *)pSEH;
+    // check that all the eh frames are all greater than the current stack value. If not, the
+    // stack has been updated somehow w/o unwinding the SEH chain.
+
+    LOG((LF_CORDB,LL_INFO1000, "\nPopSEHRecords:\n"));
+    while (pEHR != NULL && pEHR != (void *)-1)
+    {
+        LOG((LF_EH, LL_INFO1000000, "\t%08x: next:%08x handler:%x\n", pEHR, pEHR->Next, pEHR->Handler));
+        pEHR = pEHR->Next;
+    }
+#endif
+}
+
+//==========================================================================
+// COMPlusThrowCallback
+//
+//==========================================================================
+
+/*
+ *
+ * COMPlusThrowCallbackHelper
+ *
+ * This function is a simple helper function for COMPlusThrowCallback.  It is needed
+ * because of the EX_TRY macro.  This macro does an alloca(), which allocates space
+ * off the stack, not free'ing it.  Thus, doing a EX_TRY in a loop can easily result
+ * in a stack overflow error.  By factoring out the EX_TRY into a separate function,
+ * we recover that stack space.
+ *
+ * Parameters:
+ *   pJitManager - The JIT manager that will filter the EH.
+ *   pCf - The frame to crawl.
+ *   EHClausePtr
+ *   nestingLevel
+ *   pThread - Used to determine if the thread is throwable or not.
+ *
+ * Return:
+ *   Exception status.
+ *
+ */
+int COMPlusThrowCallbackHelper(IJitManager *pJitManager,
+                               CrawlFrame *pCf,
+                               ThrowCallbackType* pData,
+                               EE_ILEXCEPTION_CLAUSE  *EHClausePtr,
+                               DWORD nestingLevel,
+                               OBJECTREF throwable,
+                               Thread *pThread
+                              )
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_TRIGGERS;
+        MODE_COOPERATIVE;
+    }
+    CONTRACTL_END;
+
+    int iFilt = 0;
+    BOOL impersonating = FALSE;
+
+    EX_TRY
+    {
+        GCPROTECT_BEGIN (throwable);
+        if (pData->hCallerToken != NULL)
+        {
+            STRESS_LOG1(LF_EH, LL_INFO100, "In COMPlusThrowCallbackHelper hCallerToken = %d\n",pData->hCallerToken);
+            // CLR_ImpersonateLoggedOnUser fails fast on error
+            COMPrincipal::CLR_ImpersonateLoggedOnUser(pData->hCallerToken);
+            impersonating = TRUE;
+        }
+
+        // We want to call filters even if the thread is aborting, so suppress abort
+        // checks while the filter runs.
+        ThreadPreventAsyncHolder preventAbort;
+
+        BYTE* startAddress = (BYTE*)pCf->GetCodeInfo()->GetStartAddress();
+        iFilt = ::CallJitEHFilter(pCf, startAddress, EHClausePtr, nestingLevel, throwable);
+
+        if (impersonating)
+        {
+            STRESS_LOG1(LF_EH, LL_INFO100, "In COMPlusThrowCallbackHelper hImpersonationToken = %d\n",pData->hImpersonationToken);
+            // CLR_ImpersonateLoggedOnUser fails fast on error
+            COMPrincipal::CLR_ImpersonateLoggedOnUser(pData->hImpersonationToken);
+            impersonating = FALSE;
+        }
+        GCPROTECT_END();
+    }
+    EX_CATCH
+    {
+        if (impersonating)
+        {
+            STRESS_LOG1(LF_EH, LL_INFO100, "In COMPlusThrowCallbackHelper EX_CATCH hImpersonationToken = %d\n",pData->hImpersonationToken);
+            // CLR_ImpersonateLoggedOnUser fails fast on error
+            COMPrincipal::CLR_ImpersonateLoggedOnUser(pData->hImpersonationToken);
+            impersonating = FALSE;
+        }
+
+        // We had an exception in filter invocation that remained unhandled.
+        // Sync managed exception state, for the managed thread, based upon the active exception tracker.
+        pThread->SyncManagedExceptionState(false);
+        
+        //
+        // Swallow exception.  Treat as exception continue search.
+        //
+        iFilt = EXCEPTION_CONTINUE_SEARCH;
+
+    }
+    EX_END_CATCH(SwallowAllExceptions)
+
+    return iFilt;
+}
+
+//******************************************************************************
+// The stack walk callback for exception handling on x86.
+// Returns one of:
+//    SWA_CONTINUE    = 0,    // continue walking
+//    SWA_ABORT       = 1,    // stop walking, early out in "failure case"
+//    SWA_FAILED      = 2     // couldn't walk stack
+StackWalkAction COMPlusThrowCallback(       // SWA value
+    CrawlFrame  *pCf,                       // Data from StackWalkFramesEx
+    ThrowCallbackType *pData)               // Context data passed through from CPFH
+{
+    // We don't want to use a runtime contract here since this codepath is used during
+    // the processing of a hard SO. Contracts use a significant amount of stack
+    // which we can't afford for those cases.
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+    Frame *pFrame = pCf->GetFrame();
+    MethodDesc *pFunc = pCf->GetFunction();
+
+    #if defined(_DEBUG)
+    #define METHODNAME(pFunc) (pFunc?pFunc->m_pszDebugMethodName:"<n/a>")
+    #else
+    #define METHODNAME(pFunc) "<n/a>"
+    #endif
+    STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusThrowCallback: STACKCRAWL method:%pM ('%s'), Frame:%p, FrameVtable = %pV\n",
+        pFunc, METHODNAME(pFunc), pFrame, pCf->IsFrameless()?0:(*(void**)pFrame));
+    #undef METHODNAME
+
+    Thread *pThread = GetThread();
+
+    if (pFrame && pData->pTopFrame == pFrame)
+        /* Don't look past limiting frame if there is one */
+        return SWA_ABORT;
+
+    if (!pFunc)
+        return SWA_CONTINUE;
+
+    if (pThread->IsRudeAbortInitiated() && !pThread->IsWithinCer(pCf))
+    {
+        return SWA_CONTINUE;
+    }
+
+    ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+
+    _ASSERTE(!pData->bIsUnwind);
+#ifdef _DEBUG
+    // It SHOULD be the case that any frames we consider live between this exception
+    // record and the previous one.
+    if (!pExInfo->m_pPrevNestedInfo) {
+        if (pData->pCurrentExceptionRecord) {
+            if (pFrame) _ASSERTE(pData->pCurrentExceptionRecord > pFrame);
+            if (pCf->IsFrameless()) _ASSERTE((ULONG_PTR)pData->pCurrentExceptionRecord >= GetRegdisplaySP(pCf->GetRegisterSet()));
+        }
+        if (pData->pPrevExceptionRecord) {
+            // FCALLS have an extra SEH record in debug because of the desctructor
+            // associated with ForbidGC checking.  This is benign, so just ignore it.
+            if (pFrame) _ASSERTE(pData->pPrevExceptionRecord < pFrame || pFrame->GetVTablePtr() == HelperMethodFrame::GetMethodFrameVPtr());
+            if (pCf->IsFrameless()) _ASSERTE((ULONG_PTR)pData->pPrevExceptionRecord <= GetRegdisplaySP(pCf->GetRegisterSet()));
+        }
+    }
+#endif
+
+    UINT_PTR currentIP = 0;
+    UINT_PTR currentSP = 0;
+
+    if (pCf->IsFrameless())
+    {
+        currentIP = (UINT_PTR)GetControlPC(pCf->GetRegisterSet());
+        currentSP = (UINT_PTR)GetRegdisplaySP(pCf->GetRegisterSet());
+    }
+    else if (InlinedCallFrame::FrameHasActiveCall(pFrame))
+    {
+        // don't have the IP, SP for native code
+        currentIP = 0;
+        currentSP = 0;
+    }
+    else
+    {
+        currentIP = (UINT_PTR)(pCf->GetFrame()->GetIP());
+        currentSP = 0; //Don't have an SP to get.
+    }
+    
+    if (!pFunc->IsILStub())
+    {
+        // Append the current frame to the stack trace and save the save trace to the managed Exception object.
+        pExInfo->m_StackTraceInfo.AppendElement(pData->bAllowAllocMem, currentIP, currentSP, pFunc, pCf);
+
+        pExInfo->m_StackTraceInfo.SaveStackTrace(pData->bAllowAllocMem,
+                                                 pThread->GetThrowableAsHandle(),
+                                                 pData->bReplaceStack,
+                                                 pData->bSkipLastElement);
+    }
+    else
+    {
+        LOG((LF_EH, LL_INFO1000, "COMPlusThrowCallback: Skipping AppendElement/SaveStackTrace for IL stub MD %p\n", pFunc));
+    }
+
+    // Fire an exception thrown ETW event when an exception occurs
+    ETW::ExceptionLog::ExceptionThrown(pCf, pData->bSkipLastElement, pData->bReplaceStack);
+
+    // Reset the flags.  These flags are set only once before each stack walk done by LookForHandler(), and
+    // they apply only to the first frame we append to the stack trace.  Subsequent frames are always appended.
+    if (pData->bReplaceStack)
+    {
+        pData->bReplaceStack = FALSE;
+    }
+    if (pData->bSkipLastElement)
+    {
+        pData->bSkipLastElement = FALSE;
+    }
+
+    // Check for any impersonation on the frame and save that for use during EH filter callbacks
+    OBJECTREF* pRefSecDesc = pCf->GetAddrOfSecurityObject();
+    if (pRefSecDesc != NULL && *pRefSecDesc != NULL)
+    {
+        FRAMESECDESCREF fsdRef = (FRAMESECDESCREF)*pRefSecDesc;
+        if (fsdRef->GetCallerToken() != NULL)
+        {
+            // Impersonation info present on the Frame
+            pData->hCallerToken = fsdRef->GetCallerToken();
+            STRESS_LOG1(LF_EH, LL_INFO100, "In COMPlusThrowCallback. Found non-NULL callertoken on FSD:%d\n",pData->hCallerToken);
+            if (!pData->bImpersonationTokenSet)
+            {
+                pData->hImpersonationToken = fsdRef->GetImpersonationToken();
+                STRESS_LOG1(LF_EH, LL_INFO100, "In COMPlusThrowCallback. Found non-NULL impersonationtoken on FSD:%d\n",pData->hImpersonationToken);
+                pData->bImpersonationTokenSet = TRUE;
+            }
+        }
+    }
+
+    // now we've got the stack trace, if we aren't allowed to catch this and we're first pass, return
+    if (pData->bDontCatch)
+        return SWA_CONTINUE;
+
+    if (!pCf->IsFrameless())
+    {
+        // @todo - remove this once SIS is fully enabled.
+        extern bool g_EnableSIS;
+        if (g_EnableSIS)
+        {
+            // For debugger, we may want to notify 1st chance exceptions if they're coming out of a stub.
+            // We recognize stubs as Frames with a M2U transition type. The debugger's stackwalker also
+            // recognizes these frames and publishes ICorDebugInternalFrames in the stackwalk. It's
+            // important to use pFrame as the stack address so that the Exception callback matches up
+            // w/ the ICorDebugInternlFrame stack range.
+            if (CORDebuggerAttached())
+            {
+                Frame * pFrameStub = pCf->GetFrame();
+                Frame::ETransitionType t = pFrameStub->GetTransitionType();
+                if (t == Frame::TT_M2U)
+                {
+                    // Use address of the frame as the stack address.
+                    currentSP = (SIZE_T) ((void*) pFrameStub);
+                    currentIP = 0; // no IP.
+                    EEToDebuggerExceptionInterfaceWrapper::FirstChanceManagedException(pThread, (SIZE_T)currentIP, (SIZE_T)currentSP);
+#ifdef FEATURE_EXCEPTION_NOTIFICATIONS
+                    // Deliver the FirstChanceNotification after the debugger, if not already delivered.
+                    if (!pExInfo->DeliveredFirstChanceNotification())
+                    {
+                        ExceptionNotifications::DeliverFirstChanceNotification();
+                    }
+#endif // FEATURE_EXCEPTION_NOTIFICATIONS
+                }
+            }
+        }
+        return SWA_CONTINUE;
+    }
+
+    bool fIsILStub = pFunc->IsILStub();
+    bool fGiveDebuggerAndProfilerNotification = !fIsILStub;
+    BOOL fMethodCanHandleException = TRUE;
+
+    MethodDesc * pUserMDForILStub = NULL;
+    Frame * pILStubFrame = NULL;
+    if (fIsILStub)
+        pUserMDForILStub = GetUserMethodForILStub(pThread, currentSP, pFunc, &pILStubFrame);
+
+#ifdef FEATURE_CORRUPTING_EXCEPTIONS
+    CorruptionSeverity currentSeverity = pThread->GetExceptionState()->GetCurrentExceptionTracker()->GetCorruptionSeverity();
+    {
+        // We must defer to the MethodDesc of the user method instead of the IL stub
+        // itself because the user can specify the policy on a per-method basis and 
+        // that won't be reflected via the IL stub's MethodDesc.
+        MethodDesc * pMDWithCEAttribute = fIsILStub ? pUserMDForILStub : pFunc;
+
+        // Check if the exception can be delivered to the method? It will check if the exception
+        // is a CE or not. If it is, it will check if the method can process it or not.
+        fMethodCanHandleException = CEHelper::CanMethodHandleException(currentSeverity, pMDWithCEAttribute);
+    }
+#endif // FEATURE_CORRUPTING_EXCEPTIONS
+
+    // Let the profiler know that we are searching for a handler within this function instance
+    if (fGiveDebuggerAndProfilerNotification)
+        EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFunctionEnter(pFunc);
+
+    // The following debugger notification and AppDomain::FirstChanceNotification should be scoped together
+    // since the AD notification *must* follow immediately after the debugger's notification.
+    {
+#ifdef DEBUGGING_SUPPORTED
+        //
+        // Go ahead and notify any debugger of this exception.
+        //
+        EEToDebuggerExceptionInterfaceWrapper::FirstChanceManagedException(pThread, (SIZE_T)currentIP, (SIZE_T)currentSP);
+
+        if (CORDebuggerAttached() && pExInfo->m_ExceptionFlags.DebuggerInterceptInfo())
+        {
+            return SWA_ABORT;
+        }
+#endif // DEBUGGING_SUPPORTED
+
+#ifdef FEATURE_EXCEPTION_NOTIFICATIONS
+        // Attempt to deliver the first chance notification to the AD only *AFTER* the debugger
+        // has done that, provided we have not already done that.
+        if (!pExInfo->DeliveredFirstChanceNotification())
+        {
+            ExceptionNotifications::DeliverFirstChanceNotification();
+        }
+#endif // FEATURE_EXCEPTION_NOTIFICATIONS
+    }
+    IJitManager* pJitManager = pCf->GetJitManager();
+    _ASSERTE(pJitManager);
+    EH_CLAUSE_ENUMERATOR pEnumState;
+    unsigned EHCount = 0;
+
+#ifdef FEATURE_CORRUPTING_EXCEPTIONS
+    // If exception cannot be handled, then just bail out. We shouldnt examine the EH clauses
+    // in such a method.
+    if (!fMethodCanHandleException)
+    {
+        LOG((LF_EH, LL_INFO100, "COMPlusThrowCallback - CEHelper decided not to look for exception handlers in the method(MD:%p).\n", pFunc));
+
+        // Set the flag to skip this frame since the CE cannot be delivered
+        _ASSERTE(currentSeverity == ProcessCorrupting);
+
+        // Ensure EHClause count is zero
+        EHCount = 0;
+    }
+    else
+#endif // FEATURE_CORRUPTING_EXCEPTIONS
+    {
+        EHCount = pJitManager->InitializeEHEnumeration(pCf->GetMethodToken(), &pEnumState);
+    }
+
+    if (EHCount == 0)
+    {
+        // Inform the profiler that we're leaving, and what pass we're on
+        if (fGiveDebuggerAndProfilerNotification)
+            EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFunctionLeave(pFunc);
+        return SWA_CONTINUE;
+    }
+
+    TypeHandle thrownType = TypeHandle();
+    // if we are being called on an unwind for an exception that we did not try to catch, eg.
+    // an internal EE exception, then pThread->GetThrowable will be null
+    {
+        OBJECTREF  throwable = pThread->GetThrowable();
+        if (throwable != NULL)
+        {
+            throwable = PossiblyUnwrapThrowable(throwable, pCf->GetAssembly());
+            thrownType = TypeHandle(throwable->GetTrueMethodTable());
+        }
+    }
+
+    PREGDISPLAY regs = pCf->GetRegisterSet();
+    BYTE *pStack = (BYTE *) GetRegdisplaySP(regs);
+#ifdef DEBUGGING_SUPPORTED
+    BYTE *pHandlerEBP   = (BYTE *) GetRegdisplayFP(regs);
+#endif
+
+    DWORD offs = (DWORD)pCf->GetRelOffset();  //= (BYTE*) (*regs->pPC) - (BYTE*) pCf->GetStartAddress();
+    STRESS_LOG1(LF_EH, LL_INFO10000, "COMPlusThrowCallback: offset is %d\n", offs);
+
+    EE_ILEXCEPTION_CLAUSE EHClause;
+    unsigned start_adjust, end_adjust;
+
+    start_adjust = !(pCf->HasFaulted() || pCf->IsIPadjusted());
+    end_adjust = pCf->IsActiveFunc();
+
+    for(ULONG i=0; i < EHCount; i++)
+    {
+        pJitManager->GetNextEHClause(&pEnumState, &EHClause);
+        _ASSERTE(IsValidClause(&EHClause));
+
+        STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusThrowCallback: considering '%s' clause [%d,%d], ofs:%d\n",
+            (IsFault(&EHClause) ? "fault" : (
+            IsFinally(&EHClause) ? "finally" : (
+            IsFilterHandler(&EHClause) ? "filter" : (
+            IsTypedHandler(&EHClause) ? "typed" : "unknown")))),
+            EHClause.TryStartPC,
+            EHClause.TryEndPC,
+            offs
+            );
+
+        // Checking the exception range is a bit tricky because
+        // on CPU faults (null pointer access, div 0, ..., the IP points
+        // to the faulting instruction, but on calls, the IP points
+        // to the next instruction.
+        // This means that we should not include the start point on calls
+        // as this would be a call just preceding the try block.
+        // Also, we should include the end point on calls, but not faults.
+
+        // If we're in the FILTER part of a filter clause, then we
+        // want to stop crawling.  It's going to be caught in a
+        // EX_CATCH just above us.  If not, the exception
+        if (   IsFilterHandler(&EHClause)
+            && (   offs > EHClause.FilterOffset
+                || offs == EHClause.FilterOffset && !start_adjust)
+            && (   offs < EHClause.HandlerStartPC
+                || offs == EHClause.HandlerStartPC && !end_adjust)) {
+
+            STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusThrowCallback: Fault inside filter [%d,%d] startAdj %d endAdj %d\n",
+                        EHClause.FilterOffset, EHClause.HandlerStartPC, start_adjust, end_adjust);
+            
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFunctionLeave(pFunc);
+            return SWA_ABORT;
+        }
+
+        if ( (offs < EHClause.TryStartPC) ||
+             (offs > EHClause.TryEndPC) ||
+             (offs == EHClause.TryStartPC && start_adjust) ||
+             (offs == EHClause.TryEndPC && end_adjust))
+            continue;
+
+        BOOL typeMatch = FALSE;
+        BOOL isTypedHandler = IsTypedHandler(&EHClause);
+
+        if (isTypedHandler && !thrownType.IsNull())
+        {
+            if (EHClause.TypeHandle == (void*)(size_t)mdTypeRefNil)
+            {
+                // this is a catch(...)
+                typeMatch = TRUE;
+            }
+            else
+            {
+                TypeHandle exnType = pJitManager->ResolveEHClause(&EHClause,pCf);
+
+                // if doesn't have cached class then class wasn't loaded so couldn't have been thrown
+                typeMatch = !exnType.IsNull() && ExceptionIsOfRightType(exnType, thrownType);
+            }
+        }
+
+        // <TODO>@PERF: Is this too expensive? Consider storing the nesting level
+        // instead of the HandlerEndPC.</TODO>
+
+        // Determine the nesting level of EHClause. Just walk the table
+        // again, and find out how many handlers enclose it
+        DWORD nestingLevel = 0;
+
+        if (IsFaultOrFinally(&EHClause))
+            continue;
+        if (isTypedHandler)
+        {
+            LOG((LF_EH, LL_INFO100, "COMPlusThrowCallback: %s match for typed handler.\n", typeMatch?"Found":"Did not find"));
+            if (!typeMatch)
+            {
+                continue;
+            }
+        }
+        else
+        {
+            // Must be an exception filter (__except() part of __try{}__except(){}).
+            nestingLevel = ComputeEnclosingHandlerNestingLevel(pJitManager,
+                                                               pCf->GetMethodToken(),
+                                                               EHClause.HandlerStartPC);
+
+            // We just need *any* address within the method. This will let the debugger
+            // resolve the EnC version of the method.
+            PCODE pMethodAddr = GetControlPC(regs);            
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToDebuggerExceptionInterfaceWrapper::ExceptionFilter(pFunc, pMethodAddr, EHClause.FilterOffset, pHandlerEBP);
+
+            UINT_PTR uStartAddress = (UINT_PTR)pCf->GetCodeInfo()->GetStartAddress();
+
+            // save clause information in the exinfo
+            pExInfo->m_EHClauseInfo.SetInfo(COR_PRF_CLAUSE_FILTER, 
+                                            uStartAddress + EHClause.FilterOffset, 
+                                            StackFrame((UINT_PTR)pHandlerEBP));
+           
+            // Let the profiler know we are entering a filter
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFilterEnter(pFunc);
+
+            COUNTER_ONLY(GetPerfCounters().m_Excep.cFiltersExecuted++);
+
+            STRESS_LOG3(LF_EH, LL_INFO10, "COMPlusThrowCallback: calling filter code, EHClausePtr:%08x, Start:%08x, End:%08x\n",
+                &EHClause, EHClause.HandlerStartPC, EHClause.HandlerEndPC);
+
+            OBJECTREF throwable = PossiblyUnwrapThrowable(pThread->GetThrowable(), pCf->GetAssembly());
+
+            pExInfo->m_EHClauseInfo.SetManagedCodeEntered(TRUE);
+
+            int iFilt = COMPlusThrowCallbackHelper(pJitManager,
+                                                   pCf,
+                                                   pData,
+                                                   &EHClause,
+                                                   nestingLevel,
+                                                   throwable,
+                                                   pThread);
+
+            pExInfo->m_EHClauseInfo.SetManagedCodeEntered(FALSE);
+
+            // Let the profiler know we are leaving a filter
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFilterLeave();
+
+            pExInfo->m_EHClauseInfo.ResetInfo();
+
+            if (pThread->IsRudeAbortInitiated() && !pThread->IsWithinCer(pCf))
+            {
+                if (fGiveDebuggerAndProfilerNotification)
+                    EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFunctionLeave(pFunc);
+                return SWA_CONTINUE;
+            }
+
+            // If this filter didn't want the exception, keep looking.
+            if (EXCEPTION_EXECUTE_HANDLER != iFilt)
+                continue;
+        }
+
+        // Record this location, to stop the unwind phase, later.
+        pData->pFunc = pFunc;
+        pData->dHandler = i;
+        pData->pStack = pStack;
+
+        // Notify the profiler that a catcher has been found
+        if (fGiveDebuggerAndProfilerNotification)
+        {
+            EEToProfilerExceptionInterfaceWrapper::ExceptionSearchCatcherFound(pFunc);
+            EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFunctionLeave(pFunc);
+        }
+
+#ifdef DEBUGGING_SUPPORTED
+        //
+        // Notify debugger that a catcher has been found.
+        //
+        if (fIsILStub)
+        {
+            EEToDebuggerExceptionInterfaceWrapper::NotifyOfCHFFilter(pExInfo->m_pExceptionPointers, pILStubFrame);
+        }
+        else
+        if (fGiveDebuggerAndProfilerNotification &&
+            CORDebuggerAttached() && !pExInfo->m_ExceptionFlags.DebuggerInterceptInfo())
+        {
+            _ASSERTE(pData);
+            // We just need *any* address within the method. This will let the debugger
+            // resolve the EnC version of the method.
+            PCODE pMethodAddr = GetControlPC(regs);            
+
+            EEToDebuggerExceptionInterfaceWrapper::FirstChanceManagedExceptionCatcherFound(pThread,
+                                                                                           pData->pFunc, pMethodAddr,
+                                                                                           (SIZE_T)pData->pStack,
+                                                                                           &EHClause);
+        }
+#endif // DEBUGGING_SUPPORTED
+
+        return SWA_ABORT;
+    }
+    if (fGiveDebuggerAndProfilerNotification)
+        EEToProfilerExceptionInterfaceWrapper::ExceptionSearchFunctionLeave(pFunc);
+    return SWA_CONTINUE;
+} // StackWalkAction COMPlusThrowCallback()
+
+
+//==========================================================================
+// COMPlusUnwindCallback
+//==========================================================================
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning (disable : 4740) // There is inline asm code in this function, which disables
+                                 // global optimizations.
+#pragma warning (disable : 4731)
+#endif
+StackWalkAction COMPlusUnwindCallback (CrawlFrame *pCf, ThrowCallbackType *pData)
+{
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+    _ASSERTE(pData->bIsUnwind);
+
+    Frame *pFrame = pCf->GetFrame();
+    MethodDesc *pFunc = pCf->GetFunction();
+
+    #if defined(_DEBUG)
+    #define METHODNAME(pFunc) (pFunc?pFunc->m_pszDebugMethodName:"<n/a>")
+    #else
+    #define METHODNAME(pFunc) "<n/a>"
+    #endif
+    STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusUnwindCallback: STACKCRAWL method:%pM ('%s'), Frame:%p, FrameVtable = %pV\n",
+        pFunc, METHODNAME(pFunc), pFrame, pCf->IsFrameless()?0:(*(void**)pFrame));
+    #undef METHODNAME
+
+    if (pFrame && pData->pTopFrame == pFrame)
+        /* Don't look past limiting frame if there is one */
+        return SWA_ABORT;
+
+    if (!pFunc)
+        return SWA_CONTINUE;
+
+    if (!pCf->IsFrameless())
+        return SWA_CONTINUE;
+
+    Thread *pThread = GetThread();
+
+    // If the thread is being RudeAbort, we will not run any finally
+    if (pThread->IsRudeAbortInitiated() && !pThread->IsWithinCer(pCf))
+    {
+        return SWA_CONTINUE;
+    }
+
+    IJitManager* pJitManager = pCf->GetJitManager();
+    _ASSERTE(pJitManager);
+
+    ExInfo *pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+
+    PREGDISPLAY regs = pCf->GetRegisterSet();
+    BYTE *pStack = (BYTE *) GetRegdisplaySP(regs);
+
+    TypeHandle thrownType = TypeHandle();
+
+    BOOL fCanMethodHandleException = TRUE;
+#ifdef FEATURE_CORRUPTING_EXCEPTIONS
+    // MethodDesc's security information (i.e. whether it is critical or transparent) is calculated lazily.
+    // If this method's security information was not precalculated, then it would have been in the first pass
+    // already using Security::IsMethodCritical which could take have taken us down a path which is GC_TRIGGERS.
+    //
+    // 
+    // However, this unwind callback (for X86) is GC_NOTRIGGER and at this point the security information would have been
+    // calculated already. Hence, we wouldnt endup in the GC_TRIGGERS path. Thus, to keep SCAN.EXE (static contract analyzer) happy, 
+    // we will pass a FALSE to the CanMethodHandleException call, indicating we dont need to calculate security information (and thus,
+    // not go down the GC_TRIGGERS path.
+    //
+    // Check if the exception can be delivered to the method? It will check if the exception
+    // is a CE or not. If it is, it will check if the method can process it or not.
+    CorruptionSeverity currentSeverity = pThread->GetExceptionState()->GetCurrentExceptionTracker()->GetCorruptionSeverity();
+    
+    // We have to do this check for x86 since, unlike 64bit which will setup a new exception tracker for longjmp,
+    // x86 only sets up new trackers in the first pass (and longjmp is 2nd pass only exception). Hence, we pass
+    // this information in the callback structure without affecting any existing exception tracker (incase longjmp was
+    // a nested exception).
+    if (pData->m_fIsLongJump)
+    {
+        // Longjump is not a CSE. With a CSE in progress, this can be invoked by either:
+        //
+        // 1) Managed code (e.g. finally/fault/catch), OR
+        // 2) By native code
+        //
+        // In scenario (1), managed code can invoke it only if it was attributed with HPCSE attribute. Thus,
+        // longjmp is no different than managed code doing a "throw new Exception();".
+        //
+        // In scenario (2), longjmp is no different than any other non-CSE native exception raised.
+        //
+        // In both these case, longjmp should be treated as non-CSE. Since x86 does not setup a tracker for 
+        // it (see comment above), we pass this information (of whether the current exception is a longjmp or not)
+        // to this callback (from UnwindFrames) to setup the correct corruption severity.
+        //
+        // http://www.nynaeve.net/?p=105 has a brief description of how exception-safe setjmp/longjmp works.
+        currentSeverity = NotCorrupting;
+    }
+    {
+        MethodDesc * pFuncWithCEAttribute = pFunc;
+        Frame * pILStubFrame = NULL;
+        if (pFunc->IsILStub())
+        {
+            // We must defer to the MethodDesc of the user method instead of the IL stub
+            // itself because the user can specify the policy on a per-method basis and 
+            // that won't be reflected via the IL stub's MethodDesc.
+            pFuncWithCEAttribute = GetUserMethodForILStub(pThread, (UINT_PTR)pStack, pFunc, &pILStubFrame);
+        }
+        fCanMethodHandleException = CEHelper::CanMethodHandleException(currentSeverity, pFuncWithCEAttribute, FALSE);
+    }
+#endif // FEATURE_CORRUPTING_EXCEPTIONS
+
+#ifdef DEBUGGING_SUPPORTED
+    LOG((LF_EH, LL_INFO1000, "COMPlusUnwindCallback: Intercept %d, pData->pFunc 0x%X, pFunc 0x%X, pData->pStack 0x%X, pStack 0x%X\n",
+         pExInfo->m_ExceptionFlags.DebuggerInterceptInfo(),
+         pData->pFunc,
+         pFunc,
+         pData->pStack,
+         pStack));
+
+    //
+    // If the debugger wants to intercept this exception here, go do that.
+    //
+    if (pExInfo->m_ExceptionFlags.DebuggerInterceptInfo() && (pData->pFunc == pFunc) && (pData->pStack == pStack))
+    {
+        goto LDoDebuggerIntercept;
+    }
+#endif
+
+    bool fGiveDebuggerAndProfilerNotification;
+    fGiveDebuggerAndProfilerNotification = !pFunc->IsILStub();
+
+    // Notify the profiler of the function we're dealing with in the unwind phase
+    if (fGiveDebuggerAndProfilerNotification)
+        EEToProfilerExceptionInterfaceWrapper::ExceptionUnwindFunctionEnter(pFunc);
+
+    EH_CLAUSE_ENUMERATOR pEnumState;
+    unsigned EHCount;
+
+#ifdef FEATURE_CORRUPTING_EXCEPTIONS
+    if (!fCanMethodHandleException)
+    {
+        LOG((LF_EH, LL_INFO100, "COMPlusUnwindCallback - CEHelper decided not to look for exception handlers in the method(MD:%p).\n", pFunc));
+
+        // Set the flag to skip this frame since the CE cannot be delivered
+        _ASSERTE(currentSeverity == ProcessCorrupting);
+
+        // Force EHClause count to be zero
+        EHCount = 0;
+    }
+    else
+#endif // FEATURE_CORRUPTING_EXCEPTIONS
+    {
+        EHCount = pJitManager->InitializeEHEnumeration(pCf->GetMethodToken(), &pEnumState);
+    }
+
+    if (EHCount == 0)
+    {
+        // Inform the profiler that we're leaving, and what pass we're on
+        if (fGiveDebuggerAndProfilerNotification)
+            EEToProfilerExceptionInterfaceWrapper::ExceptionUnwindFunctionLeave(pFunc);
+
+        return SWA_CONTINUE;
+    }
+
+    // if we are being called on an unwind for an exception that we did not try to catch, eg.
+    // an internal EE exception, then pThread->GetThrowable will be null
+    {
+        OBJECTREF  throwable = pThread->GetThrowable();
+        if (throwable != NULL)
+        {
+            throwable = PossiblyUnwrapThrowable(throwable, pCf->GetAssembly());
+            thrownType = TypeHandle(throwable->GetTrueMethodTable());
+        }
+    }
+#ifdef DEBUGGING_SUPPORTED
+    BYTE *pHandlerEBP;
+    pHandlerEBP = (BYTE *) GetRegdisplayFP(regs);
+#endif
+
+    DWORD offs;
+    offs = (DWORD)pCf->GetRelOffset();  //= (BYTE*) (*regs->pPC) - (BYTE*) pCf->GetStartAddress();
+
+    LOG((LF_EH, LL_INFO100, "COMPlusUnwindCallback: current EIP offset in method 0x%x, \n", offs));
+
+    EE_ILEXCEPTION_CLAUSE EHClause;
+    unsigned start_adjust, end_adjust;
+
+    start_adjust = !(pCf->HasFaulted() || pCf->IsIPadjusted());
+    end_adjust = pCf->IsActiveFunc();
+
+    for(ULONG i=0; i < EHCount; i++)
+    {
+          pJitManager->GetNextEHClause(&pEnumState, &EHClause);
+         _ASSERTE(IsValidClause(&EHClause));
+
+        STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusUnwindCallback: considering '%s' clause [%d,%d], offs:%d\n",
+                (IsFault(&EHClause) ? "fault" : (
+                 IsFinally(&EHClause) ? "finally" : (
+                 IsFilterHandler(&EHClause) ? "filter" : (
+                 IsTypedHandler(&EHClause) ? "typed" : "unknown")))),
+                EHClause.TryStartPC,
+                EHClause.TryEndPC,
+                offs
+                );
+
+        // Checking the exception range is a bit tricky because
+        // on CPU faults (null pointer access, div 0, ..., the IP points
+        // to the faulting instruction, but on calls, the IP points
+        // to the next instruction.
+        // This means that we should not include the start point on calls
+        // as this would be a call just preceding the try block.
+        // Also, we should include the end point on calls, but not faults.
+
+        if (   IsFilterHandler(&EHClause)
+            && (   offs > EHClause.FilterOffset
+                || offs == EHClause.FilterOffset && !start_adjust)
+            && (   offs < EHClause.HandlerStartPC
+                || offs == EHClause.HandlerStartPC && !end_adjust)
+            ) {
+            STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusUnwindCallback: Fault inside filter [%d,%d] startAdj %d endAdj %d\n",
+                        EHClause.FilterOffset, EHClause.HandlerStartPC, start_adjust, end_adjust);
+
+            // Make the filter as done. See comment in CallJitEHFilter
+            // on why we have to do it here.
+            Frame* pFilterFrame = pThread->GetFrame();
+            _ASSERTE(pFilterFrame->GetVTablePtr() == ExceptionFilterFrame::GetMethodFrameVPtr());
+            ((ExceptionFilterFrame*)pFilterFrame)->SetFilterDone();
+
+            // Inform the profiler that we're leaving, and what pass we're on
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToProfilerExceptionInterfaceWrapper::ExceptionUnwindFunctionLeave(pFunc);
+
+            return SWA_ABORT;
+        }
+
+        if ( (offs <  EHClause.TryStartPC) ||
+             (offs > EHClause.TryEndPC) ||
+             (offs == EHClause.TryStartPC && start_adjust) ||
+             (offs == EHClause.TryEndPC && end_adjust))
+            continue;
+
+        // <TODO>@PERF : Is this too expensive? Consider storing the nesting level
+        // instead of the HandlerEndPC.</TODO>
+
+        // Determine the nesting level of EHClause. Just walk the table
+        // again, and find out how many handlers enclose it
+
+        DWORD nestingLevel = ComputeEnclosingHandlerNestingLevel(pJitManager,
+                                                                 pCf->GetMethodToken(),
+                                                                 EHClause.HandlerStartPC);
+
+        // We just need *any* address within the method. This will let the debugger
+        // resolve the EnC version of the method.
+        PCODE pMethodAddr = GetControlPC(regs);
+
+        UINT_PTR uStartAddress = (UINT_PTR)pCf->GetCodeInfo()->GetStartAddress();
+
+        if (IsFaultOrFinally(&EHClause))
+        {
+            COUNTER_ONLY(GetPerfCounters().m_Excep.cFinallysExecuted++);
+
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToDebuggerExceptionInterfaceWrapper::ExceptionHandle(pFunc, pMethodAddr, EHClause.HandlerStartPC, pHandlerEBP);
+
+            pExInfo->m_EHClauseInfo.SetInfo(COR_PRF_CLAUSE_FINALLY,
+                                            uStartAddress + EHClause.HandlerStartPC,
+                                            StackFrame((UINT_PTR)pHandlerEBP));
+            
+            // Notify the profiler that we are about to execute the finally code
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToProfilerExceptionInterfaceWrapper::ExceptionUnwindFinallyEnter(pFunc);
+
+            LOG((LF_EH, LL_INFO100, "COMPlusUnwindCallback: finally clause [%d,%d] - call\n", EHClause.TryStartPC, EHClause.TryEndPC));
+
+            pExInfo->m_EHClauseInfo.SetManagedCodeEntered(TRUE);
+            
+            ::CallJitEHFinally(pCf, (BYTE *)uStartAddress, &EHClause, nestingLevel);
+            
+            pExInfo->m_EHClauseInfo.SetManagedCodeEntered(FALSE);
+
+            LOG((LF_EH, LL_INFO100, "COMPlusUnwindCallback: finally - returned\n"));
+
+            // Notify the profiler that we are done with the finally code
+            if (fGiveDebuggerAndProfilerNotification)
+                EEToProfilerExceptionInterfaceWrapper::ExceptionUnwindFinallyLeave();
+
+            pExInfo->m_EHClauseInfo.ResetInfo();
+
+            continue;
+        }
+
+        // Current is not a finally, check if it's the catching handler (or filter).
+        if (pData->pFunc != pFunc || (ULONG)(pData->dHandler) != i || pData->pStack != pStack)
+        {
+            continue;
+        }
+
+#ifdef _DEBUG
+        gLastResumedExceptionFunc = pCf->GetFunction();
+        gLastResumedExceptionHandler = i;
+#endif
+
+        // save clause information in the exinfo
+        pExInfo->m_EHClauseInfo.SetInfo(COR_PRF_CLAUSE_CATCH,
+                                        uStartAddress  + EHClause.HandlerStartPC,
+                                        StackFrame((UINT_PTR)pHandlerEBP));
+
+        // Notify the profiler that we are about to resume at the catcher.         
+        if (fGiveDebuggerAndProfilerNotification)
+        {
+            DACNotify::DoExceptionCatcherEnterNotification(pFunc, EHClause.HandlerStartPC);
+
+            EEToProfilerExceptionInterfaceWrapper::ExceptionCatcherEnter(pThread, pFunc);
+
+            EEToDebuggerExceptionInterfaceWrapper::ExceptionHandle(pFunc, pMethodAddr, EHClause.HandlerStartPC, pHandlerEBP);
+        }
+
+        STRESS_LOG4(LF_EH, LL_INFO100, "COMPlusUnwindCallback: offset 0x%x matches clause [0x%x, 0x%x) matches in method %pM\n",
+                    offs, EHClause.TryStartPC, EHClause.TryEndPC, pFunc);
+
+        // ResumeAtJitEH will set pExInfo->m_EHClauseInfo.m_fManagedCodeEntered = TRUE; at the appropriate time
+        ::ResumeAtJitEH(pCf, (BYTE *)uStartAddress, &EHClause, nestingLevel, pThread, pData->bUnwindStack);   
+        //UNREACHABLE_MSG("ResumeAtJitEH shouldn't have returned!");
+        
+        // we do not set pExInfo->m_EHClauseInfo.m_fManagedCodeEntered = FALSE here, 
+        // that happens when the catch clause calls back to COMPlusEndCatch 
+        
+    }
+
+    STRESS_LOG1(LF_EH, LL_INFO100, "COMPlusUnwindCallback: no handler found in method %pM\n", pFunc);
+    if (fGiveDebuggerAndProfilerNotification)
+        EEToProfilerExceptionInterfaceWrapper::ExceptionUnwindFunctionLeave(pFunc);
+
+    return SWA_CONTINUE;
+
+
+#ifdef DEBUGGING_SUPPORTED
+LDoDebuggerIntercept:
+
+    STRESS_LOG1(LF_EH|LF_CORDB, LL_INFO100, "COMPlusUnwindCallback: Intercepting in method %pM\n", pFunc);
+
+    //
+    // Setup up the easy parts of the context to restart at.
+    //
+    EHContext context;
+
+    //
+    // Note: EAX ECX EDX are scratch
+    //
+    context.Esp = (DWORD)(size_t)(GetRegdisplaySP(regs));
+    context.Ebx = *regs->pEbx;
+    context.Esi = *regs->pEsi;
+    context.Edi = *regs->pEdi;
+    context.Ebp = *regs->pEbp;
+    
+    //
+    // Set scratch registers to 0 to avoid reporting incorrect values to GC in case of debugger changing the IP 
+    // in the middle of a scratch register lifetime (see Dev10 754922)
+    // 
+    context.Eax = 0;
+    context.Ecx = 0;
+    context.Edx = 0;
+
+    //
+    // Ok, now set the target Eip to the address the debugger requested.
+    //
+    ULONG_PTR nativeOffset;
+    pExInfo->m_DebuggerExState.GetDebuggerInterceptInfo(NULL, NULL, NULL, NULL, &nativeOffset, NULL);
+    context.Eip = GetControlPC(regs) - (pCf->GetRelOffset() - nativeOffset);
+
+    //
+    // Finally we need to get the correct Esp for this nested level
+    //
+
+    context.Esp = pCf->GetCodeManager()->GetAmbientSP(regs,
+                                                      pCf->GetCodeInfo(),
+                                                      nativeOffset,
+                                                      pData->dHandler,
+                                                      pCf->GetCodeManState()
+                                                     );
+    //
+    // In case we see unknown FS:[0] handlers we delay the interception point until we reach the handler that protects the interception point.
+    // This way we have both FS:[0] handlers being poped up by RtlUnwind and managed capital F Frames being unwinded by managed stackwalker.
+    //
+    BOOL fCheckForUnknownHandler  = TRUE;
+    if (PopNestedExceptionRecords((LPVOID)(size_t)context.Esp, fCheckForUnknownHandler))
+    {
+        // Let ClrDebuggerDoUnwindAndIntercept RtlUnwind continue to unwind frames until we reach the handler protected by COMPlusNestedExceptionHandler.
+        pExInfo->m_InterceptionContext = context;
+        pExInfo->m_ValidInterceptionContext = TRUE;
+        STRESS_LOG0(LF_EH|LF_CORDB, LL_INFO100, "COMPlusUnwindCallback: Skip interception until unwinding reaches the actual handler protected by COMPlusNestedExceptionHandler\n");
+    }
+    else
+    {
+        //
+        // Pop off all the Exception information up to this point in the stack
+        //
+        UnwindExceptionTrackerAndResumeInInterceptionFrame(pExInfo, &context);
+    }
+    return SWA_ABORT;
+#endif // DEBUGGING_SUPPORTED
+} // StackWalkAction COMPlusUnwindCallback ()
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning (disable : 4740) // There is inline asm code in this function, which disables
+                                 // global optimizations.
+#pragma warning (disable : 4731)
+#endif
+void ResumeAtJitEH(CrawlFrame* pCf,
+                   BYTE* startPC,
+                   EE_ILEXCEPTION_CLAUSE *EHClausePtr,
+                   DWORD nestingLevel,
+                   Thread *pThread,
+                   BOOL unwindStack)
+{
+    // No dynamic contract here because this function doesn't return and destructors wouldn't be executed
+    WRAPPER_NO_CONTRACT;
+
+    EHContext context;
+
+    context.Setup(PCODE(startPC + EHClausePtr->HandlerStartPC), pCf->GetRegisterSet());
+
+    size_t * pShadowSP = NULL; // Write Esp to *pShadowSP before jumping to handler
+    size_t * pHandlerEnd = NULL;
+
+    OBJECTREF throwable = PossiblyUnwrapThrowable(pThread->GetThrowable(), pCf->GetAssembly());
+
+    pCf->GetCodeManager()->FixContext(ICodeManager::CATCH_CONTEXT,
+                                      &context,
+                                      pCf->GetCodeInfo(),
+                                      EHClausePtr->HandlerStartPC,
+                                      nestingLevel,
+                                      throwable,
+                                      pCf->GetCodeManState(),
+                                      &pShadowSP,
+                                      &pHandlerEnd);
+
+    if (pHandlerEnd)
+    {
+        *pHandlerEnd = EHClausePtr->HandlerEndPC;
+    }
+
+    // save esp so that endcatch can restore it (it always restores, so want correct value)
+    ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+    pExInfo->m_dEsp = (LPVOID)context.GetSP();
+    LOG((LF_EH, LL_INFO1000, "ResumeAtJitEH: current m_dEsp set to %p\n", context.GetSP()));
+
+    PVOID dEsp = GetCurrentSP();
+
+    if (!unwindStack)
+    {
+        // If we don't want to unwind the stack, then the guard page had better not be gone!
+        _ASSERTE(pThread->DetermineIfGuardPagePresent());
+
+        // so down below won't really update esp
+        context.SetSP(dEsp);
+        pExInfo->m_pShadowSP = pShadowSP; // so that endcatch can zero it back
+
+        if  (pShadowSP)
+        {
+            *pShadowSP = (size_t)dEsp;
+        }
+    }
+    else
+    {
+        // so shadow SP has the real SP as we are going to unwind the stack
+        dEsp = (LPVOID)context.GetSP();
+
+        // BEGIN: pExInfo->UnwindExInfo(dEsp);
+        ExInfo *pPrevNestedInfo = pExInfo->m_pPrevNestedInfo;
+
+        while (pPrevNestedInfo && pPrevNestedInfo->m_StackAddress < dEsp)
+        {
+            LOG((LF_EH, LL_INFO1000, "ResumeAtJitEH: popping nested ExInfo at 0x%p\n", pPrevNestedInfo->m_StackAddress));
+
+            pPrevNestedInfo->DestroyExceptionHandle();
+            pPrevNestedInfo->m_StackTraceInfo.FreeStackTrace();
+
+#ifdef DEBUGGING_SUPPORTED
+            if (g_pDebugInterface != NULL)
+            {
+                g_pDebugInterface->DeleteInterceptContext(pPrevNestedInfo->m_DebuggerExState.GetDebuggerInterceptContext());
+            }
+#endif // DEBUGGING_SUPPORTED
+
+            pPrevNestedInfo = pPrevNestedInfo->m_pPrevNestedInfo;
+        }
+
+        pExInfo->m_pPrevNestedInfo = pPrevNestedInfo;
+
+        _ASSERTE(pExInfo->m_pPrevNestedInfo == 0 || pExInfo->m_pPrevNestedInfo->m_StackAddress >= dEsp);
+
+        // Before we unwind the SEH records, get the Frame from the top-most nested exception record.
+        Frame* pNestedFrame = GetCurrFrame(FindNestedEstablisherFrame(GetCurrentSEHRecord()));
+
+        PopNestedExceptionRecords((LPVOID)(size_t)dEsp);
+
+        EXCEPTION_REGISTRATION_RECORD* pNewBottomMostHandler = GetCurrentSEHRecord();
+
+        pExInfo->m_pShadowSP = pShadowSP;
+
+        // The context and exception record are no longer any good.
+        _ASSERTE(pExInfo->m_pContext < dEsp);   // It must be off the top of the stack.
+        pExInfo->m_pContext = 0;                // Whack it.
+        pExInfo->m_pExceptionRecord = 0;
+        pExInfo->m_pExceptionPointers = 0;
+
+        // We're going to put one nested record back on the stack before we resume.  This is
+        // where it goes.
+        NestedHandlerExRecord *pNestedHandlerExRecord = (NestedHandlerExRecord*)((BYTE*)dEsp - ALIGN_UP(sizeof(NestedHandlerExRecord), STACK_ALIGN_SIZE));
+
+        // The point of no return.  The next statement starts scribbling on the stack.  It's
+        // deep enough that we won't hit our own locals.  (That's important, 'cuz we're still
+        // using them.)
+        //
+        _ASSERTE(dEsp > &pCf);
+        pNestedHandlerExRecord->m_handlerInfo.m_hThrowable=NULL; // This is random memory.  Handle
+                                                                 // must be initialized to null before
+                                                                 // calling Init(), as Init() will try
+                                                                 // to free any old handle.
+        pNestedHandlerExRecord->Init((PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler, pNestedFrame);
+
+        INSTALL_EXCEPTION_HANDLING_RECORD(&(pNestedHandlerExRecord->m_ExReg));
+
+        context.SetSP(pNestedHandlerExRecord);
+
+        // We might have moved the bottommost handler.  The nested record itself is never
+        // the bottom most handler -- it's pushed afte the fact.  So we have to make the
+        // bottom-most handler the one BEFORE the nested record.
+        if (pExInfo->m_pBottomMostHandler < pNewBottomMostHandler)
+        {
+            STRESS_LOG3(LF_EH, LL_INFO10000, "ResumeAtJitEH: setting ExInfo:0x%p m_pBottomMostHandler from 0x%p to 0x%p\n", 
+                pExInfo, pExInfo->m_pBottomMostHandler, pNewBottomMostHandler);
+          pExInfo->m_pBottomMostHandler = pNewBottomMostHandler;
+        }
+
+        if  (pShadowSP)
+        {
+            *pShadowSP = context.GetSP();
+        }
+    }
+
+    STRESS_LOG3(LF_EH, LL_INFO100, "ResumeAtJitEH: resuming at EIP = %p  ESP = %p EBP = %p\n",
+                context.Eip, context.GetSP(), context.GetFP());
+
+#ifdef STACK_GUARDS_DEBUG
+    // We are transitioning back to managed code, so ensure that we are in 
+    // SO-tolerant mode before we do so. 
+    RestoreSOToleranceState();
+#endif
+
+    // we want this to happen as late as possible but certainly after the notification
+    // that the handle for the current ExInfo has been freed has been delivered
+    pExInfo->m_EHClauseInfo.SetManagedCodeEntered(TRUE);
+
+    ETW::ExceptionLog::ExceptionCatchBegin(pCf->GetCodeInfo()->GetMethodDesc(), (PVOID)pCf->GetCodeInfo()->GetStartAddress());
+
+    ResumeAtJitEHHelper(&context);
+    UNREACHABLE_MSG("Should never return from ResumeAtJitEHHelper!");
+
+    // we do not set pExInfo->m_EHClauseInfo.m_fManagedCodeEntered = FALSE here, 
+    // that happens when the catch clause calls back to COMPlusEndCatch 
+    // we don't return to this point so it would be moot (see unreachable_msg above)
+    
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif    
+
+// Must be in a separate function because INSTALL_COMPLUS_EXCEPTION_HANDLER has a filter
+int CallJitEHFilterWorker(size_t *pShadowSP, EHContext *pContext)
+{
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+    STATIC_CONTRACT_SO_INTOLERANT;
+
+    int retVal = EXCEPTION_CONTINUE_SEARCH;
+
+    BEGIN_CALL_TO_MANAGED();
+
+    retVal = CallJitEHFilterHelper(pShadowSP, pContext);
+
+    END_CALL_TO_MANAGED();
+
+    return retVal;
+}
+
+int CallJitEHFilter(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel, OBJECTREF thrownObj)
+{
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+    int retVal = EXCEPTION_CONTINUE_SEARCH;
+    size_t * pShadowSP = NULL;
+    EHContext context;
+
+    context.Setup(PCODE(startPC + EHClausePtr->FilterOffset), pCf->GetRegisterSet());
+
+    size_t * pEndFilter = NULL; // Write
+    pCf->GetCodeManager()->FixContext(ICodeManager::FILTER_CONTEXT, &context, pCf->GetCodeInfo(),
+                                      EHClausePtr->FilterOffset, nestingLevel, thrownObj, pCf->GetCodeManState(),
+                                      &pShadowSP, &pEndFilter);
+
+    // End of the filter is the same as start of handler
+    if (pEndFilter)
+    {
+        *pEndFilter = EHClausePtr->HandlerStartPC;
+    }
+
+    // ExceptionFilterFrame serves two purposes:
+    //
+    // 1. It serves as a frame that stops the managed search for handler 
+    // if we fault in the filter. ThrowCallbackType.pTopFrame is going point 
+    // to this frame during search for exception handler inside filter.
+    // The search for handler needs a frame to stop. If we had no frame here,
+    // the exceptions in filters would not be swallowed correctly since we would
+    // walk past the EX_TRY/EX_CATCH block in COMPlusThrowCallbackHelper.
+    //
+    // 2. It allows setting of SHADOW_SP_FILTER_DONE flag in UnwindFrames() 
+    // if we fault in the filter. We have to set this flag together with unwinding
+    // of the filter frame. Using a regular C++ holder to clear this flag here would cause 
+    // GC holes. The stack would be in inconsistent state when we trigger gc just before
+    // returning from UnwindFrames.
+
+    FrameWithCookie<ExceptionFilterFrame> exceptionFilterFrame(pShadowSP);
+    
+    ETW::ExceptionLog::ExceptionFilterBegin(pCf->GetCodeInfo()->GetMethodDesc(), (PVOID)pCf->GetCodeInfo()->GetStartAddress());
+    
+    retVal = CallJitEHFilterWorker(pShadowSP, &context);
+
+    ETW::ExceptionLog::ExceptionFilterEnd();
+
+    exceptionFilterFrame.Pop();
+
+    return retVal;
+}
+
+void CallJitEHFinally(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel)
+{
+    WRAPPER_NO_CONTRACT;
+
+    EHContext context;
+    context.Setup(PCODE(startPC + EHClausePtr->HandlerStartPC), pCf->GetRegisterSet());
+
+    size_t * pShadowSP = NULL; // Write Esp to *pShadowSP before jumping to handler
+
+    size_t * pFinallyEnd = NULL;
+    pCf->GetCodeManager()->FixContext(
+        ICodeManager::FINALLY_CONTEXT, &context, pCf->GetCodeInfo(),
+        EHClausePtr->HandlerStartPC, nestingLevel, ObjectToOBJECTREF((Object *) NULL), pCf->GetCodeManState(),
+        &pShadowSP, &pFinallyEnd);
+
+    if (pFinallyEnd)
+    {
+        *pFinallyEnd = EHClausePtr->HandlerEndPC;
+    }
+
+    ETW::ExceptionLog::ExceptionFinallyBegin(pCf->GetCodeInfo()->GetMethodDesc(), (PVOID)pCf->GetCodeInfo()->GetStartAddress());
+    
+    CallJitEHFinallyHelper(pShadowSP, &context);
+
+    ETW::ExceptionLog::ExceptionFinallyEnd();
+
+    //
+    // Update the registers using new context
+    //
+    // This is necessary to reflect GC pointer changes during the middle of a unwind inside a 
+    // finally clause, because:
+    // 1. GC won't see the part of stack inside try (which has thrown an exception) that is already 
+    // unwinded and thus GC won't update GC pointers for this portion of the stack, but rather the 
+    // call stack in finally.
+    // 2. upon return of finally, the unwind process continues and unwinds stack based on the part 
+    // of stack inside try and won't see the updated values in finally.
+    // As a result, we need to manually update the context using register values upon return of finally
+    //
+    // Note that we only update the registers for finally clause because
+    // 1. For filter handlers, stack walker is able to see the whole stack (including the try part)
+    // with the help of ExceptionFilterFrame as filter handlers are called in first pass
+    // 2. For catch handlers, the current unwinding is already finished
+    //
+    context.UpdateFrame(pCf->GetRegisterSet());
+    
+    // This does not need to be guarded by a holder because the frame is dead if an exception gets thrown.  Filters are different
+    //  since they are run in the first pass, so we must update the shadowSP reset in CallJitEHFilter.
+    if (pShadowSP) {
+        *pShadowSP = 0;  // reset the shadowSP to 0
+    }
+}
+#if defined(_MSC_VER)
+#pragma warning (default : 4731)
+#endif
+
+//=====================================================================
+// *********************************************************************
+BOOL ComPlusFrameSEH(EXCEPTION_REGISTRATION_RECORD* pEHR)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    return ((LPVOID)pEHR->Handler == (LPVOID)COMPlusFrameHandler || (LPVOID)pEHR->Handler == (LPVOID)COMPlusNestedExceptionHandler);
+}
+
+
+//
+//-------------------------------------------------------------------------
+// This is installed when we call COMPlusFrameHandler to provide a bound to
+// determine when are within a nested exception
+//-------------------------------------------------------------------------
+EXCEPTION_HANDLER_IMPL(COMPlusNestedExceptionHandler)
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (pExceptionRecord->ExceptionFlags & (EXCEPTION_UNWINDING | EXCEPTION_EXIT_UNWIND))
+    {
+        LOG((LF_EH, LL_INFO100, "    COMPlusNestedHandler(unwind) with %x at %x\n", pExceptionRecord->ExceptionCode,
+            pContext ? GetIP(pContext) : 0));
+
+
+        // We're unwinding past a nested exception record, which means that we've thrown
+        // a new exception out of a region in which we're handling a previous one.  The
+        // previous exception is overridden -- and needs to be unwound.
+
+        // The preceding is ALMOST true.  There is one more case, where we use setjmp/longjmp
+        // from withing a nested handler.  We won't have a nested exception in that case -- just
+        // the unwind.
+
+        Thread* pThread = GetThread();
+        _ASSERTE(pThread);
+        ExInfo* pExInfo = &(pThread->GetExceptionState()->m_currentExInfo);
+        ExInfo* pPrevNestedInfo = pExInfo->m_pPrevNestedInfo;
+
+        if (pPrevNestedInfo == &((NestedHandlerExRecord*)pEstablisherFrame)->m_handlerInfo)
+        {
+            _ASSERTE(pPrevNestedInfo);
+
+            LOG((LF_EH, LL_INFO100, "COMPlusNestedExceptionHandler: PopExInfo(): popping nested ExInfo at 0x%p\n", pPrevNestedInfo));
+
+            pPrevNestedInfo->DestroyExceptionHandle();
+            pPrevNestedInfo->m_StackTraceInfo.FreeStackTrace();
+
+#ifdef DEBUGGING_SUPPORTED
+            if (g_pDebugInterface != NULL)
+            {
+                g_pDebugInterface->DeleteInterceptContext(pPrevNestedInfo->m_DebuggerExState.GetDebuggerInterceptContext());
+            }
+#endif // DEBUGGING_SUPPORTED
+
+            pExInfo->m_pPrevNestedInfo = pPrevNestedInfo->m_pPrevNestedInfo;
+
+        } else {
+            // The whacky setjmp/longjmp case.  Nothing to do.
+        }
+
+    } else {
+        LOG((LF_EH, LL_INFO100, "    InCOMPlusNestedHandler with %x at %x\n", pExceptionRecord->ExceptionCode,
+            pContext ? GetIP(pContext) : 0));
+    }
+
+
+    // There is a nasty "gotcha" in the way exception unwinding, finally's, and nested exceptions
+    // interact.  Here's the scenario ... it involves two exceptions, one normal one, and one
+    // raised in a finally.
+    //
+    // The first exception occurs, and is caught by some handler way up the stack.  That handler
+    // calls RtlUnwind -- and handlers that didn't catch this first exception are called again, with
+    // the UNWIND flag set.  If, one of the handlers throws an exception during
+    // unwind (like, a throw from a finally) -- then that same handler is not called during
+    // the unwind pass of the second exception.  [ASIDE: It is called on first-pass.]
+    //
+    // What that means is -- the COMPlusExceptionHandler, can't count on unwinding itself correctly
+    // if an exception is thrown from a finally.  Instead, it relies on the NestedExceptionHandler
+    // that it pushes for this.
+    //
+
+    EXCEPTION_DISPOSITION retval = EXCEPTION_HANDLER_FWD(COMPlusFrameHandler);
+    LOG((LF_EH, LL_INFO100, "Leaving COMPlusNestedExceptionHandler with %d\n", retval));
+    return retval;
+}
+
+EXCEPTION_REGISTRATION_RECORD *FindNestedEstablisherFrame(EXCEPTION_REGISTRATION_RECORD *pEstablisherFrame)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    while (pEstablisherFrame->Handler != (PEXCEPTION_ROUTINE)COMPlusNestedExceptionHandler) {
+        pEstablisherFrame = pEstablisherFrame->Next;
+        _ASSERTE(pEstablisherFrame != EXCEPTION_CHAIN_END);   // should always find one
+    }
+    return pEstablisherFrame;
+}
+
+EXCEPTION_HANDLER_IMPL(FastNExportExceptHandler)
+{
+    WRAPPER_NO_CONTRACT;
+
+    // Most of our logic is in commin with COMPlusFrameHandler.
+    EXCEPTION_DISPOSITION retval = EXCEPTION_HANDLER_FWD(COMPlusFrameHandler);
+
+#ifdef _DEBUG
+    // If the exception is escaping the last CLR personality routine on the stack,
+    // then state a flag on the thread to indicate so.
+    if (retval == ExceptionContinueSearch)
+    {
+        SetReversePInvokeEscapingUnhandledExceptionStatus(IS_UNWINDING(pExceptionRecord->ExceptionFlags), pEstablisherFrame);
+    }
+#endif // _DEBUG
+
+    return retval;  
+}
+
+
+// Just like a regular NExport handler -- except it pops an extra frame on unwind.  A handler
+// like this is needed by the COMMethodStubProlog code.  It first pushes a frame -- and then
+// pushes a handler.  When we unwind, we need to pop the extra frame to avoid corrupting the
+// frame chain in the event of an unmanaged catcher.
+//
+EXCEPTION_HANDLER_IMPL(UMThunkPrestubHandler)
+{
+    // @todo: we'd like to have a dynamic contract here, but there's a problem. (Bug 129180) Enter on the CRST used
+    // in HandleManagedFault leaves the no-trigger count incremented. The destructor of this contract will restore
+    // it to zero, then when we leave the CRST in LinkFrameAndThrow, we assert because we're trying to decrement the
+    // gc-trigger count down past zero. The solution is to fix what we're doing with this CRST. </TODO>
+    STATIC_CONTRACT_THROWS; // COMPlusFrameHandler throws
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_ANY;
+
+    EXCEPTION_DISPOSITION retval = ExceptionContinueSearch;
+    
+    BEGIN_CONTRACT_VIOLATION(SOToleranceViolation);
+    
+    // We must forward to the COMPlusFrameHandler. This will unwind the Frame Chain up to here, and also leave the
+    // preemptive GC mode set correctly.
+    retval = EXCEPTION_HANDLER_FWD(COMPlusFrameHandler);
+
+#ifdef _DEBUG
+    // If the exception is escaping the last CLR personality routine on the stack,
+    // then state a flag on the thread to indicate so.
+    if (retval == ExceptionContinueSearch)
+    {
+        SetReversePInvokeEscapingUnhandledExceptionStatus(IS_UNWINDING(pExceptionRecord->ExceptionFlags), pEstablisherFrame);
+    }
+#endif // _DEBUG
+
+    if (IS_UNWINDING(pExceptionRecord->ExceptionFlags))
+    {
+        // Pops an extra frame on unwind.
+
+        GCX_COOP();     // Must be cooperative to modify frame chain.
+
+        Thread *pThread = GetThread();
+        _ASSERTE(pThread);
+        Frame *pFrame = pThread->GetFrame();
+        pFrame->ExceptionUnwind();
+        pFrame->Pop(pThread);
+    }
+
+    END_CONTRACT_VIOLATION;
+    
+    return retval;
+}
+
+LONG CLRNoCatchHandler(EXCEPTION_POINTERS* pExceptionInfo, PVOID pv)
+{
+    WRAPPER_NO_CONTRACT;
+    STATIC_CONTRACT_ENTRY_POINT;
+
+    LONG result = EXCEPTION_CONTINUE_SEARCH;
+
+    // This function can be called during the handling of a SO
+    //BEGIN_ENTRYPOINT_VOIDRET;
+
+    result = CLRVectoredExceptionHandler(pExceptionInfo);
+
+    if (EXCEPTION_EXECUTE_HANDLER == result)
+    {
+        result = EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    //END_ENTRYPOINT_VOIDRET;
+
+    return result;
+}
+
+#ifdef FEATURE_COMINTEROP
+// The reverse COM interop path needs to be sure to pop the ComMethodFrame that is pushed, but we do not want
+// to have an additional FS:0 handler between the COM callsite and the call into managed.  So we push this 
+// FS:0 handler, which will defer to the usual COMPlusFrameHandler and then perform the cleanup of the 
+// ComMethodFrame, if needed. 
+EXCEPTION_HANDLER_IMPL(COMPlusFrameHandlerRevCom)
+{
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+    STATIC_CONTRACT_MODE_ANY;
+
+    // Defer to COMPlusFrameHandler
+    EXCEPTION_DISPOSITION result = EXCEPTION_HANDLER_FWD(COMPlusFrameHandler);
+
+    if (pExceptionRecord->ExceptionFlags & (EXCEPTION_UNWINDING | EXCEPTION_EXIT_UNWIND))
+    {
+        // Do cleanup as needed
+        ComMethodFrame::DoSecondPassHandlerCleanup(GetCurrFrame(pEstablisherFrame));
+    }
+
+    return result;
+}
+#endif // FEATURE_COMINTEROP
+
+
+// Returns TRUE if caller should resume execution.
+BOOL
+AdjustContextForVirtualStub(
+        EXCEPTION_RECORD *pExceptionRecord,
+        CONTEXT *pContext)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    Thread * pThread = GetThread();
+
+    // We may not have a managed thread object. Example is an AV on the helper thread.
+    // (perhaps during StubManager::IsStub)
+    if (pThread == NULL)
+    {
+        return FALSE;
+    }
+
+    PCODE f_IP = GetIP(pContext);
+
+    VirtualCallStubManager::StubKind sk;
+    /* VirtualCallStubManager *pMgr = */ VirtualCallStubManager::FindStubManager(f_IP, &sk);
+
+    if (sk == VirtualCallStubManager::SK_DISPATCH)
+    {
+        if (*PTR_WORD(f_IP) != X86_INSTR_CMP_IND_ECX_IMM32)
+        {
+            _ASSERTE(!"AV in DispatchStub at unknown instruction");
+            return FALSE;
+        }
+    }
+    else
+    if (sk == VirtualCallStubManager::SK_RESOLVE)
+    {
+        if (*PTR_WORD(f_IP) != X86_INSTR_MOV_EAX_ECX_IND)
+        {
+            _ASSERTE(!"AV in ResolveStub at unknown instruction");
+            return FALSE;
+        }
+
+        SetSP(pContext, dac_cast<PCODE>(dac_cast<PTR_BYTE>(GetSP(pContext)) + sizeof(void*))); // rollback push eax
+    }
+    else
+    {
+        return FALSE;
+    }
+
+    PCODE callsite = GetAdjustedCallAddress(*dac_cast<PTR_PCODE>(GetSP(pContext)));
+    pExceptionRecord->ExceptionAddress = (PVOID)callsite;
+    SetIP(pContext, callsite);
+
+    // put ESP back to what it was before the call.
+    SetSP(pContext, dac_cast<PCODE>(dac_cast<PTR_BYTE>(GetSP(pContext)) + sizeof(void*)));
+
+    return TRUE;
+}
+
+#endif // !DACCESS_COMPILE
diff --git a/src/vm/i386/fptext.asm b/src/vm/i386/fptext.asm
new file mode 100644
index 0000000000..2190d18519
--- /dev/null
+++ b/src/vm/i386/fptext.asm
@@ -0,0 +1,277 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+; See the LICENSE file in the project root for more information.
+
+; ==++==
+; 
+
+; 
+; ==--==
+	.386
+	.model	flat
+
+	option	casemap:none
+	public	_DoubleToNumber,_NumberToDouble
+
+; NUMBER structure
+
+nPrecision	equ	(dword ptr 0)
+nScale		equ	(dword ptr 4)
+nSign		equ	(dword ptr 8)
+nDigits		equ	(word ptr 12)
+
+	.code
+
+; Powers of 10 from 1.0E1 to 1.0E15 increasing by 1
+
+Pow10By1	label	tbyte
+
+	dt	1.0E1
+	dt	1.0E2
+	dt	1.0E3
+	dt	1.0E4
+	dt	1.0E5
+	dt	1.0E6
+	dt	1.0E7
+	dt	1.0E8
+	dt	1.0E9
+	dt	1.0E10
+	dt	1.0E11
+	dt	1.0E12
+	dt	1.0E13
+	dt	1.0E14
+	dt	1.0E15
+
+; Powers of 10 from 1.0E16 to 1.0E336 increasing by 16
+
+Pow10By16	label	tbyte
+
+	dt	1.0E16
+	dt	1.0E32
+	dt	1.0E48
+	dt	1.0E64
+	dt	1.0E80
+	dt	1.0E96
+	dt	1.0E112
+	dt	1.0E128
+	dt	1.0E144
+	dt	1.0E160
+	dt	1.0E176
+	dt	1.0E192
+	dt	1.0E208
+	dt	1.0E224
+	dt	1.0E240
+	dt	1.0E256
+	dt	1.0E272
+	dt	1.0E288
+	dt	1.0E304
+	dt	1.0E320
+	dt	1.0E336
+
+; Single precision constants
+
+Single10	dd	10.0
+SingleINF	dd	7F800000H
+
+g_CwStd		dw	137fH		;Mask all errors, 64-bit, round near
+
+; void _cdecl DoubleToNumber(double value, int precision, NUMBER* number)
+
+_DoubleToNumber		proc
+
+value		equ	(qword ptr [ebp+8])
+precision	equ	(dword ptr [ebp+16])
+number		equ	(dword ptr [ebp+20])
+paramSize	=	16
+
+cwsave		equ (word ptr [ebp-24])
+digits		equ	(tbyte ptr [ebp-20])
+temp		equ	(tbyte ptr [ebp-10])
+localSize	=	24
+
+	push	ebp
+	mov	ebp,esp
+	sub	esp,localSize
+	push	edi
+	push	ebx
+	fnstcw  cwsave
+	fldcw g_CwStd
+	fld	value
+	fstp	temp
+	mov	edi,number
+	mov	eax,precision
+	mov	nPrecision[edi],eax
+	movzx	eax,word ptr temp[8]
+	mov	edx,eax
+	shr	edx,15
+	mov	nSign[edi],edx
+	and	eax,7FFFH
+	je	DN1
+	cmp	eax,7FFFH
+	jne	DN10
+	mov	eax,80000000H
+	cmp	dword ptr temp[4],eax
+	jne	DN1
+	cmp	dword ptr temp[0],0
+	jne	DN1
+	dec	eax
+DN1:	mov	nScale[edi],eax
+	mov	nDigits[edi],0
+	jmp	DN30
+DN10:	fld	value
+	sub	eax,16382+58		;Remove bias and 58 bits
+	imul	eax,19728		;log10(2) * 2^16 = .30103 * 65536
+	add	eax,0FFFFH		;Round up
+	sar	eax,16			;Only use high half
+	lea	edx,[eax+18]
+	mov	nScale[edi],edx
+ 	neg	eax
+	call	ScaleByPow10
+	fbstp	digits
+	xor	eax,eax
+	xor	ebx,ebx
+	mov	ecx,precision
+	inc	ecx
+	mov	edx,8
+	mov	al,byte ptr digits[8]
+	test	al,0F0H
+	jne	DN11
+	dec	nScale[edi]
+	jmp	DN12
+DN11:	shr	al,4
+	dec	ecx
+	je	DN20
+	add	al,'0'
+	mov	nDigits[edi+ebx*2],ax
+	inc	ebx
+	mov	al,byte ptr digits[edx]
+DN12:	and	al,0FH
+	dec	ecx
+	je	DN20
+	add	al,'0'
+	mov	nDigits[edi+ebx*2],ax
+	inc	ebx
+	dec	edx
+	jl  DN22					; We've run out of digits & don't have a rounding digit, so we'll skip the rounding step.
+	mov	al,byte ptr digits[edx]
+	jmp	DN11
+DN20:	cmp	al,5
+	jb	DN22
+DN21:	dec	ebx
+	inc	nDigits[edi+ebx*2]
+	cmp	nDigits[edi+ebx*2],'9'
+	jbe	DN23
+	or	ebx,ebx
+	jne	DN21
+	mov	nDigits[edi+ebx*2],'1'
+	inc	nScale[edi]
+	jmp	DN23
+DN22:	dec	ebx
+	cmp	nDigits[edi+ebx*2],'0'
+	je	DN22
+DN23:	mov	nDigits[edi+ebx*2+2],0
+DN30:
+	fldcw	cwsave			;;Restore original CW
+	pop	ebx
+	pop	edi
+	mov	esp,ebp
+	pop	ebp
+	ret	;made _cdecl for WinCE paramSize
+
+_DoubleToNumber		endp
+
+; void _cdecl NumberToDouble(NUMBER* number, double* value)
+_NumberToDouble		proc
+
+number		equ	(dword ptr [ebp+8])
+value		equ	(dword ptr [ebp+12])
+paramSize	=	8
+
+cwsave		equ (word  ptr [ebp-8])
+temp		equ	(dword ptr [ebp-4])
+localSize	=	8
+
+	push	ebp     
+	mov	ebp,esp					; Save the stack ptr
+	sub	esp,localSize			;
+	fnstcw  cwsave
+	fldcw g_CwStd		
+	fldz						; zero the register
+	mov	ecx,number				; move precision into ecx
+	xor	edx,edx					; clear edx
+	cmp	dx,nDigits[ecx]			; if the first digit is 0 goto SignResult
+	je	SignResult
+	mov	eax,nScale[ecx]			; store the scale in eax
+	cmp	eax,-330				; if the scale is less than or equal to -330 goto Cleanup
+	jle	Cleanup
+	cmp	eax,310					; if the scale is less than 310, goto ParseDigits
+	jl	ParseDigits
+	fstp	st(0)				; store value on the top of the floating point stack
+	fld	SingleINF				; Load infinity
+	jmp	SignResult				; Goto SignResult
+ParseDigits:	
+	movzx	eax,nDigits[ecx+edx*2]; load the character at nDigits[edx];
+	sub	eax,'0'					; subtract '0'
+	jc	ScaleResult				; jump to ScaleResult if this produces a negative value
+	mov	temp,eax				; store the first digit in temp
+	fmul	Single10			; Multiply by 10
+	fiadd	temp				; Add the digit which we just found
+	inc	edx						; increment the counter
+	cmp	edx,18					; if (eax<18) goto ParseDigits
+	jb	ParseDigits
+ScaleResult:	
+	mov	eax,nScale[ecx]			; eax = scale
+	sub	eax,edx					; scale -= (number of digits)
+	call	ScaleByPow10		; multiply the result by 10^scale
+SignResult:	
+	cmp	nSign[ecx],0			; If the sign is 0 already go to Cleanup, otherwise change the sign.
+	je	Cleanup
+	fchs
+Cleanup:	
+	mov	edx,value				; store value in edx
+	fstp	qword ptr [edx]		; copy from value to the fp stack
+	fldcw	cwsave				; Restore original CW		
+	mov	esp,ebp					; restore the stack frame & exit.
+	pop	ebp
+	ret	;Made _cdecl for WinCE  paramSize
+
+_NumberToDouble		endp
+
+; Scale st(0) by 10^eax
+		
+ScaleByPow10	proc
+	test	eax,eax
+	je	SP2
+	jl	SP3
+	mov	edx,eax
+	and	edx,0FH
+	je	SP1
+	lea	edx,[edx+edx*4]
+	fld	Pow10By1[edx*2-10]
+	fmul
+SP1:	mov	edx,eax
+	shr	edx,4
+        test    edx, edx                ; remove partial flag stall caused by shr
+	je	SP2
+	lea	edx,[edx+edx*4]
+	fld	Pow10By16[edx*2-10]
+	fmul
+SP2:	ret
+SP3:	neg	eax
+	mov	edx,eax
+	and	edx,0FH
+	je	SP4
+	lea	edx,[edx+edx*4]
+	fld	Pow10By1[edx*2-10]
+	fdiv
+SP4:	mov	edx,eax
+	shr	edx,4
+        test    edx, edx                ; remove partial flag stall caused by shr
+	je	SP5
+	lea	edx,[edx+edx*4]
+	fld	Pow10By16[edx*2-10]
+	fdiv
+SP5:	ret
+ScaleByPow10	endp
+		
+	end
diff --git a/src/vm/i386/gmsasm.asm b/src/vm/i386/gmsasm.asm
new file mode 100644
index 0000000000..6b6044b50d
--- /dev/null
+++ b/src/vm/i386/gmsasm.asm
@@ -0,0 +1,37 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+; See the LICENSE file in the project root for more information.
+
+; ==++==
+; 
+
+; 
+; ==--==
+;
+;  *** NOTE:  If you make changes to this file, propagate the changes to
+;             gmsasm.s in this directory                            
+
+	.586
+	.model	flat
+
+include asmconstants.inc
+
+	option	casemap:none
+	.code
+
+; int __fastcall LazyMachStateCaptureState(struct LazyMachState *pState);
+@LazyMachStateCaptureState@4 proc public
+        mov [ecx+MachState__pRetAddr], 0 ; marks that this is not yet valid
+        mov [ecx+MachState__edi], edi    ; remember register values
+	mov [ecx+MachState__esi], esi 
+        mov [ecx+MachState__ebx], ebx
+	mov [ecx+LazyMachState_captureEbp], ebp
+	mov [ecx+LazyMachState_captureEsp], esp
+
+        mov eax, [esp]                   ; capture return address
+	mov [ecx+LazyMachState_captureEip], eax
+	xor eax, eax
+	retn
+@LazyMachStateCaptureState@4 endp
+
+end
diff --git a/src/vm/i386/gmscpu.h b/src/vm/i386/gmscpu.h
new file mode 100644
index 0000000000..0aecefac21
--- /dev/null
+++ b/src/vm/i386/gmscpu.h
@@ -0,0 +1,140 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/**************************************************************/
+/*                       gmscpu.h                             */
+/**************************************************************/
+/* HelperFrame is defines 'GET_STATE(machState)' macro, which 
+   figures out what the state of the machine will be when the 
+   current method returns.  It then stores the state in the
+   JIT_machState structure.  */
+
+/**************************************************************/
+
+#ifndef __gmsx86_h__
+#define __gmsx86_h__
+
+#define __gmsx86_h__
+
+#ifdef _DEBUG
+class HelperMethodFrame;
+struct MachState;
+EXTERN_C MachState* STDCALL HelperMethodFrameConfirmState(HelperMethodFrame* frame, void* esiVal, void* ediVal, void* ebxVal, void* ebpVal);
+#endif
+
+    // A MachState indicates the register state of the processor at some point in time (usually
+    // just before or after a call is made).  It can be made one of two ways.  Either explicitly
+    // (when you for some reason know the values of all the registers), or implicitly using the
+    // GET_STATE macros.  
+
+typedef DPTR(struct MachState) PTR_MachState;
+struct MachState {
+
+    MachState()
+    {
+        LIMITED_METHOD_DAC_CONTRACT;
+        INDEBUG(memset(this, 0xCC, sizeof(MachState));)
+    }
+
+    bool   isValid()     { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(dac_cast<TADDR>(_pRetAddr) != INVALID_POINTER_CC); return(_pRetAddr != 0); }
+    TADDR* pEdi()        { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(dac_cast<TADDR>(_pEdi) != INVALID_POINTER_CC); return(_pEdi); }
+    TADDR* pEsi()        { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(dac_cast<TADDR>(_pEsi) != INVALID_POINTER_CC); return(_pEsi); }
+    TADDR* pEbx()        { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(dac_cast<TADDR>(_pEbx) != INVALID_POINTER_CC); return(_pEbx); }
+    TADDR* pEbp()        { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(dac_cast<TADDR>(_pEbp) != INVALID_POINTER_CC); return(_pEbp); }
+    TADDR  esp()         { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(isValid()); return(_esp); }
+    PTR_TADDR pRetAddr() { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(isValid()); return(_pRetAddr); }
+    TADDR  GetRetAddr()  { LIMITED_METHOD_DAC_CONTRACT; _ASSERTE(isValid()); return *_pRetAddr; }
+#ifndef DACCESS_COMPILE
+    void SetRetAddr(TADDR* addr) { LIMITED_METHOD_CONTRACT; _ASSERTE(isValid()); _pRetAddr = addr; }
+#endif
+
+    friend class HelperMethodFrame;
+    friend class CheckAsmOffsets;
+    friend struct LazyMachState;
+#ifdef _DEBUG
+    friend MachState* STDCALL HelperMethodFrameConfirmState(HelperMethodFrame* frame, void* esiVal, void* ediVal, void* ebxVal, void* ebpVal);
+#endif
+
+
+protected:
+    // Note the fields are layed out to make generating a
+    // MachState structure from assembly code very easy
+
+    // The state of all the callee saved registers.
+    // If the register has been spill to the stack p<REG>
+    // points at this location, otherwise it points
+    // at the field <REG> field itself 
+    PTR_TADDR _pEdi; 
+    TADDR     _edi;
+    PTR_TADDR _pEsi;
+    TADDR     _esi;
+    PTR_TADDR _pEbx;
+    TADDR     _ebx;
+    PTR_TADDR _pEbp;
+    TADDR     _ebp;
+
+    TADDR     _esp;       // stack pointer after the function returns
+    PTR_TADDR _pRetAddr;  // The address of the stored IP address (points into the stack)
+};
+
+/********************************************************************/
+/* This allows you to defer the computation of the Machine state 
+   until later.  Note that we don't reuse slots, because we want
+   this to be threadsafe without locks */
+
+struct LazyMachState;
+typedef DPTR(LazyMachState) PTR_LazyMachState;
+struct LazyMachState : public MachState {
+    // compute the machine state of the processor as it will exist just 
+    // after the return after at most'funCallDepth' number of functions.
+    // if 'testFtn' is non-NULL, the return address is tested at each
+    // return instruction encountered.  If this test returns non-NULL,
+    // then stack walking stops (thus you can walk up to the point that the
+    // return address matches some criteria
+
+    // Normally this is called with funCallDepth=1 and testFtn = 0 so that 
+    // it returns the state of the processor after the function that called 'captureState()'
+    void setLazyStateFromUnwind(MachState* copy);
+    static void unwindLazyState(LazyMachState* baseState,
+                                MachState* lazyState,
+                                DWORD threadId,
+                                int funCallDepth = 1,
+                                HostCallPreference hostCallPreference = AllowHostCalls);
+
+    friend class HelperMethodFrame;
+    friend class CheckAsmOffsets;
+private:
+    TADDR            captureEbp;        // Ebp at the time of capture
+    TADDR            captureEsp;        // Esp at the time of capture
+    TADDR            captureEip;        // Eip at the time of capture
+};
+
+inline void LazyMachState::setLazyStateFromUnwind(MachState* copy)
+{
+    // _pRetAddr has to be the last thing updated when we make the copy (because its
+    // is the the _pRetAddr becoming non-zero that flips this from invalid to valid.
+    // we assert that it is the last field in the struct.
+    static_assert_no_msg(offsetof(MachState, _pRetAddr) + sizeof(_pRetAddr) == sizeof(MachState));
+
+    memcpy(this, copy, offsetof(MachState, _pRetAddr));
+
+    // this has to be last
+    VolatileStore((TADDR*)&_pRetAddr, dac_cast<TADDR>(copy->_pRetAddr));
+}
+
+// Do the initial capture of the machine state.  This is meant to be 
+// as light weight as possible, as we may never need the state that 
+// we capture.  Thus to complete the process you need to call 
+// 'getMachState()', which finishes the process
+EXTERN_C int __fastcall LazyMachStateCaptureState(struct LazyMachState *pState);
+
+// CAPTURE_STATE captures just enough register state so that the state of the
+// processor can be deterined just after the the routine that has CAPTURE_STATE in
+// it returns.
+
+// Note that the return is never taken, is is there for epilog walking
+#define CAPTURE_STATE(machState, ret)                       \
+    if (LazyMachStateCaptureState(machState)) ret
+
+#endif
diff --git a/src/vm/i386/gmsx86.cpp b/src/vm/i386/gmsx86.cpp
new file mode 100644
index 0000000000..e7e16b70ab
--- /dev/null
+++ b/src/vm/i386/gmsx86.cpp
@@ -0,0 +1,1245 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/**************************************************************/
+/*                       gmsx86.cpp                           */
+/**************************************************************/
+
+#include "common.h"
+#include "gmscpu.h"
+
+/***************************************************************/
+/* setMachState figures out what the state of the CPU will be
+   when the function that calls 'setMachState' returns.  It stores
+   this information in 'frame'
+
+   setMachState works by simulating the execution of the
+   instructions starting at the instruction following the
+   call to 'setMachState' and continuing until a return instruction
+   is simulated.  To avoid having to process arbitrary code, the
+   call to 'setMachState' should be called as follows
+
+      if (machState.setMachState != 0) return;
+
+   setMachState is guarnenteed to return 0 (so the return
+   statement will never be executed), but the expression above
+   insures insures that there is a 'quick' path to epilog
+   of the function.  This insures that setMachState will only
+   have to parse a limited number of X86 instructions.   */
+
+
+/***************************************************************/
+#ifndef POISONC
+#define POISONC ((sizeof(int *) == 4)?0xCCCCCCCCU:UI64(0xCCCCCCCCCCCCCCCC))
+#endif
+
+/***************************************************************/
+/* the 'zeroFtn and 'recursiveFtn' are only here to determine 
+   if if mscorwks itself has been instrumented by a profiler
+   that intercepts calls or epilogs of functions. (the 
+   callsInstrumented and epilogInstrumented functions).  */
+   
+#if !defined(DACCESS_COMPILE)
+
+#pragma optimize("gsy", on )        // optimize to insure that code generation does not have junk in it
+#pragma warning(disable:4717) 
+
+static int __stdcall zeroFtn() {
+    return 0;
+}
+
+static int __stdcall recursiveFtn() {
+    return recursiveFtn()+1;
+}
+
+#pragma optimize("", on )
+
+
+/* Has mscorwks been instrumented so that calls are morphed into push XXXX call <helper> */
+static bool callsInstrumented() {
+        // Does the recusive function begin with push XXXX call <helper>
+    PTR_BYTE ptr = PTR_BYTE(recursiveFtn);
+
+    return (ptr[0] == 0x68 && ptr[5] == 0xe8);    // PUSH XXXX, call <helper>
+}
+
+/* Has mscorwks been instrumented so function prolog and epilogs are replaced with 
+   jmp [XXXX] */
+
+static bool epilogInstrumented() {
+
+    PTR_BYTE ptr = PTR_BYTE(zeroFtn);
+    if (ptr[0] == 0xe8)                            // call <helper>     (prolog instrumentation)
+        ptr += 5;
+    if (ptr[0] == 0x33 && ptr[1] == 0xc0)        // xor eax eax
+        ptr += 2;
+    return (ptr[0] == 0xeb || ptr[0] == 0xe9);        // jmp <XXXX>
+}
+
+#else 
+
+    // Note that we have the callsInstrumeted and epilogInstrumented
+    // functions so that the looser heuristics used for instrumented code
+    // can't foul up an instrumented mscorwks.  For simplicity sake we
+    // don't bother with this in the DAC, which means that the DAC could
+    // be misled more frequently than mscorwks itself, but I still think
+    // it will not be misled in any real scenario
+static bool callsInstrumented() { LIMITED_METHOD_DAC_CONTRACT; return true; }
+static bool epilogInstrumented() { LIMITED_METHOD_DAC_CONTRACT; return true; }
+
+#endif // !defined(DACCESS_COMPILE)
+
+/***************************************************************/
+/* returns true if a call to 'ip' should be entered by the
+   epilog walker.  Bascically we are looking for things that look
+   like __SEH_epilog.  In particular we look for things that
+   pops a register before doing a push.  If we see something
+   that we don't recognise, we dont consider it a epilog helper
+   and return false.
+*/
+
+static bool shouldEnterCall(PTR_BYTE ip) {
+    SUPPORTS_DAC;
+
+    int datasize; // helper variable for decoding of address modes
+    int mod;      // helper variable for decoding of mod r/m
+    int rm;       // helper variable for decoding of mod r/m
+
+    int pushes = 0;
+
+    // we should start unbalenced pops within 48 instrs. If not, it is not a special epilog function
+    // the only reason we need as many instructions as we have below is because  coreclr
+    // gets instrumented for profiling, code coverage, BBT etc, and we want these things to
+    // just work.
+    for (int i = 0; i < 48; i++) {
+        switch(*ip) {
+            case 0xF2:              // repne
+            case 0xF3:              // repe
+                ip++;
+                break;
+
+            case 0x68:              // push 0xXXXXXXXX
+                ip += 5;
+
+                // For office profiler.  They morph tail calls into push TARGET; jmp helper
+                // so if you see
+                // 
+                // push XXXX
+                // jmp xxxx
+                // 
+                // and we notice that coreclr has been instrumented and
+                // xxxx starts with a JMP [] then do what you would do for jmp XXXX
+                if (*ip == 0xE9 && callsInstrumented()) {        // jmp helper
+                    PTR_BYTE tmpIp = ip + 5;
+                    PTR_BYTE target = tmpIp + (__int32)*((PTR_TADDR)(PTR_TO_TADDR(tmpIp) - 4));
+                    if (target[0] == 0xFF && target[1] == 0x25) {                // jmp [xxxx] (to external dll)
+                        ip = PTR_BYTE(*((PTR_TADDR)(PTR_TO_TADDR(ip) - 4)));
+                    }
+                }
+                else {
+                pushes++;
+                }
+                break;
+
+            case 0x50:              // push EAX
+            case 0x51:              // push ECX
+            case 0x52:              // push EDX
+            case 0x53:              // push EBX
+            case 0x55:              // push EBP
+            case 0x56:              // push ESI
+            case 0x57:              // push EDI
+                pushes++;
+                ip++;
+                break;
+                
+            case 0xE8:              // call <disp32>
+                ip += 5;
+                pushes = 0;         // This assumes that all of the previous pushes are arguments to this call 
+                break;
+
+            case 0xFF:        
+                if (ip[1] != 0x15)  // call [XXXX] is OK (prolog of epilog helper is intrumented)
+                    return false;   // but everything else is not OK. 
+                ip += 6;
+                pushes = 0;         // This assumes that all of the previous pushes are arguments to this call 
+                break;
+
+            case 0x9C:              // pushfd
+            case 0x9D:              // popfd
+                // a pushfd can never be an argument, so we model a pair of
+                // these instruction as not changing the stack so that a call
+                // that occurs between them does not consume the value of pushfd
+                ip++;
+                break;
+
+            case 0x5D:              // pop EBP
+            case 0x5E:              // pop ESI
+            case 0x5F:              // pop EDI
+            case 0x5B:              // pop EBX
+            case 0x58:              // pop EAX
+            case 0x59:              // pop ECX
+            case 0x5A:              // pop EDX
+                if (pushes <= 0) {
+                    // We now have more pops than pushes.  This is our indication
+                    // that we are in an EH_epilog function so we return true.
+                    // This is the only way to exit this method with a retval of true.
+                    return true;
+                }
+                --pushes;
+                ip++;
+                break;
+
+            case 0xA1:              // MOV EAX, [XXXX]
+                ip += 5;
+                break;
+
+            case 0xC6:              // MOV r/m8, imm8
+                datasize = 1;
+                goto decodeRM;
+
+            case 0x89:              // MOV r/m, reg
+                if (ip[1] == 0xE5)  // MOV EBP, ESP
+                    return false;
+                if (ip[1] == 0xEC)  // MOV ESP, EBP
+                    return false;
+                goto move;
+
+            case 0x8B:              // MOV reg, r/m
+                if (ip[1] == 0xE5)  // MOV ESP, EBP
+                    return false;
+                if (ip[1] == 0xEC)  // MOV EBP, ESP
+                    return false;
+                goto move;
+
+            case 0x88:              // MOV reg, r/m (BYTE)
+            case 0x8A:              // MOV r/m, reg (BYTE)
+
+            case 0x31:              // XOR
+            case 0x32:              // XOR
+            case 0x33:              // XOR
+
+        move:
+                datasize = 0;
+
+        decodeRM:
+                // Note that we don't want to read from ip[] after
+                // we do ANY incrementing of ip
+
+                mod = (ip[1] & 0xC0) >> 6;
+                if (mod != 3) {
+                    rm  = (ip[1] & 0x07);
+                    if (mod == 0) {         // (mod == 0) 
+                        if      (rm == 5)
+                            ip += 4;            // disp32
+                        else if (rm == 4)
+                            ip += 1;            // [reg*K+reg]
+                                                // otherwise [reg]
+
+                    }
+                    else if (mod == 1) {    // (mod == 1) 
+                        ip += 1;                // for disp8
+                        if (rm == 4)
+                            ip += 1;            // [reg*K+reg+disp8]
+                                                // otherwise [reg+disp8]
+                    }
+                    else {                  // (mod == 2) 
+                        ip += 4;                // for disp32
+                        if (rm == 4)
+                            ip += 1;            // [reg*K+reg+disp32]
+                                                // otherwise [reg+disp32]
+                    }
+                }
+
+                ip += 2;
+                ip += datasize;
+                break;
+
+            case 0x64:              // FS: prefix
+                ip++;
+                break;
+
+            case 0xEB:              // jmp <disp8>
+                ip += (signed __int8) ip[1] + 2;
+                break;
+
+            case 0xE9:              // jmp <disp32>
+                ip += (__int32)*PTR_DWORD(PTR_TO_TADDR(ip) + 1) + 5; 
+                break;
+
+            case 0xF7:               // test r/m32, imm32
+                // Magellan code coverage build
+                if ( (ip[1] & 0x38) == 0x00)
+                {
+                    datasize = 4;
+                    goto decodeRM;
+                }
+                else
+                {
+                    return false;
+                }
+                break;
+
+            case 0x75:              // jnz <target>
+                // Magellan code coverage build
+                // We always follow forward jump to avoid possible looping.
+                {
+                    PTR_BYTE tmpIp = ip + (TADDR)(signed __int8) ip[1] + 2;
+                    if (tmpIp > ip) {
+                        ip = tmpIp;     // follow forwards jump
+                    }
+                    else {
+                        return false;   // backwards jump implies not EH_epilog function
+                    }
+                }
+                break;
+
+            case 0xC2:                // ret
+            case 0xC3:                // ret n
+            default:
+                return false;
+        }
+    }
+
+    return false;
+}
+
+
+/***************************************************************/
+#ifdef _PREFAST_
+#pragma warning(push)
+#pragma warning(disable:21000) // Suppress PREFast warning about overly large function
+#endif
+
+/***************************************************************/
+// A fundamental requirement of managed code is that we need to be able to enumerate all GC references on the
+// stack at GC time. To do this we need to be able to 'crawl' the stack. We know how to do this in JIT
+// compiled code (it generates additional information like the frame size etc), but we don't know how to do
+// this for unmanaged code. For PINVOKE calls, we leave a pointer to the transition boundary between managed
+// and unmanaged code and we simply ignore the lower part of the stack. However setting up this transition is
+// a bit expensive (1-2 dozen instructions), and while that is acceptable for PINVOKE, it is not acceptable
+// for high volume calls, like NEW, CAST, WriterBarrier, Stack field fetch and others.
+//
+// To get around this, for transitions into the runtime (which we call FCALLS), we DEFER setting up the
+// boundary variables (what we call the transition frame), until we actually need it (we will do an operation
+// that might cause a GC). This allow us to handle the common case (where we might find the thing in a cache,
+// or be service the 'new' from a allocation quantum), and only pay the cost of setting up the transition
+// frame when it will actually be used.
+//
+// The problem is that in order to set up a transition frame we need to be able to find ALL REGISTERS AT THE
+// TIME THE TRANSITION TO UNMANAGED CODE WAS MADE (because we might need to update them if they have GC
+// references). Because we have executed ordinary C++ code (which might spill the registers to the stack at
+// any time), we have a problem. LazyMachState is our 'solution' to this problem. We take advantage of the
+// fact that the C++ code MUST RESTORE the register before returning. Thus we simulate the execution from the
+// current location to the return and 'watch' where the registers got restored from. This is what
+// unwindLazyState does (determine what the registers would be IF you had never executed and unmanaged C++
+// code).
+// 
+// By design, this code does not handle all X86 instructions, but only those instructions needed in an
+// epilog.  If you get a failure because of a missing instruction, it MAY simply be because the compiler
+// changed and now emits a new instruction in the epilog, but it MAY also be because the unwinder is
+// 'confused' and is trying to follow a code path that is NOT AN EPILOG, and in this case adding
+// instructions to 'fix' it is inappropriate.
+//
+void LazyMachState::unwindLazyState(LazyMachState* baseState,
+                                    MachState* lazyState,
+                                    DWORD threadId,
+                                    int funCallDepth /* = 1 */,
+                                    HostCallPreference hostCallPreference /* = (HostCallPreference)(-1) */)
+{
+    CONTRACTL {
+        NOTHROW;
+        GC_NOTRIGGER;
+        SO_TOLERANT;
+        SUPPORTS_DAC;
+    } CONTRACTL_END;
+
+    lazyState->_edi = baseState->_edi;
+    lazyState->_esi = baseState->_esi;
+    lazyState->_ebx = baseState->_ebx;
+    lazyState->_ebp = baseState->captureEbp;
+#ifndef DACCESS_COMPILE
+    lazyState->_pEdi = &baseState->_edi;
+    lazyState->_pEsi = &baseState->_esi;
+    lazyState->_pEbx = &baseState->_ebx;
+    lazyState->_pEbp = &baseState->_ebp;
+#endif
+
+    // We have captured the state of the registers as they exist in 'captureState'
+    // we need to simulate execution from the return address captured in 'captureState
+    // until we return from the caller of captureState.
+
+    PTR_BYTE ip = PTR_BYTE(baseState->captureEip);
+    PTR_TADDR ESP = PTR_TADDR(baseState->captureEsp);
+    ESP++;                                 // pop captureState's return address
+
+
+    // VC now has small helper calls that it uses in epilogs.  We need to walk into these
+    // helpers if we are to decode the stack properly.  After we walk the helper we need
+    // to return and continue walking the epiliog.  This varaible remembers were to return to
+    PTR_BYTE epilogCallRet = PTR_BYTE((TADDR)0);
+
+    // The very first conditional jump that we are going to encounter is
+    // the one testing for the return value of LazyMachStateCaptureState.
+    // The non-zero path is the one directly leading to a return statement.
+    // This variable keeps track of whether we are still looking for that
+    // first conditional jump.
+    BOOL bFirstCondJmp = TRUE;
+
+    // The general strategy is that we always try to plough forward:
+    // we follow a conditional jump if and only if it is a forward jump.
+    // However, in fcall functions that set up a HELPER_METHOD_FRAME in
+    // more than one place, gcc will have both of them share the same
+    // epilog - and the second one may actually be a backward jump.
+    // This can lead us to loop in a destructor code loop.  To protect
+    // against this, we remember the ip of the last conditional jump
+    // we followed, and if we encounter it again, we take the other branch.
+    PTR_BYTE lastCondJmpIp = PTR_BYTE((TADDR)0);
+
+    int datasize; // helper variable for decoding of address modes
+    int mod;      // helper variable for decoding of mod r/m
+    int rm;       // helper variable for decoding of mod r/m
+
+#ifdef _DEBUG
+    int count = 0;
+    const DWORD cInstructions = 1000;
+    PTR_BYTE *instructionBytes = (PTR_BYTE*)alloca(cInstructions * sizeof(PTR_BYTE));
+    memset(instructionBytes, 0, cInstructions * sizeof(PTR_BYTE));
+#endif
+    bool bset16bit=false;
+    bool b16bit=false;
+    for(;;)
+    {
+        _ASSERTE(count++ < 1000);       // we should never walk more than 1000 instructions!
+        b16bit=bset16bit;
+        bset16bit=false;
+
+#ifndef DACCESS_COMPILE
+    again:
+#endif
+#ifdef _DEBUG
+        instructionBytes[count-1] = ip;
+#endif
+        switch(*ip)
+        {
+
+            case 0x64:              // FS: prefix
+                bset16bit=b16bit;   // In case we have just seen a 0x66 prefix
+                goto incIp1;
+
+            case 0x66:
+                bset16bit=true;     // Remember that we saw the 0x66 prefix [16-bit datasize override]
+                goto incIp1;
+
+            case 0x50:              // push EAX
+            case 0x51:              // push ECX
+            case 0x52:              // push EDX
+            case 0x53:              // push EBX
+            case 0x55:              // push EBP
+            case 0x56:              // push ESI
+            case 0x57:              // push EDI
+            case 0x9C:              // pushfd
+                --ESP;
+            case 0x40:              // inc EAX
+            case 0x41:              // inc ECX
+            case 0x42:              // inc EDX
+            case 0x43:              // inc EBX
+            case 0x46:              // inc ESI
+            case 0x47:              // inc EDI
+                goto incIp1;
+
+            case 0x58:              // pop EAX
+            case 0x59:              // pop ECX
+            case 0x5A:              // pop EDX
+            case 0x9D:              // popfd
+                ESP++;
+                // FALL THROUGH
+
+            case 0x90:              // nop
+        incIp1:
+                ip++;
+                break;
+
+            case 0x5B:              // pop EBX
+                lazyState->_pEbx = ESP;
+                lazyState->_ebx  = *ESP++;
+                goto incIp1;
+            case 0x5D:              // pop EBP
+                lazyState->_pEbp = ESP;
+                lazyState->_ebp  = *ESP++;
+                goto incIp1;
+            case 0x5E:              // pop ESI
+                lazyState->_pEsi = ESP;
+                lazyState->_esi = *ESP++;
+                goto incIp1;
+            case 0x5F:              // pop EDI
+                lazyState->_pEdi = ESP;
+                lazyState->_edi = *ESP++;
+                goto incIp1;
+
+            case 0xEB:              // jmp <disp8>
+                ip += (signed __int8) ip[1] + 2;
+                break;
+
+            case 0x72:              // jb <disp8> for gcc.
+                {
+                    PTR_BYTE tmpIp = ip + (int)(signed __int8)ip[1] + 2;
+                    if (tmpIp > ip)
+                        ip = tmpIp;
+                    else
+                        ip += 2;
+                }
+                break;
+
+            case 0xE8:              // call <disp32>
+                ip += 5;
+                if (epilogCallRet == 0)
+                {
+                    PTR_BYTE target = ip + (__int32)*PTR_DWORD(PTR_TO_TADDR(ip) - 4);    // calculate target
+
+                    if (shouldEnterCall(target)) 
+                    {
+                        epilogCallRet = ip;             // remember our return address
+                        --ESP;                          // simulate pushing the return address
+                        ip = target;
+                    }
+                }
+                break;
+
+            case 0xE9:              // jmp <disp32>
+                {
+                    PTR_BYTE tmpIp = ip
+                        + ((__int32)*dac_cast<PTR_DWORD>(ip + 1) + 5);
+                    ip = tmpIp;
+                }
+                break;
+
+            case 0x0f:              // follow non-zero jumps:
+              if (ip[1] >= 0x90 && ip[1] <= 0x9f) {
+                  if ((ip[2] & 0xC0) != 0xC0)  // set<cc> reg
+                      goto badOpcode;
+                  ip += 3;
+                  break;
+              }
+              else if ((ip[1] & 0xf0) == 0x40) { //cmov mod/rm
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (ip[1] >= 0x10 && ip[1] <= 0x17) { // movups, movlps, movhps, unpcklpd, unpckhpd
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (ip[1] == 0x1f) {     // nop (multi-byte)
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (ip[1] == 0x57) {     // xorps
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (ip[1] == 0xb6 || ip[1] == 0xb7) {     //movzx reg, r/m8
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (ip[1] == 0xbf) {     //movsx reg, r/m16
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (ip[1] == 0xd6 || ip[1] == 0x7e) {     // movq
+                  ++ip;
+                  datasize = 0;
+                  goto decodeRM;
+              }
+              else if (bFirstCondJmp) {
+                  bFirstCondJmp = FALSE;
+                  if (ip[1] == 0x85)  // jne <disp32>
+                      ip += (__int32)*dac_cast<PTR_DWORD>(ip + 2) + 6;
+                  else if (ip[1] >= 0x80 && ip[1] <= 0x8F)  // jcc <disp32>
+                      ip += 6;
+                  else
+                      goto badOpcode;
+              }
+              else {
+                  if ((ip[1] >= 0x80) && (ip[1] <= 0x8F)) {
+                      PTR_BYTE tmpIp = ip + (__int32)*dac_cast<PTR_DWORD>(ip + 2) + 6;
+
+                      if ((tmpIp > ip) == (lastCondJmpIp != ip)) {
+                          lastCondJmpIp = ip;
+                          ip = tmpIp;
+                      }
+                      else {
+                          lastCondJmpIp = ip;
+                          ip += 6;
+                      }
+                  }
+                  else
+                      goto badOpcode;
+              }
+              break;
+
+              // This is here because VC seems to not always optimize
+              // away a test for a literal constant
+            case 0x6A:              // push 0xXX
+                ip += 2;
+                --ESP;
+                break;
+
+            case 0x68:              // push 0xXXXXXXXX
+                if ((ip[5] == 0xFF) && (ip[6] == 0x15)) {
+                    ip += 11; // 
+                }
+                else {
+                    ip += 5;
+
+                    // For office profiler.  They morph calls into push TARGET; call helper
+                    // so if you see
+                    // 
+                    // push XXXX
+                    // call xxxx
+                    // 
+                    // and we notice that mscorwks has been instrumented and
+                    // xxxx starts with a JMP [] then do what you would do for call XXXX
+                    if ((*ip & 0xFE) == 0xE8 && callsInstrumented()) {       // It is a call or a jump (E8 or E9)
+                        PTR_BYTE tmpIp = ip + 5;
+                        PTR_BYTE target = tmpIp + (__int32)*PTR_DWORD(PTR_TO_TADDR(tmpIp) - 4); 
+                        if (target[0] == 0xFF && target[1] == 0x25) {                // jmp [xxxx] (to external dll)
+                            target = PTR_BYTE(*PTR_TADDR(PTR_TO_TADDR(ip) - 4));
+                            if (*ip == 0xE9) {                                       // Do logic for jmp
+                                ip = target;
+                            }
+                            else if (shouldEnterCall(target)) {                      // Do logic for calls
+                                epilogCallRet = ip;             // remember our return address
+                                --ESP;                          // simulate pushing the return address
+                                ip = target;
+                            }
+                        }
+                    }
+                }
+                break;
+
+           case 0x74:              // jz <target>
+                if (bFirstCondJmp) {
+                    bFirstCondJmp = FALSE;
+                    ip += 2;            // follow the non-zero path
+                    break;
+                }
+                goto condJumpDisp8;
+
+            case 0x75:              // jnz <target>
+                // Except the first jump, we always follow forward jump to avoid possible looping.
+                // 
+                if (bFirstCondJmp) {
+                    bFirstCondJmp = FALSE;
+                    ip += (signed __int8) ip[1] + 2;   // follow the non-zero path
+                    break;
+                }
+                goto condJumpDisp8;
+
+            case 0x77:              // ja <target>
+            case 0x78:              // js <target>
+            case 0x79:              // jns <target>
+            case 0x7d:              // jge <target>
+            case 0x7c:              // jl <target>
+                goto condJumpDisp8;
+
+        condJumpDisp8:
+                {
+                    PTR_BYTE tmpIp = ip + (TADDR)(signed __int8) ip[1] + 2;
+                    if ((tmpIp > ip) == (lastCondJmpIp != ip)) {
+                        lastCondJmpIp = ip;
+                        ip = tmpIp;
+                    }
+                    else {
+                        lastCondJmpIp = ip;
+                        ip += 2;
+                    }
+                }
+                break;
+
+            case 0x84:
+            case 0x85:
+                mod = (ip[1] & 0xC0) >> 6;
+                if (mod != 3)           // test reg1, reg2
+                    goto badOpcode;
+                ip += 2;
+                break;
+
+            case 0x31:
+            case 0x32:
+            case 0x33:
+#ifdef __GNUC__
+                //there are lots of special workarounds for XOR for msvc.  For GnuC
+                //just do the normal Mod/rm stuff.
+                datasize = 0;
+                goto decodeRM;
+#else
+                mod = (ip[1] & 0xC0) >> 6;
+                if (mod == 3)
+                {
+                    // XOR reg1, reg2
+
+                    // VC generates this sequence in some code:
+                    // xor reg, reg
+                    // test reg reg
+                    // je   <target>
+                    // This is just an unconditional branch, so jump to it
+                    if ((ip[1] & 7) == ((ip[1] >> 3) & 7)) {        // reg1 == reg2?
+                        if (ip[2] == 0x85 && ip[3] == ip[1]) {      // TEST reg, reg
+                            if (ip[4] == 0x74) {
+                                ip += (signed __int8) ip[5] + 6;   // follow the non-zero path
+                                break;
+                            }
+                            _ASSERTE(ip[4] != 0x0f || ((ip[5] & 0xF0)!=0x80)); // If this goes off, we need the big jumps
+                        }
+                        else
+                        {
+                            if (ip[2]==0x74)
+                            {
+                                ip += (signed __int8) ip[3] + 4;
+                                break;
+                            }
+                            _ASSERTE(ip[2] != 0x0f || ((ip[3] & 0xF0)!=0x80));              // If this goes off, we need the big jumps
+                        }
+                    }
+                    ip += 2;
+                }
+                else if (mod == 1)
+                {
+                    // XOR reg1, [reg+offs8]
+                    // Used by the /GS flag for call to __security_check_cookie()
+                    // Should only be XOR ECX,[EBP+4]
+                    _ASSERTE((((ip[1] >> 3) & 0x7) == 0x1) && ((ip[1] & 0x7) == 0x5) && (ip[2] == 4));
+                    ip += 3;
+                }
+                else if (mod == 2)
+                {
+                    // XOR reg1, [reg+offs32]
+                    // Should not happen but may occur with __security_check_cookie()
+                    _ASSERTE(!"Unexpected XOR reg1, [reg+offs32]");
+                    ip += 6;
+                }
+                else // (mod == 0)
+                {
+                    // XOR reg1, [reg]
+                    goto badOpcode;
+                }
+                break;
+#endif
+
+            case 0x05:
+                // added to handle gcc 3.3 generated code
+                // add %reg, constant
+                ip += 5;
+                break;
+
+            case 0xFF:
+                if ( (ip[1] & 0x38) == 0x30)
+                {
+                    // opcode generated by Vulcan/BBT instrumentation
+                    // search for push dword ptr[esp]; push imm32; call disp32 and if found ignore it
+                    if ((ip[1] == 0x34) && (ip[2] == 0x24) && // push dword ptr[esp]  (length 3 bytes)
+                        (ip[3] == 0x68) &&                    // push imm32           (length 5 bytes)
+                        (ip[8] == 0xe8))                      // call disp32          (length 5 bytes)
+                    {
+                        // found the magic seq emitted by Vulcan instrumentation
+                        ip += 13;  // (3+5+5)
+                        break;
+                    }
+
+                    --ESP;      // push r/m
+                    datasize = 0;
+                    goto decodeRM;
+                }
+                else if ( (ip[1] & 0x38) == 0x10) 
+                {
+                    // added to handle gcc 3.3 generated code
+                    // This is a call *(%eax) generated by gcc for destructor calls.
+                    // We can safely skip over the call
+                    datasize = 0;
+                    goto decodeRM;
+                }
+                else if (ip[1] == 0xe0)
+                {
+                    goto badOpcode;
+#if 0
+                    // Handles jmp *%eax from gcc
+                    datasize = 0;
+                    goto decodeRM;
+#endif
+                }
+                else if (ip[1] == 0x25 && epilogInstrumented())        // is it jmp [XXXX]
+                {
+                    // this is a office profiler epilog (this jmp is acting as a return instruction)
+                    PTR_BYTE epilogHelper = PTR_BYTE(*PTR_TADDR(*PTR_TADDR(PTR_TO_TADDR(ip) + 2)));
+
+                    ip = PTR_BYTE(*ESP);
+                    lazyState->_pRetAddr = ESP++;
+
+                    if (epilogHelper[0] != 0x6A)             // push <number of dwords to pop>
+                        goto badOpcode;
+                    unsigned disp = *PTR_BYTE(PTR_TO_TADDR(epilogHelper) + 1) * 4;
+                    ESP = PTR_TADDR(PTR_TO_TADDR(ESP) + disp);         // pop args
+                    goto ret_with_epilogHelperCheck;
+
+                }
+                else
+                {
+                    goto badOpcode;
+                }
+                break;
+
+            case 0x39:                       // comp r/m, reg
+            case 0x3B:                       // comp reg, r/m
+                datasize = 0;
+                goto decodeRM;
+
+            case 0xA1:                          // MOV EAX, [XXXX]
+                ip += 5;
+                break;
+
+            case 0x89:                          // MOV r/m, reg
+                if (ip[1] == 0xEC)              // MOV ESP, EBP
+                    goto mov_esp_ebp;
+                // FALL THROUGH
+
+            case 0x18:                          // SBB r/m8, r8
+            case 0x19:                          // SBB r/m[16|32], r[16|32]
+            case 0x1A:                          // SBB r8, r/m8
+            case 0x1B:                          // SBB r[16|32], r/m[16|32]
+
+            case 0x88:                          // MOV reg, r/m (BYTE)
+            case 0x8A:                          // MOV r/m, reg (BYTE)
+
+        move:
+                datasize = 0;
+
+        decodeRM:
+                // Note that we don't want to read from ip[]
+                // after we do ANY incrementing of ip
+
+                mod = (ip[1] & 0xC0) >> 6;
+                if (mod != 3) {
+                    rm  = (ip[1] & 0x07);
+                    if (mod == 0) {             // (mod == 0)
+                        if      (rm == 5)       //   has disp32?
+                            ip += 4;            //     [disp32]
+                        else if (rm == 4)       //   has SIB byte?
+                            ip += 1;            //     [reg*K+reg]
+                    }
+                    else if (mod == 1) {        // (mod == 1) 
+                        if (rm == 4)            //   has SIB byte?
+                            ip += 1;            //     [reg*K+reg+disp8]
+                        ip += 1;                //   for disp8
+                    }
+                    else {                      // (mod == 2) 
+                        if (rm == 4)            //   has SIB byte?
+                            ip += 1;            //     [reg*K+reg+disp32]
+                        ip += 4;                //   for disp32
+                    }
+                }
+                ip += 2;                        // opcode and Mod R/M byte
+                ip += datasize;
+                break;
+
+            case 0x80:                           // OP r/m8, <imm8>
+                datasize = 1;
+                goto decodeRM;
+
+            case 0x81:                           // OP r/m32, <imm32>
+                if (!b16bit && ip[1] == 0xC4) {  // ADD ESP, <imm32>
+                    ESP = dac_cast<PTR_TADDR>(dac_cast<TADDR>(ESP) +
+                          (__int32)*dac_cast<PTR_DWORD>(ip + 2));
+                    ip += 6;
+                    break;
+                } else if (!b16bit && ip[1] == 0xC5) { // ADD EBP, <imm32>
+                    lazyState->_ebp += (__int32)*dac_cast<PTR_DWORD>(ip + 2);
+                    ip += 6;
+                    break;
+                }
+
+                datasize = b16bit?2:4;
+                goto decodeRM;
+
+            case 0x01:                           // ADD mod/rm
+            case 0x03:
+            case 0x29:                           // SUB mod/rm
+            case 0x2B:
+                datasize = 0;
+                goto decodeRM;
+            case 0x83:                           // OP r/m32, <imm8>
+                if (ip[1] == 0xC4)  {            // ADD ESP, <imm8>
+                    ESP = dac_cast<PTR_TADDR>(dac_cast<TADDR>(ESP) + (signed __int8)ip[2]);
+                    ip += 3;
+                    break;
+                }
+                if (ip[1] == 0xec) {            // SUB ESP, <imm8>
+                    ESP = PTR_TADDR(PTR_TO_TADDR(ESP) - (signed __int8)ip[2]);
+                    ip += 3;
+                    break;
+                }
+                if (ip[1] == 0xe4) {            // AND ESP, <imm8>
+                    ESP = PTR_TADDR(PTR_TO_TADDR(ESP) & (signed __int8)ip[2]);
+                    ip += 3;
+                    break;
+                }
+                if (ip[1] == 0xc5) {            // ADD EBP, <imm8>
+                    lazyState->_ebp += (signed __int8)ip[2];
+                    ip += 3;
+                    break;
+                }
+
+                datasize = 1;
+                goto decodeRM;
+
+            case 0x8B:                          // MOV reg, r/m
+                if (ip[1] == 0xE5) {            // MOV ESP, EBP
+                mov_esp_ebp:
+                    ESP = PTR_TADDR(lazyState->_ebp);
+                    ip += 2;
+                    break;
+                }
+
+                if ((ip[1] & 0xc7) == 0x4 && ip[2] == 0x24) // move reg, [esp]
+                {
+                    if ( ip[1] == 0x1C ) {  // MOV EBX, [ESP]
+                      lazyState->_pEbx = ESP;
+                      lazyState->_ebx =  *lazyState->_pEbx;                
+                    }
+                    else if ( ip[1] == 0x34 ) {  // MOV ESI, [ESP]
+                      lazyState->_pEsi = ESP;
+                      lazyState->_esi =  *lazyState->_pEsi;
+                    }
+                    else if ( ip[1] == 0x3C ) {  // MOV EDI, [ESP]
+                      lazyState->_pEdi = ESP;
+                      lazyState->_edi =   *lazyState->_pEdi;
+                    }
+                    else if ( ip[1] == 0x24 /*ESP*/ || ip[1] == 0x2C /*EBP*/)
+                      goto badOpcode;
+
+                    ip += 3;
+                    break;
+                }
+
+                if ((ip[1] & 0xc7) == 0x44 && ip[2] == 0x24) // move reg, [esp+imm8]
+                {
+                    if ( ip[1] == 0x5C ) {  // MOV EBX, [ESP+XX]
+                      lazyState->_pEbx = PTR_TADDR(PTR_TO_TADDR(ESP) + (signed __int8)ip[3]);
+                      lazyState->_ebx =  *lazyState->_pEbx ;
+                    }
+                    else if ( ip[1] == 0x74 ) {  // MOV ESI, [ESP+XX]
+                      lazyState->_pEsi = PTR_TADDR(PTR_TO_TADDR(ESP) + (signed __int8)ip[3]);
+                      lazyState->_esi =  *lazyState->_pEsi;
+                    }
+                    else if ( ip[1] == 0x7C ) {  // MOV EDI, [ESP+XX]
+                      lazyState->_pEdi = PTR_TADDR(PTR_TO_TADDR(ESP) + (signed __int8)ip[3]);
+                      lazyState->_edi =   *lazyState->_pEdi;
+                    }
+                    else if ( ip[1] == 0x64 /*ESP*/ || ip[1] == 0x6C /*EBP*/)
+                      goto badOpcode;
+
+                    ip += 4;
+                    break;
+                }
+
+                if ((ip[1] & 0xC7) == 0x45) {   // MOV reg, [EBP + imm8]
+                    // gcc sometimes restores callee-preserved registers
+                    // via 'mov reg, [ebp-xx]' instead of 'pop reg'
+                    if ( ip[1] == 0x5D ) {  // MOV EBX, [EBP+XX]
+                      lazyState->_pEbx = PTR_TADDR(lazyState->_ebp + (signed __int8)ip[2]);
+                      lazyState->_ebx =  *lazyState->_pEbx ;
+                    }
+                    else if ( ip[1] == 0x75 ) {  // MOV ESI, [EBP+XX]
+                      lazyState->_pEsi = PTR_TADDR(lazyState->_ebp + (signed __int8)ip[2]);
+                      lazyState->_esi =  *lazyState->_pEsi;
+                    }
+                    else if ( ip[1] == 0x7D ) {  // MOV EDI, [EBP+XX]
+                      lazyState->_pEdi = PTR_TADDR(lazyState->_ebp + (signed __int8)ip[2]);
+                      lazyState->_edi =   *lazyState->_pEdi;
+                    }
+                    else if ( ip[1] == 0x65 /*ESP*/ || ip[1] == 0x6D /*EBP*/)
+                      goto badOpcode;
+
+                    // We don't track the values of EAX,ECX,EDX
+
+                    ip += 3;   // MOV reg, [reg + imm8]
+                    break;
+                }
+
+                if ((ip[1] & 0xC7) == 0x85) {   // MOV reg, [EBP+imm32]
+                    // gcc sometimes restores callee-preserved registers
+                    // via 'mov reg, [ebp-xx]' instead of 'pop reg'
+                    if ( ip[1] == 0xDD ) {  // MOV EBX, [EBP+XXXXXXXX]
+                      lazyState->_pEbx = PTR_TADDR(lazyState->_ebp + (__int32)*dac_cast<PTR_DWORD>(ip + 2));
+                      lazyState->_ebx =  *lazyState->_pEbx ;
+                    }
+                    else if ( ip[1] == 0xF5 ) {  // MOV ESI, [EBP+XXXXXXXX]
+                      lazyState->_pEsi = PTR_TADDR(lazyState->_ebp + (__int32)*dac_cast<PTR_DWORD>(ip + 2));
+                      lazyState->_esi =  *lazyState->_pEsi;
+                    }
+                    else if ( ip[1] == 0xFD ) {  // MOV EDI, [EBP+XXXXXXXX]
+                      lazyState->_pEdi = PTR_TADDR(lazyState->_ebp + (__int32)*dac_cast<PTR_DWORD>(ip + 2));
+                      lazyState->_edi =   *lazyState->_pEdi;
+                    }
+                    else if ( ip[1] == 0xE5 /*ESP*/ || ip[1] == 0xED /*EBP*/)
+                      goto badOpcode;  // Add more registers
+
+                    // We don't track the values of EAX,ECX,EDX
+
+                    ip += 6;   // MOV reg, [reg + imm32]
+                    break;
+                }
+                goto move;
+
+            case 0x8D:                          // LEA
+                if ((ip[1] & 0x38) == 0x20) {                       // Don't allow ESP to be updated
+                    if (ip[1] == 0xA5)          // LEA ESP, [EBP+XXXX]
+                        ESP = PTR_TADDR(lazyState->_ebp + (__int32)*dac_cast<PTR_DWORD>(ip + 2));
+                    else if (ip[1] == 0x65)     // LEA ESP, [EBP+XX]
+                        ESP = PTR_TADDR(lazyState->_ebp + (signed __int8) ip[2]);
+                    else if (ip[1] == 0x24 && ip[2] == 0x24)    // LEA ESP, [ESP]
+                        ;
+                    else if (ip[1] == 0xa4 && ip[2] == 0x24 && *((DWORD *)(&ip[3])) == 0) // Another form of: LEA ESP, [ESP]
+                        ;
+                    else if (ip[1] == 0x64 && ip[2] == 0x24 && ip[3] == 0) // Yet another form of: LEA ESP, [ESP] (8 bit offset)
+                        ;
+                    else
+                    {
+                        goto badOpcode;
+                    }
+                }
+
+                datasize = 0;
+                goto decodeRM;
+
+            case 0xB0:  // MOV AL, imm8
+                ip += 2;
+                break;
+            case 0xB8:  // MOV EAX, imm32
+            case 0xB9:  // MOV ECX, imm32
+            case 0xBA:  // MOV EDX, imm32
+            case 0xBB:  // MOV EBX, imm32
+            case 0xBE:  // MOV ESI, imm32
+            case 0xBF:  // MOV EDI, imm32
+                if(b16bit)
+                    ip += 3;
+                else
+                    ip += 5;
+                break;
+
+            case 0xC2:                  // ret N
+                {
+                unsigned __int16 disp = *dac_cast<PTR_WORD>(ip + 1);
+                ip = PTR_BYTE(*ESP);
+                lazyState->_pRetAddr = ESP++;
+                _ASSERTE(disp < 64);    // sanity check (although strictly speaking not impossible)
+                ESP = dac_cast<PTR_TADDR>(dac_cast<TADDR>(ESP) + disp);         // pop args
+                goto ret;
+                }
+            case 0xC3:                  // ret
+                ip = PTR_BYTE(*ESP);
+                lazyState->_pRetAddr = ESP++;
+
+            ret_with_epilogHelperCheck:
+                if (epilogCallRet != 0) {       // we are returning from a special epilog helper
+                    ip = epilogCallRet;
+                    epilogCallRet = 0;
+                    break;                      // this does not count toward funCallDepth
+                }
+            ret:
+                if (funCallDepth > 0)
+                {
+                    --funCallDepth;
+                    if (funCallDepth == 0)
+                        goto done;
+                }
+                else
+                {
+                    // Determine  whether given IP resides in JITted code. (It returns nonzero in that case.) 
+                    // Use it now to see if we've unwound to managed code yet.
+                    BOOL fFailedReaderLock = FALSE;
+                    BOOL fIsManagedCode = ExecutionManager::IsManagedCode(*lazyState->pRetAddr(), hostCallPreference, &fFailedReaderLock);
+                    if (fFailedReaderLock)
+                    {
+                        // We don't know if we would have been able to find a JIT
+                        // manager, because we couldn't enter the reader lock without
+                        // yielding (and our caller doesn't want us to yield).  So abort
+                        // now.
+
+                        // Invalidate the lazyState we're returning, so the caller knows
+                        // we aborted before we could fully unwind
+                        lazyState->_pRetAddr = NULL;
+                        return;
+                    }
+
+                    if (fIsManagedCode)
+                        goto done;
+                }
+
+                bFirstCondJmp = TRUE;
+                break;
+
+            case 0xC6:                  // MOV r/m8, imm8
+                datasize = 1;
+                goto decodeRM;
+
+            case 0xC7:                  // MOV r/m32, imm32
+                datasize = b16bit?2:4;
+                goto decodeRM;
+
+            case 0xC9:                  // leave
+                ESP = PTR_TADDR(lazyState->_ebp);
+                lazyState->_pEbp = ESP;
+                lazyState->_ebp = *ESP++;
+                ip++;
+                break;
+
+#ifndef DACCESS_COMPILE
+            case 0xCC:
+                if (IsDebuggerPresent())
+                {
+                    OutputDebugStringA("CLR: Invalid breakpoint in a helpermethod frame epilog\n");
+                    DebugBreak();
+                    goto again;
+                }
+#ifndef _PREFIX_
+                *((int*) 0) = 1;        // If you get at this error, it is because yout
+                                        // set a breakpoint in a helpermethod frame epilog
+                                        // you can't do that unfortunately.  Just move it
+                                        // into the interior of the method to fix it  
+#endif // !_PREFIX_
+                goto done;
+#endif //!DACCESS_COMPILE
+
+            case 0xD0:  //  shl REG16, 1
+            case 0xD1:  //  shl REG32, 1
+                    if (0xE4 == ip[1] || 0xE5 == ip[1]) // shl, ESP, 1 or shl EBP, 1
+                    goto badOpcode;       // Doesn't look like valid code
+                ip += 2;
+                break;
+
+            case 0xC1:  //  shl REG32, imm8
+                    if (0xE4 == ip[1] || 0xE5 == ip[1]) // shl, ESP, imm8 or shl EBP, imm8
+                    goto badOpcode;       // Doesn't look like valid code
+                ip += 3;
+                break;
+
+            case 0xD9:  // single prefix
+                if (0xEE == ip[1])
+                {
+                    ip += 2;            // FLDZ
+                    break;
+                }
+                //
+                // INTENTIONAL FALL THRU
+                //
+            case 0xDD:  // double prefix
+                if ((ip[1] & 0xC0) != 0xC0)
+                {
+                    datasize = 0;       // floatop r/m
+                    goto decodeRM;
+                }
+                else
+                {
+                    goto badOpcode;
+                }
+                break;
+
+            case 0xf2: // repne prefix
+            case 0xF3: // rep prefix
+                ip += 1;
+                break;
+
+            case 0xA4:  // MOVS byte
+            case 0xA5:  // MOVS word/dword
+                ip += 1;
+                break;
+
+            case 0xA8: //test AL, imm8
+                ip += 2;
+                break;
+            case 0xA9: //test EAX, imm32
+                ip += 5;
+                break;
+            case 0xF6:
+                if ( (ip[1] & 0x38) == 0x00) // TEST r/m8, imm8
+                {
+                    datasize = 1;
+                    goto decodeRM;
+                }
+                else
+                {
+                    goto badOpcode;
+                }
+                break;
+
+            case 0xF7:
+                if ( (ip[1] & 0x38) == 0x00) // TEST r/m32, imm32
+                {
+                    datasize = b16bit?2:4;
+                    goto decodeRM;
+                }
+                else if ((ip[1] & 0xC8)  == 0xC8) //neg reg
+                {
+                    ip += 2;
+                    break;
+                }
+                else if ((ip[1] & 0x30) == 0x30) //div eax by mod/rm
+                {
+                    datasize = 0;
+                    goto decodeRM;
+                }
+                else
+                {
+                    goto badOpcode;
+                }
+                break;
+
+#ifdef __GNUC__
+            case 0x2e:
+                // Group 2 instruction prefix.
+                if (ip[1] == 0x0f && ip[2] == 0x1f)
+                {
+                    // Although not the recommended multi-byte sequence for 9-byte
+                    // nops (the suggestion is to use 0x66 as the prefix), this shows
+                    // up in GCC-optimized code.
+                    ip += 2;
+                    datasize = 0;
+                    goto decodeRM;
+                }
+                else
+                {
+                    goto badOpcode;
+                }
+                break;
+#endif // __GNUC__
+
+            default:
+            badOpcode:
+                _ASSERTE(!"Bad opcode");
+                // FIX what to do here?
+#ifndef DACCESS_COMPILE
+#ifndef _PREFIX_
+                *((unsigned __int8**) 0) = ip;  // cause an access violation (Free Build assert)
+#endif // !_PREFIX_                            
+#else
+                DacNotImpl();
+#endif
+                goto done;
+        }
+    }
+done:
+    _ASSERTE(epilogCallRet == 0);
+
+    // At this point the fields in 'frame' coorespond exactly to the register
+    // state when the the helper returns to its caller.
+    lazyState->_esp = dac_cast<TADDR>(ESP);
+}
+#ifdef _PREFAST_
+#pragma warning(pop)
+#endif
diff --git a/src/vm/i386/jithelp.asm b/src/vm/i386/jithelp.asm
new file mode 100644
index 0000000000..ac767287ee
--- /dev/null
+++ b/src/vm/i386/jithelp.asm
@@ -0,0 +1,2574 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+; See the LICENSE file in the project root for more information.
+
+; ==++==
+; 
+
+; 
+; ==--==
+; ***********************************************************************
+; File: JIThelp.asm
+;
+; ***********************************************************************
+;
+;  *** NOTE:  If you make changes to this file, propagate the changes to
+;             jithelp.s in this directory                            
+
+; This contains JITinterface routines that are 100% x86 assembly
+
+        .586
+        .model  flat
+
+        include asmconstants.inc
+
+        option  casemap:none
+        .code
+;
+; <TODO>@TODO Switch to g_ephemeral_low and g_ephemeral_high
+; @TODO instead of g_lowest_address, g_highest address</TODO>
+;
+
+ARGUMENT_REG1           equ     ecx
+ARGUMENT_REG2           equ     edx
+g_ephemeral_low                 TEXTEQU <_g_ephemeral_low>
+g_ephemeral_high                TEXTEQU <_g_ephemeral_high>
+g_lowest_address                TEXTEQU <_g_lowest_address>
+g_highest_address               TEXTEQU <_g_highest_address>
+g_card_table                    TEXTEQU <_g_card_table>
+WriteBarrierAssert              TEXTEQU <_WriteBarrierAssert@8>
+JIT_LLsh                        TEXTEQU <_JIT_LLsh@0>
+JIT_LRsh                        TEXTEQU <_JIT_LRsh@0>
+JIT_LRsz                        TEXTEQU <_JIT_LRsz@0>
+JIT_LMul                        TEXTEQU <@JIT_LMul@16>
+JIT_Dbl2LngOvf                  TEXTEQU <@JIT_Dbl2LngOvf@8>
+JIT_Dbl2Lng                     TEXTEQU <@JIT_Dbl2Lng@8>
+JIT_Dbl2IntSSE2                 TEXTEQU <@JIT_Dbl2IntSSE2@8>
+JIT_Dbl2LngP4x87                TEXTEQU <@JIT_Dbl2LngP4x87@8>
+JIT_Dbl2LngSSE3	                TEXTEQU <@JIT_Dbl2LngSSE3@8>
+JIT_InternalThrowFromHelper     TEXTEQU <@JIT_InternalThrowFromHelper@4>
+JIT_WriteBarrierReg_PreGrow     TEXTEQU <_JIT_WriteBarrierReg_PreGrow@0>
+JIT_WriteBarrierReg_PostGrow    TEXTEQU <_JIT_WriteBarrierReg_PostGrow@0>
+JIT_TailCall                    TEXTEQU <_JIT_TailCall@0>
+JIT_TailCallLeave               TEXTEQU <_JIT_TailCallLeave@0>
+JIT_TailCallVSDLeave            TEXTEQU <_JIT_TailCallVSDLeave@0>
+JIT_TailCallHelper              TEXTEQU <_JIT_TailCallHelper@4>
+JIT_TailCallReturnFromVSD       TEXTEQU <_JIT_TailCallReturnFromVSD@0>
+
+EXTERN  g_ephemeral_low:DWORD
+EXTERN  g_ephemeral_high:DWORD
+EXTERN  g_lowest_address:DWORD
+EXTERN  g_highest_address:DWORD
+EXTERN  g_card_table:DWORD
+ifdef _DEBUG
+EXTERN  WriteBarrierAssert:PROC
+endif ; _DEBUG	
+EXTERN  JIT_InternalThrowFromHelper:PROC
+ifdef FEATURE_HIJACK
+EXTERN  JIT_TailCallHelper:PROC
+endif
+EXTERN _g_TailCallFrameVptr:DWORD
+EXTERN @JIT_FailFast@0:PROC
+EXTERN _s_gsCookie:DWORD
+EXTERN @JITutil_IsInstanceOfInterface@8:PROC
+EXTERN @JITutil_ChkCastInterface@8:PROC
+EXTERN @JITutil_IsInstanceOfAny@8:PROC
+EXTERN @JITutil_ChkCastAny@8:PROC
+ifdef FEATURE_IMPLICIT_TLS
+EXTERN _GetThread@0:PROC
+endif
+
+ifdef WRITE_BARRIER_CHECK 
+; Those global variables are always defined, but should be 0 for Server GC
+g_GCShadow                      TEXTEQU <?g_GCShadow@@3PAEA>
+g_GCShadowEnd                   TEXTEQU <?g_GCShadowEnd@@3PAEA>
+EXTERN  g_GCShadow:DWORD
+EXTERN  g_GCShadowEnd:DWORD
+INVALIDGCVALUE equ 0CCCCCCCDh
+endif
+
+ifdef FEATURE_REMOTING
+EXTERN _TransparentProxyStub_CrossContext@0:PROC
+EXTERN _InContextTPQuickDispatchAsmStub@0:PROC
+endif
+
+.686P
+.XMM
+; The following macro is needed because of a MASM issue with the
+; movsd mnemonic
+; 
+$movsd MACRO op1, op2
+    LOCAL begin_movsd, end_movsd
+begin_movsd:
+    movupd op1, op2
+end_movsd:
+    org begin_movsd
+    db 0F2h
+    org end_movsd
+ENDM
+.586
+
+; The following macro is used to match the JITs
+; multi-byte NOP sequence
+$nop3 MACRO
+    db 090h
+    db 090h
+    db 090h
+ENDM
+
+
+
+;***
+;JIT_WriteBarrier* - GC write barrier helper
+;
+;Purpose:
+;   Helper calls in order to assign an object to a field
+;   Enables book-keeping of the GC.
+;
+;Entry:
+;   EDX - address of ref-field (assigned to)
+;   the resp. other reg - RHS of assignment
+;
+;Exit:
+;
+;Uses:
+;       EDX is destroyed.
+;
+;Exceptions:
+;
+;*******************************************************************************
+
+; The code here is tightly coupled with AdjustContextForWriteBarrier, if you change
+; anything here, you might need to change AdjustContextForWriteBarrier as well
+WriteBarrierHelper MACRO rg
+        ALIGN 4
+
+    ;; The entry point is the fully 'safe' one in which we check if EDX (the REF
+    ;; begin updated) is actually in the GC heap
+
+PUBLIC _JIT_CheckedWriteBarrier&rg&@0
+_JIT_CheckedWriteBarrier&rg&@0 PROC
+        ;; check in the REF being updated is in the GC heap
+        cmp             edx, g_lowest_address
+        jb              WriteBarrier_NotInHeap_&rg
+        cmp             edx, g_highest_address
+        jae             WriteBarrier_NotInHeap_&rg
+
+        ;; fall through to unchecked routine
+        ;; note that its entry point also happens to be aligned
+
+ifdef WRITE_BARRIER_CHECK
+    ;; This entry point is used when you know the REF pointer being updated
+    ;; is in the GC heap
+PUBLIC _JIT_DebugWriteBarrier&rg&@0
+_JIT_DebugWriteBarrier&rg&@0:
+endif
+
+ifdef _DEBUG
+        push    edx
+        push    ecx
+        push    eax
+
+        push    rg
+        push    edx
+        call    WriteBarrierAssert
+
+        pop     eax
+        pop     ecx
+        pop     edx
+endif ;_DEBUG
+
+        ; in the !WRITE_BARRIER_CHECK case this will be the move for all
+        ; addresses in the GCHeap, addresses outside the GCHeap will get 
+        ; taken care of below at WriteBarrier_NotInHeap_&rg
+
+ifndef WRITE_BARRIER_CHECK
+        mov     DWORD PTR [edx], rg
+endif
+
+ifdef WRITE_BARRIER_CHECK  
+        ; Test dest here so if it is bad AV would happen before we change register/stack 
+        ; status. This makes job of AdjustContextForWriteBarrier easier.
+        cmp     [edx], 0
+        ;; ALSO update the shadow GC heap if that is enabled
+        ; Make ebp into the temporary src register. We need to do this so that we can use ecx
+        ; in the calculation of the shadow GC address, but still have access to the src register
+        push    ecx
+        push    ebp 
+        mov     ebp, rg  
+        
+        ; if g_GCShadow is 0, don't perform the check       
+        cmp     g_GCShadow, 0
+        je      WriteBarrier_NoShadow_&rg          
+
+        mov     ecx, edx
+        sub     ecx, g_lowest_address   ; U/V
+        jb      WriteBarrier_NoShadow_&rg
+        add     ecx, [g_GCShadow]
+        cmp     ecx, [g_GCShadowEnd]
+        ja      WriteBarrier_NoShadow_&rg
+
+        ; TODO: In Orcas timeframe if we move to P4+ only on X86 we should enable 
+        ; mfence barriers on either side of these two writes to make sure that 
+        ; they stay as close together as possible
+
+        ; edx contains address in GC
+        ; ecx contains address in ShadowGC
+        ; ebp temporarially becomes the src register
+
+        ;; When we're writing to the shadow GC heap we want to be careful to minimize
+        ;; the risk of a race that can occur here where the GC and ShadowGC don't match
+        mov     DWORD PTR [edx], ebp
+        mov     DWORD PTR [ecx], ebp
+        
+        ;; We need a scratch register to verify the shadow heap.  We also need to
+        ;; construct a memory barrier so that the write to the shadow heap happens
+        ;; before the read from the GC heap.  We can do both by using SUB/XCHG 
+        ;; rather than PUSH.
+        ;;
+        ;; TODO: Should be changed to a push if the mfence described above is added.
+        ;;
+        sub     esp, 4
+        xchg    [esp], eax
+        
+        ;; As part of our race avoidance (see above) we will now check whether the values
+        ;; in the GC and ShadowGC match. There is a possibility that we're wrong here but
+        ;; being overaggressive means we might mask a case where someone updates GC refs
+        ;; without going to a write barrier, but by its nature it will be indeterminant
+        ;; and we will find real bugs whereas the current implementation is indeterminant
+        ;; but only leads to investigations that find that this code is fundamentally flawed
+        mov     eax, [edx]
+        cmp     [ecx], eax
+        je      WriteBarrier_CleanupShadowCheck_&rg
+        mov     [ecx], INVALIDGCVALUE
+
+WriteBarrier_CleanupShadowCheck_&rg:
+        pop     eax
+
+        jmp     WriteBarrier_ShadowCheckEnd_&rg
+        
+WriteBarrier_NoShadow_&rg:
+        ; If we come here then we haven't written the value to the GC and need to.
+        ;   ebp contains rg
+        ; We restore ebp/ecx immediately after this, and if either of them is the src
+        ; register it will regain its value as the src register.
+        mov     DWORD PTR [edx], ebp
+WriteBarrier_ShadowCheckEnd_&rg:
+        pop     ebp
+        pop     ecx
+endif
+        cmp     rg, g_ephemeral_low
+        jb      WriteBarrier_NotInEphemeral_&rg
+        cmp     rg, g_ephemeral_high
+        jae     WriteBarrier_NotInEphemeral_&rg
+
+        shr     edx, 10
+        add     edx, [g_card_table]
+        cmp     BYTE PTR [edx], 0FFh
+        jne     WriteBarrier_UpdateCardTable_&rg
+        ret
+        
+WriteBarrier_UpdateCardTable_&rg:
+        mov     BYTE PTR [edx], 0FFh
+        ret
+
+WriteBarrier_NotInHeap_&rg:
+        ; If it wasn't in the heap then we haven't updated the dst in memory yet
+        mov     DWORD PTR [edx], rg
+WriteBarrier_NotInEphemeral_&rg:
+        ; If it is in the GC Heap but isn't in the ephemeral range we've already
+        ; updated the Heap with the Object*.
+        ret
+_JIT_CheckedWriteBarrier&rg&@0 ENDP
+
+ENDM
+
+
+;***
+;JIT_ByRefWriteBarrier* - GC write barrier helper
+;
+;Purpose:
+;   Helper calls in order to assign an object to a byref field
+;   Enables book-keeping of the GC.
+;
+;Entry:
+;   EDI - address of ref-field (assigned to)
+;   ESI - address of the data  (source)
+;   ECX can be trashed
+;
+;Exit:
+;
+;Uses:
+;   EDI and ESI are incremented by a DWORD
+;
+;Exceptions:
+;
+;*******************************************************************************
+
+; The code here is tightly coupled with AdjustContextForWriteBarrier, if you change
+; anything here, you might need to change AdjustContextForWriteBarrier as well
+
+ByRefWriteBarrierHelper MACRO
+        ALIGN 4
+PUBLIC _JIT_ByRefWriteBarrier@0
+_JIT_ByRefWriteBarrier@0 PROC
+        ;;test for dest in range
+        mov     ecx, [esi] 
+        cmp     edi, g_lowest_address
+        jb      ByRefWriteBarrier_NotInHeap
+        cmp     edi, g_highest_address
+        jae     ByRefWriteBarrier_NotInHeap
+
+ifndef WRITE_BARRIER_CHECK
+        ;;write barrier
+        mov     [edi],ecx
+endif
+
+ifdef WRITE_BARRIER_CHECK
+        ; Test dest here so if it is bad AV would happen before we change register/stack 
+        ; status. This makes job of AdjustContextForWriteBarrier easier.
+        cmp     [edi], 0
+
+        ;; ALSO update the shadow GC heap if that is enabled
+        
+        ; use edx for address in GC Shadow,
+        push    edx      
+        
+        ;if g_GCShadow is 0, don't do the update
+        cmp     g_GCShadow, 0
+        je      ByRefWriteBarrier_NoShadow
+
+        mov     edx, edi
+        sub     edx, g_lowest_address   ; U/V
+        jb      ByRefWriteBarrier_NoShadow
+        add     edx, [g_GCShadow]
+        cmp     edx, [g_GCShadowEnd]
+        ja      ByRefWriteBarrier_NoShadow
+
+        ; TODO: In Orcas timeframe if we move to P4+ only on X86 we should enable 
+        ; mfence barriers on either side of these two writes to make sure that 
+        ; they stay as close together as possible
+
+        ; edi contains address in GC
+        ; edx contains address in ShadowGC
+        ; ecx is the value to assign
+
+        ;; When we're writing to the shadow GC heap we want to be careful to minimize
+        ;; the risk of a race that can occur here where the GC and ShadowGC don't match        
+        mov     DWORD PTR [edi], ecx
+        mov     DWORD PTR [edx], ecx
+
+        ;; We need a scratch register to verify the shadow heap.  We also need to
+        ;; construct a memory barrier so that the write to the shadow heap happens
+        ;; before the read from the GC heap.  We can do both by using SUB/XCHG 
+        ;; rather than PUSH.
+        ;;
+        ;; TODO: Should be changed to a push if the mfence described above is added.
+        ;;
+        sub     esp, 4
+        xchg    [esp], eax
+
+        ;; As part of our race avoidance (see above) we will now check whether the values
+        ;; in the GC and ShadowGC match. There is a possibility that we're wrong here but
+        ;; being overaggressive means we might mask a case where someone updates GC refs
+        ;; without going to a write barrier, but by its nature it will be indeterminant
+        ;; and we will find real bugs whereas the current implementation is indeterminant
+        ;; but only leads to investigations that find that this code is fundamentally flawed
+        
+        mov     eax, [edi]
+        cmp     [edx], eax
+        je      ByRefWriteBarrier_CleanupShadowCheck
+        mov     [edx], INVALIDGCVALUE
+ByRefWriteBarrier_CleanupShadowCheck:
+        pop     eax
+        jmp     ByRefWriteBarrier_ShadowCheckEnd
+        
+ByRefWriteBarrier_NoShadow:
+        ; If we come here then we haven't written the value to the GC and need to.
+        mov     DWORD PTR [edi], ecx
+
+ByRefWriteBarrier_ShadowCheckEnd:
+        pop     edx
+endif
+        ;;test for *src in ephemeral segement
+        cmp     ecx, g_ephemeral_low
+        jb      ByRefWriteBarrier_NotInEphemeral
+        cmp     ecx, g_ephemeral_high
+        jae     ByRefWriteBarrier_NotInEphemeral 
+        
+        mov     ecx, edi
+        add     esi,4
+        add     edi,4
+
+        shr     ecx, 10
+        add     ecx, [g_card_table]
+        cmp     byte ptr [ecx], 0FFh
+        jne     ByRefWriteBarrier_UpdateCardTable
+        ret
+ByRefWriteBarrier_UpdateCardTable:
+        mov     byte ptr [ecx], 0FFh
+        ret
+        
+ByRefWriteBarrier_NotInHeap:
+        ; If it wasn't in the heap then we haven't updated the dst in memory yet
+        mov     [edi],ecx
+ByRefWriteBarrier_NotInEphemeral:
+        ; If it is in the GC Heap but isn't in the ephemeral range we've already
+        ; updated the Heap with the Object*.
+        add     esi,4
+        add     edi,4
+        ret
+_JIT_ByRefWriteBarrier@0 ENDP
+ENDM
+
+;*******************************************************************************
+; Write barrier wrappers with fcall calling convention
+;
+UniversalWriteBarrierHelper MACRO name
+        ALIGN 4
+PUBLIC @JIT_&name&@8
+@JIT_&name&@8 PROC
+        mov eax,edx
+        mov edx,ecx
+        jmp _JIT_&name&EAX@0
+@JIT_&name&@8 ENDP
+ENDM
+
+; WriteBarrierStart and WriteBarrierEnd are used to determine bounds of
+; WriteBarrier functions so can determine if got AV in them. 
+; 
+PUBLIC _JIT_WriteBarrierStart@0
+_JIT_WriteBarrierStart@0 PROC
+ret
+_JIT_WriteBarrierStart@0 ENDP
+
+ifdef FEATURE_USE_ASM_GC_WRITE_BARRIERS
+; Only define these if we're using the ASM GC write barriers; if this flag is not defined,
+; we'll use C++ versions of these write barriers.
+UniversalWriteBarrierHelper <CheckedWriteBarrier>
+UniversalWriteBarrierHelper <WriteBarrier>
+endif 
+
+WriteBarrierHelper <EAX>
+WriteBarrierHelper <EBX>
+WriteBarrierHelper <ECX>
+WriteBarrierHelper <ESI>
+WriteBarrierHelper <EDI>
+WriteBarrierHelper <EBP>
+
+ByRefWriteBarrierHelper
+
+PUBLIC _JIT_WriteBarrierLast@0
+_JIT_WriteBarrierLast@0 PROC
+ret
+_JIT_WriteBarrierLast@0 ENDP
+
+; This is the first function outside the "keep together range". Used by BBT scripts.
+PUBLIC _JIT_WriteBarrierEnd@0
+_JIT_WriteBarrierEnd@0 PROC
+ret
+_JIT_WriteBarrierEnd@0 ENDP
+
+;*********************************************************************/
+; In cases where we support it we have an optimized GC Poll callback.  Normall (when we're not trying to
+; suspend for GC, the CORINFO_HELP_POLL_GC helper points to this nop routine.  When we're ready to suspend
+; for GC, we whack the Jit Helper table entry to point to the real helper.  When we're done with GC we
+; whack it back.
+PUBLIC @JIT_PollGC_Nop@0
+@JIT_PollGC_Nop@0 PROC
+ret
+@JIT_PollGC_Nop@0 ENDP
+
+;*********************************************************************/
+;llshl - long shift left
+;
+;Purpose:
+;   Does a Long Shift Left (signed and unsigned are identical)
+;   Shifts a long left any number of bits.
+;
+;       NOTE:  This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+;   EDX:EAX - long value to be shifted
+;       ECX - number of bits to shift by
+;
+;Exit:
+;   EDX:EAX - shifted value
+;
+        ALIGN 16
+PUBLIC JIT_LLsh
+JIT_LLsh PROC
+; Handle shifts of between bits 0 and 31
+        cmp     ecx, 32
+        jae     short LLshMORE32
+        shld    edx,eax,cl
+        shl     eax,cl
+        ret
+; Handle shifts of between bits 32 and 63
+LLshMORE32:
+        ; The x86 shift instructions only use the lower 5 bits.
+        mov     edx,eax
+        xor     eax,eax
+        shl     edx,cl
+        ret
+JIT_LLsh ENDP
+
+
+;*********************************************************************/
+;LRsh - long shift right
+;
+;Purpose:
+;   Does a signed Long Shift Right
+;   Shifts a long right any number of bits.
+;
+;       NOTE:  This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+;   EDX:EAX - long value to be shifted
+;       ECX - number of bits to shift by
+;
+;Exit:
+;   EDX:EAX - shifted value
+;
+        ALIGN 16
+PUBLIC JIT_LRsh
+JIT_LRsh PROC
+; Handle shifts of between bits 0 and 31
+        cmp     ecx, 32
+        jae     short LRshMORE32
+        shrd    eax,edx,cl
+        sar     edx,cl
+        ret
+; Handle shifts of between bits 32 and 63
+LRshMORE32:
+        ; The x86 shift instructions only use the lower 5 bits.
+        mov     eax,edx
+        sar     edx, 31
+        sar     eax,cl
+        ret
+JIT_LRsh ENDP
+
+
+;*********************************************************************/
+; LRsz:
+;Purpose:
+;   Does a unsigned Long Shift Right
+;   Shifts a long right any number of bits.
+;
+;       NOTE:  This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+;   EDX:EAX - long value to be shifted
+;       ECX - number of bits to shift by
+;
+;Exit:
+;   EDX:EAX - shifted value
+;
+        ALIGN 16
+PUBLIC JIT_LRsz
+JIT_LRsz PROC
+; Handle shifts of between bits 0 and 31
+        cmp     ecx, 32
+        jae     short LRszMORE32
+        shrd    eax,edx,cl
+        shr     edx,cl
+        ret
+; Handle shifts of between bits 32 and 63
+LRszMORE32:
+        ; The x86 shift instructions only use the lower 5 bits.
+        mov     eax,edx
+        xor     edx,edx
+        shr     eax,cl
+        ret
+JIT_LRsz ENDP
+
+;*********************************************************************/
+; LMul:
+;Purpose:
+;   Does a long multiply (same for signed/unsigned)
+;
+;       NOTE:  This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+;   Parameters are passed on the stack:
+;               1st pushed: multiplier (QWORD)
+;               2nd pushed: multiplicand (QWORD)
+;
+;Exit:
+;   EDX:EAX - product of multiplier and multiplicand
+;
+        ALIGN 16
+PUBLIC JIT_LMul
+JIT_LMul PROC
+
+;       AHI, BHI : upper 32 bits of A and B
+;       ALO, BLO : lower 32 bits of A and B
+;
+;             ALO * BLO
+;       ALO * BHI
+; +     BLO * AHI
+; ---------------------
+
+        mov     eax,[esp + 8]   ; AHI
+        mov     ecx,[esp + 16]  ; BHI
+        or      ecx,eax         ;test for both hiwords zero.
+        mov     ecx,[esp + 12]  ; BLO
+        jnz     LMul_hard       ;both are zero, just mult ALO and BLO
+
+        mov     eax,[esp + 4]
+        mul     ecx
+
+        ret     16              ; callee restores the stack
+
+LMul_hard:
+        push    ebx
+
+        mul     ecx             ;eax has AHI, ecx has BLO, so AHI * BLO
+        mov     ebx,eax         ;save result
+
+        mov     eax,[esp + 8]   ; ALO
+        mul     dword ptr [esp + 20] ;ALO * BHI
+        add     ebx,eax         ;ebx = ((ALO * BHI) + (AHI * BLO))
+
+        mov     eax,[esp + 8]   ; ALO   ;ecx = BLO
+        mul     ecx             ;so edx:eax = ALO*BLO
+        add     edx,ebx         ;now edx has all the LO*HI stuff
+
+        pop     ebx
+
+        ret     16              ; callee restores the stack
+
+JIT_LMul ENDP
+
+;*********************************************************************/
+; JIT_Dbl2LngOvf
+
+;Purpose:
+;   converts a double to a long truncating toward zero (C semantics)
+;   with check for overflow
+;
+;       uses stdcall calling conventions 
+;
+PUBLIC JIT_Dbl2LngOvf
+JIT_Dbl2LngOvf PROC
+        fnclex
+        fld     qword ptr [esp+4]
+        push    ecx
+        push    ecx
+        fstp    qword ptr [esp]
+        call    JIT_Dbl2Lng
+        mov     ecx,eax
+        fnstsw  ax
+        test    ax,01h
+        jnz     Dbl2LngOvf_throw
+        mov     eax,ecx
+        ret     8
+
+Dbl2LngOvf_throw:
+        mov     ECX, CORINFO_OverflowException_ASM
+        call    JIT_InternalThrowFromHelper
+        ret     8
+JIT_Dbl2LngOvf ENDP
+
+;*********************************************************************/
+; JIT_Dbl2Lng
+
+;Purpose:
+;   converts a double to a long truncating toward zero (C semantics)
+;
+;       uses stdcall calling conventions 
+;
+;   note that changing the rounding mode is very expensive.  This
+;   routine basiclly does the truncation sematics without changing
+;   the rounding mode, resulting in a win.
+;
+PUBLIC JIT_Dbl2Lng
+JIT_Dbl2Lng PROC
+        fld qword ptr[ESP+4]            ; fetch arg
+        lea ecx,[esp-8]
+        sub esp,16                      ; allocate frame
+        and ecx,-8                      ; align pointer on boundary of 8
+        fld st(0)                       ; duplciate top of stack
+        fistp qword ptr[ecx]            ; leave arg on stack, also save in temp
+        fild qword ptr[ecx]             ; arg, round(arg) now on stack
+        mov edx,[ecx+4]                 ; high dword of integer
+        mov eax,[ecx]                   ; low dword of integer
+        test eax,eax
+        je integer_QNaN_or_zero
+
+arg_is_not_integer_QNaN:
+        fsubp st(1),st                  ; TOS=d-round(d),
+                                        ; { st(1)=st(1)-st & pop ST }
+        test edx,edx                    ; what's sign of integer
+        jns positive
+                                        ; number is negative
+                                        ; dead cycle
+                                        ; dead cycle
+        fstp dword ptr[ecx]             ; result of subtraction
+        mov ecx,[ecx]                   ; dword of difference(single precision)
+        add esp,16
+        xor ecx,80000000h
+        add ecx,7fffffffh               ; if difference>0 then increment integer
+        adc eax,0                       ; inc eax (add CARRY flag)
+        adc edx,0                       ; propagate carry flag to upper bits
+        ret 8
+
+positive:
+        fstp dword ptr[ecx]             ;17-18 ; result of subtraction
+        mov ecx,[ecx]                   ; dword of difference (single precision)
+        add esp,16
+        add ecx,7fffffffh               ; if difference<0 then decrement integer
+        sbb eax,0                       ; dec eax (subtract CARRY flag)
+        sbb edx,0                       ; propagate carry flag to upper bits
+        ret 8
+
+integer_QNaN_or_zero:
+        test edx,7fffffffh
+        jnz arg_is_not_integer_QNaN
+        fstp st(0)                      ;; pop round(arg)
+        fstp st(0)                      ;; arg
+        add esp,16
+        ret 8
+JIT_Dbl2Lng ENDP
+
+;*********************************************************************/
+; JIT_Dbl2LngP4x87
+
+;Purpose:
+;   converts a double to a long truncating toward zero (C semantics)
+;
+;	uses stdcall calling conventions 
+;
+;   This code is faster on a P4 than the Dbl2Lng code above, but is
+;   slower on a PIII.  Hence we choose this code when on a P4 or above.
+;
+PUBLIC JIT_Dbl2LngP4x87
+JIT_Dbl2LngP4x87 PROC
+arg1	equ	<[esp+0Ch]>
+
+    sub 	esp, 8                  ; get some local space
+
+    fld	qword ptr arg1              ; fetch arg
+    fnstcw  word ptr arg1           ; store FPCW
+    movzx   eax, word ptr arg1      ; zero extend - wide
+    or	ah, 0Ch                     ; turn on OE and DE flags
+    mov	dword ptr [esp], eax        ; store new FPCW bits
+    fldcw   word ptr  [esp]         ; reload FPCW with new bits 
+    fistp   qword ptr [esp]         ; convert
+    mov	eax, dword ptr [esp]        ; reload FP result
+    mov	edx, dword ptr [esp+4]      ;
+    fldcw   word ptr arg1           ; reload original FPCW value
+
+    add esp, 8                      ; restore stack
+
+    ret	8
+JIT_Dbl2LngP4x87 ENDP
+
+;*********************************************************************/
+; JIT_Dbl2LngSSE3
+
+;Purpose:
+;   converts a double to a long truncating toward zero (C semantics)
+;
+;	uses stdcall calling conventions 
+;
+;   This code is faster than the above P4 x87 code for Intel processors
+;   equal or later than Core2 and Atom that have SSE3 support
+;
+.686P
+.XMM
+PUBLIC JIT_Dbl2LngSSE3
+JIT_Dbl2LngSSE3 PROC
+arg1	equ	<[esp+0Ch]>
+
+    sub esp, 8                      ; get some local space
+
+    fld qword ptr arg1              ; fetch arg
+    fisttp qword ptr [esp]          ; convert
+    mov eax, dword ptr [esp]        ; reload FP result
+    mov edx, dword ptr [esp+4]
+ 
+    add esp, 8                      ; restore stack
+
+    ret	8
+JIT_Dbl2LngSSE3 ENDP
+.586
+
+;*********************************************************************/
+; JIT_Dbl2IntSSE2
+
+;Purpose:
+;   converts a double to a long truncating toward zero (C semantics)
+;
+;	uses stdcall calling conventions 
+;
+;   This code is even faster than the P4 x87 code for Dbl2LongP4x87,
+;   but only returns a 32 bit value (only good for int).
+;
+.686P
+.XMM
+PUBLIC JIT_Dbl2IntSSE2
+JIT_Dbl2IntSSE2 PROC
+	$movsd	xmm0, [esp+4]
+	cvttsd2si eax, xmm0
+	ret 8
+JIT_Dbl2IntSSE2 ENDP
+.586
+
+
+;*********************************************************************/
+; This is the small write barrier thunk we use when we know the
+; ephemeral generation is higher in memory than older generations.
+; The 0x0F0F0F0F values are bashed by the two functions above.
+; This the generic version - wherever the code says ECX, 
+; the specific register is patched later into a copy
+; Note: do not replace ECX by EAX - there is a smaller encoding for
+; the compares just for EAX, which won't work for other registers.
+;
+; READ THIS!!!!!!
+; it is imperative that the addresses of of the values that we overwrite
+; (card table, ephemeral region ranges, etc) are naturally aligned since
+; there are codepaths that will overwrite these values while the EE is running.
+;
+PUBLIC JIT_WriteBarrierReg_PreGrow
+JIT_WriteBarrierReg_PreGrow PROC
+        mov     DWORD PTR [edx], ecx
+        cmp     ecx, 0F0F0F0F0h
+        jb      NoWriteBarrierPre
+
+        shr     edx, 10
+        nop ; padding for alignment of constant
+        cmp     byte ptr [edx+0F0F0F0F0h], 0FFh
+        jne     WriteBarrierPre
+NoWriteBarrierPre:
+        ret
+        nop ; padding for alignment of constant
+        nop ; padding for alignment of constant
+WriteBarrierPre:
+        mov     byte ptr [edx+0F0F0F0F0h], 0FFh
+        ret
+JIT_WriteBarrierReg_PreGrow ENDP
+
+;*********************************************************************/
+; This is the larger write barrier thunk we use when we know that older
+; generations may be higher in memory than the ephemeral generation
+; The 0x0F0F0F0F values are bashed by the two functions above.
+; This the generic version - wherever the code says ECX, 
+; the specific register is patched later into a copy
+; Note: do not replace ECX by EAX - there is a smaller encoding for
+; the compares just for EAX, which won't work for other registers.
+; NOTE: we need this aligned for our validation to work properly
+        ALIGN 4
+PUBLIC JIT_WriteBarrierReg_PostGrow
+JIT_WriteBarrierReg_PostGrow PROC
+        mov     DWORD PTR [edx], ecx
+        cmp     ecx, 0F0F0F0F0h
+        jb      NoWriteBarrierPost
+        cmp     ecx, 0F0F0F0F0h
+        jae     NoWriteBarrierPost
+
+        shr     edx, 10
+        nop ; padding for alignment of constant
+        cmp     byte ptr [edx+0F0F0F0F0h], 0FFh
+        jne     WriteBarrierPost
+NoWriteBarrierPost:
+        ret
+        nop ; padding for alignment of constant
+        nop ; padding for alignment of constant
+WriteBarrierPost:
+        mov     byte ptr [edx+0F0F0F0F0h], 0FFh
+        ret
+JIT_WriteBarrierReg_PostGrow ENDP
+
+;*********************************************************************/
+; 
+
+        ; a fake virtual stub dispatch register indirect callsite
+        $nop3
+        call    dword ptr [eax]
+
+
+PUBLIC JIT_TailCallReturnFromVSD
+JIT_TailCallReturnFromVSD:
+ifdef _DEBUG
+        nop                         ; blessed callsite
+endif
+        call    VSDHelperLabel      ; keep call-ret count balanced.
+VSDHelperLabel:
+
+; Stack at this point :
+;    ...
+; m_ReturnAddress
+; m_regs
+; m_CallerAddress
+; m_pThread
+; vtbl
+; GSCookie
+; &VSDHelperLabel
+OffsetOfTailCallFrame = 8
+
+; ebx = pThread
+
+ifdef _DEBUG
+        mov     esi, _s_gsCookie        ; GetProcessGSCookie()
+        cmp     dword ptr [esp+OffsetOfTailCallFrame-SIZEOF_GSCookie], esi
+        je      TailCallFrameGSCookieIsValid
+        call    @JIT_FailFast@0
+    TailCallFrameGSCookieIsValid:
+endif
+        ; remove the padding frame from the chain
+        mov     esi, dword ptr [esp+OffsetOfTailCallFrame+4]    ; esi = TailCallFrame::m_Next
+        mov     dword ptr [ebx + Thread_m_pFrame], esi
+
+        ; skip the frame
+        add     esp, 20     ; &VSDHelperLabel, GSCookie, vtbl, m_Next, m_CallerAddress
+
+        pop     edi         ; restore callee saved registers
+        pop     esi
+        pop     ebx
+        pop     ebp
+
+        ret                 ; return to m_ReturnAddress
+
+;------------------------------------------------------------------------------
+; 
+
+PUBLIC JIT_TailCall
+JIT_TailCall PROC
+
+; the stack layout at this point is:
+;
+;   ebp+8+4*nOldStackArgs   <- end of argument destination
+;    ...                       ...
+;   ebp+8+                     old args (size is nOldStackArgs)
+;    ...                       ...
+;   ebp+8                   <- start of argument destination
+;   ebp+4                   ret addr
+;   ebp+0                   saved ebp
+;   ebp-c                   saved ebx, esi, edi (if have callee saved regs = 1)
+;
+;                           other stuff (local vars) in the jitted callers' frame
+;
+;   esp+20+4*nNewStackArgs  <- end of argument source
+;    ...                       ...
+;   esp+20+                    new args (size is nNewStackArgs) to be passed to the target of the tail-call
+;    ...                       ...
+;   esp+20                  <- start of argument source
+;   esp+16                  nOldStackArgs
+;   esp+12                  nNewStackArgs
+;   esp+8                   flags (1 = have callee saved regs, 2 = virtual stub dispatch)
+;   esp+4                   target addr
+;   esp+0                   retaddr
+;   
+;   If you change this function, make sure you update code:TailCallStubManager as well.
+
+RetAddr         equ 0
+TargetAddr      equ 4
+nNewStackArgs   equ 12
+nOldStackArgs   equ 16
+NewArgs         equ 20
+
+; extra space is incremented as we push things on the stack along the way
+ExtraSpace      = 0
+
+        call    _GetThread@0; eax = Thread*
+        push    eax         ; Thread*
+
+        ; save ArgumentRegisters
+        push    ecx
+        push    edx
+
+ExtraSpace      = 12    ; pThread, ecx, edx
+
+ifdef FEATURE_HIJACK
+        ; Make sure that the EE does have the return address patched. So we can move it around.
+        test    dword ptr [eax+Thread_m_State], TS_Hijacked_ASM
+        jz      NoHijack
+        
+        ; JIT_TailCallHelper(Thread *)
+        push    eax
+        call    JIT_TailCallHelper  ; this is __stdcall
+
+NoHijack:
+endif
+
+        mov     edx, dword ptr [esp+ExtraSpace+JIT_TailCall_StackOffsetToFlags]           ; edx = flags
+
+        mov     eax, dword ptr [esp+ExtraSpace+nOldStackArgs]   ; eax = nOldStackArgs
+        mov     ecx, dword ptr [esp+ExtraSpace+nNewStackArgs]   ; ecx = nNewStackArgs
+
+        ; restore callee saved registers
+        ; <TODO>@TODO : esp based - doesnt work with localloc</TODO>
+        test    edx, 1
+        jz      NoCalleeSaveRegisters
+        
+        mov     edi, dword ptr [ebp-4]              ; restore edi
+        mov     esi, dword ptr [ebp-8]              ; restore esi
+        mov     ebx, dword ptr [ebp-12]             ; restore ebx
+
+NoCalleeSaveRegisters:
+
+        push    dword ptr [ebp+4]                   ; save the original return address for later
+        push    edi
+        push    esi
+
+ExtraSpace      = 24    ; pThread, ecx, edx, orig retaddr, edi, esi
+CallersEsi      = 0
+CallersEdi      = 4
+OrigRetAddr     = 8
+pThread         = 20
+
+        lea     edi, [ebp+8+4*eax]                  ; edi = the end of argument destination
+        lea     esi, [esp+ExtraSpace+NewArgs+4*ecx] ; esi = the end of argument source
+
+        mov     ebp, dword ptr [ebp]        ; restore ebp (do not use ebp as scratch register to get a good stack trace in debugger)
+
+        test    edx, 2
+        jnz     VSDTailCall
+
+        ; copy the arguments to the final destination
+        test    ecx, ecx
+        jz      ArgumentsCopied
+ArgumentCopyLoop:
+        ; At this point, this is the value of the registers :
+        ; edi = end of argument dest
+        ; esi = end of argument source
+        ; ecx = nNewStackArgs
+        mov     eax, dword ptr [esi-4]
+        sub     edi, 4
+        sub     esi, 4
+        mov     dword ptr [edi], eax
+        dec     ecx
+        jnz     ArgumentCopyLoop
+ArgumentsCopied:
+
+        ; edi = the start of argument destination
+
+        mov     eax, dword ptr [esp+4+4]                    ; return address
+        mov     ecx, dword ptr [esp+ExtraSpace+TargetAddr]  ; target address
+
+        mov     dword ptr [edi-4], eax      ; return address
+        mov     dword ptr [edi-8], ecx      ; target address
+
+        lea     eax, [edi-8]                ; new value for esp
+
+        pop     esi
+        pop     edi
+        pop     ecx         ; skip original return address
+        pop     edx
+        pop     ecx
+
+        mov     esp, eax
+
+PUBLIC JIT_TailCallLeave    ; add a label here so that TailCallStubManager can access it
+JIT_TailCallLeave:
+        retn                ; Will branch to targetAddr.  This matches the
+                            ; "call" done by JITted code, keeping the
+                            ; call-ret count balanced.
+
+        ;----------------------------------------------------------------------
+VSDTailCall:
+        ;----------------------------------------------------------------------
+        
+        ; For the Virtual Stub Dispatch, we create a fake callsite to fool
+        ; the callsite probes. In order to create the call site, we need to insert TailCallFrame
+        ; if we do not have one already.
+        ;
+        ; ecx = nNewStackArgs
+        ; esi = the end of argument source
+        ; edi = the end of argument destination
+        ;
+        ; The stub has pushed the following onto the stack at this point :
+        ; pThread, ecx, edx, orig retaddr, edi, esi
+
+
+        cmp     dword ptr [esp+OrigRetAddr], JIT_TailCallReturnFromVSD
+        jz      VSDTailCallFrameInserted_DoSlideUpArgs ; There is an exiting TailCallFrame that can be reused
+
+        ; try to allocate space for the frame / check whether there is enough space
+        ; If there is sufficient space, we will setup the frame and then slide 
+        ; the arguments up the stack. Else, we first need to slide the arguments
+        ; down the stack to make space for the TailCallFrame
+        sub     edi, (SIZEOF_GSCookie + SIZEOF_TailCallFrame)
+        cmp     edi, esi
+        jae     VSDSpaceForFrameChecked
+
+        ; There is not sufficient space to wedge in the TailCallFrame without 
+        ; overwriting the new arguments.
+        ; We need to allocate the extra space on the stack, 
+        ; and slide down the new arguments
+        
+        mov     eax, esi
+        sub     eax, edi
+        sub     esp, eax
+
+        mov     eax, ecx                        ; to subtract the size of arguments
+        mov     edx, ecx                        ; for counter
+
+        neg     eax
+
+        ; copy down the arguments to the final destination, need to copy all temporary storage as well
+        add     edx, (ExtraSpace+NewArgs)/4
+
+        lea     esi, [esi+4*eax-(ExtraSpace+NewArgs)]
+        lea     edi, [edi+4*eax-(ExtraSpace+NewArgs)]
+
+VSDAllocFrameCopyLoop:
+        mov     eax, dword ptr [esi]
+        mov     dword ptr [edi], eax
+        add     esi, 4
+        add     edi, 4
+        dec     edx
+        jnz     VSDAllocFrameCopyLoop
+
+        ; the argument source and destination are same now
+        mov     esi, edi
+
+VSDSpaceForFrameChecked:
+
+        ; At this point, we have enough space on the stack for the TailCallFrame,
+        ; and we may already have slided down the arguments
+        
+        mov     eax, _s_gsCookie                ; GetProcessGSCookie()
+        mov     dword ptr [edi], eax            ; set GSCookie
+        mov     eax, _g_TailCallFrameVptr       ; vptr
+        mov     edx, dword ptr [esp+OrigRetAddr]        ; orig return address
+        mov     dword ptr [edi+SIZEOF_GSCookie], eax            ; TailCallFrame::vptr
+        mov     dword ptr [edi+SIZEOF_GSCookie+28], edx         ; TailCallFrame::m_ReturnAddress
+
+        mov     eax, dword ptr [esp+CallersEdi]         ; restored edi
+        mov     edx, dword ptr [esp+CallersEsi]         ; restored esi
+        mov     dword ptr [edi+SIZEOF_GSCookie+12], eax         ; TailCallFrame::m_regs::edi
+        mov     dword ptr [edi+SIZEOF_GSCookie+16], edx         ; TailCallFrame::m_regs::esi
+        mov     dword ptr [edi+SIZEOF_GSCookie+20], ebx         ; TailCallFrame::m_regs::ebx
+        mov     dword ptr [edi+SIZEOF_GSCookie+24], ebp         ; TailCallFrame::m_regs::ebp
+
+        mov     ebx, dword ptr [esp+pThread]            ; ebx = pThread
+
+        mov     eax, dword ptr [ebx+Thread_m_pFrame]
+        lea     edx, [edi+SIZEOF_GSCookie]
+        mov     dword ptr [edi+SIZEOF_GSCookie+4], eax          ; TailCallFrame::m_pNext
+        mov     dword ptr [ebx+Thread_m_pFrame], edx    ; hook the new frame into the chain
+
+        ; setup ebp chain
+        lea     ebp, [edi+SIZEOF_GSCookie+24]                   ; TailCallFrame::m_regs::ebp
+
+        ; Do not copy arguments again if they are in place already
+        ; Otherwise, we will need to slide the new arguments up the stack
+        cmp     esi, edi
+        jne     VSDTailCallFrameInserted_DoSlideUpArgs
+
+        ; At this point, we must have already previously slided down the new arguments,
+        ; or the TailCallFrame is a perfect fit
+        ; set the caller address
+        mov     edx, dword ptr [esp+ExtraSpace+RetAddr] ; caller address
+        mov     dword ptr [edi+SIZEOF_GSCookie+8], edx         ; TailCallFrame::m_CallerAddress
+
+        ; adjust edi as it would by copying
+        neg     ecx
+        lea     edi, [edi+4*ecx]
+
+        jmp     VSDArgumentsCopied
+
+VSDTailCallFrameInserted_DoSlideUpArgs:
+        ; set the caller address
+        mov     edx, dword ptr [esp+ExtraSpace+RetAddr] ; caller address
+        mov     dword ptr [edi+SIZEOF_GSCookie+8], edx          ; TailCallFrame::m_CallerAddress
+
+        ; copy the arguments to the final destination
+        test    ecx, ecx
+        jz      VSDArgumentsCopied
+VSDArgumentCopyLoop:
+        mov     eax, dword ptr [esi-4]
+        sub     edi, 4
+        sub     esi, 4
+        mov     dword ptr [edi], eax
+        dec     ecx
+        jnz     VSDArgumentCopyLoop
+VSDArgumentsCopied:
+
+        ; edi = the start of argument destination
+
+        mov     ecx, dword ptr [esp+ExtraSpace+TargetAddr]   ; target address
+
+        mov     dword ptr [edi-4], JIT_TailCallReturnFromVSD ; return address
+        mov     dword ptr [edi-12], ecx     ; address of indirection cell
+        mov     ecx, [ecx]
+        mov     dword ptr [edi-8], ecx      ; target address
+
+        ; skip original return address and saved esi, edi
+        add     esp, 12
+
+        pop     edx
+        pop     ecx
+
+        lea     esp, [edi-12]   ; new value for esp
+        pop     eax
+
+PUBLIC JIT_TailCallVSDLeave ; add a label here so that TailCallStubManager can access it
+JIT_TailCallVSDLeave:
+        retn                ; Will branch to targetAddr.  This matches the
+                            ; "call" done by JITted code, keeping the
+                            ; call-ret count balanced.
+
+JIT_TailCall ENDP
+
+
+;------------------------------------------------------------------------------
+
+; HCIMPL2_VV(float, JIT_FltRem, float dividend, float divisor)
+@JIT_FltRem@8 proc public
+        fld  dword ptr [esp+4]          ; divisor
+        fld  dword ptr [esp+8]          ; dividend
+fremloop:
+        fprem
+        fstsw   ax
+        fwait
+        sahf
+        jp      fremloop        ; Continue while the FPU status bit C2 is set
+        fxch    ; swap, so divisor is on top and result is in st(1)
+        fstp    ST(0)           ; Pop the divisor from the FP stack
+        retn    8               ; Return value is in st(0)
+@JIT_FltRem@8 endp
+
+; HCIMPL2_VV(float, JIT_DblRem, float dividend, float divisor)
+@JIT_DblRem@16 proc public
+        fld  qword ptr [esp+4]          ; divisor
+        fld  qword ptr [esp+12]         ; dividend
+fremloopd:
+        fprem
+        fstsw   ax
+        fwait
+        sahf
+        jp      fremloopd       ; Continue while the FPU status bit C2 is set
+        fxch    ; swap, so divisor is on top and result is in st(1)
+        fstp    ST(0)           ; Pop the divisor from the FP stack
+        retn    16              ; Return value is in st(0)
+@JIT_DblRem@16 endp
+
+;------------------------------------------------------------------------------
+
+g_SystemInfo            TEXTEQU <?g_SystemInfo@@3U_SYSTEM_INFO@@A>
+g_SpinConstants         TEXTEQU <?g_SpinConstants@@3USpinConstants@@A>
+g_pSyncTable            TEXTEQU <?g_pSyncTable@@3PAVSyncTableEntry@@A>
+JITutil_MonEnterWorker  TEXTEQU <@JITutil_MonEnterWorker@4>
+JITutil_MonReliableEnter TEXTEQU <@JITutil_MonReliableEnter@8>
+JITutil_MonTryEnter     TEXTEQU <@JITutil_MonTryEnter@12>
+JITutil_MonExitWorker   TEXTEQU <@JITutil_MonExitWorker@4>
+JITutil_MonContention   TEXTEQU <@JITutil_MonContention@4>       
+JITutil_MonReliableContention   TEXTEQU <@JITutil_MonReliableContention@8>       
+JITutil_MonSignal       TEXTEQU <@JITutil_MonSignal@4>
+JIT_InternalThrow       TEXTEQU <@JIT_InternalThrow@4>
+EXTRN	g_SystemInfo:BYTE
+EXTRN	g_SpinConstants:BYTE
+EXTRN	g_pSyncTable:DWORD
+EXTRN	JITutil_MonEnterWorker:PROC
+EXTRN	JITutil_MonReliableEnter:PROC
+EXTRN	JITutil_MonTryEnter:PROC
+EXTRN	JITutil_MonExitWorker:PROC
+EXTRN	JITutil_MonContention:PROC
+EXTRN	JITutil_MonReliableContention:PROC
+EXTRN	JITutil_MonSignal:PROC
+EXTRN	JIT_InternalThrow:PROC
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+EnterSyncHelper TEXTEQU <_EnterSyncHelper@8>
+LeaveSyncHelper TEXTEQU <_LeaveSyncHelper@8>          
+EXTRN	EnterSyncHelper:PROC
+EXTRN	LeaveSyncHelper:PROC
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+
+; The following macro is needed because MASM returns
+; "instruction prefix not allowed" error message for
+; rep nop mnemonic
+$repnop MACRO
+    db 0F3h
+    db 090h
+ENDM
+
+; Safe ThreadAbort does not abort a thread if it is running finally or has lock counts.
+; At the time we call Monitor.Enter, we initiate the abort if we can.
+; We do not need to do the same for Monitor.Leave, since most of time, Monitor.Leave is called
+; during finally.
+
+;**********************************************************************
+; This is a frameless helper for entering a monitor on a object.
+; The object is in ARGUMENT_REG1.  This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonEnterWorker@4 proc public
+        ; Initialize delay value for retry with exponential backoff
+        push    ebx
+        mov     ebx, dword ptr g_SpinConstants+SpinConstants_dwInitialDuration
+
+        ; We need yet another register to avoid refetching the thread object
+        push    esi
+        
+        ; Check if the instance is NULL.
+        test    ARGUMENT_REG1, ARGUMENT_REG1
+        jz      MonEnterFramedLockHelper
+
+        call    _GetThread@0
+        mov     esi,eax
+        
+        ; Check if we can abort here
+        mov     eax, [esi+Thread_m_State]
+        and     eax, TS_CatchAtSafePoint_ASM
+        jz      MonEnterRetryThinLock
+        ; go through the slow code path to initiate ThreadAbort.
+        jmp     MonEnterFramedLockHelper
+
+MonEnterRetryThinLock: 
+        ; Fetch the object header dword
+        mov     eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+
+        ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit not set
+        ; SBLK_COMBINED_MASK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL
+        test    eax, SBLK_COMBINED_MASK_ASM
+        jnz     MonEnterNeedMoreTests
+
+        ; Everything is fine - get the thread id to store in the lock
+        mov     edx, [esi+Thread_m_ThreadId]
+
+        ; If the thread id is too large, we need a syncblock for sure
+        cmp     edx, SBLK_MASK_LOCK_THREADID_ASM
+        ja      MonEnterFramedLockHelper
+
+        ; We want to store a new value with the current thread id set in the low 10 bits
+        or      edx,eax
+        lock cmpxchg dword ptr [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+        jnz     MonEnterPrepareToWaitThinLock
+
+        ; Everything went fine and we're done
+        add     [esi+Thread_m_dwLockCount],1
+        pop     esi
+        pop     ebx
+        ret
+
+MonEnterNeedMoreTests: 
+        ; Ok, it's not the simple case - find out which case it is
+        test    eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM
+        jnz     MonEnterHaveHashOrSyncBlockIndex
+
+        ; The header is transitioning or the lock - treat this as if the lock was taken
+        test    eax, BIT_SBLK_SPIN_LOCK_ASM
+        jnz     MonEnterPrepareToWaitThinLock
+
+        ; Here we know we have the "thin lock" layout, but the lock is not free.
+        ; It could still be the recursion case - compare the thread id to check
+        mov     edx,eax
+        and     edx, SBLK_MASK_LOCK_THREADID_ASM
+        cmp     edx, [esi+Thread_m_ThreadId]
+        jne     MonEnterPrepareToWaitThinLock
+
+        ; Ok, the thread id matches, it's the recursion case.
+        ; Bump up the recursion level and check for overflow
+        lea     edx, [eax+SBLK_LOCK_RECLEVEL_INC_ASM]
+        test    edx, SBLK_MASK_LOCK_RECLEVEL_ASM
+        jz      MonEnterFramedLockHelper
+
+        ; Try to put the new recursion level back. If the header was changed in the meantime,
+        ; we need a full retry, because the layout could have changed.
+        lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+        jnz     MonEnterRetryHelperThinLock
+
+        ; Everything went fine and we're done
+        pop     esi
+        pop     ebx
+        ret
+
+MonEnterPrepareToWaitThinLock: 
+        ; If we are on an MP system, we try spinning for a certain number of iterations
+        cmp     dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+        jle     MonEnterFramedLockHelper
+
+        ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+        mov     eax, ebx
+MonEnterdelayLoopThinLock:
+        $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+        dec     eax
+        jnz     MonEnterdelayLoopThinLock
+
+        ; next time, wait a factor longer
+        imul    ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+        cmp     ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+        jle     MonEnterRetryHelperThinLock
+
+        jmp     MonEnterFramedLockHelper
+
+MonEnterRetryHelperThinLock: 
+        jmp     MonEnterRetryThinLock
+
+MonEnterHaveHashOrSyncBlockIndex: 
+        ; If we have a hash code already, we need to create a sync block
+        test    eax, BIT_SBLK_IS_HASHCODE_ASM
+        jnz     MonEnterFramedLockHelper
+
+        ; Ok, we have a sync block index - just and out the top bits and grab the syncblock index
+        and     eax, MASK_SYNCBLOCKINDEX_ASM
+
+        ; Get the sync block pointer.
+        mov     ARGUMENT_REG2, dword ptr g_pSyncTable
+        mov     ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+        ; Check if the sync block has been allocated.
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jz      MonEnterFramedLockHelper
+
+        ; Get a pointer to the lock object.
+        lea     ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+        ; Attempt to acquire the lock.
+MonEnterRetrySyncBlock: 
+        mov     eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+        test    eax,eax
+        jne     MonEnterHaveWaiters
+
+        ; Common case, lock isn't held and there are no waiters. Attempt to
+        ; gain ownership ourselves.
+        mov     ARGUMENT_REG1,1
+        lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld], ARGUMENT_REG1
+        jnz     MonEnterRetryHelperSyncBlock
+
+        ; Success. Save the thread object in the lock and increment the use count.
+        mov     dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        inc     dword ptr [esi+Thread_m_dwLockCount]
+        inc     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+4]   ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+        pop     esi
+        pop     ebx
+        ret
+
+        ; It's possible to get here with waiters but no lock held, but in this
+        ; case a signal is about to be fired which will wake up a waiter. So
+        ; for fairness sake we should wait too.
+        ; Check first for recursive lock attempts on the same thread.
+MonEnterHaveWaiters: 
+        ; Is mutex already owned by current thread?
+        cmp     [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        jne     MonEnterPrepareToWait
+
+        ; Yes, bump our use count.
+        inc     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+4]   ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC        
+endif ;MON_DEBUG
+        pop     esi
+        pop     ebx
+        ret
+
+MonEnterPrepareToWait: 
+        ; If we are on an MP system, we try spinning for a certain number of iterations
+        cmp     dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+        jle     MonEnterHaveWaiters1
+
+        ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+        mov     eax,ebx
+MonEnterdelayLoop:
+        $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+        dec     eax
+        jnz     MonEnterdelayLoop
+
+        ; next time, wait a factor longer
+        imul    ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+        cmp     ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+        jle     MonEnterRetrySyncBlock
+
+MonEnterHaveWaiters1: 
+
+        pop     esi
+        pop     ebx
+
+        ; Place AwareLock in arg1 then call contention helper.
+        mov     ARGUMENT_REG1, ARGUMENT_REG2
+        jmp     JITutil_MonContention
+
+MonEnterRetryHelperSyncBlock: 
+        jmp     MonEnterRetrySyncBlock
+
+        ; ECX has the object to synchronize on
+MonEnterFramedLockHelper: 
+        pop     esi
+        pop     ebx
+        jmp     JITutil_MonEnterWorker
+
+@JIT_MonEnterWorker@4 endp
+
+;**********************************************************************
+; This is a frameless helper for entering a monitor on a object, and
+; setting a flag to indicate that the lock was taken.
+; The object is in ARGUMENT_REG1.  The flag is in ARGUMENT_REG2.
+; This tries the normal case (no blocking or object allocation) in line 
+; and calls a framed helper for the other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonReliableEnter@8 proc public
+        ; Initialize delay value for retry with exponential backoff
+        push    ebx
+        mov     ebx, dword ptr g_SpinConstants+SpinConstants_dwInitialDuration
+        
+        ; Put pbLockTaken in edi
+        push	edi
+        mov		edi, ARGUMENT_REG2
+
+        ; We need yet another register to avoid refetching the thread object
+        push    esi
+        
+        ; Check if the instance is NULL.
+        test    ARGUMENT_REG1, ARGUMENT_REG1
+        jz      MonReliableEnterFramedLockHelper
+
+        call    _GetThread@0
+        mov     esi,eax
+        
+        ; Check if we can abort here
+        mov     eax, [esi+Thread_m_State]
+        and     eax, TS_CatchAtSafePoint_ASM
+        jz      MonReliableEnterRetryThinLock
+        ; go through the slow code path to initiate ThreadAbort.
+        jmp     MonReliableEnterFramedLockHelper
+
+MonReliableEnterRetryThinLock: 
+        ; Fetch the object header dword
+        mov     eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+
+        ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit not set
+        ; SBLK_COMBINED_MASK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL
+        test    eax, SBLK_COMBINED_MASK_ASM
+        jnz     MonReliableEnterNeedMoreTests
+
+        ; Everything is fine - get the thread id to store in the lock
+        mov     edx, [esi+Thread_m_ThreadId]
+
+        ; If the thread id is too large, we need a syncblock for sure
+        cmp     edx, SBLK_MASK_LOCK_THREADID_ASM
+        ja      MonReliableEnterFramedLockHelper
+
+        ; We want to store a new value with the current thread id set in the low 10 bits
+        or      edx,eax
+        lock cmpxchg dword ptr [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+        jnz     MonReliableEnterPrepareToWaitThinLock
+
+        ; Everything went fine and we're done
+        add     [esi+Thread_m_dwLockCount],1
+        ; Set *pbLockTaken=true
+        mov		byte ptr [edi],1
+        pop     esi
+        pop		edi
+        pop     ebx
+        ret
+
+MonReliableEnterNeedMoreTests: 
+        ; Ok, it's not the simple case - find out which case it is
+        test    eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM
+        jnz     MonReliableEnterHaveHashOrSyncBlockIndex
+
+        ; The header is transitioning or the lock - treat this as if the lock was taken
+        test    eax, BIT_SBLK_SPIN_LOCK_ASM
+        jnz     MonReliableEnterPrepareToWaitThinLock
+
+        ; Here we know we have the "thin lock" layout, but the lock is not free.
+        ; It could still be the recursion case - compare the thread id to check
+        mov     edx,eax
+        and     edx, SBLK_MASK_LOCK_THREADID_ASM
+        cmp     edx, [esi+Thread_m_ThreadId]
+        jne     MonReliableEnterPrepareToWaitThinLock
+
+        ; Ok, the thread id matches, it's the recursion case.
+        ; Bump up the recursion level and check for overflow
+        lea     edx, [eax+SBLK_LOCK_RECLEVEL_INC_ASM]
+        test    edx, SBLK_MASK_LOCK_RECLEVEL_ASM
+        jz      MonReliableEnterFramedLockHelper
+
+        ; Try to put the new recursion level back. If the header was changed in the meantime,
+        ; we need a full retry, because the layout could have changed.
+        lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+        jnz     MonReliableEnterRetryHelperThinLock
+
+        ; Everything went fine and we're done
+        ; Set *pbLockTaken=true
+        mov		byte ptr [edi],1
+        pop     esi
+        pop		edi
+        pop     ebx
+        ret
+
+MonReliableEnterPrepareToWaitThinLock: 
+        ; If we are on an MP system, we try spinning for a certain number of iterations
+        cmp     dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+        jle     MonReliableEnterFramedLockHelper
+
+        ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+        mov     eax, ebx
+MonReliableEnterdelayLoopThinLock:
+        $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+        dec     eax
+        jnz     MonReliableEnterdelayLoopThinLock
+
+        ; next time, wait a factor longer
+        imul    ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+        cmp     ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+        jle     MonReliableEnterRetryHelperThinLock
+
+        jmp     MonReliableEnterFramedLockHelper
+
+MonReliableEnterRetryHelperThinLock: 
+        jmp     MonReliableEnterRetryThinLock
+
+MonReliableEnterHaveHashOrSyncBlockIndex: 
+        ; If we have a hash code already, we need to create a sync block
+        test    eax, BIT_SBLK_IS_HASHCODE_ASM
+        jnz     MonReliableEnterFramedLockHelper
+
+        ; Ok, we have a sync block index - just and out the top bits and grab the syncblock index
+        and     eax, MASK_SYNCBLOCKINDEX_ASM
+
+        ; Get the sync block pointer.
+        mov     ARGUMENT_REG2, dword ptr g_pSyncTable
+        mov     ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+        ; Check if the sync block has been allocated.
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jz      MonReliableEnterFramedLockHelper
+
+        ; Get a pointer to the lock object.
+        lea     ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+        ; Attempt to acquire the lock.
+MonReliableEnterRetrySyncBlock: 
+        mov     eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+        test    eax,eax
+        jne     MonReliableEnterHaveWaiters
+
+        ; Common case, lock isn't held and there are no waiters. Attempt to
+        ; gain ownership ourselves.
+        mov     ARGUMENT_REG1,1
+        lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld], ARGUMENT_REG1
+        jnz     MonReliableEnterRetryHelperSyncBlock
+
+        ; Success. Save the thread object in the lock and increment the use count.
+        mov     dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        inc     dword ptr [esi+Thread_m_dwLockCount]
+        inc     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+        ; Set *pbLockTaken=true
+        mov		byte ptr [edi],1
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+4]   ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+        pop     esi
+        pop		edi
+        pop     ebx
+        ret
+
+        ; It's possible to get here with waiters but no lock held, but in this
+        ; case a signal is about to be fired which will wake up a waiter. So
+        ; for fairness sake we should wait too.
+        ; Check first for recursive lock attempts on the same thread.
+MonReliableEnterHaveWaiters: 
+        ; Is mutex already owned by current thread?
+        cmp     [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        jne     MonReliableEnterPrepareToWait
+
+        ; Yes, bump our use count.
+        inc     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+        ; Set *pbLockTaken=true
+        mov		byte ptr [edi],1
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+4]   ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC        
+endif ;MON_DEBUG
+        pop     esi
+        pop		edi
+        pop     ebx
+        ret
+
+MonReliableEnterPrepareToWait: 
+        ; If we are on an MP system, we try spinning for a certain number of iterations
+        cmp     dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+        jle     MonReliableEnterHaveWaiters1
+
+        ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+        mov     eax,ebx
+MonReliableEnterdelayLoop:
+        $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+        dec     eax
+        jnz     MonReliableEnterdelayLoop
+
+        ; next time, wait a factor longer
+        imul    ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+        cmp     ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+        jle     MonReliableEnterRetrySyncBlock
+
+MonReliableEnterHaveWaiters1: 
+
+        ; Place AwareLock in arg1, pbLockTaken in arg2, then call contention helper.
+        mov     ARGUMENT_REG1, ARGUMENT_REG2
+        mov		ARGUMENT_REG2, edi
+
+        pop     esi
+        pop		edi
+        pop     ebx
+
+        jmp     JITutil_MonReliableContention
+
+MonReliableEnterRetryHelperSyncBlock: 
+        jmp     MonReliableEnterRetrySyncBlock
+
+        ; ECX has the object to synchronize on
+MonReliableEnterFramedLockHelper: 
+	    mov		ARGUMENT_REG2, edi
+        pop     esi
+        pop		edi
+        pop     ebx
+        jmp     JITutil_MonReliableEnter
+
+@JIT_MonReliableEnter@8 endp
+
+;************************************************************************
+; This is a frameless helper for trying to enter a monitor on a object.
+; The object is in ARGUMENT_REG1 and a timeout in ARGUMENT_REG2. This tries the
+; normal case (no object allocation) in line and calls a framed helper for the
+; other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonTryEnter@12 proc public
+        ; Save the timeout parameter.
+        push    ARGUMENT_REG2
+
+        ; Initialize delay value for retry with exponential backoff
+        push    ebx
+        mov     ebx, dword ptr g_SpinConstants+SpinConstants_dwInitialDuration
+
+        ; The thin lock logic needs another register to store the thread
+        push    esi
+        
+        ; Check if the instance is NULL.
+        test    ARGUMENT_REG1, ARGUMENT_REG1
+        jz      MonTryEnterFramedLockHelper
+
+        ; Check if the timeout looks valid
+        cmp     ARGUMENT_REG2,-1
+        jl      MonTryEnterFramedLockHelper
+
+        ; Get the thread right away, we'll need it in any case
+        call    _GetThread@0
+        mov     esi,eax
+
+        ; Check if we can abort here
+        mov     eax, [esi+Thread_m_State]
+        and     eax, TS_CatchAtSafePoint_ASM
+        jz      MonTryEnterRetryThinLock
+        ; go through the slow code path to initiate ThreadAbort.
+        jmp     MonTryEnterFramedLockHelper
+
+MonTryEnterRetryThinLock: 
+        ; Get the header dword and check its layout
+        mov     eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+
+        ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit not set
+        ; SBLK_COMBINED_MASK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL
+        test    eax, SBLK_COMBINED_MASK_ASM
+        jnz     MonTryEnterNeedMoreTests
+
+        ; Ok, everything is fine. Fetch the thread id and make sure it's small enough for thin locks
+        mov     edx, [esi+Thread_m_ThreadId]
+        cmp     edx, SBLK_MASK_LOCK_THREADID_ASM
+        ja      MonTryEnterFramedLockHelper
+
+        ; Try to put our thread id in there
+        or      edx,eax
+        lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+        jnz     MonTryEnterRetryHelperThinLock
+
+        ; Got the lock - everything is fine"
+        add     [esi+Thread_m_dwLockCount],1
+        pop     esi
+
+        ; Delay value no longer needed
+        pop     ebx
+
+        ; Timeout parameter not needed, ditch it from the stack.
+        add     esp,4
+
+		mov		eax, [esp+4]
+        mov     byte ptr [eax], 1
+        ret		4
+
+MonTryEnterNeedMoreTests: 
+        ; Ok, it's not the simple case - find out which case it is
+        test    eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM
+        jnz     MonTryEnterHaveSyncBlockIndexOrHash
+
+        ; The header is transitioning or the lock is taken
+        test    eax, BIT_SBLK_SPIN_LOCK_ASM
+        jnz     MonTryEnterRetryHelperThinLock
+
+        mov     edx, eax
+        and     edx, SBLK_MASK_LOCK_THREADID_ASM
+        cmp     edx, [esi+Thread_m_ThreadId]
+        jne     MonTryEnterPrepareToWaitThinLock
+
+        ; Ok, the thread id matches, it's the recursion case.
+        ; Bump up the recursion level and check for overflow
+        lea     edx, [eax+SBLK_LOCK_RECLEVEL_INC_ASM]
+        test    edx, SBLK_MASK_LOCK_RECLEVEL_ASM
+        jz      MonTryEnterFramedLockHelper
+
+        ; Try to put the new recursion level back. If the header was changed in the meantime,
+        ; we need a full retry, because the layout could have changed.
+        lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+        jnz     MonTryEnterRetryHelperThinLock
+
+        ; Everything went fine and we're done
+        pop     esi
+        pop     ebx
+
+        ; Timeout parameter not needed, ditch it from the stack.
+        add     esp, 4
+		mov		eax, [esp+4]
+        mov     byte ptr [eax], 1
+        ret		4
+
+MonTryEnterPrepareToWaitThinLock:
+        ; If we are on an MP system, we try spinning for a certain number of iterations
+        cmp     dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+        jle     MonTryEnterFramedLockHelper
+
+        ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+        mov     eax, ebx
+MonTryEnterdelayLoopThinLock:
+        $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+        dec     eax
+        jnz     MonTryEnterdelayLoopThinLock
+
+        ; next time, wait a factor longer
+        imul    ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+        cmp     ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+        jle     MonTryEnterRetryHelperThinLock
+
+        jmp     MonTryEnterWouldBlock
+
+MonTryEnterRetryHelperThinLock: 
+        jmp     MonTryEnterRetryThinLock
+
+
+MonTryEnterHaveSyncBlockIndexOrHash: 
+        ; If we have a hash code already, we need to create a sync block
+        test    eax, BIT_SBLK_IS_HASHCODE_ASM
+        jnz     MonTryEnterFramedLockHelper
+
+        ; Just and out the top bits and grab the syncblock index
+        and     eax, MASK_SYNCBLOCKINDEX_ASM
+
+        ; Get the sync block pointer.
+        mov     ARGUMENT_REG2, dword ptr g_pSyncTable
+        mov     ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+        ; Check if the sync block has been allocated.
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jz      MonTryEnterFramedLockHelper
+
+        ; Get a pointer to the lock object.
+        lea     ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]        
+
+MonTryEnterRetrySyncBlock: 
+        ; Attempt to acquire the lock.
+        mov     eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+        test    eax,eax
+        jne     MonTryEnterHaveWaiters
+
+        ; We need another scratch register for what follows, so save EBX now so"
+        ; we can use it for that purpose."
+        push    ebx
+
+        ; Common case, lock isn't held and there are no waiters. Attempt to
+        ; gain ownership ourselves.
+        mov     ebx,1
+        lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld],ebx
+
+        pop     ebx
+        
+        jnz     MonTryEnterRetryHelperSyncBlock
+
+        ; Success. Save the thread object in the lock and increment the use count.
+        mov     dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        inc     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]        
+        inc     dword ptr [esi+Thread_m_dwLockCount]
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+4]   ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC        
+endif ;MON_DEBUG
+
+        pop     esi
+        pop     ebx
+
+        ; Timeout parameter not needed, ditch it from the stack."
+        add     esp,4
+
+		mov		eax, [esp+4]
+        mov     byte ptr [eax], 1
+        ret		4
+
+        ; It's possible to get here with waiters but no lock held, but in this
+        ; case a signal is about to be fired which will wake up a waiter. So
+        ; for fairness sake we should wait too.
+        ; Check first for recursive lock attempts on the same thread.
+MonTryEnterHaveWaiters: 
+        ; Is mutex already owned by current thread?
+        cmp     [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        jne     MonTryEnterPrepareToWait
+
+        ; Yes, bump our use count.
+        inc     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+4]   ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC        
+endif ;MON_DEBUG
+        pop     esi
+        pop     ebx
+
+        ; Timeout parameter not needed, ditch it from the stack.
+        add     esp,4
+
+		mov		eax, [esp+4]
+        mov     byte ptr [eax], 1
+        ret		4
+
+MonTryEnterPrepareToWait:
+        ; If we are on an MP system, we try spinning for a certain number of iterations
+        cmp     dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+        jle     MonTryEnterWouldBlock
+
+        ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+        mov     eax, ebx
+MonTryEnterdelayLoop:
+        $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+        dec     eax
+        jnz     MonTryEnterdelayLoop
+
+        ; next time, wait a factor longer
+        imul    ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+        cmp     ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+        jle     MonTryEnterRetrySyncBlock
+
+        ; We would need to block to enter the section. Return failure if
+        ; timeout is zero, else call the framed helper to do the blocking
+        ; form of TryEnter."
+MonTryEnterWouldBlock: 
+        pop     esi
+        pop     ebx
+        pop     ARGUMENT_REG2
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jnz     MonTryEnterBlock
+		mov		eax, [esp+4]
+        mov     byte ptr [eax], 0
+        ret		4
+
+MonTryEnterRetryHelperSyncBlock: 
+        jmp     MonTryEnterRetrySyncBlock
+
+MonTryEnterFramedLockHelper: 
+        ; ARGUMENT_REG1 has the object to synchronize on, must retrieve the
+        ; timeout parameter from the stack.
+        pop     esi
+        pop     ebx
+        pop     ARGUMENT_REG2
+MonTryEnterBlock:        
+        jmp     JITutil_MonTryEnter
+
+@JIT_MonTryEnter@12 endp
+
+;**********************************************************************
+; This is a frameless helper for exiting a monitor on a object.
+; The object is in ARGUMENT_REG1.  This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonExitWorker@4 proc public
+        ; The thin lock logic needs an additional register to hold the thread, unfortunately
+        push    esi
+        
+        ; Check if the instance is NULL.
+        test    ARGUMENT_REG1, ARGUMENT_REG1
+        jz      MonExitFramedLockHelper
+        
+        call    _GetThread@0
+        mov     esi,eax
+
+MonExitRetryThinLock: 
+        ; Fetch the header dword and check its layout and the spin lock bit
+        mov     eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+        ;BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_SPIN_LOCK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK
+        test    eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_SPIN_LOCK_ASM
+        jnz     MonExitNeedMoreTests
+
+        ; Ok, we have a "thin lock" layout - check whether the thread id matches
+        mov     edx,eax
+        and     edx, SBLK_MASK_LOCK_THREADID_ASM
+        cmp     edx, [esi+Thread_m_ThreadId]
+        jne     MonExitFramedLockHelper
+
+        ; Check the recursion level
+        test    eax, SBLK_MASK_LOCK_RECLEVEL_ASM
+        jne     MonExitDecRecursionLevel
+
+        ; It's zero - we're leaving the lock.
+        ; So try to put back a zero thread id.
+        ; edx and eax match in the thread id bits, and edx is zero elsewhere, so the xor is sufficient
+        xor     edx,eax
+        lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+        jnz     MonExitRetryHelperThinLock
+
+        ; We're done
+        sub     [esi+Thread_m_dwLockCount],1
+        pop     esi
+        ret
+
+MonExitDecRecursionLevel: 
+        lea     edx, [eax-SBLK_LOCK_RECLEVEL_INC_ASM]
+        lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+        jnz     MonExitRetryHelperThinLock
+
+        ; We're done
+        pop     esi
+        ret
+
+MonExitNeedMoreTests:
+        ;Forward all special cases to the slow helper
+        ;BIT_SBLK_IS_HASHCODE_OR_SPIN_LOCK_ASM = BIT_SBLK_IS_HASHCODE + BIT_SBLK_SPIN_LOCK
+        test    eax, BIT_SBLK_IS_HASHCODE_OR_SPIN_LOCK_ASM
+        jnz     MonExitFramedLockHelper
+
+        ; Get the sync block index and use it to compute the sync block pointer
+        mov     ARGUMENT_REG2, dword ptr g_pSyncTable
+        and     eax, MASK_SYNCBLOCKINDEX_ASM
+        mov     ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]        
+
+        ; was there a sync block?
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jz      MonExitFramedLockHelper
+
+        ; Get a pointer to the lock object.
+        lea     ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+        ; Check if lock is held.
+        cmp     [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+        jne     MonExitFramedLockHelper
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG1 ; preserve regs
+        push    ARGUMENT_REG2
+
+        push    ARGUMENT_REG2 ; AwareLock
+        push    [esp+8]       ; return address
+        call    LeaveSyncHelper
+
+        pop     ARGUMENT_REG2 ; restore regs
+        pop     ARGUMENT_REG1
+endif ;TRACK_SYNC        
+endif ;MON_DEBUG
+        ; Reduce our recursion count.
+        dec     dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+        jz      MonExitLastRecursion
+
+        pop     esi
+        ret
+
+MonExitRetryHelperThinLock: 
+        jmp     MonExitRetryThinLock
+
+MonExitFramedLockHelper: 
+        pop     esi
+        jmp     JITutil_MonExitWorker
+
+        ; This is the last count we held on this lock, so release the lock.
+MonExitLastRecursion: 
+        dec     dword ptr [esi+Thread_m_dwLockCount]
+        mov     dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],0
+
+MonExitRetry: 
+        mov     eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+        lea     esi, [eax-1]
+        lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld], esi
+        jne     MonExitRetryHelper        
+        pop     esi        
+        test    eax,0FFFFFFFEh
+        jne     MonExitMustSignal
+
+        ret
+
+MonExitMustSignal:
+        mov     ARGUMENT_REG1, ARGUMENT_REG2
+        jmp     JITutil_MonSignal
+
+MonExitRetryHelper: 
+        jmp     MonExitRetry
+
+@JIT_MonExitWorker@4 endp
+
+;**********************************************************************
+; This is a frameless helper for entering a static monitor on a class.
+; The methoddesc is in ARGUMENT_REG1.  This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; Note we are changing the methoddesc parameter to a pointer to the
+; AwareLock.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonEnterStatic@4 proc public
+        ; We need another scratch register for what follows, so save EBX now so
+        ; we can use it for that purpose.
+        push    ebx
+
+        ; Attempt to acquire the lock
+MonEnterStaticRetry: 
+        mov     eax, [ARGUMENT_REG1+AwareLock_m_MonitorHeld]
+        test    eax,eax
+        jne     MonEnterStaticHaveWaiters
+
+        ; Common case, lock isn't held and there are no waiters. Attempt to
+        ; gain ownership ourselves.
+        mov     ebx,1
+        lock cmpxchg [ARGUMENT_REG1+AwareLock_m_MonitorHeld],ebx
+        jnz     MonEnterStaticRetryHelper
+
+        pop     ebx
+
+        ; Success. Save the thread object in the lock and increment the use count.
+        call    _GetThread@0
+        mov     [ARGUMENT_REG1+AwareLock_m_HoldingThread], eax
+        inc     dword ptr [ARGUMENT_REG1+AwareLock_m_Recursion]
+        inc     dword ptr [eax+Thread_m_dwLockCount]
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG1   ; AwareLock
+        push    [esp+4]         ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+        ret
+
+        ; It's possible to get here with waiters but no lock held, but in this
+        ; case a signal is about to be fired which will wake up a waiter. So
+        ; for fairness sake we should wait too.
+        ; Check first for recursive lock attempts on the same thread.
+MonEnterStaticHaveWaiters: 
+        ; Get thread but preserve EAX (contains cached contents of m_MonitorHeld).
+        push    eax
+        call    _GetThread@0
+        mov     ebx,eax
+        pop     eax
+
+        ; Is mutex already owned by current thread?
+        cmp     [ARGUMENT_REG1+AwareLock_m_HoldingThread],ebx
+        jne     MonEnterStaticPrepareToWait
+
+        ; Yes, bump our use count.
+        inc     dword ptr [ARGUMENT_REG1+AwareLock_m_Recursion]
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG1   ; AwareLock
+        push    [esp+4]         ; return address
+        call    EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+        pop     ebx
+        ret
+
+MonEnterStaticPrepareToWait: 
+        pop     ebx
+
+        ; ARGUMENT_REG1 should have AwareLock. Call contention helper.
+        jmp     JITutil_MonContention
+
+MonEnterStaticRetryHelper: 
+        jmp     MonEnterStaticRetry
+@JIT_MonEnterStatic@4 endp
+
+;**********************************************************************
+; A frameless helper for exiting a static monitor on a class.
+; The methoddesc is in ARGUMENT_REG1.  This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; Note we are changing the methoddesc parameter to a pointer to the
+; AwareLock.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonExitStatic@4 proc public
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+        push    ARGUMENT_REG1   ; preserve regs
+
+        push    ARGUMENT_REG1   ; AwareLock
+        push    [esp+8]         ; return address
+        call    LeaveSyncHelper
+
+        pop     [ARGUMENT_REG1] ; restore regs
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+
+        ; Check if lock is held.
+        call    _GetThread@0
+        cmp     [ARGUMENT_REG1+AwareLock_m_HoldingThread],eax
+        jne     MonExitStaticLockError
+
+        ; Reduce our recursion count.
+        dec     dword ptr [ARGUMENT_REG1+AwareLock_m_Recursion]
+        jz      MonExitStaticLastRecursion
+
+        ret
+
+        ; This is the last count we held on this lock, so release the lock.
+MonExitStaticLastRecursion: 
+        ; eax must have the thread object
+        dec     dword ptr [eax+Thread_m_dwLockCount]
+        mov     dword ptr [ARGUMENT_REG1+AwareLock_m_HoldingThread],0
+        push    ebx
+
+MonExitStaticRetry: 
+        mov     eax, [ARGUMENT_REG1+AwareLock_m_MonitorHeld]
+        lea     ebx, [eax-1]
+        lock cmpxchg [ARGUMENT_REG1+AwareLock_m_MonitorHeld],ebx
+        jne     MonExitStaticRetryHelper
+        pop     ebx
+        test    eax,0FFFFFFFEh
+        jne     MonExitStaticMustSignal
+
+        ret
+
+MonExitStaticMustSignal: 
+        jmp     JITutil_MonSignal
+
+MonExitStaticRetryHelper: 
+        jmp     MonExitStaticRetry
+        ; Throw a synchronization lock exception.
+MonExitStaticLockError: 
+        mov     ARGUMENT_REG1, CORINFO_SynchronizationLockException_ASM
+        jmp     JIT_InternalThrow
+
+@JIT_MonExitStatic@4 endp
+
+; PatchedCodeStart and PatchedCodeEnd are used to determine bounds of patched code.
+; 
+
+_JIT_PatchedCodeStart@0 proc public
+ret
+_JIT_PatchedCodeStart@0 endp
+
+;
+; Optimized TLS getters
+;
+
+            ALIGN 4
+            
+ifndef FEATURE_IMPLICIT_TLS
+_GetThread@0 proc public
+            ; This will be overwritten at runtime with optimized GetThread implementation
+            jmp short _GetTLSDummy@0
+            ; Just allocate space that will be filled in at runtime
+            db (TLS_GETTER_MAX_SIZE_ASM - 2) DUP (0CCh)
+_GetThread@0 endp
+
+            ALIGN 4
+
+_GetAppDomain@0 proc public
+            ; This will be overwritten at runtime with optimized GetAppDomain implementation
+            jmp short _GetTLSDummy@0
+            ; Just allocate space that will be filled in at runtime
+            db (TLS_GETTER_MAX_SIZE_ASM - 2) DUP (0CCh)
+_GetAppDomain@0 endp
+
+_GetTLSDummy@0 proc public
+            xor eax,eax
+            ret
+_GetTLSDummy@0 endp
+
+            ALIGN 4
+
+_ClrFlsGetBlock@0 proc public
+            ; This will be overwritten at runtime with optimized ClrFlsGetBlock implementation
+            jmp short _GetTLSDummy@0
+            ; Just allocate space that will be filled in at runtime
+            db (TLS_GETTER_MAX_SIZE_ASM - 2) DUP (0CCh)
+_ClrFlsGetBlock@0 endp
+endif
+
+;**********************************************************************
+; Write barriers generated at runtime
+
+PUBLIC _JIT_PatchedWriteBarrierStart@0
+_JIT_PatchedWriteBarrierStart@0 PROC
+ret
+_JIT_PatchedWriteBarrierStart@0 ENDP
+
+PatchedWriteBarrierHelper MACRO rg
+        ALIGN 8
+PUBLIC _JIT_WriteBarrier&rg&@0
+_JIT_WriteBarrier&rg&@0 PROC
+        ; Just allocate space that will be filled in at runtime
+        db (48) DUP (0CCh)
+_JIT_WriteBarrier&rg&@0 ENDP
+
+ENDM
+
+PatchedWriteBarrierHelper <EAX>
+PatchedWriteBarrierHelper <EBX>
+PatchedWriteBarrierHelper <ECX>
+PatchedWriteBarrierHelper <ESI>
+PatchedWriteBarrierHelper <EDI>
+PatchedWriteBarrierHelper <EBP>
+
+PUBLIC _JIT_PatchedWriteBarrierLast@0
+_JIT_PatchedWriteBarrierLast@0 PROC
+ret
+_JIT_PatchedWriteBarrierLast@0 ENDP
+
+;**********************************************************************
+; PrecodeRemotingThunk is patched at runtime to activate it
+ifdef FEATURE_REMOTING
+        ALIGN 16
+_PrecodeRemotingThunk@0 proc public
+
+        ret                             ; This is going to be patched to "test ecx,ecx"
+        nop
+
+        jz      RemotingDone            ; predicted not taken
+
+        cmp     dword ptr [ecx],11111111h ; This is going to be patched to address of the transparent proxy
+        je      RemotingCheck           ; predicted not taken
+
+RemotingDone:
+        ret
+
+RemotingCheck:
+        push     eax            ; save method desc
+        mov      eax, dword ptr [ecx + TransparentProxyObject___stubData]
+        call     [ecx + TransparentProxyObject___stub]
+        test     eax, eax
+        jnz      RemotingCtxMismatch
+        mov      eax, [esp]
+        mov      ax, [eax + MethodDesc_m_wFlags]
+        and      ax, MethodDesc_mdcClassification
+        cmp      ax, MethodDesc_mcComInterop
+        je       ComPlusCall
+        pop      eax            ; throw away method desc
+        jmp      RemotingDone
+
+RemotingCtxMismatch:
+        pop      eax            ; restore method desc
+        add      esp, 4         ; pop return address into the precode
+        jmp      _TransparentProxyStub_CrossContext@0
+        
+ComPlusCall:
+        pop      eax            ; restore method desc
+        mov      [esp],eax      ; replace return address into the precode with method desc (argument for TP stub)
+        jmp      _InContextTPQuickDispatchAsmStub@0        
+
+_PrecodeRemotingThunk@0 endp
+endif ;  FEATURE_REMOTING
+
+_JIT_PatchedCodeLast@0 proc public
+ret
+_JIT_PatchedCodeLast@0 endp
+
+; This is the first function outside the "keep together range". Used by BBT scripts.
+_JIT_PatchedCodeEnd@0 proc public
+ret
+_JIT_PatchedCodeEnd@0 endp
+
+; This is the ASM portion of JIT_IsInstanceOfInterface.  For all the bizarre cases, it quickly
+; fails and falls back on the JITutil_IsInstanceOfAny helper.  So all failure cases take
+; the slow path, too.
+;
+; ARGUMENT_REG1 = array or interface to check for.
+; ARGUMENT_REG2 = instance to be cast.
+
+        ALIGN 16
+PUBLIC @JIT_IsInstanceOfInterface@8
+@JIT_IsInstanceOfInterface@8 PROC
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jz      IsNullInst
+
+        mov     eax, [ARGUMENT_REG2]            ; get MethodTable
+
+        push    ebx
+        push    esi
+        movzx   ebx, word ptr [eax+MethodTable_m_wNumInterfaces]
+
+        ; check if this MT implements any interfaces
+        test    ebx, ebx
+        jz      IsInstanceOfInterfaceDoBizarre
+
+        ; move Interface map ptr into eax
+        mov     eax, [eax+MethodTable_m_pInterfaceMap]
+
+IsInstanceOfInterfaceTop:
+        ; eax -> current InterfaceInfo_t entry in interface map list
+ifdef FEATURE_PREJIT
+        mov     esi, [eax]
+        test    esi, 1
+        ; Move the deference out of line so that this jump is correctly predicted for the case
+        ; when there is no indirection
+        jnz     IsInstanceOfInterfaceIndir
+        cmp     ARGUMENT_REG1, esi
+else
+        cmp     ARGUMENT_REG1, [eax]
+endif
+        je      IsInstanceOfInterfaceFound
+
+IsInstanceOfInterfaceNext:
+        add     eax, SIZEOF_InterfaceInfo_t
+        dec     ebx
+        jnz     IsInstanceOfInterfaceTop
+
+        ; fall through to DoBizarre
+
+IsInstanceOfInterfaceDoBizarre:
+        pop     esi
+        pop     ebx
+        mov     eax, [ARGUMENT_REG2]    ; get MethodTable
+        test    dword ptr [eax+MethodTable_m_dwFlags], NonTrivialInterfaceCastFlags
+        jnz     IsInstanceOfInterfaceNonTrivialCast
+
+IsNullInst:
+        xor     eax,eax
+        ret
+
+ifdef FEATURE_PREJIT
+IsInstanceOfInterfaceIndir:
+        cmp     ARGUMENT_REG1,[esi-1]
+        jne     IsInstanceOfInterfaceNext
+endif
+
+IsInstanceOfInterfaceFound:
+        pop     esi
+        pop     ebx
+        mov     eax, ARGUMENT_REG2      ; the successful instance
+        ret
+
+IsInstanceOfInterfaceNonTrivialCast:
+        jmp     @JITutil_IsInstanceOfInterface@8
+
+@JIT_IsInstanceOfInterface@8 endp
+
+; This is the ASM portion of JIT_ChkCastInterface.  For all the bizarre cases, it quickly
+; fails and falls back on the JITutil_ChkCastAny helper.  So all failure cases take
+; the slow path, too.
+;
+; ARGUMENT_REG1 = array or interface to check for.
+; ARGUMENT_REG2 = instance to be cast.
+
+        ALIGN 16
+PUBLIC @JIT_ChkCastInterface@8
+@JIT_ChkCastInterface@8 PROC
+        test    ARGUMENT_REG2, ARGUMENT_REG2
+        jz      ChkCastInterfaceIsNullInst
+
+        mov     eax, [ARGUMENT_REG2]            ; get MethodTable
+
+        push    ebx
+        push    esi
+        movzx   ebx, word ptr [eax+MethodTable_m_wNumInterfaces]
+
+        ; speculatively move Interface map ptr into eax
+        mov     eax, [eax+MethodTable_m_pInterfaceMap]
+
+        ; check if this MT implements any interfaces
+        test    ebx, ebx
+        jz      ChkCastInterfaceDoBizarre
+
+ChkCastInterfaceTop:
+        ; eax -> current InterfaceInfo_t entry in interface map list
+ifdef FEATURE_PREJIT
+        mov     esi, [eax]
+        test    esi, 1
+        ; Move the deference out of line so that this jump is correctly predicted for the case
+        ; when there is no indirection
+        jnz     ChkCastInterfaceIndir
+        cmp     ARGUMENT_REG1, esi
+else
+        cmp     ARGUMENT_REG1, [eax]
+endif
+        je      ChkCastInterfaceFound
+
+ChkCastInterfaceNext:
+        add     eax, SIZEOF_InterfaceInfo_t
+        dec     ebx
+        jnz     ChkCastInterfaceTop
+
+        ; fall through to DoBizarre
+
+ChkCastInterfaceDoBizarre:
+        pop     esi
+        pop     ebx
+        jmp     @JITutil_ChkCastInterface@8
+
+ifdef FEATURE_PREJIT
+ChkCastInterfaceIndir:
+        cmp     ARGUMENT_REG1,[esi-1]
+        jne     ChkCastInterfaceNext
+endif
+
+ChkCastInterfaceFound:
+        pop     esi
+        pop     ebx
+
+ChkCastInterfaceIsNullInst:
+        mov     eax, ARGUMENT_REG2      ; either null, or the successful instance
+        ret
+
+@JIT_ChkCastInterface@8 endp
+
+    end
diff --git a/src/vm/i386/jitinterfacex86.cpp b/src/vm/i386/jitinterfacex86.cpp
new file mode 100644
index 0000000000..949b115ce2
--- /dev/null
+++ b/src/vm/i386/jitinterfacex86.cpp
@@ -0,0 +1,1922 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// ===========================================================================
+// File: JITinterfaceX86.CPP
+//
+// ===========================================================================
+
+// This contains JITinterface routines that are tailored for
+// X86 platforms. Non-X86 versions of these can be found in
+// JITinterfaceGen.cpp
+
+
+#include "common.h"
+#include "jitinterface.h"
+#include "eeconfig.h"
+#include "excep.h"
+#include "comdelegate.h"
+#ifdef FEATURE_REMOTING
+#include "remoting.h" // create context bound and remote class instances
+#endif
+#include "field.h"
+#include "ecall.h"
+#include "asmconstants.h"
+#include "virtualcallstub.h"
+#include "eventtrace.h"
+#include "threadsuspend.h"
+
+#if defined(_DEBUG) && !defined (WRITE_BARRIER_CHECK) 
+#define WRITE_BARRIER_CHECK 1
+#endif
+
+// To test with MON_DEBUG off, comment out the following line. DO NOT simply define
+// to be 0 as the checks are for #ifdef not #if 0.
+// 
+#ifdef _DEBUG 
+#define MON_DEBUG 1
+#endif
+
+class generation;
+extern "C" generation generation_table[];
+
+extern "C" void STDCALL JIT_WriteBarrierReg_PreGrow();// JIThelp.asm/JIThelp.s
+extern "C" void STDCALL JIT_WriteBarrierReg_PostGrow();// JIThelp.asm/JIThelp.s
+
+#ifdef _DEBUG 
+extern "C" void STDCALL WriteBarrierAssert(BYTE* ptr, Object* obj)
+{
+    STATIC_CONTRACT_SO_TOLERANT;
+    WRAPPER_NO_CONTRACT;
+
+    static BOOL fVerifyHeap = -1;
+
+    if (fVerifyHeap == -1)
+        fVerifyHeap = g_pConfig->GetHeapVerifyLevel() & EEConfig::HEAPVERIFY_GC;
+
+    if (fVerifyHeap)
+    {
+        obj->Validate(FALSE);
+        if(GCHeap::GetGCHeap()->IsHeapPointer(ptr))
+        {
+            Object* pObj = *(Object**)ptr;
+            _ASSERTE (pObj == NULL || GCHeap::GetGCHeap()->IsHeapPointer(pObj));
+        }
+    }
+    else
+    {
+        _ASSERTE((g_lowest_address <= ptr && ptr < g_highest_address) ||
+             ((size_t)ptr < MAX_UNCHECKED_OFFSET_FOR_NULL_OBJECT));
+    }
+}
+
+#endif // _DEBUG
+
+/****************************************************************************/
+/* assigns 'val to 'array[idx], after doing all the proper checks */
+
+/* note that we can do almost as well in portable code, but this
+   squezes the last little bit of perf out */
+
+__declspec(naked) void F_CALL_CONV JIT_Stelem_Ref(PtrArray* array, unsigned idx, Object* val)
+{
+    STATIC_CONTRACT_SO_TOLERANT;
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+
+    enum { CanCast = TypeHandle::CanCast,
+#if CHECK_APP_DOMAIN_LEAKS 
+           EEClassFlags = EEClass::AUXFLAG_APP_DOMAIN_AGILE |
+                          EEClass::AUXFLAG_CHECK_APP_DOMAIN_AGILE,
+#endif // CHECK_APP_DOMAIN_LEAKS
+         };
+
+    __asm {
+        mov EAX, [ESP+4]            // EAX = val
+
+        test ECX, ECX
+        je ThrowNullReferenceException
+
+        cmp EDX, [ECX+4];           // test if in bounds
+        jae ThrowIndexOutOfRangeException
+
+        test EAX, EAX
+        jz Assigning0
+
+#if CHECK_APP_DOMAIN_LEAKS 
+        mov EAX,[g_pConfig]
+        movzx EAX, [EAX]EEConfig.fAppDomainLeaks;
+        test EAX, EAX
+        jz NoCheck
+        // Check if the instance is agile or check agile
+        mov EAX, [ECX]
+        mov EAX, [EAX]MethodTable.m_ElementTypeHnd
+        test EAX, 2                 // Check for non-MT
+        jnz NoCheck
+        // Check VMflags of element type
+        mov EAX, [EAX]MethodTable.m_pEEClass
+        mov EAX, dword ptr [EAX]EEClass.m_wAuxFlags
+        test EAX, EEClassFlags
+        jnz NeedFrame             // Jump to the generic case so we can do an app domain check
+ NoCheck:
+        mov EAX, [ESP+4]            // EAX = val
+#endif // CHECK_APP_DOMAIN_LEAKS
+
+        push EDX
+        mov EDX, [ECX]
+        mov EDX, [EDX]MethodTable.m_ElementTypeHnd
+
+        cmp EDX, [EAX]               // do we have an exact match
+        jne NotExactMatch
+
+DoWrite2:
+        pop EDX
+        lea EDX, [ECX + 4*EDX + 8]
+        call JIT_WriteBarrierEAX
+        ret     4
+
+Assigning0:
+        // write barrier is not necessary for assignment of NULL references
+        mov     [ECX + 4*EDX + 8], EAX
+        ret     4
+
+DoWrite:
+        mov EAX, [ESP+4]            // EAX = val
+        lea EDX, [ECX + 4*EDX + 8]
+        call JIT_WriteBarrierEAX
+        ret     4
+
+NotExactMatch:
+        cmp EDX, [g_pObjectClass]   // are we assigning to Array of objects
+        je DoWrite2
+
+        // push EDX                 // caller-save ECX and EDX
+        push ECX
+
+        push EDX                    // element type handle
+        push EAX                    // object
+
+        call ObjIsInstanceOfNoGC
+
+        pop ECX                     // caller-restore ECX and EDX
+        pop EDX
+
+        cmp EAX, CanCast
+        je DoWrite
+
+#if CHECK_APP_DOMAIN_LEAKS 
+NeedFrame:
+#endif
+        // Call the helper that knows how to erect a frame
+        push EDX
+        push ECX
+
+        lea ECX, [ESP+8+4]              // ECX = address of object being stored
+        lea EDX, [ESP]                  // EDX = address of array
+
+        call ArrayStoreCheck
+
+        pop ECX                         // these might have been updated!
+        pop EDX
+
+        cmp EAX, EAX                    // set zero flag
+        jnz Epilog                      // This jump never happens, it keeps the epilog walker happy
+
+        jmp DoWrite
+
+ThrowNullReferenceException:
+        mov ECX, CORINFO_NullReferenceException
+        jmp Throw
+
+ThrowIndexOutOfRangeException:
+        mov ECX, CORINFO_IndexOutOfRangeException
+
+Throw:
+        call    JIT_InternalThrowFromHelper
+Epilog:
+        ret     4
+    }
+}
+
+extern "C" __declspec(naked) Object* F_CALL_CONV JIT_IsInstanceOfClass(MethodTable *pMT, Object *pObject)
+{
+    STATIC_CONTRACT_SO_TOLERANT;
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+
+#if defined(FEATURE_TYPEEQUIVALENCE) || defined(FEATURE_REMOTING)
+    enum
+    {
+        MTEquivalenceFlags = MethodTable::public_enum_flag_HasTypeEquivalence,
+    };
+#endif
+
+    __asm
+    {
+        // Check if the instance is NULL
+        test            ARGUMENT_REG2, ARGUMENT_REG2
+        je              ReturnInst
+
+        // Get the method table for the instance.
+        mov             eax, dword ptr [ARGUMENT_REG2]
+
+        // Check if they are the same.
+        cmp             eax, ARGUMENT_REG1
+        jne             CheckParent
+
+    ReturnInst:
+        // We matched the class.
+        mov             eax, ARGUMENT_REG2
+        ret
+
+    // Check if the parent class matches.
+    CheckParent:
+        mov             eax, dword ptr [eax]MethodTable.m_pParentMethodTable
+        cmp             eax, ARGUMENT_REG1
+        je              ReturnInst
+
+    // Check if we hit the top of the hierarchy.
+        test            eax, eax
+        jne             CheckParent
+
+    // Check if the instance is a proxy.
+#if defined(FEATURE_TYPEEQUIVALENCE) || defined(FEATURE_REMOTING)
+        mov             eax, [ARGUMENT_REG2]
+        test            dword ptr [eax]MethodTable.m_dwFlags, MTEquivalenceFlags
+        jne             SlowPath
+#endif
+    // It didn't match and it isn't a proxy and it doesn't have type equivalence
+        xor             eax, eax
+        ret
+
+    // Cast didn't match, so try the worker to check for the proxy/equivalence case.
+#if defined(FEATURE_TYPEEQUIVALENCE) || defined(FEATURE_REMOTING)
+    SlowPath:
+        jmp             JITutil_IsInstanceOfAny
+#endif            
+    }
+}
+
+extern "C" __declspec(naked) Object* F_CALL_CONV JIT_ChkCastClass(MethodTable *pMT, Object *pObject)
+{
+    STATIC_CONTRACT_SO_TOLERANT;
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+
+    __asm
+    {
+        // Check if the instance is NULL
+        test            ARGUMENT_REG2, ARGUMENT_REG2
+        je              ReturnInst
+
+        // Get the method table for the instance.
+        mov             eax, dword ptr [ARGUMENT_REG2]
+
+        // Check if they are the same.
+        cmp             eax, ARGUMENT_REG1
+        jne             CheckParent
+
+    ReturnInst:
+        // We matched the class.
+        mov             eax, ARGUMENT_REG2
+        ret
+
+    // Check if the parent class matches.
+    CheckParent:
+        mov             eax, dword ptr [eax]MethodTable.m_pParentMethodTable
+        cmp             eax, ARGUMENT_REG1
+        je              ReturnInst
+
+    // Check if we hit the top of the hierarchy.
+        test            eax, eax
+        jne             CheckParent
+
+    // Call out to JITutil_ChkCastAny to handle the proxy case and throw a rich
+    // InvalidCastException in case of failure.
+        jmp             JITutil_ChkCastAny
+    }
+}
+
+extern "C" __declspec(naked) Object* F_CALL_CONV JIT_ChkCastClassSpecial(MethodTable *pMT, Object *pObject)
+{
+    STATIC_CONTRACT_SO_TOLERANT;
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+
+    // Assumes that the check for the trivial cases has been inlined by the JIT.
+
+    __asm
+    {
+        // Get the method table for the instance.
+        mov             eax, dword ptr [ARGUMENT_REG2]
+
+    // Check if the parent class matches.
+    CheckParent:
+        mov             eax, dword ptr [eax]MethodTable.m_pParentMethodTable
+        cmp             eax, ARGUMENT_REG1
+        jne             CheckNull
+
+    // We matched the class.
+        mov             eax, ARGUMENT_REG2
+        ret
+
+    CheckNull:
+    // Check if we hit the top of the hierarchy.
+        test            eax, eax
+        jne             CheckParent
+
+    // Call out to JITutil_ChkCastAny to handle the proxy case and throw a rich
+    // InvalidCastException in case of failure.
+        jmp             JITutil_ChkCastAny
+    }
+}
+
+HCIMPL1_V(INT32, JIT_Dbl2IntOvf, double val)
+{
+    FCALL_CONTRACT;
+
+    INT64 ret = HCCALL1_V(JIT_Dbl2Lng, val);
+
+    if (ret != (INT32) ret)
+        goto THROW;
+
+    return (INT32) ret;
+
+THROW:
+    FCThrow(kOverflowException);
+}
+HCIMPLEND
+
+
+FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_);
+
+#ifdef FEATURE_REMOTING    
+HCIMPL1(Object*, JIT_NewCrossContextHelper, CORINFO_CLASS_HANDLE typeHnd_)
+{
+    CONTRACTL
+    {
+        FCALL_CHECK;
+    }
+    CONTRACTL_END;
+
+    TypeHandle typeHnd(typeHnd_);
+
+    OBJECTREF newobj = NULL;
+    HELPER_METHOD_FRAME_BEGIN_RET_0();    // Set up a frame
+
+    _ASSERTE(!typeHnd.IsTypeDesc());                                   // we never use this helper for arrays
+    MethodTable *pMT = typeHnd.AsMethodTable();
+    pMT->CheckRestore();
+
+    // Remoting services determines if the current context is appropriate
+    // for activation. If the current context is OK then it creates an object
+    // else it creates a proxy.
+    // Note: 3/20/03 Added fIsNewObj flag to indicate that CreateProxyOrObject
+    // is being called from Jit_NewObj ... the fIsCom flag is FALSE by default -
+    // which used to be the case before this change as well.
+    newobj = CRemotingServices::CreateProxyOrObject(pMT,FALSE /*fIsCom*/,TRUE/*fIsNewObj*/);
+
+    HELPER_METHOD_FRAME_END();
+    return(OBJECTREFToObject(newobj));
+}
+HCIMPLEND
+#endif //  FEATURE_REMOTING    
+
+HCIMPL1(Object*, AllocObjectWrapper, MethodTable *pMT)
+{
+    CONTRACTL
+    {
+        FCALL_CHECK;
+    }
+    CONTRACTL_END;
+
+    OBJECTREF newObj = NULL;
+    HELPER_METHOD_FRAME_BEGIN_RET_0();    // Set up a frame
+    newObj = AllocateObject(pMT);
+    HELPER_METHOD_FRAME_END();
+    return OBJECTREFToObject(newObj);
+}
+HCIMPLEND
+
+/*********************************************************************/
+// This is a frameless helper for allocating an object whose type derives
+// from marshalbyref. We check quickly to see if it is configured to
+// have remote activation. If not, we use the superfast allocator to
+// allocate the object. Otherwise, we take the slow path of allocating
+// the object via remoting services.
+#ifdef FEATURE_REMOTING
+__declspec(naked) Object* F_CALL_CONV JIT_NewCrossContext(CORINFO_CLASS_HANDLE typeHnd_)
+{
+    STATIC_CONTRACT_SO_TOLERANT;
+    STATIC_CONTRACT_THROWS;
+    STATIC_CONTRACT_GC_TRIGGERS;
+
+    _asm
+    {
+        // Check if remoting has been configured
+        push ARGUMENT_REG1  // save registers
+        push ARGUMENT_REG1
+        call CRemotingServices::RequiresManagedActivation
+        test eax, eax
+        // Jump to the slow path
+        jne SpecialOrXCtxHelper
+#ifdef _DEBUG 
+        push LL_INFO10
+        push LF_GCALLOC
+        call LoggingOn
+        test eax, eax
+        jne AllocWithLogHelper
+#endif // _DEBUG
+
+        // if the object doesn't have a finalizer and the size is small, jump to super fast asm helper
+        mov     ARGUMENT_REG1, [esp]
+        call    MethodTable::CannotUseSuperFastHelper
+        test    eax, eax
+        jne     FastHelper
+
+        pop     ARGUMENT_REG1
+        // Jump to the super fast helper
+        jmp     dword ptr [hlpDynamicFuncTable + DYNAMIC_CORINFO_HELP_NEWSFAST * SIZE VMHELPDEF]VMHELPDEF.pfnHelper
+
+FastHelper:
+        pop     ARGUMENT_REG1
+        // Jump to the helper
+        jmp     JIT_New
+
+SpecialOrXCtxHelper:
+#ifdef FEATURE_COMINTEROP 
+        test    eax, ComObjectType
+        jz      XCtxHelper
+        pop     ARGUMENT_REG1
+        // Jump to the helper
+        jmp     JIT_New
+
+XCtxHelper:
+#endif // FEATURE_COMINTEROP
+
+        pop     ARGUMENT_REG1
+        // Jump to the helper
+        jmp     JIT_NewCrossContextHelper
+
+#ifdef _DEBUG 
+AllocWithLogHelper:
+        pop     ARGUMENT_REG1
+        // Jump to the helper
+        jmp     AllocObjectWrapper
+#endif // _DEBUG
+    }
+}
+#endif // FEATURE_REMOTING
+
+
+/*********************************************************************/
+extern "C" void* g_TailCallFrameVptr;
+void* g_TailCallFrameVptr;
+
+#ifdef FEATURE_HIJACK
+extern "C" void STDCALL JIT_TailCallHelper(Thread * pThread);
+void STDCALL JIT_TailCallHelper(Thread * pThread)
+{
+    CONTRACTL {
+        NOTHROW;
+        GC_NOTRIGGER;
+        SO_TOLERANT;
+    } CONTRACTL_END;
+
+    pThread->UnhijackThread();
+}
+#endif // FEATURE_HIJACK
+
+#if CHECK_APP_DOMAIN_LEAKS 
+HCIMPL1(void *, SetObjectAppDomain, Object *pObject)
+{
+    FCALL_CONTRACT;
+    DEBUG_ONLY_FUNCTION;
+
+    HELPER_METHOD_FRAME_BEGIN_RET_ATTRIB_NOPOLL(Frame::FRAME_ATTR_CAPTURE_DEPTH_2|Frame::FRAME_ATTR_EXACT_DEPTH|Frame::FRAME_ATTR_NO_THREAD_ABORT);
+    pObject->SetAppDomain();
+    HELPER_METHOD_FRAME_END();
+
+    return pObject;
+}
+HCIMPLEND
+#endif // CHECK_APP_DOMAIN_LEAKS
+
+    // emit code that adds MIN_OBJECT_SIZE to reg if reg is unaligned thus making it aligned
+void JIT_TrialAlloc::EmitAlignmentRoundup(CPUSTUBLINKER *psl, X86Reg testAlignReg, X86Reg adjReg, Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    _ASSERTE((MIN_OBJECT_SIZE & 7) == 4);   // want to change alignment
+
+    CodeLabel *AlreadyAligned = psl->NewCodeLabel();
+
+    // test reg, 7
+    psl->Emit16(0xC0F7 | (static_cast<unsigned short>(testAlignReg) << 8));
+    psl->Emit32(0x7);
+
+    // jz alreadyAligned
+    if (flags & ALIGN8OBJ)
+    {
+        psl->X86EmitCondJump(AlreadyAligned, X86CondCode::kJNZ);
+    }
+    else
+    {
+        psl->X86EmitCondJump(AlreadyAligned, X86CondCode::kJZ);
+    }
+
+    psl->X86EmitAddReg(adjReg, MIN_OBJECT_SIZE);
+    // AlreadyAligned:
+    psl->EmitLabel(AlreadyAligned);
+}
+
+    // if 'reg' is unaligned, then set the dummy object at EAX and increment EAX past
+    // the dummy object
+void JIT_TrialAlloc::EmitDummyObject(CPUSTUBLINKER *psl, X86Reg alignTestReg, Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    CodeLabel *AlreadyAligned = psl->NewCodeLabel();
+
+    // test reg, 7
+    psl->Emit16(0xC0F7 | (static_cast<unsigned short>(alignTestReg) << 8));
+    psl->Emit32(0x7);
+
+    // jz alreadyAligned
+    if (flags & ALIGN8OBJ)
+    {
+        psl->X86EmitCondJump(AlreadyAligned, X86CondCode::kJNZ);
+    }
+    else
+    {
+        psl->X86EmitCondJump(AlreadyAligned, X86CondCode::kJZ);
+    }
+
+    // Make the fake object
+    // mov EDX, [g_pObjectClass]
+    psl->Emit16(0x158B);
+    psl->Emit32((int)(size_t)&g_pObjectClass);
+
+    // mov [EAX], EDX
+    psl->X86EmitOffsetModRM(0x89, kEDX, kEAX, 0);
+
+#if CHECK_APP_DOMAIN_LEAKS 
+    EmitSetAppDomain(psl);
+#endif
+
+    // add EAX, MIN_OBJECT_SIZE
+    psl->X86EmitAddReg(kEAX, MIN_OBJECT_SIZE);
+
+    // AlreadyAligned:
+    psl->EmitLabel(AlreadyAligned);
+}
+
+void JIT_TrialAlloc::EmitCore(CPUSTUBLINKER *psl, CodeLabel *noLock, CodeLabel *noAlloc, Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    // Upon entry here, ecx contains the method we are to try allocate memory for
+    // Upon exit, eax contains the allocated memory, edx is trashed, and ecx undisturbed
+
+    if (flags & MP_ALLOCATOR)
+    {
+        if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ))
+        {
+            if (flags & ALIGN8OBJ)
+            {
+                // mov             eax, [ecx]MethodTable.m_BaseSize
+                psl->X86EmitIndexRegLoad(kEAX, kECX, offsetof(MethodTable, m_BaseSize));
+            }
+
+            psl->X86EmitPushReg(kEBX);  // we need a spare register
+        }
+        else
+        {
+            // mov             eax, [ecx]MethodTable.m_BaseSize
+            psl->X86EmitIndexRegLoad(kEAX, kECX, offsetof(MethodTable, m_BaseSize));
+        }
+
+        assert( ((flags & ALIGN8)==0     ||  // EAX loaded by else statement
+                 (flags & SIZE_IN_EAX)   ||  // EAX already comes filled out
+                 (flags & ALIGN8OBJ)     )   // EAX loaded in the if (flags & ALIGN8OBJ) statement
+                 && "EAX should contain size for allocation and it doesnt!!!");
+
+        // Fetch current thread into EDX, preserving EAX and ECX
+        psl->X86EmitCurrentThreadFetch(kEDX, (1<<kEAX)|(1<<kECX));
+
+        // Try the allocation.
+
+
+        if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ))
+        {
+            // MOV EBX, [edx]Thread.m_alloc_context.alloc_ptr
+            psl->X86EmitOffsetModRM(0x8B, kEBX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(alloc_context, alloc_ptr));
+            // add EAX, EBX
+            psl->Emit16(0xC303);
+            if (flags & ALIGN8)
+                EmitAlignmentRoundup(psl, kEBX, kEAX, flags);      // bump EAX up size by 12 if EBX unaligned (so that we are aligned)
+        }
+        else
+        {
+            // add             eax, [edx]Thread.m_alloc_context.alloc_ptr
+            psl->X86EmitOffsetModRM(0x03, kEAX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(alloc_context, alloc_ptr));
+        }
+
+        // cmp             eax, [edx]Thread.m_alloc_context.alloc_limit
+        psl->X86EmitOffsetModRM(0x3b, kEAX, kEDX, offsetof(Thread, m_alloc_context) + offsetof(alloc_context, alloc_limit));
+
+        // ja              noAlloc
+        psl->X86EmitCondJump(noAlloc, X86CondCode::kJA);
+
+        // Fill in the allocation and get out.
+
+        // mov             [edx]Thread.m_alloc_context.alloc_ptr, eax
+        psl->X86EmitIndexRegStore(kEDX, offsetof(Thread, m_alloc_context) + offsetof(alloc_context, alloc_ptr), kEAX);
+
+        if (flags & (ALIGN8 | SIZE_IN_EAX | ALIGN8OBJ))
+        {
+            // mov EAX, EBX
+            psl->Emit16(0xC38B);
+            // pop EBX
+            psl->X86EmitPopReg(kEBX);
+
+            if (flags & ALIGN8)
+                EmitDummyObject(psl, kEAX, flags);
+        }
+        else
+        {
+            // sub             eax, [ecx]MethodTable.m_BaseSize
+            psl->X86EmitOffsetModRM(0x2b, kEAX, kECX, offsetof(MethodTable, m_BaseSize));
+        }
+
+        // mov             dword ptr [eax], ecx
+        psl->X86EmitIndexRegStore(kEAX, 0, kECX);
+    }
+    else
+    {
+        // Take the GC lock (there is no lock prefix required - we will use JIT_TrialAllocSFastMP on an MP System).
+        // inc             dword ptr [m_GCLock]
+        psl->Emit16(0x05ff);
+        psl->Emit32((int)(size_t)&m_GCLock);
+
+        // jnz             NoLock
+        psl->X86EmitCondJump(noLock, X86CondCode::kJNZ);
+
+        if (flags & SIZE_IN_EAX)
+        {
+            // mov edx, eax
+            psl->Emit16(0xd08b);
+        }
+        else
+        {
+            // mov             edx, [ecx]MethodTable.m_BaseSize
+            psl->X86EmitIndexRegLoad(kEDX, kECX, offsetof(MethodTable, m_BaseSize));
+        }
+
+        // mov             eax, dword ptr [generation_table]
+        psl->Emit8(0xA1);
+        psl->Emit32((int)(size_t)&generation_table);
+
+        // Try the allocation.
+        // add             edx, eax
+        psl->Emit16(0xd003);
+
+        if (flags & (ALIGN8 | ALIGN8OBJ))
+            EmitAlignmentRoundup(psl, kEAX, kEDX, flags);      // bump up EDX size by 12 if EAX unaligned (so that we are aligned)
+
+        // cmp             edx, dword ptr [generation_table+4]
+        psl->Emit16(0x153b);
+        psl->Emit32((int)(size_t)&generation_table + 4);
+
+        // ja              noAlloc
+        psl->X86EmitCondJump(noAlloc, X86CondCode::kJA);
+
+        // Fill in the allocation and get out.
+        // mov             dword ptr [generation_table], edx
+        psl->Emit16(0x1589);
+        psl->Emit32((int)(size_t)&generation_table);
+
+        if (flags & (ALIGN8 | ALIGN8OBJ))
+            EmitDummyObject(psl, kEAX, flags);
+
+        // mov             dword ptr [eax], ecx
+        psl->X86EmitIndexRegStore(kEAX, 0, kECX);
+
+        // mov             dword ptr [m_GCLock], 0FFFFFFFFh
+        psl->Emit16(0x05C7);
+        psl->Emit32((int)(size_t)&m_GCLock);
+        psl->Emit32(0xFFFFFFFF);
+    }
+
+
+#ifdef INCREMENTAL_MEMCLR 
+    // <TODO>We're planning to get rid of this anyhow according to Patrick</TODO>
+    _ASSERTE(!"NYI");
+#endif // INCREMENTAL_MEMCLR
+}
+
+#if CHECK_APP_DOMAIN_LEAKS 
+void JIT_TrialAlloc::EmitSetAppDomain(CPUSTUBLINKER *psl)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (!g_pConfig->AppDomainLeaks())
+        return;
+
+    // At both entry & exit, eax contains the allocated object.
+    // ecx is preserved, edx is not.
+
+    //
+    // Add in a call to SetAppDomain.  (Note that this
+    // probably would have been easier to implement by just not using
+    // the generated helpers in a checked build, but we'd lose code
+    // coverage that way.)
+    //
+
+    // Save ECX over function call
+    psl->X86EmitPushReg(kECX);
+
+    // mov object to ECX
+    // mov ecx, eax
+    psl->Emit16(0xc88b);
+
+    // SetObjectAppDomain pops its arg & returns object in EAX
+    psl->X86EmitCall(psl->NewExternalCodeLabel((LPVOID)SetObjectAppDomain), 4);
+
+    psl->X86EmitPopReg(kECX);
+}
+
+#endif // CHECK_APP_DOMAIN_LEAKS
+
+
+void JIT_TrialAlloc::EmitNoAllocCode(CPUSTUBLINKER *psl, Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (flags & MP_ALLOCATOR)
+    {
+        if (flags & (ALIGN8|SIZE_IN_EAX))
+            psl->X86EmitPopReg(kEBX);
+    }
+    else
+    {
+        // mov             dword ptr [m_GCLock], 0FFFFFFFFh
+        psl->Emit16(0x05c7);
+        psl->Emit32((int)(size_t)&m_GCLock);
+        psl->Emit32(0xFFFFFFFF);
+    }
+}
+
+void *JIT_TrialAlloc::GenAllocSFast(Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER sl;
+
+    CodeLabel *noLock  = sl.NewCodeLabel();
+    CodeLabel *noAlloc = sl.NewCodeLabel();
+
+    // Emit the main body of the trial allocator, be it SP or MP
+    EmitCore(&sl, noLock, noAlloc, flags);
+
+#if CHECK_APP_DOMAIN_LEAKS 
+    EmitSetAppDomain(&sl);
+#endif
+
+    // Here we are at the end of the success case - just emit a ret
+    sl.X86EmitReturn(0);
+
+    // Come here in case of no space
+    sl.EmitLabel(noAlloc);
+
+    // Release the lock in the uniprocessor case
+    EmitNoAllocCode(&sl, flags);
+
+    // Come here in case of failure to get the lock
+    sl.EmitLabel(noLock);
+
+    // Jump to the framed helper
+    sl.X86EmitNearJump(sl.NewExternalCodeLabel((LPVOID)JIT_New));
+
+    Stub *pStub = sl.Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+
+    return (void *)pStub->GetEntryPoint();
+}
+
+
+void *JIT_TrialAlloc::GenBox(Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER sl;
+
+    CodeLabel *noLock  = sl.NewCodeLabel();
+    CodeLabel *noAlloc = sl.NewCodeLabel();
+
+    // Save address of value to be boxed
+    sl.X86EmitPushReg(kEBX);
+    sl.Emit16(0xda8b);
+
+    // Save the MethodTable ptr
+    sl.X86EmitPushReg(kECX);
+
+    // mov             ecx, [ecx]MethodTable.m_pWriteableData
+    sl.X86EmitOffsetModRM(0x8b, kECX, kECX, offsetof(MethodTable, m_pWriteableData));
+
+    // Check whether the class has not been initialized
+    // test [ecx]MethodTableWriteableData.m_dwFlags,MethodTableWriteableData::enum_flag_Unrestored
+    sl.X86EmitOffsetModRM(0xf7, (X86Reg)0x0, kECX, offsetof(MethodTableWriteableData, m_dwFlags));
+    sl.Emit32(MethodTableWriteableData::enum_flag_Unrestored);
+
+    // Restore the MethodTable ptr in ecx
+    sl.X86EmitPopReg(kECX);
+
+    // jne              noAlloc
+    sl.X86EmitCondJump(noAlloc, X86CondCode::kJNE);
+
+    // Emit the main body of the trial allocator
+    EmitCore(&sl, noLock, noAlloc, flags);
+
+#if CHECK_APP_DOMAIN_LEAKS 
+    EmitSetAppDomain(&sl);
+#endif
+
+    // Here we are at the end of the success case
+
+    // Check whether the object contains pointers
+    // test [ecx]MethodTable.m_dwFlags,MethodTable::enum_flag_ContainsPointers
+    sl.X86EmitOffsetModRM(0xf7, (X86Reg)0x0, kECX, offsetof(MethodTable, m_dwFlags));
+    sl.Emit32(MethodTable::enum_flag_ContainsPointers);
+
+    CodeLabel *pointerLabel = sl.NewCodeLabel();
+
+    // jne              pointerLabel
+    sl.X86EmitCondJump(pointerLabel, X86CondCode::kJNE);
+
+    // We have no pointers - emit a simple inline copy loop
+
+    // mov             ecx, [ecx]MethodTable.m_BaseSize
+    sl.X86EmitOffsetModRM(0x8b, kECX, kECX, offsetof(MethodTable, m_BaseSize));
+
+    // sub ecx,12
+    sl.X86EmitSubReg(kECX, 12);
+
+    CodeLabel *loopLabel = sl.NewCodeLabel();
+
+    sl.EmitLabel(loopLabel);
+
+    // mov edx,[ebx+ecx]
+    sl.X86EmitOp(0x8b, kEDX, kEBX, 0, kECX, 1);
+
+    // mov [eax+ecx+4],edx
+    sl.X86EmitOp(0x89, kEDX, kEAX, 4, kECX, 1);
+
+    // sub ecx,4
+    sl.X86EmitSubReg(kECX, 4);
+
+    // jg loopLabel
+    sl.X86EmitCondJump(loopLabel, X86CondCode::kJGE);
+
+    sl.X86EmitPopReg(kEBX);
+
+    sl.X86EmitReturn(0);
+
+    // Arrive at this label if there are pointers in the object
+    sl.EmitLabel(pointerLabel);
+
+    // Do call to CopyValueClassUnchecked(object, data, pMT)
+
+    // Pass pMT (still in ECX)
+    sl.X86EmitPushReg(kECX);
+
+    // Pass data (still in EBX)
+    sl.X86EmitPushReg(kEBX);
+
+    // Save the address of the object just allocated
+    // mov ebx,eax
+    sl.Emit16(0xD88B);
+
+
+    // Pass address of first user byte in the newly allocated object
+    sl.X86EmitAddReg(kEAX, 4);
+    sl.X86EmitPushReg(kEAX);
+
+    // call CopyValueClass
+    sl.X86EmitCall(sl.NewExternalCodeLabel((LPVOID) CopyValueClassUnchecked), 12);
+
+    // Restore the address of the newly allocated object and return it.
+    // mov eax,ebx
+    sl.Emit16(0xC38B);
+
+    sl.X86EmitPopReg(kEBX);
+
+    sl.X86EmitReturn(0);
+
+    // Come here in case of no space
+    sl.EmitLabel(noAlloc);
+
+    // Release the lock in the uniprocessor case
+    EmitNoAllocCode(&sl, flags);
+
+    // Come here in case of failure to get the lock
+    sl.EmitLabel(noLock);
+
+    // Restore the address of the value to be boxed
+    // mov edx,ebx
+    sl.Emit16(0xD38B);
+
+    // pop ebx
+    sl.X86EmitPopReg(kEBX);
+
+    // Jump to the slow version of JIT_Box
+    sl.X86EmitNearJump(sl.NewExternalCodeLabel((LPVOID) JIT_Box));
+
+    Stub *pStub = sl.Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+
+    return (void *)pStub->GetEntryPoint();
+}
+
+
+HCIMPL2_RAW(Object*, UnframedAllocateObjectArray, /*TypeHandle*/PVOID ArrayType, DWORD cElements)
+{
+    // This isn't _really_ an FCALL and therefore shouldn't have the 
+    // SO_TOLERANT part of the FCALL_CONTRACT b/c it is not entered
+    // from managed code.
+    CONTRACTL {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_COOPERATIVE;
+        SO_INTOLERANT;
+    } CONTRACTL_END;
+
+    return OBJECTREFToObject(AllocateArrayEx(TypeHandle::FromPtr(ArrayType),
+                           (INT32 *)(&cElements),
+                           1,
+                           FALSE
+                           DEBUG_ARG(FALSE)));
+}
+HCIMPLEND_RAW
+
+
+HCIMPL2_RAW(Object*, UnframedAllocatePrimitiveArray, CorElementType type, DWORD cElements)
+{
+    // This isn't _really_ an FCALL and therefore shouldn't have the 
+    // SO_TOLERANT part of the FCALL_CONTRACT b/c it is not entered
+    // from managed code.
+    CONTRACTL {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_COOPERATIVE;
+        SO_INTOLERANT;
+    } CONTRACTL_END;
+
+    return OBJECTREFToObject( AllocatePrimitiveArray(type, cElements, FALSE) );
+}
+HCIMPLEND_RAW
+
+
+void *JIT_TrialAlloc::GenAllocArray(Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER sl;
+
+    CodeLabel *noLock  = sl.NewCodeLabel();
+    CodeLabel *noAlloc = sl.NewCodeLabel();
+
+    // We were passed a type descriptor in ECX, which contains the (shared)
+    // array method table and the element type.
+
+    // If this is the allocator for use from unmanaged code, ECX contains the
+    // element type descriptor, or the CorElementType.
+
+    // We need to save ECX for later
+
+    // push ecx
+    sl.X86EmitPushReg(kECX);
+
+    // The element count is in EDX - we need to save it for later.
+
+    // push edx
+    sl.X86EmitPushReg(kEDX);
+
+    if (flags & NO_FRAME)
+    {
+        if (flags & OBJ_ARRAY)
+        {
+            // we need to load the true method table from the type desc
+            sl.X86EmitIndexRegLoad(kECX, kECX, offsetof(ArrayTypeDesc,m_TemplateMT)-2);
+        }
+        else
+        {
+            // mov ecx,[g_pPredefinedArrayTypes+ecx*4]
+            sl.Emit8(0x8b);
+            sl.Emit16(0x8d0c);
+            sl.Emit32((int)(size_t)&g_pPredefinedArrayTypes);
+
+            // test ecx,ecx
+            sl.Emit16(0xc985);
+
+            // je noLock
+            sl.X86EmitCondJump(noLock, X86CondCode::kJZ);
+
+            // we need to load the true method table from the type desc
+            sl.X86EmitIndexRegLoad(kECX, kECX, offsetof(ArrayTypeDesc,m_TemplateMT));
+        }
+    }
+    else
+    {
+        // we need to load the true method table from the type desc
+        sl.X86EmitIndexRegLoad(kECX, kECX, offsetof(ArrayTypeDesc,m_TemplateMT)-2);
+
+#ifdef FEATURE_PREJIT
+        CodeLabel *indir = sl.NewCodeLabel();
+
+        // test cl,1
+        sl.Emit16(0xC1F6);
+        sl.Emit8(0x01);
+
+        // je indir
+        sl.X86EmitCondJump(indir, X86CondCode::kJZ);
+
+        // mov ecx, [ecx-1]
+        sl.X86EmitIndexRegLoad(kECX, kECX, -1);
+
+        sl.EmitLabel(indir);
+#endif
+    }
+
+    // Do a conservative check here.  This is to avoid doing overflow checks within this function.  We'll
+    // still have to do a size check before running through the body of EmitCore.  The way we do the check
+    // against the allocation quantum there requires that we not overflow when adding the size to the
+    // current allocation context pointer.  There is exactly LARGE_OBJECT_SIZE of headroom there, so do that
+    // check before we EmitCore.
+    //
+    // For reference types, we can just pick the correct value of maxElems and skip the second check.
+    //
+    // By the way, we use 258 as a "slack" value to ensure that we don't overflow because of the size of the
+    // array header or alignment.
+    sl.Emit16(0xfa81);
+
+
+        // The large object heap is 8 byte aligned, so for double arrays we
+        // want to bias toward putting things in the large object heap
+    unsigned maxElems =  0xffff - 256;
+
+    if ((flags & ALIGN8) && g_pConfig->GetDoubleArrayToLargeObjectHeapThreshold() < maxElems)
+        maxElems = g_pConfig->GetDoubleArrayToLargeObjectHeapThreshold();
+    if (flags & OBJ_ARRAY)
+    {
+        //Since we know that the array elements are sizeof(OBJECTREF), set maxElems exactly here (use the
+        //same slack from above.
+        maxElems = min(maxElems, (LARGE_OBJECT_SIZE/sizeof(OBJECTREF)) - 256);
+    }
+    sl.Emit32(maxElems);
+
+
+    // jae noLock - seems tempting to jump to noAlloc, but we haven't taken the lock yet
+    sl.X86EmitCondJump(noLock, X86CondCode::kJAE);
+
+    if (flags & OBJ_ARRAY)
+    {
+        // In this case we know the element size is sizeof(void *), or 4 for x86
+        // This helps us in two ways - we can shift instead of multiplying, and
+        // there's no need to align the size either
+
+        _ASSERTE(sizeof(void *) == 4);
+
+        // mov eax, [ecx]MethodTable.m_BaseSize
+        sl.X86EmitIndexRegLoad(kEAX, kECX, offsetof(MethodTable, m_BaseSize));
+
+        // lea eax, [eax+edx*4]
+        sl.X86EmitOp(0x8d, kEAX, kEAX, 0, kEDX, 4);
+    }
+    else
+    {
+        // movzx eax, [ECX]MethodTable.m_dwFlags /* component size */
+        sl.Emit8(0x0f);
+        sl.X86EmitOffsetModRM(0xb7, kEAX, kECX, offsetof(MethodTable, m_dwFlags /* component size */));
+
+        // mul eax, edx
+        sl.Emit16(0xe2f7);
+
+        // add eax, [ecx]MethodTable.m_BaseSize
+        sl.X86EmitOffsetModRM(0x03, kEAX, kECX, offsetof(MethodTable, m_BaseSize));
+
+        // Since this is an array of value classes, we need an extra compare here to make sure we're still
+        // less than LARGE_OBJECT_SIZE.  This is the last bit of arithmetic before we compare against the
+        // allocation context, so do it here.
+
+        // cmp eax, LARGE_OBJECT_SIZE
+        // ja noLock
+        sl.Emit8(0x3d);
+        sl.Emit32(LARGE_OBJECT_SIZE);
+        sl.X86EmitCondJump(noLock, X86CondCode::kJA);
+    }
+
+#if DATA_ALIGNMENT == 4 
+    if (flags & OBJ_ARRAY)
+    {
+        // No need for rounding in this case - element size is 4, and m_BaseSize is guaranteed
+        // to be a multiple of 4.
+    }
+    else
+#endif // DATA_ALIGNMENT == 4
+    {
+        // round the size to a multiple of 4
+
+        // add eax, 3
+        sl.X86EmitAddReg(kEAX, (DATA_ALIGNMENT-1));
+
+        // and eax, ~3
+        sl.Emit16(0xe083);
+        sl.Emit8(~(DATA_ALIGNMENT-1));
+    }
+
+    flags = (Flags)(flags | SIZE_IN_EAX);
+
+    // Emit the main body of the trial allocator, be it SP or MP
+    EmitCore(&sl, noLock, noAlloc, flags);
+
+    // Here we are at the end of the success case - store element count
+    // and possibly the element type descriptor and return
+
+    // pop edx - element count
+    sl.X86EmitPopReg(kEDX);
+
+    // pop ecx - array type descriptor
+    sl.X86EmitPopReg(kECX);
+
+    // mov             dword ptr [eax]ArrayBase.m_NumComponents, edx
+    sl.X86EmitIndexRegStore(kEAX, offsetof(ArrayBase,m_NumComponents), kEDX);
+
+#if CHECK_APP_DOMAIN_LEAKS 
+    EmitSetAppDomain(&sl);
+#endif
+
+    // no stack parameters
+    sl.X86EmitReturn(0);
+
+    // Come here in case of no space
+    sl.EmitLabel(noAlloc);
+
+    // Release the lock in the uniprocessor case
+    EmitNoAllocCode(&sl, flags);
+
+    // Come here in case of failure to get the lock
+    sl.EmitLabel(noLock);
+
+    // pop edx - element count
+    sl.X86EmitPopReg(kEDX);
+
+    // pop ecx - array type descriptor
+    sl.X86EmitPopReg(kECX);
+
+    CodeLabel * target;
+    if (flags & NO_FRAME)
+    {
+        if (flags & OBJ_ARRAY)
+        {
+            // Jump to the unframed helper
+            target = sl.NewExternalCodeLabel((LPVOID)UnframedAllocateObjectArray);
+            _ASSERTE(target->e.m_pExternalAddress);
+        }
+        else
+        {
+            // Jump to the unframed helper
+            target = sl.NewExternalCodeLabel((LPVOID)UnframedAllocatePrimitiveArray);
+            _ASSERTE(target->e.m_pExternalAddress);
+        }
+    }
+    else
+    {
+        // Jump to the framed helper
+        target = sl.NewExternalCodeLabel((LPVOID)JIT_NewArr1);
+        _ASSERTE(target->e.m_pExternalAddress);
+    }
+    sl.X86EmitNearJump(target);
+
+    Stub *pStub = sl.Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+
+    return (void *)pStub->GetEntryPoint();
+}
+
+
+void *JIT_TrialAlloc::GenAllocString(Flags flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER sl;
+
+    CodeLabel *noLock  = sl.NewCodeLabel();
+    CodeLabel *noAlloc = sl.NewCodeLabel();
+
+    // We were passed the number of characters in ECX
+
+    // push ecx
+    sl.X86EmitPushReg(kECX);
+
+    // mov eax, ecx
+    sl.Emit16(0xc18b);
+
+    // we need to load the method table for string from the global
+
+    // mov ecx, [g_pStringMethodTable]
+    sl.Emit16(0x0d8b);
+    sl.Emit32((int)(size_t)&g_pStringClass);
+
+    // Instead of doing elaborate overflow checks, we just limit the number of elements
+    // to (LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR) or less.
+    // This will avoid all overflow problems, as well as making sure
+    // big string objects are correctly allocated in the big object heap.
+
+    _ASSERTE(sizeof(WCHAR) == 2);
+
+    // cmp edx,(LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR)
+    sl.Emit16(0xf881);
+    sl.Emit32((LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR));
+
+    // jae noLock - seems tempting to jump to noAlloc, but we haven't taken the lock yet
+    sl.X86EmitCondJump(noLock, X86CondCode::kJAE);
+
+    // mov edx, [ecx]MethodTable.m_BaseSize
+    sl.X86EmitIndexRegLoad(kEDX, kECX, offsetof(MethodTable,m_BaseSize));
+
+    // Calculate the final size to allocate.
+    // We need to calculate baseSize + cnt*2, then round that up by adding 3 and anding ~3.
+
+    // lea eax, [edx+eax*2+5]
+    sl.X86EmitOp(0x8d, kEAX, kEDX, (DATA_ALIGNMENT-1), kEAX, 2);
+
+    // and eax, ~3
+    sl.Emit16(0xe083);
+    sl.Emit8(~(DATA_ALIGNMENT-1));
+
+    flags = (Flags)(flags | SIZE_IN_EAX);
+
+    // Emit the main body of the trial allocator, be it SP or MP
+    EmitCore(&sl, noLock, noAlloc, flags);
+
+    // Here we are at the end of the success case - store element count
+    // and possibly the element type descriptor and return
+
+    // pop ecx - element count
+    sl.X86EmitPopReg(kECX);
+
+    // mov             dword ptr [eax]ArrayBase.m_StringLength, ecx
+    sl.X86EmitIndexRegStore(kEAX, offsetof(StringObject,m_StringLength), kECX);
+
+#if CHECK_APP_DOMAIN_LEAKS 
+    EmitSetAppDomain(&sl);
+#endif
+
+    // no stack parameters
+    sl.X86EmitReturn(0);
+
+    // Come here in case of no space
+    sl.EmitLabel(noAlloc);
+
+    // Release the lock in the uniprocessor case
+    EmitNoAllocCode(&sl, flags);
+
+    // Come here in case of failure to get the lock
+    sl.EmitLabel(noLock);
+
+    // pop ecx - element count
+    sl.X86EmitPopReg(kECX);
+
+    CodeLabel * target;
+    if (flags & NO_FRAME)
+    {
+        // Jump to the unframed helper
+        target = sl.NewExternalCodeLabel((LPVOID)UnframedAllocateString);
+    }
+    else
+    {
+        // Jump to the framed helper
+        target = sl.NewExternalCodeLabel((LPVOID)FramedAllocateString);
+    }
+    sl.X86EmitNearJump(target);
+
+    Stub *pStub = sl.Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+
+    return (void *)pStub->GetEntryPoint();
+}
+
+
+FastStringAllocatorFuncPtr fastStringAllocator = UnframedAllocateString;
+
+FastObjectArrayAllocatorFuncPtr fastObjectArrayAllocator = UnframedAllocateObjectArray;
+
+FastPrimitiveArrayAllocatorFuncPtr fastPrimitiveArrayAllocator = UnframedAllocatePrimitiveArray;
+
+// For this helper,
+// If bCCtorCheck == true
+//          ECX contains the domain neutral module ID
+//          EDX contains the class domain ID, and the
+// else
+//          ECX contains the domain neutral module ID
+//          EDX is junk
+// shared static base is returned in EAX.
+
+// "init" should be the address of a routine which takes an argument of
+// the module domain ID, the class domain ID, and returns the static base pointer
+void EmitFastGetSharedStaticBase(CPUSTUBLINKER *psl, CodeLabel *init, bool bCCtorCheck, bool bGCStatic, bool bSingleAppDomain)
+{
+    STANDARD_VM_CONTRACT;
+
+    CodeLabel *DoInit = 0;
+    if (bCCtorCheck)
+    {
+        DoInit = psl->NewCodeLabel();
+    }
+
+    // mov eax, ecx
+    psl->Emit8(0x89);
+    psl->Emit8(0xc8);
+
+    if(!bSingleAppDomain)
+    {
+        // Check tag
+        CodeLabel *cctorCheck = psl->NewCodeLabel();
+
+
+        // test eax, 1
+        psl->Emit8(0xa9);
+        psl->Emit32(1);
+
+        // jz cctorCheck
+        psl->X86EmitCondJump(cctorCheck, X86CondCode::kJZ);
+
+        // mov eax GetAppDomain()
+        psl->X86EmitCurrentAppDomainFetch(kEAX, (1<<kECX)|(1<<kEDX));
+
+        // mov eax [eax->m_sDomainLocalBlock.m_pModuleSlots]
+        psl->X86EmitIndexRegLoad(kEAX, kEAX, (__int32) AppDomain::GetOffsetOfModuleSlotsPointer());
+
+        // Note: weird address arithmetic effectively does:
+        // shift over 1 to remove tag bit (which is always 1), then multiply by 4.
+        // mov eax [eax + ecx*2 - 2]
+        psl->X86EmitOp(0x8b, kEAX, kEAX, -2, kECX, 2);
+
+        // cctorCheck:
+        psl->EmitLabel(cctorCheck);
+
+    }
+
+    if (bCCtorCheck)
+    {
+        // test [eax + edx + offsetof(DomainLocalModule, m_pDataBlob], ClassInitFlags::INITIALIZED_FLAG       // Is class inited
+        _ASSERTE(FitsInI1(ClassInitFlags::INITIALIZED_FLAG));
+        _ASSERTE(FitsInI1(DomainLocalModule::GetOffsetOfDataBlob()));
+
+        BYTE testClassInit[] = { 0xF6, 0x44, 0x10,
+            (BYTE) DomainLocalModule::GetOffsetOfDataBlob(), (BYTE)ClassInitFlags::INITIALIZED_FLAG };
+
+        psl->EmitBytes(testClassInit, sizeof(testClassInit));
+
+        // jz  init                                    // no, init it
+        psl->X86EmitCondJump(DoInit, X86CondCode::kJZ);
+    }
+
+    if (bGCStatic)
+    {
+        // Indirect to get the pointer to the first GC Static
+        psl->X86EmitIndexRegLoad(kEAX, kEAX, (__int32) DomainLocalModule::GetOffsetOfGCStaticPointer());
+    }
+
+    // ret
+    psl->X86EmitReturn(0);
+
+    if (bCCtorCheck)
+    {
+        // DoInit:
+        psl->EmitLabel(DoInit);
+
+        // push edx (must be preserved)
+        psl->X86EmitPushReg(kEDX);
+
+        // call init
+        psl->X86EmitCall(init, 0);
+
+        // pop edx
+        psl->X86EmitPopReg(kEDX);
+
+        // ret
+        psl->X86EmitReturn(0);
+    }
+
+}
+
+void *GenFastGetSharedStaticBase(bool bCheckCCtor, bool bGCStatic, bool bSingleAppDomain)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER sl;
+
+    CodeLabel *init;
+    if (bGCStatic)
+    {
+        init = sl.NewExternalCodeLabel((LPVOID)JIT_GetSharedGCStaticBase);
+    }
+    else
+    {
+        init = sl.NewExternalCodeLabel((LPVOID)JIT_GetSharedNonGCStaticBase);
+    }
+
+    EmitFastGetSharedStaticBase(&sl, init, bCheckCCtor, bGCStatic, bSingleAppDomain);
+
+    Stub *pStub = sl.Link(SystemDomain::GetGlobalLoaderAllocator()->GetExecutableHeap());
+
+    return (void*) pStub->GetEntryPoint();
+}
+
+
+#ifdef ENABLE_FAST_GCPOLL_HELPER
+void    EnableJitGCPoll()
+{
+    SetJitHelperFunction(CORINFO_HELP_POLL_GC, (void*)JIT_PollGC);
+}
+void    DisableJitGCPoll()
+{
+    SetJitHelperFunction(CORINFO_HELP_POLL_GC, (void*)JIT_PollGC_Nop);
+}
+#endif
+
+#define NUM_WRITE_BARRIERS 6
+
+static const BYTE c_rgWriteBarrierRegs[NUM_WRITE_BARRIERS] = {
+    0, // EAX
+    1, // ECX
+    3, // EBX
+    6, // ESI
+    7, // EDI
+    5, // EBP
+};
+
+static const void * const c_rgWriteBarriers[NUM_WRITE_BARRIERS] = {
+    (void *)JIT_WriteBarrierEAX,
+    (void *)JIT_WriteBarrierECX,
+    (void *)JIT_WriteBarrierEBX,
+    (void *)JIT_WriteBarrierESI,
+    (void *)JIT_WriteBarrierEDI,
+    (void *)JIT_WriteBarrierEBP,
+};
+
+#ifdef WRITE_BARRIER_CHECK 
+static const void * const c_rgDebugWriteBarriers[NUM_WRITE_BARRIERS] = {
+    (void *)JIT_DebugWriteBarrierEAX,
+    (void *)JIT_DebugWriteBarrierECX,
+    (void *)JIT_DebugWriteBarrierEBX,
+    (void *)JIT_DebugWriteBarrierESI,
+    (void *)JIT_DebugWriteBarrierEDI,
+    (void *)JIT_DebugWriteBarrierEBP,
+};
+#endif // WRITE_BARRIER_CHECK
+
+#define DEBUG_RANDOM_BARRIER_CHECK DbgGetEXETimeStamp() % 7 == 4
+
+/*********************************************************************/
+// Initialize the part of the JIT helpers that require very little of
+// EE infrastructure to be in place.
+/*********************************************************************/
+void InitJITHelpers1()
+{
+    STANDARD_VM_CONTRACT;
+
+#define ETW_NUM_JIT_HELPERS 10
+    static const LPCWSTR pHelperNames[ETW_NUM_JIT_HELPERS] = {
+                                                      W("@NewObject"),
+                                                      W("@NewObjectAlign8"),
+                                                      W("@Box"),
+                                                      W("@NewArray1Object"),
+                                                      W("@NewArray1ValueType"),
+                                                      W("@NewArray1ObjectAlign8"),
+                                                      W("@StaticBaseObject"),
+                                                      W("@StaticBaseNonObject"),
+                                                      W("@StaticBaseObjectNoCCtor"),
+                                                      W("@StaticBaseNonObjectNoCCtor")
+                                                    };
+
+    PVOID pMethodAddresses[ETW_NUM_JIT_HELPERS]={0};
+
+    _ASSERTE(g_SystemInfo.dwNumberOfProcessors != 0);
+
+    JIT_TrialAlloc::Flags flags = GCHeap::UseAllocationContexts() ?
+        JIT_TrialAlloc::MP_ALLOCATOR : JIT_TrialAlloc::NORMAL;
+
+    // Get CPU features and check for SSE2 support.
+    // This code should eventually probably be moved into codeman.cpp,
+    // where we set the cpu feature flags for the JIT based on CPU type and features.
+    DWORD dwCPUFeaturesECX;
+    DWORD dwCPUFeaturesEDX;
+
+    __asm
+    {
+        pushad
+        mov eax, 1
+        cpuid
+	mov dwCPUFeaturesECX, ecx
+        mov dwCPUFeaturesEDX, edx
+        popad
+    }
+
+    //  If bit 26 (SSE2) is set, then we can use the SSE2 flavors
+    //  and faster x87 implementation for the P4 of Dbl2Lng.
+    if (dwCPUFeaturesEDX & (1<<26))
+    {
+        SetJitHelperFunction(CORINFO_HELP_DBL2INT, JIT_Dbl2IntSSE2);
+        if (dwCPUFeaturesECX & 1)  // check SSE3
+        {
+            SetJitHelperFunction(CORINFO_HELP_DBL2UINT, JIT_Dbl2LngSSE3);
+            SetJitHelperFunction(CORINFO_HELP_DBL2LNG, JIT_Dbl2LngSSE3);
+	}
+        else
+        {
+            SetJitHelperFunction(CORINFO_HELP_DBL2UINT, JIT_Dbl2LngP4x87);   // SSE2 only for signed
+            SetJitHelperFunction(CORINFO_HELP_DBL2LNG, JIT_Dbl2LngP4x87);
+        }
+    }
+
+    if (!(TrackAllocationsEnabled() 
+        || LoggingOn(LF_GCALLOC, LL_INFO10)
+#ifdef _DEBUG 
+        || (g_pConfig->ShouldInjectFault(INJECTFAULT_GCHEAP) != 0)
+#endif
+         )
+        )
+    {
+        // Replace the slow helpers with faster version
+
+        pMethodAddresses[0] = JIT_TrialAlloc::GenAllocSFast(flags);
+        SetJitHelperFunction(CORINFO_HELP_NEWSFAST, pMethodAddresses[0]);
+        pMethodAddresses[1] = JIT_TrialAlloc::GenAllocSFast((JIT_TrialAlloc::Flags)(flags|JIT_TrialAlloc::ALIGN8 | JIT_TrialAlloc::ALIGN8OBJ));
+        SetJitHelperFunction(CORINFO_HELP_NEWSFAST_ALIGN8, pMethodAddresses[1]);
+        pMethodAddresses[2] = JIT_TrialAlloc::GenBox(flags);
+        SetJitHelperFunction(CORINFO_HELP_BOX, pMethodAddresses[2]);
+        pMethodAddresses[3] = JIT_TrialAlloc::GenAllocArray((JIT_TrialAlloc::Flags)(flags|JIT_TrialAlloc::OBJ_ARRAY));
+        SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, pMethodAddresses[3]);
+        pMethodAddresses[4] = JIT_TrialAlloc::GenAllocArray(flags);
+        SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, pMethodAddresses[4]);
+        pMethodAddresses[5] = JIT_TrialAlloc::GenAllocArray((JIT_TrialAlloc::Flags)(flags|JIT_TrialAlloc::ALIGN8));
+        SetJitHelperFunction(CORINFO_HELP_NEWARR_1_ALIGN8, pMethodAddresses[5]);
+
+        fastObjectArrayAllocator = (FastObjectArrayAllocatorFuncPtr)JIT_TrialAlloc::GenAllocArray((JIT_TrialAlloc::Flags)(flags|JIT_TrialAlloc::NO_FRAME|JIT_TrialAlloc::OBJ_ARRAY));
+        fastPrimitiveArrayAllocator = (FastPrimitiveArrayAllocatorFuncPtr)JIT_TrialAlloc::GenAllocArray((JIT_TrialAlloc::Flags)(flags|JIT_TrialAlloc::NO_FRAME));
+
+        // If allocation logging is on, then we divert calls to FastAllocateString to an Ecall method, not this
+        // generated method. Find this workaround in Ecall::Init() in ecall.cpp.
+        ECall::DynamicallyAssignFCallImpl((PCODE) JIT_TrialAlloc::GenAllocString(flags), ECall::FastAllocateString);
+
+        // generate another allocator for use from unmanaged code (won't need a frame)
+        fastStringAllocator = (FastStringAllocatorFuncPtr) JIT_TrialAlloc::GenAllocString((JIT_TrialAlloc::Flags)(flags|JIT_TrialAlloc::NO_FRAME));
+        //UnframedAllocateString;
+    }
+
+    bool bSingleAppDomain = IsSingleAppDomain();
+
+    // Replace static helpers with faster assembly versions
+    pMethodAddresses[6] = GenFastGetSharedStaticBase(true, true, bSingleAppDomain);
+    SetJitHelperFunction(CORINFO_HELP_GETSHARED_GCSTATIC_BASE, pMethodAddresses[6]);
+    pMethodAddresses[7] = GenFastGetSharedStaticBase(true, false, bSingleAppDomain);
+    SetJitHelperFunction(CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE, pMethodAddresses[7]);
+    pMethodAddresses[8] = GenFastGetSharedStaticBase(false, true, bSingleAppDomain);
+    SetJitHelperFunction(CORINFO_HELP_GETSHARED_GCSTATIC_BASE_NOCTOR, pMethodAddresses[8]);
+    pMethodAddresses[9] = GenFastGetSharedStaticBase(false, false, bSingleAppDomain);
+    SetJitHelperFunction(CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE_NOCTOR, pMethodAddresses[9]);
+
+    ETW::MethodLog::StubsInitialized(pMethodAddresses, (PVOID *)pHelperNames, ETW_NUM_JIT_HELPERS);
+
+#ifdef ENABLE_FAST_GCPOLL_HELPER
+    // code:JIT_PollGC_Nop
+    SetJitHelperFunction(CORINFO_HELP_POLL_GC, (void*)JIT_PollGC_Nop);
+#endif //ENABLE_FAST_GCPOLL_HELPER
+
+    // All write barrier helpers should fit into one page.
+    // If you hit this assert on retail build, there is most likely problem with BBT script.
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (BYTE*)JIT_WriteBarrierLast - (BYTE*)JIT_WriteBarrierStart < PAGE_SIZE);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (BYTE*)JIT_PatchedWriteBarrierLast - (BYTE*)JIT_PatchedWriteBarrierStart < PAGE_SIZE);
+
+    // Copy the write barriers to their final resting place.
+    for (int iBarrier = 0; iBarrier < NUM_WRITE_BARRIERS; iBarrier++)
+    {
+        BYTE * pfunc = (BYTE *) JIT_WriteBarrierReg_PreGrow;
+
+        BYTE * pBuf = (BYTE *)c_rgWriteBarriers[iBarrier];
+        int reg = c_rgWriteBarrierRegs[iBarrier];
+
+        memcpy(pBuf, pfunc, 34);
+
+        // assert the copied code ends in a ret to make sure we got the right length
+        _ASSERTE(pBuf[33] == 0xC3);
+
+        // We need to adjust registers in a couple of instructions
+        // It would be nice to have the template contain all zeroes for
+        // the register fields (corresponding to EAX), but that doesn't
+        // work because then we get a smaller encoding for the compares
+        // that only works for EAX but not the other registers.
+        // So we always have to clear the register fields before updating them.
+
+        // First instruction to patch is a mov [edx], reg
+
+        _ASSERTE(pBuf[0] == 0x89);
+        // Update the reg field (bits 3..5) of the ModR/M byte of this instruction
+        pBuf[1] &= 0xc7;
+        pBuf[1] |= reg << 3;
+
+        // Second instruction to patch is cmp reg, imm32 (low bound)
+
+        _ASSERTE(pBuf[2] == 0x81);
+        // Here the lowest three bits in ModR/M field are the register
+        pBuf[3] &= 0xf8;
+        pBuf[3] |= reg;
+
+#ifdef WRITE_BARRIER_CHECK 
+        // Don't do the fancy optimization just jump to the old one
+        // Use the slow one from time to time in a debug build because
+        // there are some good asserts in the unoptimized one
+        if ((g_pConfig->GetHeapVerifyLevel() & EEConfig::HEAPVERIFY_BARRIERCHECK) || DEBUG_RANDOM_BARRIER_CHECK) {
+            pfunc = &pBuf[0];
+            *pfunc++ = 0xE9;                // JMP c_rgDebugWriteBarriers[iBarrier]
+            *((DWORD*) pfunc) = (BYTE*) c_rgDebugWriteBarriers[iBarrier] - (pfunc + sizeof(DWORD));
+        }
+#endif // WRITE_BARRIER_CHECK
+    }
+
+#ifndef CODECOVERAGE
+    ValidateWriteBarrierHelpers();
+#endif
+
+    // Leave the patched region writable for StompWriteBarrierEphemeral(), StompWriteBarrierResize()
+    // and CTPMethodTable::ActivatePrecodeRemotingThunk
+
+    // Initialize g_TailCallFrameVptr for JIT_TailCall helper
+    g_TailCallFrameVptr = (void*)TailCallFrame::GetMethodFrameVPtr();
+}
+
+// these constans are offsets into our write barrier helpers for values that get updated as the bounds of the managed heap change.
+// ephemeral region
+const int AnyGrow_EphemeralLowerBound = 4; // offset is the same for both pre and post grow functions
+const int PostGrow_EphemeralUpperBound = 12;
+
+// card table
+const int PreGrow_CardTableFirstLocation = 16;
+const int PreGrow_CardTableSecondLocation = 28;
+const int PostGrow_CardTableFirstLocation = 24;
+const int PostGrow_CardTableSecondLocation = 36;
+
+
+#ifndef CODECOVERAGE        // Deactivate alignment validation for code coverage builds 
+                            // because the instrumented binaries will not preserve alignmant constraits and we will fail.
+
+void ValidateWriteBarrierHelpers()
+{
+    // we have an invariant that the addresses of all the values that we update in our write barrier
+    // helpers must be naturally aligned, this is so that the update can happen atomically since there
+    // are places where we update these values while the EE is running
+
+#ifdef WRITE_BARRIER_CHECK
+    // write barrier checking uses the slower helpers that we don't bash so there is no need for validation
+    if ((g_pConfig->GetHeapVerifyLevel() & EEConfig::HEAPVERIFY_BARRIERCHECK) || DEBUG_RANDOM_BARRIER_CHECK)
+        return;
+#endif // WRITE_BARRIER_CHECK
+    
+    // first validate the PreGrow helper
+    BYTE* pWriteBarrierFunc = reinterpret_cast<BYTE*>(JIT_WriteBarrierEAX);
+
+    // ephemeral region
+    DWORD* pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[AnyGrow_EphemeralLowerBound]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+
+    // card table
+    pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[PreGrow_CardTableFirstLocation]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+    pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[PreGrow_CardTableSecondLocation]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+
+    // now validate the PostGrow helper
+    pWriteBarrierFunc = reinterpret_cast<BYTE*>(JIT_WriteBarrierReg_PostGrow);
+
+    // ephemeral region
+    pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[AnyGrow_EphemeralLowerBound]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+    pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[PostGrow_EphemeralUpperBound]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+
+    // card table
+    pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[PostGrow_CardTableFirstLocation]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+    pLocation = reinterpret_cast<DWORD*>(&pWriteBarrierFunc[PostGrow_CardTableSecondLocation]);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", (reinterpret_cast<DWORD>(pLocation) & 0x3) == 0);
+    _ASSERTE_ALL_BUILDS("clr/src/VM/i386/JITinterfaceX86.cpp", *pLocation == 0xf0f0f0f0);
+}
+
+#endif //CODECOVERAGE
+/*********************************************************************/
+
+#define WriteBarrierIsPreGrow() (((BYTE *)JIT_WriteBarrierEAX)[10] == 0xc1)
+
+
+/*********************************************************************/
+// When a GC happens, the upper and lower bounds of the ephemeral
+// generation change.  This routine updates the WriteBarrier thunks
+// with the new values.
+void StompWriteBarrierEphemeral(bool /* isRuntimeSuspended */)
+{
+    CONTRACTL {
+        NOTHROW;
+        GC_NOTRIGGER;
+    } CONTRACTL_END;
+
+#ifdef WRITE_BARRIER_CHECK 
+        // Don't do the fancy optimization if we are checking write barrier
+    if (((BYTE *)JIT_WriteBarrierEAX)[0] == 0xE9)  // we are using slow write barrier
+        return;
+#endif // WRITE_BARRIER_CHECK
+
+    BOOL flushICache = FALSE;
+
+    // Update the lower bound.
+    for (int iBarrier = 0; iBarrier < NUM_WRITE_BARRIERS; iBarrier++)
+    {
+        BYTE * pBuf = (BYTE *)c_rgWriteBarriers[iBarrier];
+
+        // assert there is in fact a cmp r/m32, imm32 there
+        _ASSERTE(pBuf[2] == 0x81);
+
+        // Update the immediate which is the lower bound of the ephemeral generation
+        size_t *pfunc = (size_t *) &pBuf[AnyGrow_EphemeralLowerBound];
+        //avoid trivial self modifying code
+        if (*pfunc != (size_t) g_ephemeral_low)
+        {
+            flushICache = TRUE;
+            *pfunc = (size_t) g_ephemeral_low;
+        }
+        if (!WriteBarrierIsPreGrow())
+        {
+            // assert there is in fact a cmp r/m32, imm32 there
+            _ASSERTE(pBuf[10] == 0x81);
+
+                // Update the upper bound if we are using the PostGrow thunk.
+            pfunc = (size_t *) &pBuf[PostGrow_EphemeralUpperBound];
+            //avoid trivial self modifying code
+            if (*pfunc != (size_t) g_ephemeral_high)
+            {
+                flushICache = TRUE;
+                *pfunc = (size_t) g_ephemeral_high;
+            }
+        }
+    }
+
+    if (flushICache)
+        FlushInstructionCache(GetCurrentProcess(), (void *)JIT_PatchedWriteBarrierStart,
+            (BYTE*)JIT_PatchedWriteBarrierLast - (BYTE*)JIT_PatchedWriteBarrierStart);
+}
+
+/*********************************************************************/
+// When the GC heap grows, the ephemeral generation may no longer
+// be after the older generations.  If this happens, we need to switch
+// to the PostGrow thunk that checks both upper and lower bounds.
+// regardless we need to update the thunk with the
+// card_table - lowest_address.
+void StompWriteBarrierResize(bool isRuntimeSuspended, bool bReqUpperBoundsCheck)
+{
+    CONTRACTL {
+        NOTHROW;
+        if (GetThread()) {GC_TRIGGERS;} else {GC_NOTRIGGER;}
+    } CONTRACTL_END;
+
+#ifdef WRITE_BARRIER_CHECK 
+        // Don't do the fancy optimization if we are checking write barrier
+    if (((BYTE *)JIT_WriteBarrierEAX)[0] == 0xE9)  // we are using slow write barrier
+        return;
+#endif // WRITE_BARRIER_CHECK
+
+    bool bWriteBarrierIsPreGrow = WriteBarrierIsPreGrow();
+    bool bStompWriteBarrierEphemeral = false;
+
+    BOOL bEESuspendedHere = FALSE;
+
+    for (int iBarrier = 0; iBarrier < NUM_WRITE_BARRIERS; iBarrier++)
+    {
+        BYTE * pBuf = (BYTE *)c_rgWriteBarriers[iBarrier];
+        int reg = c_rgWriteBarrierRegs[iBarrier];
+
+        size_t *pfunc;
+
+    // Check if we are still using the pre-grow version of the write barrier.
+        if (bWriteBarrierIsPreGrow)
+        {
+            // Check if we need to use the upper bounds checking barrier stub.
+            if (bReqUpperBoundsCheck)
+            {
+                GCX_MAYBE_COOP_NO_THREAD_BROKEN((GetThread()!=NULL));
+                if( !isRuntimeSuspended && !bEESuspendedHere) {
+                    ThreadSuspend::SuspendEE(ThreadSuspend::SUSPEND_FOR_GC_PREP);
+                    bEESuspendedHere = TRUE;
+                }
+
+                pfunc = (size_t *) JIT_WriteBarrierReg_PostGrow;
+                memcpy(pBuf, pfunc, 42);
+
+                // assert the copied code ends in a ret to make sure we got the right length
+                _ASSERTE(pBuf[41] == 0xC3);
+
+                // We need to adjust registers in a couple of instructions
+                // It would be nice to have the template contain all zeroes for
+                // the register fields (corresponding to EAX), but that doesn't
+                // work because then we get a smaller encoding for the compares
+                // that only works for EAX but not the other registers
+                // So we always have to clear the register fields before updating them.
+
+                // First instruction to patch is a mov [edx], reg
+
+                _ASSERTE(pBuf[0] == 0x89);
+                // Update the reg field (bits 3..5) of the ModR/M byte of this instruction
+                pBuf[1] &= 0xc7;
+                pBuf[1] |= reg << 3;
+
+                // Second instruction to patch is cmp reg, imm32 (low bound)
+
+                _ASSERTE(pBuf[2] == 0x81);
+                // Here the lowest three bits in ModR/M field are the register
+                pBuf[3] &= 0xf8;
+                pBuf[3] |= reg;
+
+                // Third instruction to patch is another cmp reg, imm32 (high bound)
+
+                _ASSERTE(pBuf[10] == 0x81);
+                // Here the lowest three bits in ModR/M field are the register
+                pBuf[11] &= 0xf8;
+                pBuf[11] |= reg;
+
+                bStompWriteBarrierEphemeral = true;
+                // What we're trying to update is the offset field of a
+
+                // cmp offset[edx], 0ffh instruction
+                _ASSERTE(pBuf[22] == 0x80);
+                pfunc = (size_t *) &pBuf[PostGrow_CardTableFirstLocation];
+               *pfunc = (size_t) g_card_table;
+
+                // What we're trying to update is the offset field of a
+                // mov offset[edx], 0ffh instruction
+                _ASSERTE(pBuf[34] == 0xC6);
+                pfunc = (size_t *) &pBuf[PostGrow_CardTableSecondLocation];
+
+            }
+            else
+            {
+                // What we're trying to update is the offset field of a
+
+                // cmp offset[edx], 0ffh instruction
+                _ASSERTE(pBuf[14] == 0x80);
+                pfunc = (size_t *) &pBuf[PreGrow_CardTableFirstLocation];
+               *pfunc = (size_t) g_card_table;
+
+                // What we're trying to update is the offset field of a
+
+                // mov offset[edx], 0ffh instruction
+                _ASSERTE(pBuf[26] == 0xC6);
+                pfunc = (size_t *) &pBuf[PreGrow_CardTableSecondLocation];
+            }
+        }
+        else
+        {
+            // What we're trying to update is the offset field of a
+
+            // cmp offset[edx], 0ffh instruction
+            _ASSERTE(pBuf[22] == 0x80);
+            pfunc = (size_t *) &pBuf[PostGrow_CardTableFirstLocation];
+           *pfunc = (size_t) g_card_table;
+
+            // What we're trying to update is the offset field of a
+            // mov offset[edx], 0ffh instruction
+            _ASSERTE(pBuf[34] == 0xC6);
+            pfunc = (size_t *) &pBuf[PostGrow_CardTableSecondLocation];
+        }
+
+        // Stick in the adjustment value.
+        *pfunc = (size_t) g_card_table;
+    }
+
+    if (bStompWriteBarrierEphemeral)
+    {
+        _ASSERTE(isRuntimeSuspended || bEESuspendedHere);
+        StompWriteBarrierEphemeral(true);
+    }
+    else
+    {
+        FlushInstructionCache(GetCurrentProcess(), (void *)JIT_PatchedWriteBarrierStart,
+            (BYTE*)JIT_PatchedWriteBarrierLast - (BYTE*)JIT_PatchedWriteBarrierStart);
+    }
+
+    if(bEESuspendedHere)
+        ThreadSuspend::RestartEE(FALSE, TRUE);
+}
+
diff --git a/src/vm/i386/profiler.cpp b/src/vm/i386/profiler.cpp
new file mode 100644
index 0000000000..11d4247aef
--- /dev/null
+++ b/src/vm/i386/profiler.cpp
@@ -0,0 +1,336 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+// FILE: profiler.cpp
+//
+
+// 
+
+// 
+// ======================================================================================
+
+#include "common.h"
+
+#ifdef PROFILING_SUPPORTED
+#include "proftoeeinterfaceimpl.h"
+
+//
+// The following structure is the format on x86 builds of the data
+// being passed in plaformSpecificHandle for ProfileEnter/Leave/Tailcall
+//
+typedef struct _PROFILE_PLATFORM_SPECIFIC_DATA
+{
+    FunctionID functionId;
+    DWORD    doubleBuffer1;
+    DWORD    doubleBuffer2;
+    DWORD    floatBuffer;
+    DWORD    floatingPointValuePresent;
+    UINT_PTR eax; // eax and edx must be continuous in this structure to make getting 64 bit return values easier.
+    UINT_PTR edx;
+    UINT_PTR ecx;
+    UINT_PTR esp;
+    UINT_PTR ip;
+} PROFILE_PLATFORM_SPECIFIC_DATA, *PPROFILE_PLATFORM_SPECIFIC_DATA;
+
+
+/*
+ * ProfileGetIPFromPlatformSpecificHandle
+ *
+ * This routine takes the platformSpecificHandle and retrieves from it the
+ * IP value.
+ *
+ * Parameters:
+ *    handle - the platformSpecificHandle passed to ProfileEnter/Leave/Tailcall
+ *
+ * Returns:
+ *    The IP value stored in the handle.
+ */
+UINT_PTR ProfileGetIPFromPlatformSpecificHandle(void *handle)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    return ((PROFILE_PLATFORM_SPECIFIC_DATA *)handle)->ip;
+}
+
+
+/*
+ * ProfileSetFunctionIDInPlatformSpecificHandle
+ *
+ * This routine takes the platformSpecificHandle and functionID, and assign 
+ * functionID to functionID field of platformSpecificHandle.
+ *
+ * Parameters:
+ *    pPlatformSpecificHandle - the platformSpecificHandle passed to ProfileEnter/Leave/Tailcall
+ *    functionID - the FunctionID to be assigned
+ *
+ * Returns:
+ *    None
+ */
+void ProfileSetFunctionIDInPlatformSpecificHandle(void * pPlatformSpecificHandle, FunctionID functionID)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    _ASSERTE(pPlatformSpecificHandle != NULL);
+    _ASSERTE(functionID != NULL);
+
+    PROFILE_PLATFORM_SPECIFIC_DATA * pData = reinterpret_cast<PROFILE_PLATFORM_SPECIFIC_DATA *>(pPlatformSpecificHandle);
+    pData->functionId = functionID;   
+}
+
+/*
+ * ProfileArgIterator::ProfileArgIterator
+ *
+ * Constructor. Initializes for arg iteration.
+ *
+ * Parameters:
+ *    pMetaSig - The signature of the method we are going iterate over
+ *    platformSpecificHandle - the value passed to ProfileEnter/Leave/Tailcall
+ *
+ * Returns:
+ *    None.
+ */
+ProfileArgIterator::ProfileArgIterator(MetaSig * pMetaSig, void * platformSpecificHandle):
+    m_argIterator(pMetaSig)
+{
+    //
+    // It would be really nice to contract this, but the underlying functions are convolutedly
+    // contracted.  Basically everything should be loaded by the time the profiler gets a call
+    // back, so everything is NOTHROW/NOTRIGGER, but there is not mechanism for saying that the
+    // contracts in called functions should be for the best case, not the worst case, now.
+    //
+    WRAPPER_NO_CONTRACT;
+
+    m_handle = platformSpecificHandle;
+}
+
+/*
+ * ProfileArgIterator::~ProfileArgIterator
+ *
+ * Destructor, releases all resources.
+ *
+ */
+ProfileArgIterator::~ProfileArgIterator()
+{
+    LIMITED_METHOD_CONTRACT;
+}
+
+/*
+ * ProfileArgIterator::GetNextArgAddr
+ *
+ * After initialization, this method is called repeatedly until it
+ * returns NULL to get the address of each arg.  Note: this address
+ * could be anywhere on the stack.
+ *
+ * Returns:
+ *    Address of the argument, or NULL if iteration is complete.
+ */
+LPVOID ProfileArgIterator::GetNextArgAddr()
+{
+    //
+    // It would be really nice to contract this, but the underlying functions are convolutedly
+    // contracted.  Basically everything should be loaded by the time the profiler gets a call
+    // back, so everything is NOTHROW/NOTRIGGER, but there is not mechanism for saying that the
+    // contracts in called functions should be for the best case, not the worst case, now.
+    //
+    WRAPPER_NO_CONTRACT;
+
+    int argOffset = m_argIterator.GetNextOffset();
+
+    //
+    // Value is enregistered, figure out where and return that.
+    //
+    PROFILE_PLATFORM_SPECIFIC_DATA *pData = (PROFILE_PLATFORM_SPECIFIC_DATA *)m_handle;
+
+    //
+    // Zero indicates the end of the args.
+    //
+    if (argOffset == TransitionBlock::InvalidOffset)
+    {
+        return NULL;
+    }
+
+    if (pData == NULL)
+    {
+        //
+        // Something wrong.
+        //
+        _ASSERTE(!"Why do we have a NULL data pointer here?");
+        return NULL;
+    }
+
+    //
+    // If this is not enregistered, return the value
+    //
+    if (TransitionBlock::IsStackArgumentOffset(argOffset))
+    {
+        return ((LPBYTE)pData->esp) + (argOffset - TransitionBlock::GetOffsetOfArgs());
+    }
+
+    switch (argOffset - TransitionBlock::GetOffsetOfArgumentRegisters())
+    {
+    case offsetof(ArgumentRegisters, ECX):
+        return &(pData->ecx);
+    case offsetof(ArgumentRegisters, EDX):
+        return &(pData->edx);
+    }
+
+    _ASSERTE(!"Arg is an unsaved register!");
+    return NULL;
+}
+
+/*
+ * ProfileArgIterator::GetHiddenArgValue
+ *
+ * Called after initialization, any number of times, to retrieve any
+ * hidden argument, so that resolution for Generics can be done.
+ *
+ * Parameters:
+ *    None.
+ *
+ * Returns:
+ *    Value of the hidden parameter, or NULL if none exists.
+ */
+LPVOID ProfileArgIterator::GetHiddenArgValue(void)
+{
+    //
+    // It would be really nice to contract this, but the underlying functions are convolutedly
+    // contracted.  Basically everything should be loaded by the time the profiler gets a call
+    // back, so everything is NOTHROW/NOTRIGGER, but there is not mechanism for saying that the
+    // contracts in called functions should be for the best case, not the worst case, now.
+    //
+    WRAPPER_NO_CONTRACT;
+
+    PROFILE_PLATFORM_SPECIFIC_DATA *pData = (PROFILE_PLATFORM_SPECIFIC_DATA *)m_handle;
+
+    MethodDesc *pMethodDesc = FunctionIdToMethodDesc(pData->functionId);
+
+    if (!pMethodDesc->RequiresInstArg())
+    {
+        return NULL;
+    }
+
+    //
+    // The ArgIterator::GetParamTypeOffset() can only be called after calling GetNextOffset until the
+    // entire signature has been walked, but *before* GetNextOffset returns TransitionBlock::InvalidOffset 
+    // - indicating the end.
+    //
+
+    //
+    // Get the offset of the hidden arg
+    //
+    int argOffset = m_argIterator.GetParamTypeArgOffset();
+
+    //
+    // If this is not enregistered, return the value
+    //
+    if (TransitionBlock::IsStackArgumentOffset(argOffset))
+    {
+        return *(LPVOID *)(((LPBYTE)pData->esp) + (argOffset - TransitionBlock::GetOffsetOfArgs()));
+    }
+
+    switch (argOffset - TransitionBlock::GetOffsetOfArgumentRegisters())
+    {
+    case offsetof(ArgumentRegisters, ECX):
+        return (LPVOID)(pData->ecx);
+    case offsetof(ArgumentRegisters, EDX):
+        return (LPVOID)(pData->edx);
+    }
+
+    _ASSERTE(!"Arg is an unsaved register!");
+    return NULL;
+}
+
+/*
+ * ProfileArgIterator::GetThis
+ *
+ * Called after initialization, any number of times, to retrieve the
+ * value of 'this'.
+ *
+ * Parameters:
+ *    None.
+ *
+ * Returns:
+ *    value of the 'this' parameter, or NULL if none exists.
+ */
+LPVOID ProfileArgIterator::GetThis(void)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+    }
+    CONTRACTL_END;
+
+    PROFILE_PLATFORM_SPECIFIC_DATA *pData = (PROFILE_PLATFORM_SPECIFIC_DATA *)m_handle;
+
+    if (pData->ip == 0)
+    {
+        return NULL;
+    }
+
+    if (!m_argIterator.HasThis())
+    {
+        return NULL;
+    }
+
+    switch (offsetof(ArgumentRegisters, THIS_REG))
+    {
+    case offsetof(ArgumentRegisters, ECX):
+        return (LPVOID)pData->ecx;
+
+    case offsetof(ArgumentRegisters, EDX):
+        return (LPVOID)pData->edx;
+    }
+
+    _ASSERTE(!"This is an unsaved register!");
+    return NULL;
+}
+
+
+
+/*
+ * ProfileArgIterator::GetReturnBufferAddr
+ *
+ * Called after initialization, any number of times, to retrieve the
+ * address of the return buffer.  NULL indicates no return value.
+ *
+ * Parameters:
+ *    None.
+ *
+ * Returns:
+ *    Address of the return buffer, or NULL if none exists.
+ */
+LPVOID ProfileArgIterator::GetReturnBufferAddr(void)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+    }
+    CONTRACTL_END;
+
+    PROFILE_PLATFORM_SPECIFIC_DATA *pData = (PROFILE_PLATFORM_SPECIFIC_DATA *)m_handle;
+
+    if (m_argIterator.HasRetBuffArg())
+    {
+        return (void *)(pData->eax);
+    }
+
+    switch (m_argIterator.GetSig()->GetReturnType())
+    {
+    case ELEMENT_TYPE_R8:
+        _ASSERTE(pData->floatingPointValuePresent);
+        return (void *)(&(pData->doubleBuffer1));
+
+    case ELEMENT_TYPE_R4:
+        _ASSERTE(pData->floatingPointValuePresent);
+        return (void *)(&(pData->floatBuffer));
+
+    default:
+        return &(pData->eax);
+    }
+}
+
+#endif // PROFILING_SUPPORTED
+
diff --git a/src/vm/i386/remotingx86.cpp b/src/vm/i386/remotingx86.cpp
new file mode 100644
index 0000000000..3a9e891267
--- /dev/null
+++ b/src/vm/i386/remotingx86.cpp
@@ -0,0 +1,225 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+//
+// 
+// File: remotingx86.cpp
+// 
+
+// 
+// 
+// Purpose: Defines various remoting related functions for the x86 architecture
+//
+
+//
+// 
+
+//
+
+#include "common.h"
+
+#ifdef FEATURE_REMOTING
+
+#include "excep.h"
+#include "comdelegate.h"
+#include "remoting.h"
+#include "field.h"
+#include "siginfo.hpp"
+#include "stackbuildersink.h"
+#include "threads.h"
+#include "method.hpp"
+#include "asmconstants.h"
+#include "interoputil.h"
+#include "virtualcallstub.h"
+
+#ifdef FEATURE_COMINTEROP 
+#include "comcallablewrapper.h"
+#include "comcache.h"
+#endif // FEATURE_COMINTEROP
+
+//+----------------------------------------------------------------------------
+//
+//  Method:     CTPMethodTable::CreateThunkForVirtualMethod   private
+//
+//  Synopsis:   Creates the thunk that pushes the supplied slot number and jumps
+//              to TP Stub
+//
+//+----------------------------------------------------------------------------
+PCODE CTPMethodTable::CreateThunkForVirtualMethod(DWORD dwSlot, BYTE *startaddr)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        PRECONDITION(CheckPointer(startaddr));
+    }
+    CONTRACTL_END;
+
+    BYTE *pCode = startaddr;
+
+    // 0000   B8 67 45 23 01     MOV  EAX, dwSlot
+    // 0005   E9 ?? ?? ?? ??     JMP  TransparentProxyStub
+    *pCode++ = 0xB8;
+    *((DWORD *) pCode) = dwSlot;
+    pCode += sizeof(DWORD);
+    *pCode++ = 0xE9;
+    // self-relative call, based on the start of the next instruction.
+    *((LONG *) pCode) = (LONG)((size_t)GetTPStubEntryPoint() - (size_t) (pCode + sizeof(LONG)));
+
+    _ASSERTE(CVirtualThunkMgr::IsThunkByASM((PCODE)startaddr));
+
+    return (PCODE)startaddr;
+}
+
+
+//+----------------------------------------------------------------------------
+//
+//  Method:     CTPMethodTable::ActivatePrecodeRemotingThunk    private
+//
+//  Synopsis:   Patch the precode remoting thunk to begin interception
+//
+//+----------------------------------------------------------------------------
+void CTPMethodTable::ActivatePrecodeRemotingThunk()
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    // Before activation:
+    // 0000 C3                  ret
+    // 0001 90                  nop
+
+    // After activation:
+    // 0000 85 C9               test     ecx,ecx
+
+    // 0002 74 XX               je       RemotingDone
+    // 0004 81 39 XX XX XX XX   cmp      dword ptr [ecx],11111111h
+    // 000A 74 XX               je       RemotingCheck
+
+    // Switch offset and size of patch based on the jump opcode used.
+    BYTE* pCode = (BYTE*)PrecodeRemotingThunk;
+
+    SIZE_T mtOffset = 0x0006;
+    SIZE_T size = 0x000A;
+
+    // Patch "ret + nop" to "test ecx,ecx"
+    *(UINT16 *)pCode = 0xC985;
+
+    // Replace placeholder value with the actual address of TP method table
+    _ASSERTE(*(PVOID*)(pCode+mtOffset) == (PVOID*)0x11111111);
+    *(PVOID*)(pCode+mtOffset) = GetMethodTable();
+
+    FlushInstructionCache(GetCurrentProcess(), pCode, size);
+}
+
+//+----------------------------------------------------------------------------
+//
+//  Method:     CVirtualThunkMgr::DoTraceStub   public
+//
+//  Synopsis:   Traces the stub given the starting address
+//
+//+----------------------------------------------------------------------------
+BOOL CVirtualThunkMgr::DoTraceStub(PCODE stubStartAddress, TraceDestination *trace)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        PRECONDITION(stubStartAddress != NULL);
+        PRECONDITION(CheckPointer(trace));
+    }
+    CONTRACTL_END;
+
+    BOOL bIsStub = FALSE;
+
+    // Find a thunk whose code address matching the starting address
+    LPBYTE pThunk = FindThunk((LPBYTE)stubStartAddress);
+    if(NULL != pThunk)
+    {
+        LPBYTE pbAddr = NULL;
+        LONG destAddress = 0;
+        if((LPBYTE)stubStartAddress == pThunk)
+        {
+
+            // Extract the long which gives the self relative address
+            // of the destination
+            pbAddr = pThunk + sizeof(BYTE) + sizeof(DWORD) + sizeof(BYTE);
+            destAddress = *(LONG *)pbAddr;
+
+            // Calculate the absolute address by adding the offset of the next
+            // instruction after the call instruction
+            destAddress += (LONG)(size_t)(pbAddr + sizeof(LONG));
+
+        }
+
+        // We cannot tell where the stub will end up until OnCall is reached.
+        // So we tell the debugger to run till OnCall is reached and then
+        // come back and ask us again for the actual destination address of
+        // the call
+
+        Stub *stub = Stub::RecoverStub((TADDR)destAddress);
+
+        trace->InitForFramePush(stub->GetPatchAddress());
+        bIsStub = TRUE;
+    }
+
+    return bIsStub;
+}
+
+//+----------------------------------------------------------------------------
+//
+//  Method:     CVirtualThunkMgr::IsThunkByASM  public
+//
+//  Synopsis:   Check assembly to see if this one of our thunks
+//
+//+----------------------------------------------------------------------------
+BOOL CVirtualThunkMgr::IsThunkByASM(PCODE startaddr)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        PRECONDITION(startaddr != NULL);
+    }
+    CONTRACTL_END;
+
+    PTR_BYTE pbCode = PTR_BYTE(startaddr);
+
+    return ((pbCode[0] == 0xB8) &&
+            (pbCode[5] == 0xe9) &&
+            (rel32Decode((TADDR)(pbCode + 6)) == CTPMethodTable::GetTPStubEntryPoint()));
+}
+
+//+----------------------------------------------------------------------------
+//
+//  Method:     CVirtualThunkMgr::GetMethodDescByASM   public
+//
+//  Synopsis:   Parses MethodDesc out of assembly code
+//
+//+----------------------------------------------------------------------------
+MethodDesc *CVirtualThunkMgr::GetMethodDescByASM(PCODE startaddr, MethodTable *pMT)
+{
+    CONTRACT (MethodDesc*)
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+        PRECONDITION(startaddr != NULL);
+        PRECONDITION(CheckPointer(pMT));
+        POSTCONDITION(CheckPointer(RETVAL));
+    }
+    CONTRACT_END;
+
+    RETURN (pMT->GetMethodDescForSlot(*((DWORD *) (startaddr + 1))));
+}
+
+#endif// FEATURE_REMOTING
+
diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp
new file mode 100644
index 0000000000..0037a7d3e6
--- /dev/null
+++ b/src/vm/i386/stublinkerx86.cpp
@@ -0,0 +1,6806 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+
+// NOTE on Frame Size C_ASSERT usage in this file 
+// if the frame size changes then the stubs have to be revisited for correctness
+// kindly revist the logic and then update the constants so that the C_ASSERT will again fire
+// if someone changes the frame size.  You are expected to keep this hard coded constant
+// up to date so that changes in the frame size trigger errors at compile time if the code is not altered
+
+// Precompiled Header
+
+#include "common.h"
+
+#include "field.h"
+#include "stublink.h"
+
+#include "tls.h"
+#include "frames.h"
+#include "excep.h"
+#include "dllimport.h"
+#include "log.h"
+#include "security.h"
+#include "comdelegate.h"
+#include "array.h"
+#include "jitinterface.h"
+#include "codeman.h"
+#ifdef FEATURE_REMOTING
+#include "remoting.h"
+#endif
+#include "dbginterface.h"
+#include "eeprofinterfaces.h"
+#include "eeconfig.h"
+#include "securitydeclarative.h"
+#ifdef _TARGET_X86_
+#include "asmconstants.h"
+#endif // _TARGET_X86_
+#include "class.h"
+#include "stublink.inl"
+
+#ifdef FEATURE_COMINTEROP
+#include "comtoclrcall.h"
+#include "runtimecallablewrapper.h"
+#include "comcache.h"
+#include "olevariant.h"
+#include "notifyexternals.h"
+#endif // FEATURE_COMINTEROP
+
+#ifdef FEATURE_PREJIT
+#include "compile.h"
+#endif
+
+#if defined(_DEBUG) && defined(STUBLINKER_GENERATES_UNWIND_INFO)
+#include <psapi.h>
+#endif
+
+
+#ifndef DACCESS_COMPILE
+
+extern "C" VOID __cdecl StubRareEnable(Thread *pThread);
+#ifdef FEATURE_COMINTEROP
+extern "C" HRESULT __cdecl StubRareDisableHR(Thread *pThread);
+#endif // FEATURE_COMINTEROP
+extern "C" VOID __cdecl StubRareDisableTHROW(Thread *pThread, Frame *pFrame);
+
+extern "C" VOID __cdecl ArrayOpStubNullException(void);
+extern "C" VOID __cdecl ArrayOpStubRangeException(void);
+extern "C" VOID __cdecl ArrayOpStubTypeMismatchException(void);
+
+#if defined(_TARGET_AMD64_)
+#define EXCEPTION_HELPERS(base) \
+    extern "C" VOID __cdecl base##_RSIRDI_ScratchArea(void); \
+    extern "C" VOID __cdecl base##_ScratchArea(void); \
+    extern "C" VOID __cdecl base##_RSIRDI(void); \
+    extern "C" VOID __cdecl base(void)
+EXCEPTION_HELPERS(ArrayOpStubNullException);
+EXCEPTION_HELPERS(ArrayOpStubRangeException);
+EXCEPTION_HELPERS(ArrayOpStubTypeMismatchException);
+#undef EXCEPTION_HELPERS
+
+#if defined(_DEBUG) 
+extern "C" VOID __cdecl DebugCheckStubUnwindInfo();
+#endif
+#endif // _TARGET_AMD64_
+
+// Presumably this code knows what it is doing with TLS.  If we are hiding these
+// services from normal code, reveal them here.
+#ifdef TlsGetValue
+#undef TlsGetValue
+#endif
+
+#ifdef FEATURE_COMINTEROP
+Thread* __stdcall CreateThreadBlockReturnHr(ComMethodFrame *pFrame);
+#endif
+
+
+
+#ifdef _TARGET_AMD64_
+
+BOOL IsPreservedReg (X86Reg reg)
+{
+    UINT16 PreservedRegMask =
+          (1 << kRBX)
+        | (1 << kRBP)
+        | (1 << kRSI)
+        | (1 << kRDI)
+        | (1 << kR12)
+        | (1 << kR13)
+        | (1 << kR14)
+        | (1 << kR15);
+    return PreservedRegMask & (1 << reg);
+}
+
+#endif // _TARGET_AMD64_
+
+#ifdef _TARGET_AMD64_
+//-----------------------------------------------------------------------
+// InstructionFormat for near Jump and short Jump
+//-----------------------------------------------------------------------
+
+//X64EmitTailcallWithRSPAdjust
+class X64NearJumpSetup : public InstructionFormat
+{
+    public:
+        X64NearJumpSetup() : InstructionFormat(  InstructionFormat::k8|InstructionFormat::k32
+                                                       | InstructionFormat::k64Small | InstructionFormat::k64
+                                                      )
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+            LIMITED_METHOD_CONTRACT
+            switch (refsize)
+            {
+                case k8:
+                    return 0;
+
+                case k32:
+                    return 0;
+
+                case k64Small:
+                    return 5;
+
+                case k64:
+                    return 10;
+
+                default:
+                    _ASSERTE(!"unexpected refsize");
+                    return 0;
+
+            }
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+            LIMITED_METHOD_CONTRACT
+            if (k8 == refsize)
+            {
+                // do nothing, X64NearJump will take care of this
+            }
+            else if (k32 == refsize)
+            {
+                // do nothing, X64NearJump will take care of this
+            }
+            else if (k64Small == refsize)
+            {
+                UINT64 TargetAddress = (INT64)pOutBuffer + fixedUpReference + GetSizeOfInstruction(refsize, variationCode);
+                _ASSERTE(FitsInU4(TargetAddress));
+
+                // mov eax, imm32  ; zero-extended
+                pOutBuffer[0] = 0xB8;
+                *((UINT32*)&pOutBuffer[1]) = (UINT32)TargetAddress;
+            }
+            else if (k64 == refsize)
+            {
+                // mov rax, imm64
+                pOutBuffer[0] = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+                pOutBuffer[1] = 0xB8;
+                *((UINT64*)&pOutBuffer[2]) = (UINT64)(((INT64)pOutBuffer) + fixedUpReference + GetSizeOfInstruction(refsize, variationCode));
+            }
+            else
+            {
+                _ASSERTE(!"unreached");
+            }
+        }
+
+        virtual BOOL CanReach(UINT refsize, UINT variationCode, BOOL fExternal, INT_PTR offset)
+        {
+            STATIC_CONTRACT_NOTHROW;
+            STATIC_CONTRACT_GC_NOTRIGGER;
+            STATIC_CONTRACT_FORBID_FAULT;
+
+
+            if (fExternal)
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k8:
+                    // For external, we don't have enough info to predict
+                    // the offset.
+                    return FALSE;
+
+                case InstructionFormat::k32:
+                    return sizeof(PVOID) <= sizeof(UINT32);
+
+                case InstructionFormat::k64Small:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+            else
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k8:
+                    return FitsInI1(offset);
+
+                case InstructionFormat::k32:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64Small:
+                    // EmitInstruction emits a non-relative jmp for
+                    // k64Small.  We don't have enough info to predict the
+                    // target address.  (Even if we did, this would only
+                    // handle the set of unsigned offsets with bit 31 set
+                    // and no higher bits set, too uncommon/hard to test.)
+                    return FALSE;
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+        }
+};
+
+class X64NearJumpExecute : public InstructionFormat
+{
+    public:
+        X64NearJumpExecute() : InstructionFormat(  InstructionFormat::k8|InstructionFormat::k32
+                                                 | InstructionFormat::k64Small | InstructionFormat::k64
+                                                 )
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+            LIMITED_METHOD_CONTRACT
+            switch (refsize)
+            {
+                case k8:
+                    return 2;
+
+                case k32:
+                    return 5;
+
+                case k64Small:
+                    return 3;
+
+                case k64:
+                    return 3;
+
+                default:
+                    _ASSERTE(!"unexpected refsize");
+                    return 0;
+
+            }
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+            LIMITED_METHOD_CONTRACT
+            if (k8 == refsize)
+            {
+                pOutBuffer[0] = 0xeb;
+                *((__int8*)(pOutBuffer+1)) = (__int8)fixedUpReference;
+            }
+            else if (k32 == refsize)
+            {
+                pOutBuffer[0] = 0xe9;
+                *((__int32*)(pOutBuffer+1)) = (__int32)fixedUpReference;
+            }
+            else if (k64Small == refsize)
+            {
+                // REX.W jmp rax
+                pOutBuffer[0] = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+                pOutBuffer[1] = 0xFF;
+                pOutBuffer[2] = 0xE0;
+            }
+            else if (k64 == refsize)
+            {
+                // REX.W jmp rax
+                pOutBuffer[0] = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+                pOutBuffer[1] = 0xFF;
+                pOutBuffer[2] = 0xE0;
+            }
+            else
+            {
+                _ASSERTE(!"unreached");
+            }
+        }
+
+        virtual BOOL CanReach(UINT refsize, UINT variationCode, BOOL fExternal, INT_PTR offset)
+        {
+            STATIC_CONTRACT_NOTHROW;
+            STATIC_CONTRACT_GC_NOTRIGGER;
+            STATIC_CONTRACT_FORBID_FAULT;
+
+
+            if (fExternal)
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k8:
+                    // For external, we don't have enough info to predict
+                    // the offset.
+                    return FALSE;
+
+                case InstructionFormat::k32:
+                    return sizeof(PVOID) <= sizeof(UINT32);
+
+                case InstructionFormat::k64Small:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+            else
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k8:
+                    return FitsInI1(offset);
+
+                case InstructionFormat::k32:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64Small:
+                    // EmitInstruction emits a non-relative jmp for
+                    // k64Small.  We don't have enough info to predict the
+                    // target address.  (Even if we did, this would only
+                    // handle the set of unsigned offsets with bit 31 set
+                    // and no higher bits set, too uncommon/hard to test.)
+                    return FALSE;
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+        }
+};
+
+#endif
+
+//-----------------------------------------------------------------------
+// InstructionFormat for near Jump and short Jump
+//-----------------------------------------------------------------------
+class X86NearJump : public InstructionFormat
+{
+    public:
+        X86NearJump() : InstructionFormat(  InstructionFormat::k8|InstructionFormat::k32
+#ifdef _TARGET_AMD64_
+                                          | InstructionFormat::k64Small | InstructionFormat::k64
+#endif // _TARGET_AMD64_
+                                          )
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+            LIMITED_METHOD_CONTRACT
+            switch (refsize)
+            {
+                case k8:
+                    return 2;
+
+                case k32:
+                    return 5;
+#ifdef _TARGET_AMD64_
+                case k64Small:
+                    return 5 + 2;
+
+                case k64:
+                    return 12;
+#endif // _TARGET_AMD64_
+                default:
+                    _ASSERTE(!"unexpected refsize");
+                    return 0;
+
+            }
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+            LIMITED_METHOD_CONTRACT
+            if (k8 == refsize)
+            {
+                pOutBuffer[0] = 0xeb;
+                *((__int8*)(pOutBuffer+1)) = (__int8)fixedUpReference;
+            }
+            else if (k32 == refsize)
+            {
+                pOutBuffer[0] = 0xe9;
+                *((__int32*)(pOutBuffer+1)) = (__int32)fixedUpReference;
+            }
+#ifdef _TARGET_AMD64_
+            else if (k64Small == refsize)
+            {
+                UINT64 TargetAddress = (INT64)pOutBuffer + fixedUpReference + GetSizeOfInstruction(refsize, variationCode);
+                _ASSERTE(FitsInU4(TargetAddress));
+
+                // mov eax, imm32  ; zero-extended
+                pOutBuffer[0] = 0xB8;
+                *((UINT32*)&pOutBuffer[1]) = (UINT32)TargetAddress;
+
+                // jmp rax
+                pOutBuffer[5] = 0xFF;
+                pOutBuffer[6] = 0xE0;
+            }
+            else if (k64 == refsize)
+            {
+                // mov rax, imm64
+                pOutBuffer[0] = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+                pOutBuffer[1] = 0xB8;
+                *((UINT64*)&pOutBuffer[2]) = (UINT64)(((INT64)pOutBuffer) + fixedUpReference + GetSizeOfInstruction(refsize, variationCode));
+
+                // jmp rax
+                pOutBuffer[10] = 0xFF;
+                pOutBuffer[11] = 0xE0;
+            }
+#endif // _TARGET_AMD64_
+            else
+            {
+                _ASSERTE(!"unreached");
+            }
+        }
+
+        virtual BOOL CanReach(UINT refsize, UINT variationCode, BOOL fExternal, INT_PTR offset)
+        {
+            STATIC_CONTRACT_NOTHROW;
+            STATIC_CONTRACT_GC_NOTRIGGER;
+            STATIC_CONTRACT_FORBID_FAULT;
+
+
+            if (fExternal)
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k8:
+                    // For external, we don't have enough info to predict
+                    // the offset.
+                    return FALSE;
+
+                case InstructionFormat::k32:
+                    return sizeof(PVOID) <= sizeof(UINT32);
+
+#ifdef _TARGET_AMD64_
+                case InstructionFormat::k64Small:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+#endif
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+            else
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k8:
+                    return FitsInI1(offset);
+
+                case InstructionFormat::k32:
+#ifdef _TARGET_AMD64_
+                    return FitsInI4(offset);
+#else
+                    return TRUE;
+#endif
+
+#ifdef _TARGET_AMD64_
+                case InstructionFormat::k64Small:
+                    // EmitInstruction emits a non-relative jmp for
+                    // k64Small.  We don't have enough info to predict the
+                    // target address.  (Even if we did, this would only
+                    // handle the set of unsigned offsets with bit 31 set
+                    // and no higher bits set, too uncommon/hard to test.)
+                    return FALSE;
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+#endif
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+        }
+};
+
+
+//-----------------------------------------------------------------------
+// InstructionFormat for conditional jump. Set the variationCode
+// to members of X86CondCode.
+//-----------------------------------------------------------------------
+class X86CondJump : public InstructionFormat
+{
+    public:
+        X86CondJump(UINT allowedSizes) : InstructionFormat(allowedSizes)
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+        LIMITED_METHOD_CONTRACT
+            return (refsize == k8 ? 2 : 6);
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+        LIMITED_METHOD_CONTRACT
+        if (refsize == k8)
+        {
+                pOutBuffer[0] = static_cast<BYTE>(0x70 | variationCode);
+                *((__int8*)(pOutBuffer+1)) = (__int8)fixedUpReference;
+        }
+        else
+        {
+                pOutBuffer[0] = 0x0f;
+                pOutBuffer[1] = static_cast<BYTE>(0x80 | variationCode);
+                *((__int32*)(pOutBuffer+2)) = (__int32)fixedUpReference;
+            }
+        }
+};
+
+
+//-----------------------------------------------------------------------
+// InstructionFormat for near call.
+//-----------------------------------------------------------------------
+class X86Call : public InstructionFormat
+{
+    public:
+        X86Call ()
+            : InstructionFormat(  InstructionFormat::k32
+#ifdef _TARGET_AMD64_
+                                | InstructionFormat::k64Small | InstructionFormat::k64
+#endif // _TARGET_AMD64_
+                                )
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+            LIMITED_METHOD_CONTRACT;
+
+            switch (refsize)
+            {
+            case k32:
+                return 5;
+
+#ifdef _TARGET_AMD64_
+            case k64Small:
+                return 5 + 2;
+
+            case k64:
+                return 10 + 2;
+#endif // _TARGET_AMD64_
+
+            default:
+                _ASSERTE(!"unexpected refsize");
+                return 0;
+            }
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+            LIMITED_METHOD_CONTRACT
+
+            switch (refsize)
+            {
+            case k32:
+                pOutBuffer[0] = 0xE8;
+                *((__int32*)(1+pOutBuffer)) = (__int32)fixedUpReference;
+                break;
+
+#ifdef _TARGET_AMD64_
+            case k64Small:
+                UINT64 TargetAddress;
+
+                TargetAddress = (INT64)pOutBuffer + fixedUpReference + GetSizeOfInstruction(refsize, variationCode);
+                _ASSERTE(FitsInU4(TargetAddress));
+
+                // mov  eax,<fixedUpReference>  ; zero-extends
+                pOutBuffer[0] = 0xB8;
+                *((UINT32*)&pOutBuffer[1]) = (UINT32)TargetAddress;
+
+                // call rax
+                pOutBuffer[5] = 0xff;
+                pOutBuffer[6] = 0xd0;
+                break;
+
+            case k64:
+                // mov  rax,<fixedUpReference>
+                pOutBuffer[0] = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+                pOutBuffer[1] = 0xB8;
+                *((UINT64*)&pOutBuffer[2]) = (UINT64)(((INT64)pOutBuffer) + fixedUpReference + GetSizeOfInstruction(refsize, variationCode));
+
+                // call rax
+                pOutBuffer[10] = 0xff;
+                pOutBuffer[11] = 0xd0;
+                break;
+#endif // _TARGET_AMD64_
+
+            default:
+                _ASSERTE(!"unreached");
+                break;
+            }
+        }
+
+// For x86, the default CanReach implementation will suffice.  It only needs
+// to handle k32.
+#ifdef _TARGET_AMD64_
+        virtual BOOL CanReach(UINT refsize, UINT variationCode, BOOL fExternal, INT_PTR offset)
+        {
+            if (fExternal)
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k32:
+                    // For external, we don't have enough info to predict
+                    // the offset.
+                    return FALSE;
+
+                case InstructionFormat::k64Small:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+            else
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k32:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64Small:
+                    // EmitInstruction emits a non-relative jmp for
+                    // k64Small.  We don't have enough info to predict the
+                    // target address.  (Even if we did, this would only
+                    // handle the set of unsigned offsets with bit 31 set
+                    // and no higher bits set, too uncommon/hard to test.)
+                    return FALSE;
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+        }
+#endif // _TARGET_AMD64_
+};
+
+
+//-----------------------------------------------------------------------
+// InstructionFormat for push imm32.
+//-----------------------------------------------------------------------
+class X86PushImm32 : public InstructionFormat
+{
+    public:
+        X86PushImm32(UINT allowedSizes) : InstructionFormat(allowedSizes)
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+            LIMITED_METHOD_CONTRACT;
+
+            return 5;
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+            LIMITED_METHOD_CONTRACT;
+
+            pOutBuffer[0] = 0x68;
+            // only support absolute pushimm32 of the label address. The fixedUpReference is
+            // the offset to the label from the current point, so add to get address
+            *((__int32*)(1+pOutBuffer)) = (__int32)(fixedUpReference);
+        }
+};
+
+#if defined(_TARGET_AMD64_)
+//-----------------------------------------------------------------------
+// InstructionFormat for lea reg, [RIP relative].
+//-----------------------------------------------------------------------
+class X64LeaRIP : public InstructionFormat
+{
+    public:
+        X64LeaRIP() : InstructionFormat(InstructionFormat::k64Small)
+        {
+            LIMITED_METHOD_CONTRACT;
+        }
+
+        virtual UINT GetSizeOfInstruction(UINT refsize, UINT variationCode)
+        {
+            LIMITED_METHOD_CONTRACT;
+
+            return 7;
+        }
+
+        virtual BOOL CanReach(UINT refsize, UINT variationCode, BOOL fExternal, INT_PTR offset)
+        {
+            if (fExternal)
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k64Small:
+                    // For external, we don't have enough info to predict
+                    // the offset.
+                    return FALSE;
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+            else
+            {
+                switch (refsize)
+                {
+                case InstructionFormat::k64Small:
+                    return FitsInI4(offset);
+
+                case InstructionFormat::k64:
+                    // intentional fallthru
+                case InstructionFormat::kAllowAlways:
+                    return TRUE;
+
+                default:
+                    _ASSERTE(0);
+                    return FALSE;
+                }
+            }
+        }
+
+        virtual VOID EmitInstruction(UINT refsize, __int64 fixedUpReference, BYTE *pOutBuffer, UINT variationCode, BYTE *pDataBuffer)
+        {
+            LIMITED_METHOD_CONTRACT;
+
+            X86Reg reg = (X86Reg)variationCode;
+            BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+            if (reg >= kR8)
+            {
+                rex |= REX_MODRM_REG_EXT;
+                reg = X86RegFromAMD64Reg(reg);
+            }
+
+            pOutBuffer[0] = rex;
+            pOutBuffer[1] = 0x8D;
+            pOutBuffer[2] = 0x05 | (reg << 3);
+            // only support absolute pushimm32 of the label address. The fixedUpReference is
+            // the offset to the label from the current point, so add to get address
+            *((__int32*)(3+pOutBuffer)) = (__int32)(fixedUpReference);
+        }
+};
+
+#endif // _TARGET_AMD64_
+
+#if defined(_TARGET_AMD64_)
+static BYTE gX64NearJumpSetup[sizeof(X64NearJumpSetup)];
+static BYTE gX64NearJumpExecute[sizeof(X64NearJumpExecute)];
+static BYTE gX64LeaRIP[sizeof(X64LeaRIP)];
+#endif
+
+static BYTE gX86NearJump[sizeof(X86NearJump)];
+static BYTE gX86CondJump[sizeof(X86CondJump)];
+static BYTE gX86Call[sizeof(X86Call)];
+static BYTE gX86PushImm32[sizeof(X86PushImm32)];
+
+/* static */ void StubLinkerCPU::Init()
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_NOTRIGGER;
+        INJECT_FAULT(COMPlusThrowOM(););
+    }
+    CONTRACTL_END;
+    new (gX86NearJump) X86NearJump();
+    new (gX86CondJump) X86CondJump( InstructionFormat::k8|InstructionFormat::k32);
+    new (gX86Call) X86Call();
+    new (gX86PushImm32) X86PushImm32(InstructionFormat::k32);
+
+#if defined(_TARGET_AMD64_)
+    new (gX64NearJumpSetup) X64NearJumpSetup();
+    new (gX64NearJumpExecute) X64NearJumpExecute();
+    new (gX64LeaRIP) X64LeaRIP();
+#endif
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    mov destReg, srcReg
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitMovRegReg(X86Reg destReg, X86Reg srcReg)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (destReg >= kR8)
+    {
+        rex |= REX_MODRM_RM_EXT;
+        destReg = X86RegFromAMD64Reg(destReg);
+    }
+    if (srcReg >= kR8)
+    {
+        rex |= REX_MODRM_REG_EXT;
+        srcReg = X86RegFromAMD64Reg(srcReg);
+    }
+    Emit8(rex);
+#endif
+
+    Emit8(0x89);
+    Emit8(static_cast<UINT8>(0xC0 | (srcReg << 3) | destReg));
+}
+
+//---------------------------------------------------------------
+
+VOID StubLinkerCPU::X86EmitMovSPReg(X86Reg srcReg)
+{
+    STANDARD_VM_CONTRACT;
+    const X86Reg kESP = (X86Reg)4;
+    X86EmitMovRegReg(kESP, srcReg);
+}
+
+VOID StubLinkerCPU::X86EmitMovRegSP(X86Reg destReg)
+{
+    STANDARD_VM_CONTRACT;
+    const X86Reg kESP = (X86Reg)4;
+    X86EmitMovRegReg(destReg, kESP);
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    PUSH <reg32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitPushReg(X86Reg reg)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef STUBLINKER_GENERATES_UNWIND_INFO
+    X86Reg origReg = reg;
+#endif
+
+#ifdef _TARGET_AMD64_
+    if (reg >= kR8)
+    {
+        Emit8(REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT | REX_OPCODE_REG_EXT);
+        reg = X86RegFromAMD64Reg(reg);
+    }
+#endif
+    Emit8(static_cast<UINT8>(0x50 + reg));
+
+#ifdef STUBLINKER_GENERATES_UNWIND_INFO
+    if (IsPreservedReg(origReg))
+    {
+        UnwindPushedReg(origReg);
+    }
+    else
+#endif
+    {
+        Push(sizeof(void*));
+    }
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    POP <reg32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitPopReg(X86Reg reg)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    if (reg >= kR8)
+    {
+        Emit8(REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT | REX_OPCODE_REG_EXT);
+        reg = X86RegFromAMD64Reg(reg);
+    }
+#endif // _TARGET_AMD64_
+
+    Emit8(static_cast<UINT8>(0x58 + reg));
+    Pop(sizeof(void*));
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    PUSH <imm32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitPushImm32(UINT32 value)
+{
+    STANDARD_VM_CONTRACT;
+
+    Emit8(0x68);
+    Emit32(value);
+    Push(sizeof(void*));
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    PUSH <imm32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitPushImm32(CodeLabel &target)
+{
+    STANDARD_VM_CONTRACT;
+
+    EmitLabelRef(&target, reinterpret_cast<X86PushImm32&>(gX86PushImm32), 0);
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    PUSH <imm8>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitPushImm8(BYTE value)
+{
+    STANDARD_VM_CONTRACT;
+
+    Emit8(0x6a);
+    Emit8(value);
+    Push(sizeof(void*));
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    PUSH <ptr>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitPushImmPtr(LPVOID value WIN64_ARG(X86Reg tmpReg /*=kR10*/))
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    X86EmitRegLoad(tmpReg, (UINT_PTR) value);
+    X86EmitPushReg(tmpReg);
+#else
+    X86EmitPushImm32((UINT_PTR) value);
+#endif
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    XOR <reg32>,<reg32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitZeroOutReg(X86Reg reg)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    // 32-bit results are zero-extended, so we only need the REX byte if
+    // it's an extended register.
+    if (reg >= kR8)
+    {
+        Emit8(REX_PREFIX_BASE | REX_MODRM_REG_EXT | REX_MODRM_RM_EXT);
+        reg = X86RegFromAMD64Reg(reg);
+    }
+#endif
+    Emit8(0x33);
+    Emit8(static_cast<UINT8>(0xc0 | (reg << 3) | reg));
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    jmp [reg]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitJumpReg(X86Reg reg)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+    }
+    CONTRACTL_END;
+
+    Emit8(0xff);
+    Emit8(static_cast<BYTE>(0xe0) | static_cast<BYTE>(reg));
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    CMP <reg32>,imm32
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitCmpRegImm32(X86Reg reg, INT32 imm32)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION((int) reg < NumX86Regs);
+    }
+    CONTRACTL_END;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (reg >= kR8)
+    {
+        rex |= REX_OPCODE_REG_EXT;
+        reg = X86RegFromAMD64Reg(reg);
+    }
+    Emit8(rex);
+#endif
+
+    if (FitsInI1(imm32)) {
+        Emit8(0x83);
+        Emit8(static_cast<UINT8>(0xF8 | reg));
+        Emit8((INT8)imm32);
+    } else {
+        Emit8(0x81);
+        Emit8(static_cast<UINT8>(0xF8 | reg));
+        Emit32(imm32);
+    }
+}
+
+#ifdef _TARGET_AMD64_
+//---------------------------------------------------------------
+// Emits:
+//    CMP [reg+offs], imm32
+//    CMP [reg], imm32
+//---------------------------------------------------------------
+VOID StubLinkerCPU:: X86EmitCmpRegIndexImm32(X86Reg reg, INT32 offs, INT32 imm32)
+{
+    STANDARD_VM_CONTRACT;
+
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (reg >= kR8)
+    {
+        rex |= REX_OPCODE_REG_EXT;
+        reg = X86RegFromAMD64Reg(reg);
+    }
+    Emit8(rex);
+
+    X64EmitCmp32RegIndexImm32(reg, offs, imm32);
+}
+
+VOID StubLinkerCPU:: X64EmitCmp32RegIndexImm32(X86Reg reg, INT32 offs, INT32 imm32)
+#else // _TARGET_AMD64_
+VOID StubLinkerCPU:: X86EmitCmpRegIndexImm32(X86Reg reg, INT32 offs, INT32 imm32)
+#endif // _TARGET_AMD64_
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION((int) reg < NumX86Regs);
+    }
+    CONTRACTL_END;
+
+    //
+    // The binary representation of "cmp [mem], imm32" is :
+    // 1000-00sw  mod11-1r/m
+    //
+    
+    unsigned wBit = (FitsInI1(imm32) ? 0 : 1);
+    Emit8(static_cast<UINT8>(0x80 | wBit));
+    
+    unsigned modBits;
+    if (offs == 0)
+        modBits = 0;
+    else if (FitsInI1(offs))
+        modBits = 1;
+    else
+        modBits = 2;
+    
+    Emit8(static_cast<UINT8>((modBits << 6) | 0x38 | reg));
+
+    if (offs)
+    {
+        if (FitsInI1(offs))
+            Emit8((INT8)offs);
+        else
+            Emit32(offs);
+    }
+    
+    if (FitsInI1(imm32))
+        Emit8((INT8)imm32);
+    else
+        Emit32(imm32);
+}
+
+//---------------------------------------------------------------
+// Emits:
+#if defined(_TARGET_AMD64_)
+//  mov     rax, <target>
+//  add     rsp, imm32
+//  jmp     rax
+#else
+//  add     rsp, imm32
+//  jmp     <target>
+#endif
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitTailcallWithESPAdjust(CodeLabel *pTarget, INT32 imm32)
+{
+    STANDARD_VM_CONTRACT;
+
+#if defined(_TARGET_AMD64_)
+    EmitLabelRef(pTarget, reinterpret_cast<X64NearJumpSetup&>(gX64NearJumpSetup), 0);
+    X86EmitAddEsp(imm32);
+    EmitLabelRef(pTarget, reinterpret_cast<X64NearJumpExecute&>(gX64NearJumpExecute), 0);
+#else
+    X86EmitAddEsp(imm32);
+    X86EmitNearJump(pTarget);
+#endif
+}
+
+//---------------------------------------------------------------
+// Emits:
+#if defined(_TARGET_AMD64_)
+//  mov     rax, <target>
+//  pop     reg
+//  jmp     rax
+#else
+//  pop     reg
+//  jmp     <target>
+#endif
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitTailcallWithSinglePop(CodeLabel *pTarget, X86Reg reg)
+{
+    STANDARD_VM_CONTRACT;
+
+#if defined(_TARGET_AMD64_)
+    EmitLabelRef(pTarget, reinterpret_cast<X64NearJumpSetup&>(gX64NearJumpSetup), 0);
+    X86EmitPopReg(reg);
+    EmitLabelRef(pTarget, reinterpret_cast<X64NearJumpExecute&>(gX64NearJumpExecute), 0);
+#else
+    X86EmitPopReg(reg);
+    X86EmitNearJump(pTarget);
+#endif
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    JMP <ofs8>   or
+//    JMP <ofs32}
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitNearJump(CodeLabel *target)
+{
+    STANDARD_VM_CONTRACT;
+    EmitLabelRef(target, reinterpret_cast<X86NearJump&>(gX86NearJump), 0);
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    Jcc <ofs8> or
+//    Jcc <ofs32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitCondJump(CodeLabel *target, X86CondCode::cc condcode)
+{
+    STANDARD_VM_CONTRACT;
+    EmitLabelRef(target, reinterpret_cast<X86CondJump&>(gX86CondJump), condcode);
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    call <ofs32>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitCall(CodeLabel *target, int iArgBytes)
+{
+    STANDARD_VM_CONTRACT;
+
+    EmitLabelRef(target, reinterpret_cast<X86Call&>(gX86Call), 0);
+
+    INDEBUG(Emit8(0x90));   // Emit a nop after the call in debug so that
+                            // we know that this is a call that can directly call
+                            // managed code
+#ifndef _TARGET_AMD64_
+    Pop(iArgBytes);
+#endif // !_TARGET_AMD64_
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    ret n
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitReturn(WORD wArgBytes)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+#ifdef _TARGET_AMD64_
+        PRECONDITION(wArgBytes == 0);
+#endif
+
+    }
+    CONTRACTL_END;
+
+    if (wArgBytes == 0)
+        Emit8(0xc3);
+    else
+    {
+        Emit8(0xc2);
+        Emit16(wArgBytes);
+    }
+
+    Pop(wArgBytes);
+}
+
+#ifdef _TARGET_AMD64_
+//---------------------------------------------------------------
+// Emits:
+//    JMP <ofs8>   or
+//    JMP <ofs32}
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitLeaRIP(CodeLabel *target, X86Reg reg)
+{
+    STANDARD_VM_CONTRACT;
+    EmitLabelRef(target, reinterpret_cast<X64LeaRIP&>(gX64LeaRIP), reg);
+}
+#endif // _TARGET_AMD64_
+
+
+
+VOID StubLinkerCPU::X86EmitPushRegs(unsigned regSet)
+{
+    STANDARD_VM_CONTRACT;
+
+    for (X86Reg r = kEAX; r <= NumX86Regs; r = (X86Reg)(r+1))
+        if (regSet & (1U<<r))
+        {
+            X86EmitPushReg(r);
+        }
+}
+
+
+VOID StubLinkerCPU::X86EmitPopRegs(unsigned regSet)
+{
+    STANDARD_VM_CONTRACT;
+
+    for (X86Reg r = NumX86Regs; r >= kEAX; r = (X86Reg)(r-1))
+        if (regSet & (1U<<r))
+            X86EmitPopReg(r);
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    mov <dstreg>, [<srcreg> + <ofs>]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexRegLoad(X86Reg dstreg,
+                                        X86Reg srcreg,
+                                        __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X86EmitOffsetModRM(0x8b, dstreg, srcreg, ofs);
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    mov [<dstreg> + <ofs>],<srcreg>
+//
+// Note: If you intend to use this to perform 64bit moves to a RSP
+//       based offset, then this method may not work. Consider
+//       using X86EmitIndexRegStoreRSP.
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexRegStore(X86Reg dstreg,
+                                         __int32 ofs,
+                                         X86Reg srcreg)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (dstreg != kESP_Unsafe)
+        X86EmitOffsetModRM(0x89, srcreg, dstreg, ofs);
+    else
+        X86EmitOp(0x89, srcreg, (X86Reg)kESP_Unsafe,  ofs);
+}
+
+#if defined(_TARGET_AMD64_)
+//---------------------------------------------------------------
+// Emits:
+//    mov [RSP + <ofs>],<srcreg>
+//
+// It marks the instruction has 64bit so that the processor
+// performs a 8byte data move to a RSP based stack location.
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexRegStoreRSP(__int32 ofs,
+                                         X86Reg srcreg)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitOp(0x89, srcreg, (X86Reg)kESP_Unsafe,  ofs, (X86Reg)0, 0, k64BitOp);
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    mov [R12 + <ofs>],<srcreg>
+//
+// It marks the instruction has 64bit so that the processor
+// performs a 8byte data move to a R12 based stack location.
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexRegStoreR12(__int32 ofs,
+                                         X86Reg srcreg)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitOp(0x89, srcreg, (X86Reg)kR12,  ofs, (X86Reg)0, 0, k64BitOp);
+}
+#endif // defined(_TARGET_AMD64_)
+
+//---------------------------------------------------------------
+// Emits:
+//    push dword ptr [<srcreg> + <ofs>]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexPush(X86Reg srcreg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    if(srcreg != kESP_Unsafe)
+        X86EmitOffsetModRM(0xff, (X86Reg)0x6, srcreg, ofs);
+    else
+        X86EmitOp(0xff,(X86Reg)0x6, srcreg, ofs);
+
+    Push(sizeof(void*));
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    push dword ptr [<baseReg> + <indexReg>*<scale> + <ofs>]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitBaseIndexPush(
+        X86Reg baseReg,
+        X86Reg indexReg,
+        __int32 scale,
+        __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitOffsetModRmSIB(0xff, (X86Reg)0x6, baseReg, indexReg, scale, ofs);
+    Push(sizeof(void*));
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    push dword ptr [ESP + <ofs>]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitSPIndexPush(__int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    __int8 ofs8 = (__int8) ofs;
+    if (ofs == (__int32) ofs8)
+    {
+        // The offset can be expressed in a byte (can use the byte
+        // form of the push esp instruction)
+
+        BYTE code[] = {0xff, 0x74, 0x24, ofs8};
+        EmitBytes(code, sizeof(code));   
+    }
+    else
+    {
+        // The offset requires 4 bytes (need to use the long form
+        // of the push esp instruction)
+ 
+        BYTE code[] = {0xff, 0xb4, 0x24, 0x0, 0x0, 0x0, 0x0};
+        *(__int32 *)(&code[3]) = ofs;        
+        EmitBytes(code, sizeof(code));
+    }
+    
+    Push(sizeof(void*));
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//    pop dword ptr [<srcreg> + <ofs>]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexPop(X86Reg srcreg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    if(srcreg != kESP_Unsafe)
+        X86EmitOffsetModRM(0x8f, (X86Reg)0x0, srcreg, ofs);
+    else
+        X86EmitOp(0x8f,(X86Reg)0x0, srcreg, ofs);
+
+    Pop(sizeof(void*));
+}
+
+//---------------------------------------------------------------
+// Emits:
+//    lea <dstreg>, [<srcreg> + <ofs>
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitIndexLea(X86Reg dstreg, X86Reg srcreg, __int32 ofs)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION((int) dstreg < NumX86Regs);
+        PRECONDITION((int) srcreg < NumX86Regs);
+    }
+    CONTRACTL_END;
+
+    X86EmitOffsetModRM(0x8d, dstreg, srcreg, ofs);
+}
+
+#if defined(_TARGET_AMD64_)
+VOID StubLinkerCPU::X86EmitIndexLeaRSP(X86Reg dstreg, X86Reg srcreg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitOp(0x8d, dstreg, (X86Reg)kESP_Unsafe,  ofs, (X86Reg)0, 0, k64BitOp);
+}
+#endif // defined(_TARGET_AMD64_)
+
+//---------------------------------------------------------------
+// Emits:
+//   sub esp, IMM
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitSubEsp(INT32 imm32)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (imm32 < 0x1000-100)
+    {
+        // As long as the esp size is less than 1 page plus a small
+        // safety fudge factor, we can just bump esp.
+        X86EmitSubEspWorker(imm32);
+    }
+    else
+    {
+        // Otherwise, must touch at least one byte for each page.
+        while (imm32 >= 0x1000)
+        {
+
+            X86EmitSubEspWorker(0x1000-4);
+            X86EmitPushReg(kEAX);
+
+            imm32 -= 0x1000;
+        }
+        if (imm32 < 500)
+        {
+            X86EmitSubEspWorker(imm32);
+        }
+        else
+        {
+            // If the remainder is large, touch the last byte - again,
+            // as a fudge factor.
+            X86EmitSubEspWorker(imm32-4);
+            X86EmitPushReg(kEAX);
+        }
+    }
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//   sub esp, IMM
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitSubEspWorker(INT32 imm32)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+    // On Win32, stacks must be faulted in one page at a time.
+        PRECONDITION(imm32 < 0x1000);
+    }
+    CONTRACTL_END;
+
+    if (!imm32)
+    {
+        // nop
+    }
+    else
+    {
+        X86_64BitOperands();
+
+        if (FitsInI1(imm32))
+        {
+            Emit16(0xec83);
+            Emit8((INT8)imm32);
+        }
+        else
+        {
+            Emit16(0xec81);
+            Emit32(imm32);
+        }
+
+        Push(imm32);
+    }
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//   add esp, IMM
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitAddEsp(INT32 imm32)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (!imm32)
+    {
+        // nop
+    }
+    else
+    {
+        X86_64BitOperands();
+
+        if (FitsInI1(imm32))
+        {
+            Emit16(0xc483);
+            Emit8((INT8)imm32);
+        }
+        else
+        {
+            Emit16(0xc481);
+            Emit32(imm32);
+        }
+    }
+    Pop(imm32);
+}
+
+VOID StubLinkerCPU::X86EmitAddReg(X86Reg reg, INT32 imm32)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION((int) reg < NumX86Regs);
+    }
+    CONTRACTL_END;
+
+    if (imm32 == 0)
+        return;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (reg >= kR8)
+    {
+        rex |= REX_OPCODE_REG_EXT;
+        reg = X86RegFromAMD64Reg(reg);
+    }
+    Emit8(rex);
+#endif
+
+    if (FitsInI1(imm32)) {
+        Emit8(0x83);
+        Emit8(static_cast<UINT8>(0xC0 | reg));
+        Emit8(static_cast<UINT8>(imm32));
+    } else {
+        Emit8(0x81);
+        Emit8(static_cast<UINT8>(0xC0 | reg));
+        Emit32(imm32);
+    }
+}
+
+//---------------------------------------------------------------
+// Emits: add destReg, srcReg
+//---------------------------------------------------------------
+
+VOID StubLinkerCPU::X86EmitAddRegReg(X86Reg destReg, X86Reg srcReg)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitR2ROp(0x01, srcReg, destReg);
+}
+
+
+
+
+VOID StubLinkerCPU::X86EmitSubReg(X86Reg reg, INT32 imm32)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION((int) reg < NumX86Regs);
+    }
+    CONTRACTL_END;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (reg >= kR8)
+    {
+        rex |= REX_OPCODE_REG_EXT;
+        reg = X86RegFromAMD64Reg(reg);
+    }
+    Emit8(rex);
+#endif
+
+    if (FitsInI1(imm32)) {
+        Emit8(0x83);
+        Emit8(static_cast<UINT8>(0xE8 | reg));
+        Emit8(static_cast<UINT8>(imm32));
+    } else {
+        Emit8(0x81);
+        Emit8(static_cast<UINT8>(0xE8 | reg));
+        Emit32(imm32);
+    }
+}
+
+//---------------------------------------------------------------
+// Emits: sub destReg, srcReg
+//---------------------------------------------------------------
+
+VOID StubLinkerCPU::X86EmitSubRegReg(X86Reg destReg, X86Reg srcReg)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitR2ROp(0x29, srcReg, destReg);
+}
+
+#if defined(_TARGET_AMD64_)
+
+//---------------------------------------------------------------
+// movdqa destXmmreg, srcXmmReg
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovXmmXmm(X86Reg destXmmreg, X86Reg srcXmmReg)
+{
+    STANDARD_VM_CONTRACT;
+    // There are several that could be used to mov xmm registers. MovAps is 
+    // what C++ compiler uses so let's use it here too.
+    X86EmitR2ROp(X86_INSTR_MOVAPS_R_RM, destXmmreg, srcXmmReg, k32BitOp);
+}
+
+//---------------------------------------------------------------
+// movdqa XmmN, [baseReg + offset]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovdqaFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X64EmitMovXmmWorker(0x66, 0x6F, Xmmreg, baseReg, ofs);
+}
+
+//---------------------------------------------------------------
+// movdqa [baseReg + offset], XmmN
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovdqaToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X64EmitMovXmmWorker(0x66, 0x7F, Xmmreg, baseReg, ofs);
+}
+
+//---------------------------------------------------------------
+// movsd XmmN, [baseReg + offset]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovSDFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X64EmitMovXmmWorker(0xF2, 0x10, Xmmreg, baseReg, ofs);
+}
+
+//---------------------------------------------------------------
+// movsd [baseReg + offset], XmmN
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovSDToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X64EmitMovXmmWorker(0xF2, 0x11, Xmmreg, baseReg, ofs);
+}
+
+//---------------------------------------------------------------
+// movss XmmN, [baseReg + offset]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovSSFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X64EmitMovXmmWorker(0xF3, 0x10, Xmmreg, baseReg, ofs);
+}
+
+//---------------------------------------------------------------
+// movss [baseReg + offset], XmmN
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+    X64EmitMovXmmWorker(0xF3, 0x11, Xmmreg, baseReg, ofs);
+}
+
+//---------------------------------------------------------------
+// Helper method for emitting of XMM from/to memory moves
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovXmmWorker(BYTE prefix, BYTE opcode, X86Reg Xmmreg, X86Reg baseReg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    BYTE    codeBuffer[10];
+    unsigned int     nBytes  = 0;
+    
+    // Setup the legacyPrefix for movsd
+    codeBuffer[nBytes++] = prefix;
+    
+    // By default, assume we dont have to emit the REX byte.
+    bool fEmitRex = false;
+    
+    BYTE rex = REX_PREFIX_BASE;
+
+    if (baseReg >= kR8)
+    {
+        rex |= REX_MODRM_RM_EXT;
+        baseReg = X86RegFromAMD64Reg(baseReg);
+        fEmitRex = true;
+    }
+    if (Xmmreg >= kXMM8)
+    {
+        rex |= REX_MODRM_REG_EXT;
+        Xmmreg = X86RegFromAMD64Reg(Xmmreg);
+        fEmitRex = true;
+    }
+
+    if (fEmitRex == true)
+    {
+        codeBuffer[nBytes++] = rex;
+    }
+    
+    // Next, specify the two byte opcode - first byte is always 0x0F.
+    codeBuffer[nBytes++] = 0x0F;    
+    codeBuffer[nBytes++] = opcode; 
+    
+    BYTE modrm = static_cast<BYTE>((Xmmreg << 3) | baseReg);
+    bool fOffsetFitsInSignedByte = FitsInI1(ofs)?true:false;
+    
+    if (fOffsetFitsInSignedByte)
+        codeBuffer[nBytes++] = 0x40|modrm;
+    else
+        codeBuffer[nBytes++] = 0x80|modrm;
+    
+    // If we are dealing with RSP or R12 as the baseReg, we need to emit the SIB byte.
+    if ((baseReg == (X86Reg)4 /*kRSP*/) || (baseReg == kR12))
+    {
+        codeBuffer[nBytes++] = 0x24;
+    }
+    
+    // Finally, specify the offset
+    if (fOffsetFitsInSignedByte)
+    {
+        codeBuffer[nBytes++] = (BYTE)ofs;
+    }
+    else
+    {
+        *((__int32*)(codeBuffer+nBytes)) = ofs;
+        nBytes += 4;
+    }
+    
+    _ASSERTE(nBytes <= _countof(codeBuffer));
+    
+    // Lastly, emit the encoded bytes
+    EmitBytes(codeBuffer, nBytes);    
+}
+
+#endif // defined(_TARGET_AMD64_)
+
+//---------------------------------------------------------------
+// Emits a MOD/RM for accessing a dword at [<indexreg> + ofs32]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitOffsetModRM(BYTE opcode, X86Reg opcodereg, X86Reg indexreg, __int32 ofs)
+{
+    STANDARD_VM_CONTRACT;
+
+    BYTE    codeBuffer[7];
+    BYTE*   code    = codeBuffer;
+    int     nBytes  = 0;
+#ifdef _TARGET_AMD64_
+    code++;
+    //
+    // code points to base X86 instruction,
+    // codeBuffer points to full AMD64 instruction
+    //
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (indexreg >= kR8)
+    {
+        rex |= REX_MODRM_RM_EXT;
+        indexreg = X86RegFromAMD64Reg(indexreg);
+    }
+    if (opcodereg >= kR8)
+    {
+        rex |= REX_MODRM_REG_EXT;
+        opcodereg = X86RegFromAMD64Reg(opcodereg);
+    }
+
+    nBytes++;
+    code[-1] = rex;
+#endif
+    code[0] = opcode;
+    nBytes++;
+    BYTE modrm = static_cast<BYTE>((opcodereg << 3) | indexreg);
+    if (ofs == 0 && indexreg != kEBP)
+    {
+        code[1] = modrm;
+        nBytes++;
+        EmitBytes(codeBuffer, nBytes);
+    }
+    else if (FitsInI1(ofs))
+    {
+        code[1] = 0x40|modrm;
+        code[2] = (BYTE)ofs;
+        nBytes += 2;
+        EmitBytes(codeBuffer, nBytes);
+    }
+    else
+    {
+        code[1] = 0x80|modrm;
+        *((__int32*)(2+code)) = ofs;
+        nBytes += 5;
+        EmitBytes(codeBuffer, nBytes);
+    }
+}
+
+//---------------------------------------------------------------
+// Emits a MOD/RM for accessing a dword at [<baseReg> + <indexReg>*<scale> + ofs32]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitOffsetModRmSIB(BYTE opcode, X86Reg opcodeOrReg, X86Reg baseReg, X86Reg indexReg, __int32 scale, __int32 ofs)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION(scale == 1 || scale == 2 || scale == 4 || scale == 8);
+        PRECONDITION(indexReg != kESP_Unsafe);
+    }
+    CONTRACTL_END;
+
+    BYTE    codeBuffer[8];
+    BYTE*   code    = codeBuffer;
+    int     nBytes  = 0;
+
+#ifdef _TARGET_AMD64_
+    _ASSERTE(!"NYI");
+#endif
+    code[0] = opcode;
+    nBytes++;
+
+    BYTE scaleEnc = 0;
+    switch(scale) 
+    {
+        case 1: scaleEnc = 0; break;
+        case 2: scaleEnc = 1; break;
+        case 4: scaleEnc = 2; break;
+        case 8: scaleEnc = 3; break;
+        default: _ASSERTE(!"Unexpected");
+    }
+
+    BYTE sib = static_cast<BYTE>((scaleEnc << 6) | (indexReg << 3) | baseReg);
+
+    if (FitsInI1(ofs))
+    {
+        code[1] = static_cast<BYTE>(0x44 | (opcodeOrReg << 3));
+        code[2] = sib;
+        code[3] = (BYTE)ofs;
+        nBytes += 3;
+        EmitBytes(codeBuffer, nBytes);
+    }
+    else
+    {
+        code[1] = static_cast<BYTE>(0x84 | (opcodeOrReg << 3));
+        code[2] = sib;
+        *(__int32*)(&code[3]) = ofs;
+        nBytes += 6;
+        EmitBytes(codeBuffer, nBytes);
+    }
+}
+
+
+
+VOID StubLinkerCPU::X86EmitRegLoad(X86Reg reg, UINT_PTR imm)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (!imm)
+    {
+        X86EmitZeroOutReg(reg);
+        return;
+    }
+
+    UINT cbimm = sizeof(void*);
+
+#ifdef _TARGET_AMD64_
+    // amd64 zero-extends all 32-bit operations.  If the immediate will fit in
+    // 32 bits, use the smaller encoding.
+
+    if (reg >= kR8 || !FitsInU4(imm))
+    {
+        BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+        if (reg >= kR8)
+        {
+            rex |= REX_MODRM_RM_EXT;
+            reg = X86RegFromAMD64Reg(reg);
+        }
+        Emit8(rex);
+    }
+    else
+    {
+        // amd64 is little endian, so the &imm below will correctly read off
+        // the low 4 bytes.
+        cbimm = sizeof(UINT32);
+    }
+#endif // _TARGET_AMD64_
+    Emit8(0xB8 | (BYTE)reg);
+    EmitBytes((BYTE*)&imm, cbimm);
+}
+
+
+//---------------------------------------------------------------
+// Emits the most efficient form of the operation:
+//
+//    opcode   altreg, [basereg + scaledreg*scale + ofs]
+//
+// or
+//
+//    opcode   [basereg + scaledreg*scale + ofs], altreg
+//
+// (the opcode determines which comes first.)
+//
+//
+// Limitations:
+//
+//    scale must be 0,1,2,4 or 8.
+//    if scale == 0, scaledreg is ignored.
+//    basereg and altreg may be equal to 4 (ESP) but scaledreg cannot
+//    for some opcodes, "altreg" may actually select an operation
+//      rather than a second register argument.
+//    if basereg is EBP, scale must be 0.
+//
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitOp(WORD    opcode,
+                              X86Reg  altreg,
+                              X86Reg  basereg,
+                              __int32 ofs /*=0*/,
+                              X86Reg  scaledreg /*=0*/,
+                              BYTE    scale /*=0*/
+                    AMD64_ARG(X86OperandSize OperandSize /*= k32BitOp*/))
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        // All 2-byte opcodes start with 0x0f.
+        PRECONDITION(!(opcode >> 8) || (opcode & 0xff) == 0x0f);
+
+        PRECONDITION(scale == 0 || scale == 1 || scale == 2 || scale == 4 || scale == 8);
+        PRECONDITION(scaledreg != (X86Reg)4);
+        PRECONDITION(!(basereg == kEBP && scale != 0));
+
+        PRECONDITION( ((UINT)basereg)   < NumX86Regs );
+        PRECONDITION( ((UINT)scaledreg) < NumX86Regs );
+        PRECONDITION( ((UINT)altreg)    < NumX86Regs );
+    }
+    CONTRACTL_END;
+
+#ifdef _TARGET_AMD64_
+    if (   k64BitOp == OperandSize
+        || altreg    >= kR8
+        || basereg   >= kR8
+        || scaledreg >= kR8)
+    {
+        BYTE rex = REX_PREFIX_BASE;
+
+        if (k64BitOp == OperandSize)
+            rex |= REX_OPERAND_SIZE_64BIT;
+
+        if (altreg >= kR8)
+        {
+            rex |= REX_MODRM_REG_EXT;
+            altreg = X86RegFromAMD64Reg(altreg);
+        }
+
+        if (basereg >= kR8)
+        {
+            // basereg might be in the modrm or sib fields.  This will be
+            // decided below, but the encodings are the same either way.
+            _ASSERTE(REX_SIB_BASE_EXT == REX_MODRM_RM_EXT);
+            rex |= REX_SIB_BASE_EXT;
+            basereg = X86RegFromAMD64Reg(basereg);
+        }
+
+        if (scaledreg >= kR8)
+        {
+            rex |= REX_SIB_INDEX_EXT;
+            scaledreg = X86RegFromAMD64Reg(scaledreg);
+        }
+
+        Emit8(rex);
+    }
+#endif // _TARGET_AMD64_
+
+    BYTE modrmbyte = static_cast<BYTE>(altreg << 3);
+    BOOL fNeedSIB  = FALSE;
+    BYTE SIBbyte = 0;
+    BYTE ofssize;
+    BYTE scaleselect= 0;
+
+    if (ofs == 0 && basereg != kEBP)
+    {
+        ofssize = 0; // Don't change this constant!
+    }
+    else if (FitsInI1(ofs))
+    {
+        ofssize = 1; // Don't change this constant!
+    }
+    else
+    {
+        ofssize = 2; // Don't change this constant!
+    }
+
+    switch (scale)
+    {
+        case 1: scaleselect = 0; break;
+        case 2: scaleselect = 1; break;
+        case 4: scaleselect = 2; break;
+        case 8: scaleselect = 3; break;
+    }
+
+    if (scale == 0 && basereg != (X86Reg)4 /*ESP*/)
+    {
+        // [basereg + ofs]
+        modrmbyte |= basereg | (ofssize << 6);
+    }
+    else if (scale == 0)
+    {
+        // [esp + ofs]
+        _ASSERTE(basereg == (X86Reg)4);
+        fNeedSIB = TRUE;
+        SIBbyte  = 0044;
+
+        modrmbyte |= 4 | (ofssize << 6);
+    }
+    else
+    {
+
+        //[basereg + scaledreg*scale + ofs]
+
+        modrmbyte |= 0004 | (ofssize << 6);
+        fNeedSIB = TRUE;
+        SIBbyte = static_cast<BYTE>((scaleselect << 6) | (scaledreg << 3) | basereg);
+
+    }
+
+    //Some sanity checks:
+    _ASSERTE(!(fNeedSIB && basereg == kEBP)); // EBP not valid as a SIB base register.
+    _ASSERTE(!( (!fNeedSIB) && basereg == (X86Reg)4 )) ; // ESP addressing requires SIB byte
+
+    Emit8((BYTE)opcode);
+
+    if (opcode >> 8)
+        Emit8(opcode >> 8);
+
+    Emit8(modrmbyte);
+    if (fNeedSIB)
+    {
+        Emit8(SIBbyte);
+    }
+    switch (ofssize)
+    {
+        case 0: break;
+        case 1: Emit8( (__int8)ofs ); break;
+        case 2: Emit32( ofs ); break;
+        default: _ASSERTE(!"Can't get here.");
+    }
+}
+
+
+// Emits
+//
+//    opcode altreg, modrmreg
+//
+// or
+//
+//    opcode modrmreg, altreg
+//  
+// (the opcode determines which one comes first)
+//
+// For single-operand opcodes, "altreg" actually selects
+// an operation rather than a register.
+
+VOID StubLinkerCPU::X86EmitR2ROp (WORD opcode,
+                                  X86Reg altreg,
+                                  X86Reg modrmreg
+                        AMD64_ARG(X86OperandSize OperandSize /*= k64BitOp*/)
+                                  )
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        // All 2-byte opcodes start with 0x0f.
+        PRECONDITION(!(opcode >> 8) || (opcode & 0xff) == 0x0f);
+
+        PRECONDITION( ((UINT)altreg) < NumX86Regs );
+        PRECONDITION( ((UINT)modrmreg) < NumX86Regs );
+    }
+    CONTRACTL_END;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = 0;
+
+    if (modrmreg >= kR8)
+    {
+        rex |= REX_MODRM_RM_EXT;
+        modrmreg = X86RegFromAMD64Reg(modrmreg);
+    }
+
+    if (altreg >= kR8)
+    {
+        rex |= REX_MODRM_REG_EXT;
+        altreg = X86RegFromAMD64Reg(altreg);
+    }
+
+    if (k64BitOp == OperandSize)
+        rex |= REX_OPERAND_SIZE_64BIT;
+
+    if (rex)
+        Emit8(REX_PREFIX_BASE | rex);
+#endif // _TARGET_AMD64_
+
+    Emit8((BYTE)opcode);
+
+    if (opcode >> 8)
+        Emit8(opcode >> 8);
+
+    Emit8(static_cast<UINT8>(0300 | (altreg << 3) | modrmreg));
+}
+
+
+//---------------------------------------------------------------
+// Emits:
+//   op altreg, [esp+ofs]
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitEspOffset(BYTE opcode,
+                                     X86Reg altreg,
+                                     __int32 ofs
+                           AMD64_ARG(X86OperandSize OperandSize /*= k64BitOp*/)
+                                     )
+{
+    STANDARD_VM_CONTRACT;
+
+    BYTE    codeBuffer[8];
+    BYTE   *code = codeBuffer;
+    int     nBytes;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = 0;
+
+    if (k64BitOp == OperandSize)
+        rex |= REX_OPERAND_SIZE_64BIT;
+
+    if (altreg >= kR8)
+    {
+        rex |= REX_MODRM_REG_EXT;
+        altreg = X86RegFromAMD64Reg(altreg);
+    }
+
+    if (rex)
+    {
+        *code = (REX_PREFIX_BASE | rex);
+        code++;
+        nBytes = 1;
+    }
+    else
+#endif // _TARGET_AMD64_
+    {
+        nBytes = 0;
+    }
+
+    code[0] = opcode;
+    BYTE modrm = static_cast<BYTE>((altreg << 3) | 004);
+    if (ofs == 0)
+    {
+        code[1] = modrm;
+        code[2] = 0044;
+        EmitBytes(codeBuffer, 3 + nBytes);
+    }
+    else if (FitsInI1(ofs))
+    {
+        code[1] = 0x40|modrm;
+        code[2] = 0044;
+        code[3] = (BYTE)ofs;
+        EmitBytes(codeBuffer, 4 + nBytes);
+    }
+    else
+    {
+        code[1] = 0x80|modrm;
+        code[2] = 0044;
+        *((__int32*)(3+code)) = ofs;
+        EmitBytes(codeBuffer, 7 + nBytes);
+    }
+
+}
+
+//---------------------------------------------------------------
+
+VOID StubLinkerCPU::X86EmitPushEBPframe()
+{
+    STANDARD_VM_CONTRACT;
+    
+    //  push ebp
+    X86EmitPushReg(kEBP);
+    // mov ebp,esp
+    X86EmitMovRegSP(kEBP);
+}
+
+#ifdef _DEBUG
+//---------------------------------------------------------------
+// Emits:
+//     mov <reg32>,0xcccccccc
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitDebugTrashReg(X86Reg reg)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+
+    if (reg >= kR8)
+    {
+        rex |= REX_OPCODE_REG_EXT;
+        reg = X86RegFromAMD64Reg(reg);
+    }
+    Emit8(rex);
+    Emit8(0xb8|reg);
+    Emit64(0xcccccccccccccccc);
+#else
+    Emit8(static_cast<UINT8>(0xb8 | reg));
+    Emit32(0xcccccccc);
+#endif
+}
+#endif //_DEBUG
+
+
+// Get X86Reg indexes of argument registers based on offset into ArgumentRegister
+X86Reg GetX86ArgumentRegisterFromOffset(size_t ofs)
+{
+    CONTRACT(X86Reg)
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+
+    }
+    CONTRACT_END;
+
+    #define ARGUMENT_REGISTER(reg) if (ofs == offsetof(ArgumentRegisters, reg)) RETURN  k##reg ;
+    ENUM_ARGUMENT_REGISTERS();
+    #undef ARGUMENT_REGISTER
+
+    _ASSERTE(0);//Can't get here.
+    RETURN kEBP;
+}
+
+
+#ifdef _TARGET_AMD64_
+static const X86Reg c_argRegs[] = { 
+    #define ARGUMENT_REGISTER(regname) k##regname,
+    ENUM_ARGUMENT_REGISTERS()
+    #undef ARGUMENT_REGISTER
+};
+#endif
+
+
+#ifndef CROSSGEN_COMPILE
+
+#if defined(_DEBUG) && (defined(_TARGET_AMD64_) || defined(_TARGET_X86_)) && !defined(FEATURE_PAL)
+void StubLinkerCPU::EmitJITHelperLoggingThunk(PCODE pJitHelper, LPVOID helperFuncCount)
+{
+    STANDARD_VM_CONTRACT;
+
+    VMHELPCOUNTDEF* pHelperFuncCount = (VMHELPCOUNTDEF*)helperFuncCount;
+/*
+        push        rcx
+        mov         rcx, &(pHelperFuncCount->count)
+   lock inc        [rcx]
+        pop         rcx
+#ifdef _TARGET_AMD64_
+        mov         rax, <pJitHelper>
+        jmp         rax
+#else
+        jmp         <pJitHelper>
+#endif
+*/
+
+    // push     rcx
+    // mov      rcx, &(pHelperFuncCount->count)
+    X86EmitPushReg(kECX);
+    X86EmitRegLoad(kECX, (UINT_PTR)(&(pHelperFuncCount->count)));
+
+    // lock inc [rcx]
+    BYTE lock_inc_RCX[] = { 0xf0, 0xff, 0x01 };
+    EmitBytes(lock_inc_RCX, sizeof(lock_inc_RCX));
+
+#if defined(_TARGET_AMD64_)
+    // mov      rax, <pJitHelper>
+    // pop      rcx
+    // jmp      rax
+#else
+    // pop      rcx
+    // jmp      <pJitHelper>
+#endif
+    X86EmitTailcallWithSinglePop(NewExternalCodeLabel(pJitHelper), kECX);
+}
+#endif // _DEBUG && (_TARGET_AMD64_ || _TARGET_X86_) && !FEATURE_PAL
+
+#ifndef FEATURE_IMPLICIT_TLS
+//---------------------------------------------------------------
+// Emit code to store the current Thread structure in dstreg
+// preservedRegSet is a set of registers to be preserved
+// TRASHES  EAX, EDX, ECX unless they are in preservedRegSet.
+// RESULTS  dstreg = current Thread
+//---------------------------------------------------------------
+VOID StubLinkerCPU::X86EmitTLSFetch(DWORD idx, X86Reg dstreg, unsigned preservedRegSet)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+    // It doesn't make sense to have the destination register be preserved
+        PRECONDITION((preservedRegSet & (1<<dstreg)) == 0);
+        AMD64_ONLY(PRECONDITION(dstreg < 8)); // code below doesn't support high registers
+    }
+    CONTRACTL_END;
+
+    TLSACCESSMODE mode = GetTLSAccessMode(idx);
+
+#ifdef _DEBUG
+    {
+        static BOOL f = TRUE;
+        f = !f;
+        if (f)
+        {
+           mode = TLSACCESS_GENERIC;
+        }
+    }
+#endif
+
+    switch (mode)
+    {
+        case TLSACCESS_WNT: 
+            {
+                unsigned __int32 tlsofs = offsetof(TEB, TlsSlots) + (idx * sizeof(void*));
+#ifdef _TARGET_AMD64_                
+                BYTE code[] = {0x65,0x48,0x8b,0x04,0x25};    // mov dstreg, qword ptr gs:[IMM32]
+                static const int regByteIndex = 3;
+#elif defined(_TARGET_X86_)
+                BYTE code[] = {0x64,0x8b,0x05};              // mov dstreg, dword ptr fs:[IMM32]
+                static const int regByteIndex = 2;
+#endif 
+                code[regByteIndex] |= (dstreg << 3);
+
+                EmitBytes(code, sizeof(code));
+                Emit32(tlsofs);
+            }
+            break;
+
+        case TLSACCESS_GENERIC:
+
+            X86EmitPushRegs(preservedRegSet & ((1<<kEAX)|(1<<kEDX)|(1<<kECX)));
+
+            X86EmitPushImm32(idx);
+#ifdef _TARGET_AMD64_
+            X86EmitPopReg (kECX);       // arg in reg
+#endif
+
+            // call TLSGetValue
+            X86EmitCall(NewExternalCodeLabel((LPVOID) TlsGetValue), sizeof(void*));
+
+            // mov dstreg, eax
+            X86EmitMovRegReg(dstreg, kEAX);
+
+            X86EmitPopRegs(preservedRegSet & ((1<<kEAX)|(1<<kEDX)|(1<<kECX)));
+
+            break;
+
+        default:
+            _ASSERTE(0);
+    }
+
+#ifdef _DEBUG
+    // Trash caller saved regs that we were not told to preserve, and that aren't the dstreg.
+    preservedRegSet |= 1<<dstreg;
+    if (!(preservedRegSet & (1<<kEAX)))
+        X86EmitDebugTrashReg(kEAX);
+    if (!(preservedRegSet & (1<<kEDX)))
+        X86EmitDebugTrashReg(kEDX);
+    if (!(preservedRegSet & (1<<kECX)))
+        X86EmitDebugTrashReg(kECX);
+#endif
+
+}
+#endif // FEATURE_IMPLICIT_TLS
+
+VOID StubLinkerCPU::X86EmitCurrentThreadFetch(X86Reg dstreg, unsigned preservedRegSet)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+    // It doesn't make sense to have the destination register be preserved
+        PRECONDITION((preservedRegSet & (1<<dstreg)) == 0);
+        AMD64_ONLY(PRECONDITION(dstreg < 8)); // code below doesn't support high registers
+    }
+    CONTRACTL_END;
+
+#ifdef FEATURE_IMPLICIT_TLS
+
+    X86EmitPushRegs(preservedRegSet & ((1<<kEAX)|(1<<kEDX)|(1<<kECX)));
+
+    //TODO: Inline the instruction instead of a call
+    // call GetThread
+    X86EmitCall(NewExternalCodeLabel((LPVOID) GetThread), sizeof(void*));
+
+    // mov dstreg, eax
+    X86EmitMovRegReg(dstreg, kEAX);
+
+    X86EmitPopRegs(preservedRegSet & ((1<<kEAX)|(1<<kEDX)|(1<<kECX)));
+
+#ifdef _DEBUG
+    // Trash caller saved regs that we were not told to preserve, and that aren't the dstreg.
+    preservedRegSet |= 1<<dstreg;
+    if (!(preservedRegSet & (1<<kEAX)))
+        X86EmitDebugTrashReg(kEAX);
+    if (!(preservedRegSet & (1<<kEDX)))
+        X86EmitDebugTrashReg(kEDX);
+    if (!(preservedRegSet & (1<<kECX)))
+        X86EmitDebugTrashReg(kECX);
+#endif // _DEBUG
+
+#else // FEATURE_IMPLICIT_TLS
+
+    X86EmitTLSFetch(GetThreadTLSIndex(), dstreg, preservedRegSet);
+    
+#endif // FEATURE_IMPLICIT_TLS
+
+}
+
+VOID StubLinkerCPU::X86EmitCurrentAppDomainFetch(X86Reg dstreg, unsigned preservedRegSet)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+    // It doesn't make sense to have the destination register be preserved
+        PRECONDITION((preservedRegSet & (1<<dstreg)) == 0);
+        AMD64_ONLY(PRECONDITION(dstreg < 8)); // code below doesn't support high registers
+    }
+    CONTRACTL_END;
+
+#ifdef FEATURE_IMPLICIT_TLS
+    X86EmitPushRegs(preservedRegSet & ((1<<kEAX)|(1<<kEDX)|(1<<kECX)));
+
+    //TODO: Inline the instruction instead of a call
+    // call GetThread
+    X86EmitCall(NewExternalCodeLabel((LPVOID) GetAppDomain), sizeof(void*));
+
+    // mov dstreg, eax
+    X86EmitMovRegReg(dstreg, kEAX);
+
+    X86EmitPopRegs(preservedRegSet & ((1<<kEAX)|(1<<kEDX)|(1<<kECX)));
+
+#ifdef _DEBUG
+    // Trash caller saved regs that we were not told to preserve, and that aren't the dstreg.
+    preservedRegSet |= 1<<dstreg;
+    if (!(preservedRegSet & (1<<kEAX)))
+        X86EmitDebugTrashReg(kEAX);
+    if (!(preservedRegSet & (1<<kEDX)))
+        X86EmitDebugTrashReg(kEDX);
+    if (!(preservedRegSet & (1<<kECX)))
+        X86EmitDebugTrashReg(kECX);
+#endif
+
+#else // FEATURE_IMPLICIT_TLS
+
+    X86EmitTLSFetch(GetAppDomainTLSIndex(), dstreg, preservedRegSet);
+
+#endif // FEATURE_IMPLICIT_TLS
+}
+
+#ifdef _TARGET_X86_
+
+#ifdef PROFILING_SUPPORTED
+VOID StubLinkerCPU::EmitProfilerComCallProlog(TADDR pFrameVptr, X86Reg regFrame)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr())
+    {
+        // Load the methoddesc into ECX (UMThkCallFrame->m_pvDatum->m_pMD)
+        X86EmitIndexRegLoad(kECX, regFrame, UMThkCallFrame::GetOffsetOfDatum());
+        X86EmitIndexRegLoad(kECX, kECX, UMEntryThunk::GetOffsetOfMethodDesc());
+
+        // Push arguments and notify profiler
+        X86EmitPushImm32(COR_PRF_TRANSITION_CALL);      // Reason
+        X86EmitPushReg(kECX);                           // MethodDesc*
+        X86EmitCall(NewExternalCodeLabel((LPVOID) ProfilerUnmanagedToManagedTransitionMD), 2*sizeof(void*));
+    }
+
+#ifdef FEATURE_COMINTEROP
+    else if (pFrameVptr == ComMethodFrame::GetMethodFrameVPtr())
+    {
+        // Load the methoddesc into ECX (Frame->m_pvDatum->m_pMD)
+        X86EmitIndexRegLoad(kECX, regFrame, ComMethodFrame::GetOffsetOfDatum());
+        X86EmitIndexRegLoad(kECX, kECX, ComCallMethodDesc::GetOffsetOfMethodDesc());
+
+        // Push arguments and notify profiler
+        X86EmitPushImm32(COR_PRF_TRANSITION_CALL);      // Reason
+        X86EmitPushReg(kECX);                           // MethodDesc*
+        X86EmitCall(NewExternalCodeLabel((LPVOID) ProfilerUnmanagedToManagedTransitionMD), 2*sizeof(void*));
+    }
+#endif // FEATURE_COMINTEROP
+
+    // Unrecognized frame vtbl
+    else
+    {
+        _ASSERTE(!"Unrecognized vtble passed to EmitComMethodStubProlog with profiling turned on.");
+    }
+}
+
+
+VOID StubLinkerCPU::EmitProfilerComCallEpilog(TADDR pFrameVptr, X86Reg regFrame)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+#ifdef FEATURE_COMINTEROP
+        PRECONDITION(pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr() || pFrameVptr == ComMethodFrame::GetMethodFrameVPtr());
+#else 
+        PRECONDITION(pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr());
+#endif // FEATURE_COMINTEROP
+    }
+    CONTRACTL_END;
+
+    if (pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr())
+    {
+        // Load the methoddesc into ECX (UMThkCallFrame->m_pvDatum->m_pMD)
+        X86EmitIndexRegLoad(kECX, regFrame, UMThkCallFrame::GetOffsetOfDatum());
+        X86EmitIndexRegLoad(kECX, kECX, UMEntryThunk::GetOffsetOfMethodDesc());
+
+        // Push arguments and notify profiler
+        X86EmitPushImm32(COR_PRF_TRANSITION_RETURN);    // Reason
+        X86EmitPushReg(kECX);                           // MethodDesc*
+        X86EmitCall(NewExternalCodeLabel((LPVOID) ProfilerManagedToUnmanagedTransitionMD), 2*sizeof(void*));
+    }
+
+#ifdef FEATURE_COMINTEROP
+    else if (pFrameVptr == ComMethodFrame::GetMethodFrameVPtr())
+    {
+        // Load the methoddesc into ECX (Frame->m_pvDatum->m_pMD)
+        X86EmitIndexRegLoad(kECX, regFrame, ComMethodFrame::GetOffsetOfDatum());
+        X86EmitIndexRegLoad(kECX, kECX, ComCallMethodDesc::GetOffsetOfMethodDesc());
+
+        // Push arguments and notify profiler
+        X86EmitPushImm32(COR_PRF_TRANSITION_RETURN);    // Reason
+        X86EmitPushReg(kECX);                           // MethodDesc*
+        X86EmitCall(NewExternalCodeLabel((LPVOID) ProfilerManagedToUnmanagedTransitionMD), 2*sizeof(void*));
+    }
+#endif // FEATURE_COMINTEROP
+
+    // Unrecognized frame vtbl
+    else
+    {
+        _ASSERTE(!"Unrecognized vtble passed to EmitComMethodStubEpilog with profiling turned on.");
+    }
+}
+#endif // PROFILING_SUPPORTED
+
+
+//========================================================================
+//  Prolog for entering managed code from COM
+//  pushes the appropriate frame ptr
+//  sets up a thread and returns a label that needs to be emitted by the caller
+//  At the end:
+//  ESI will hold the pointer to the ComMethodFrame or UMThkCallFrame
+//  EBX will hold the result of GetThread()
+//  EDI will hold the previous Frame ptr
+
+void StubLinkerCPU::EmitComMethodStubProlog(TADDR pFrameVptr,
+                                            CodeLabel** rgRareLabels,
+                                            CodeLabel** rgRejoinLabels,
+                                            BOOL bShouldProfile)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        PRECONDITION(rgRareLabels != NULL);
+        PRECONDITION(rgRareLabels[0] != NULL && rgRareLabels[1] != NULL && rgRareLabels[2] != NULL);
+        PRECONDITION(rgRejoinLabels != NULL);
+        PRECONDITION(rgRejoinLabels[0] != NULL && rgRejoinLabels[1] != NULL && rgRejoinLabels[2] != NULL);
+    }
+    CONTRACTL_END;
+
+    // push ebp     ;; save callee-saved register
+    // push ebx     ;; save callee-saved register
+    // push esi     ;; save callee-saved register
+    // push edi     ;; save callee-saved register
+    X86EmitPushEBPframe();
+
+    X86EmitPushReg(kEBX);
+    X86EmitPushReg(kESI);
+    X86EmitPushReg(kEDI);
+
+    // push eax ; datum
+    X86EmitPushReg(kEAX);
+
+    // push edx ;leave room for m_next (edx is an arbitrary choice)
+    X86EmitPushReg(kEDX);
+
+    // push IMM32 ; push Frame vptr
+    X86EmitPushImmPtr((LPVOID) pFrameVptr);
+
+    X86EmitPushImmPtr((LPVOID)GetProcessGSCookie());
+
+    // lea esi, [esp+4] ;; set ESI -> new frame
+    X86EmitEspOffset(0x8d, kESI, 4);    // lea ESI, [ESP+4]
+
+    if (pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr())
+    {
+        // Preserve argument registers for thiscall/fastcall
+        X86EmitPushReg(kECX);
+        X86EmitPushReg(kEDX);
+    }
+
+    // Emit Setup thread
+    EmitSetup(rgRareLabels[0]);  // rareLabel for rare setup
+    EmitLabel(rgRejoinLabels[0]); // rejoin label for rare setup
+
+#ifdef PROFILING_SUPPORTED
+    // If profiling is active, emit code to notify profiler of transition
+    // Must do this before preemptive GC is disabled, so no problem if the
+    // profiler blocks.
+    if (CORProfilerTrackTransitions() && bShouldProfile)
+    {
+        EmitProfilerComCallProlog(pFrameVptr, /*Frame*/ kESI);
+    }
+#endif // PROFILING_SUPPORTED
+
+    //-----------------------------------------------------------------------
+    // Generate the inline part of disabling preemptive GC.  It is critical
+    // that this part happen before we link in the frame.  That's because
+    // we won't be able to unlink the frame from preemptive mode.  And during
+    // shutdown, we cannot switch to cooperative mode under some circumstances
+    //-----------------------------------------------------------------------
+    EmitDisable(rgRareLabels[1], /*fCallIn=*/TRUE, kEBX); // rare disable gc
+    EmitLabel(rgRejoinLabels[1]);                         // rejoin for rare disable gc
+
+    // If we take an SO after installing the new frame but before getting the exception
+    // handlers in place, we will have a corrupt frame stack.  So probe-by-touch first for 
+    // sufficient stack space to erect the handler.  Because we know we will be touching
+    // that stack right away when install the handler, this probe-by-touch will not incur
+    // unnecessary cache misses.   And this allows us to do the probe with one instruction.
+    
+    // Note that for Win64, the personality routine will handle unlinking the frame, so
+    // we don't need to probe in the Win64 stubs.  The exception is ComToCLRWorker
+    // where we don't setup a personality routine.  However, we push the frame inside
+    // that function and it is probe-protected with an entry point probe first, so we are
+    // OK there too.
+ 
+    // We push two registers to setup the EH handler and none to setup the frame
+    // so probe for double that to give ourselves a small margin for error.
+    // mov eax, [esp+n] ;; probe for sufficient stack to setup EH
+    X86EmitEspOffset(0x8B, kEAX, -0x20);   
+     // mov edi,[ebx + Thread.GetFrame()]  ;; get previous frame
+    X86EmitIndexRegLoad(kEDI, kEBX, Thread::GetOffsetOfCurrentFrame());
+
+    // mov [esi + Frame.m_next], edi
+    X86EmitIndexRegStore(kESI, Frame::GetOffsetOfNextLink(), kEDI);
+
+    // mov [ebx + Thread.GetFrame()], esi
+    X86EmitIndexRegStore(kEBX, Thread::GetOffsetOfCurrentFrame(), kESI);
+
+    if (pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr())
+    {
+        // push UnmanagedToManagedExceptHandler
+        X86EmitPushImmPtr((LPVOID)UMThunkPrestubHandler);
+
+        // mov eax, fs:[0]
+        static const BYTE codeSEH1[] = { 0x64, 0xA1, 0x0, 0x0, 0x0, 0x0};
+        EmitBytes(codeSEH1, sizeof(codeSEH1));
+
+        // push eax
+        X86EmitPushReg(kEAX);
+
+        // mov dword ptr fs:[0], esp
+        static const BYTE codeSEH2[] = { 0x64, 0x89, 0x25, 0x0, 0x0, 0x0, 0x0};
+        EmitBytes(codeSEH2, sizeof(codeSEH2));
+    }
+
+#if _DEBUG
+    if (Frame::ShouldLogTransitions())
+    {
+        // call LogTransition
+        X86EmitPushReg(kESI);
+        X86EmitCall(NewExternalCodeLabel((LPVOID) Frame::LogTransition), sizeof(void*));
+    }
+#endif
+}
+
+//========================================================================
+//  Epilog for stubs that enter managed code from COM
+//
+//  At this point of the stub, the state should be as follows:
+//  ESI holds the ComMethodFrame or UMThkCallFrame ptr
+//  EBX holds the result of GetThread()
+//  EDI holds the previous Frame ptr
+//
+void StubLinkerCPU::EmitComMethodStubEpilog(TADDR pFrameVptr,
+                                            CodeLabel** rgRareLabels,
+                                            CodeLabel** rgRejoinLabels,
+                                            BOOL bShouldProfile)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        PRECONDITION(rgRareLabels != NULL);
+        PRECONDITION(rgRareLabels[0] != NULL && rgRareLabels[1] != NULL && rgRareLabels[2] != NULL);
+        PRECONDITION(rgRejoinLabels != NULL);
+        PRECONDITION(rgRejoinLabels[0] != NULL && rgRejoinLabels[1] != NULL && rgRejoinLabels[2] != NULL);
+    }
+    CONTRACTL_END;
+
+    EmitCheckGSCookie(kESI, UnmanagedToManagedFrame::GetOffsetOfGSCookie());
+
+    if (pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr())
+    {
+        // if we are using exceptions, unlink the SEH
+        // mov ecx,[esp]  ;;pointer to the next exception record
+        X86EmitEspOffset(0x8b, kECX, 0);
+
+        // mov dword ptr fs:[0], ecx
+        static const BYTE codeSEH[] = { 0x64, 0x89, 0x0D, 0x0, 0x0, 0x0, 0x0 };
+        EmitBytes(codeSEH, sizeof(codeSEH));
+
+        X86EmitAddEsp(sizeof(EXCEPTION_REGISTRATION_RECORD));
+    }
+
+    // mov [ebx + Thread.GetFrame()], edi  ;; restore previous frame
+    X86EmitIndexRegStore(kEBX, Thread::GetOffsetOfCurrentFrame(), kEDI);
+
+    //-----------------------------------------------------------------------
+    // Generate the inline part of disabling preemptive GC
+    //-----------------------------------------------------------------------
+    EmitEnable(rgRareLabels[2]); // rare gc
+    EmitLabel(rgRejoinLabels[2]);        // rejoin for rare gc
+
+    if (pFrameVptr == UMThkCallFrame::GetMethodFrameVPtr())
+    {
+        // Restore argument registers for thiscall/fastcall
+        X86EmitPopReg(kEDX);
+        X86EmitPopReg(kECX);
+    }
+
+    // add esp, popstack
+    X86EmitAddEsp(sizeof(GSCookie) + UnmanagedToManagedFrame::GetOffsetOfCalleeSavedRegisters());
+
+    // pop edi        ; restore callee-saved registers
+    // pop esi
+    // pop ebx
+    // pop ebp
+    X86EmitPopReg(kEDI);
+    X86EmitPopReg(kESI);
+    X86EmitPopReg(kEBX);
+    X86EmitPopReg(kEBP);
+
+    //    jmp eax //reexecute!
+    X86EmitR2ROp(0xff, (X86Reg)4, kEAX);
+
+    // ret
+    // This will never be executed. It is just to help out stack-walking logic 
+    // which disassembles the epilog to unwind the stack. A "ret" instruction 
+    // indicates that no more code needs to be disassembled, if the stack-walker
+    // keeps on going past the previous "jmp eax".
+    X86EmitReturn(0);
+
+    //-----------------------------------------------------------------------
+    // The out-of-line portion of enabling preemptive GC - rarely executed
+    //-----------------------------------------------------------------------
+    EmitLabel(rgRareLabels[2]);  // label for rare enable gc
+    EmitRareEnable(rgRejoinLabels[2]); // emit rare enable gc
+
+    //-----------------------------------------------------------------------
+    // The out-of-line portion of disabling preemptive GC - rarely executed
+    //-----------------------------------------------------------------------
+    EmitLabel(rgRareLabels[1]);  // label for rare disable gc
+    EmitRareDisable(rgRejoinLabels[1]); // emit rare disable gc
+
+    //-----------------------------------------------------------------------
+    // The out-of-line portion of setup thread - rarely executed
+    //-----------------------------------------------------------------------
+    EmitLabel(rgRareLabels[0]);  // label for rare setup thread
+    EmitRareSetup(rgRejoinLabels[0], /*fThrow*/ TRUE); // emit rare setup thread
+}
+
+//---------------------------------------------------------------
+// Emit code to store the setup current Thread structure in eax.
+// TRASHES  eax,ecx&edx.
+// RESULTS  ebx = current Thread
+//---------------------------------------------------------------
+VOID StubLinkerCPU::EmitSetup(CodeLabel *pForwardRef)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef FEATURE_IMPLICIT_TLS
+    DWORD idx = 0;
+    TLSACCESSMODE mode = TLSACCESS_GENERIC;
+#else
+    DWORD idx = GetThreadTLSIndex();
+    TLSACCESSMODE mode = GetTLSAccessMode(idx);
+#endif
+
+#ifdef _DEBUG
+    {
+        static BOOL f = TRUE;
+        f = !f;
+        if (f)
+        {
+           mode = TLSACCESS_GENERIC;
+        }
+    }
+#endif
+
+    switch (mode)
+    {
+        case TLSACCESS_WNT: 
+            {
+                unsigned __int32 tlsofs = offsetof(TEB, TlsSlots) + (idx * sizeof(void*));
+
+                static const BYTE code[] = {0x64,0x8b,0x1d};              // mov ebx, dword ptr fs:[IMM32]
+                EmitBytes(code, sizeof(code));
+                Emit32(tlsofs);
+            }
+            break;
+
+        case TLSACCESS_GENERIC:
+#ifdef FEATURE_IMPLICIT_TLS
+            X86EmitCall(NewExternalCodeLabel((LPVOID) GetThread), sizeof(void*));
+#else
+            X86EmitPushImm32(idx);
+
+            // call TLSGetValue
+            X86EmitCall(NewExternalCodeLabel((LPVOID) TlsGetValue), sizeof(void*));
+#endif
+            // mov ebx,eax
+            Emit16(0xc389);
+            break;
+        default:
+            _ASSERTE(0);
+    }
+
+    // cmp ebx, 0
+    static const BYTE b[] = { 0x83, 0xFB, 0x0};
+
+    EmitBytes(b, sizeof(b));
+
+    // jz RarePath
+    X86EmitCondJump(pForwardRef, X86CondCode::kJZ);
+
+#ifdef _DEBUG
+    X86EmitDebugTrashReg(kECX);
+    X86EmitDebugTrashReg(kEDX);
+#endif
+
+}
+
+VOID StubLinkerCPU::EmitRareSetup(CodeLabel *pRejoinPoint, BOOL fThrow)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifndef FEATURE_COMINTEROP
+    _ASSERTE(fThrow);
+#else // !FEATURE_COMINTEROP
+    if (!fThrow)
+    {
+        X86EmitPushReg(kESI);
+        X86EmitCall(NewExternalCodeLabel((LPVOID) CreateThreadBlockReturnHr), sizeof(void*));
+    }
+    else
+#endif // !FEATURE_COMINTEROP
+    {
+        X86EmitCall(NewExternalCodeLabel((LPVOID) CreateThreadBlockThrow), 0);
+    }
+
+    // mov ebx,eax
+    Emit16(0xc389);
+    X86EmitNearJump(pRejoinPoint);
+}
+
+//========================================================================
+#endif // _TARGET_X86_
+//========================================================================
+#if defined(FEATURE_COMINTEROP) && defined(_TARGET_X86_)
+//========================================================================
+//  Epilog for stubs that enter managed code from COM
+//
+//  On entry, ESI points to the Frame
+//  ESP points to below FramedMethodFrame::m_vc5Frame
+//  EBX hold GetThread()
+//  EDI holds the previous Frame
+
+void StubLinkerCPU::EmitSharedComMethodStubEpilog(TADDR pFrameVptr,
+                                                  CodeLabel** rgRareLabels,
+                                                  CodeLabel** rgRejoinLabels,
+                                                  unsigned offsetRetThunk,
+                                                  BOOL bShouldProfile)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        PRECONDITION(rgRareLabels != NULL);
+        PRECONDITION(rgRareLabels[0] != NULL && rgRareLabels[1] != NULL && rgRareLabels[2] != NULL);
+        PRECONDITION(rgRejoinLabels != NULL);
+        PRECONDITION(rgRejoinLabels[0] != NULL && rgRejoinLabels[1] != NULL && rgRejoinLabels[2] != NULL);
+    }
+    CONTRACTL_END;
+
+    CodeLabel *NoEntryLabel;
+    NoEntryLabel = NewCodeLabel();
+
+    EmitCheckGSCookie(kESI, UnmanagedToManagedFrame::GetOffsetOfGSCookie());
+
+    // mov [ebx + Thread.GetFrame()], edi  ;; restore previous frame
+    X86EmitIndexRegStore(kEBX, Thread::GetOffsetOfCurrentFrame(), kEDI);
+
+    //-----------------------------------------------------------------------
+    // Generate the inline part of enabling preemptive GC
+    //-----------------------------------------------------------------------
+    EmitLabel(NoEntryLabel);    // need to enable preemp mode even when we fail the disable as rare disable will return in coop mode
+    
+    EmitEnable(rgRareLabels[2]);     // rare enable gc
+    EmitLabel(rgRejoinLabels[2]);        // rejoin for rare enable gc
+
+#ifdef PROFILING_SUPPORTED
+    // If profiling is active, emit code to notify profiler of transition
+    if (CORProfilerTrackTransitions() && bShouldProfile)
+    {
+        // Save return value
+        X86EmitPushReg(kEAX);
+        X86EmitPushReg(kEDX);
+
+        EmitProfilerComCallEpilog(pFrameVptr, kESI);
+
+        // Restore return value
+        X86EmitPopReg(kEDX);
+        X86EmitPopReg(kEAX);
+    }
+#endif // PROFILING_SUPPORTED
+
+    X86EmitAddEsp(sizeof(GSCookie) + UnmanagedToManagedFrame::GetOffsetOfDatum());
+
+    // pop ecx
+    X86EmitPopReg(kECX); // pop the MethodDesc*
+
+    // pop edi        ; restore callee-saved registers
+    // pop esi
+    // pop ebx
+    // pop ebp
+    X86EmitPopReg(kEDI);
+    X86EmitPopReg(kESI);
+    X86EmitPopReg(kEBX);
+    X86EmitPopReg(kEBP);
+
+    // add ecx, offsetRetThunk
+    X86EmitAddReg(kECX, offsetRetThunk);
+
+    // jmp ecx
+    // This will jump to the "ret cbStackArgs" instruction in COMMETHOD_PREPAD.
+    static const BYTE bjmpecx[] = { 0xff, 0xe1 };
+    EmitBytes(bjmpecx, sizeof(bjmpecx));
+
+    // ret
+    // This will never be executed. It is just to help out stack-walking logic 
+    // which disassembles the epilog to unwind the stack. A "ret" instruction 
+    // indicates that no more code needs to be disassembled, if the stack-walker
+    // keeps on going past the previous "jmp ecx".
+    X86EmitReturn(0);
+
+    //-----------------------------------------------------------------------
+    // The out-of-line portion of enabling preemptive GC - rarely executed
+    //-----------------------------------------------------------------------
+    EmitLabel(rgRareLabels[2]);  // label for rare enable gc
+    EmitRareEnable(rgRejoinLabels[2]); // emit rare enable gc
+
+    //-----------------------------------------------------------------------
+    // The out-of-line portion of disabling preemptive GC - rarely executed
+    //-----------------------------------------------------------------------
+    EmitLabel(rgRareLabels[1]);  // label for rare disable gc
+    EmitRareDisableHRESULT(rgRejoinLabels[1], NoEntryLabel);
+
+    //-----------------------------------------------------------------------
+    // The out-of-line portion of setup thread - rarely executed
+    //-----------------------------------------------------------------------
+    EmitLabel(rgRareLabels[0]);  // label for rare setup thread
+    EmitRareSetup(rgRejoinLabels[0],/*fThrow*/ FALSE); // emit rare setup thread
+}
+
+//========================================================================
+#endif // defined(FEATURE_COMINTEROP) && defined(_TARGET_X86_)
+
+#ifndef FEATURE_STUBS_AS_IL
+/*==============================================================================
+    Pushes a TransitionFrame on the stack
+    If you make any changes to the prolog instruction sequence, be sure
+    to update UpdateRegdisplay, too!!  This service should only be called from
+    within the runtime.  It should not be called for any unmanaged -> managed calls in.
+
+    At the end of the generated prolog stub code:
+    pFrame is in ESI/RSI.
+    the previous pFrame is in EDI/RDI
+    The current Thread* is in EBX/RBX.
+    For x86, ESP points to TransitionFrame
+    For amd64, ESP points to the space reserved for the outgoing argument registers
+*/
+
+VOID StubLinkerCPU::EmitMethodStubProlog(TADDR pFrameVptr, int transitionBlockOffset)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    X86EmitPushReg(kR15);   // CalleeSavedRegisters
+    X86EmitPushReg(kR14);
+    X86EmitPushReg(kR13);
+    X86EmitPushReg(kR12);
+    X86EmitPushReg(kRBP);
+    X86EmitPushReg(kRBX);
+    X86EmitPushReg(kRSI);
+    X86EmitPushReg(kRDI);
+
+    // Push m_datum
+    X86EmitPushReg(SCRATCH_REGISTER_X86REG);
+
+    // push edx ;leave room for m_next (edx is an arbitrary choice)
+    X86EmitPushReg(kEDX);
+
+    // push Frame vptr
+    X86EmitPushImmPtr((LPVOID) pFrameVptr);
+
+    // mov rsi, rsp
+    X86EmitR2ROp(0x8b, kRSI, (X86Reg)4 /*kESP*/);
+    UnwindSetFramePointer(kRSI);
+
+    // Save ArgumentRegisters
+    #define ARGUMENT_REGISTER(regname) X86EmitRegSave(k##regname, SecureDelegateFrame::GetOffsetOfTransitionBlock() + \
+        sizeof(TransitionBlock) + offsetof(ArgumentRegisters, regname));
+    ENUM_ARGUMENT_REGISTERS();
+    #undef ARGUMENT_REGISTER
+
+    _ASSERTE(((Frame*)&pFrameVptr)->GetGSCookiePtr() == PTR_GSCookie(PBYTE(&pFrameVptr) - sizeof(GSCookie)));
+    X86EmitPushImmPtr((LPVOID)GetProcessGSCookie());
+
+    // sub rsp, 4*sizeof(void*)           ;; allocate callee scratch area and ensure rsp is 16-byte-aligned
+    const INT32 padding = sizeof(ArgumentRegisters) + ((sizeof(FramedMethodFrame) % (2 * sizeof(LPVOID))) ? 0 : sizeof(LPVOID));
+    X86EmitSubEsp(padding);
+#endif // _TARGET_AMD64_
+
+#ifdef _TARGET_X86_
+    // push ebp     ;; save callee-saved register
+    // mov ebp,esp
+    // push ebx     ;; save callee-saved register
+    // push esi     ;; save callee-saved register
+    // push edi     ;; save callee-saved register
+    X86EmitPushEBPframe();
+
+    X86EmitPushReg(kEBX);
+    X86EmitPushReg(kESI);
+    X86EmitPushReg(kEDI);
+
+    // Push & initialize ArgumentRegisters
+    #define ARGUMENT_REGISTER(regname) X86EmitPushReg(k##regname);
+    ENUM_ARGUMENT_REGISTERS();
+    #undef ARGUMENT_REGISTER
+
+    // Push m_datum
+    X86EmitPushReg(kEAX);
+
+    // push edx ;leave room for m_next (edx is an arbitrary choice)
+    X86EmitPushReg(kEDX);
+
+    // push Frame vptr
+    X86EmitPushImmPtr((LPVOID) pFrameVptr);
+
+    // mov esi,esp
+    X86EmitMovRegSP(kESI);
+
+    X86EmitPushImmPtr((LPVOID)GetProcessGSCookie());
+#endif // _TARGET_X86_
+
+    // ebx <-- GetThread()
+    // Trashes X86TLSFetch_TRASHABLE_REGS
+    X86EmitCurrentThreadFetch(kEBX, 0);
+
+#if _DEBUG
+
+    // call ObjectRefFlush
+#ifdef _TARGET_AMD64_
+
+    // mov rcx, rbx
+    X86EmitR2ROp(0x8b, kECX, kEBX);         // arg in reg
+
+#else // !_TARGET_AMD64_
+    X86EmitPushReg(kEBX);                   // arg on stack
+#endif // _TARGET_AMD64_
+
+    // Make the call
+    X86EmitCall(NewExternalCodeLabel((LPVOID) Thread::ObjectRefFlush), sizeof(void*));
+
+#endif // _DEBUG
+
+    // mov edi,[ebx + Thread.GetFrame()]    ;; get previous frame
+    X86EmitIndexRegLoad(kEDI, kEBX, Thread::GetOffsetOfCurrentFrame());
+
+    // mov [esi + Frame.m_next], edi
+    X86EmitIndexRegStore(kESI, Frame::GetOffsetOfNextLink(), kEDI);
+
+    // mov [ebx + Thread.GetFrame()], esi
+    X86EmitIndexRegStore(kEBX, Thread::GetOffsetOfCurrentFrame(), kESI);
+
+#if _DEBUG
+
+    if (Frame::ShouldLogTransitions())
+    {
+        // call LogTransition
+#ifdef _TARGET_AMD64_
+
+        // mov rcx, rsi
+        X86EmitR2ROp(0x8b, kECX, kESI);         // arg in reg
+
+#else // !_TARGET_AMD64_
+        X86EmitPushReg(kESI);                   // arg on stack
+#endif // _TARGET_AMD64_
+         
+        X86EmitCall(NewExternalCodeLabel((LPVOID) Frame::LogTransition), sizeof(void*));
+
+#ifdef _TARGET_AMD64_
+    // Reload parameter registers
+    // mov r, [esp+offs]
+    #define ARGUMENT_REGISTER(regname) X86EmitEspOffset(0x8b, k##regname, sizeof(ArgumentRegisters) + \
+        sizeof(TransitionFrame) + offsetof(ArgumentRegisters, regname));
+    ENUM_ARGUMENT_REGISTERS();
+    #undef ARGUMENT_REGISTER
+
+#endif // _TARGET_AMD64_
+    }
+
+#endif // _DEBUG
+
+
+#ifdef _TARGET_AMD64_
+    // OK for the debugger to examine the new frame now
+    // (Note that if it's not OK yet for some stub, another patch label
+    // can be emitted later which will override this one.)    
+    EmitPatchLabel();
+#else
+    // For x86, the patch label can be specified only after the GSCookie is pushed
+    // Otherwise the debugger will see a Frame without a valid GSCookie
+#endif
+}
+
+/*==============================================================================
+ EmitMethodStubEpilog generates the part of the stub that will pop off the
+ Frame
+ 
+ restoreArgRegs - indicates whether the argument registers need to be
+                  restored from m_argumentRegisters
+  
+ At this point of the stub:
+    pFrame is in ESI/RSI.
+    the previous pFrame is in EDI/RDI
+    The current Thread* is in EBX/RBX.
+    For x86, ESP points to the FramedMethodFrame::NegInfo
+*/
+
+VOID StubLinkerCPU::EmitMethodStubEpilog(WORD numArgBytes, int transitionBlockOffset)
+{
+    STANDARD_VM_CONTRACT;
+
+    // mov [ebx + Thread.GetFrame()], edi  ;; restore previous frame
+    X86EmitIndexRegStore(kEBX, Thread::GetOffsetOfCurrentFrame(), kEDI);
+
+#ifdef _TARGET_X86_
+    // deallocate Frame
+    X86EmitAddEsp(sizeof(GSCookie) + transitionBlockOffset + TransitionBlock::GetOffsetOfCalleeSavedRegisters());
+
+#elif defined(_TARGET_AMD64_)
+    // lea rsp, [rsi + <offset of preserved registers>]
+    X86EmitOffsetModRM(0x8d, (X86Reg)4 /*kRSP*/, kRSI, transitionBlockOffset + TransitionBlock::GetOffsetOfCalleeSavedRegisters());
+#endif // _TARGET_AMD64_
+
+    // pop edi        ; restore callee-saved registers
+    // pop esi
+    // pop ebx
+    // pop ebp
+    X86EmitPopReg(kEDI);
+    X86EmitPopReg(kESI);
+    X86EmitPopReg(kEBX);
+    X86EmitPopReg(kEBP);
+
+#ifdef _TARGET_AMD64_
+    X86EmitPopReg(kR12);
+    X86EmitPopReg(kR13);
+    X86EmitPopReg(kR14);
+    X86EmitPopReg(kR15);
+#endif
+
+#ifdef _TARGET_AMD64_
+    // Caller deallocates argument space.  (Bypasses ASSERT in
+    // X86EmitReturn.)
+    numArgBytes = 0;
+#endif
+
+    X86EmitReturn(numArgBytes);
+}
+
+
+// On entry, ESI should be pointing to the Frame
+
+VOID StubLinkerCPU::EmitCheckGSCookie(X86Reg frameReg, int gsCookieOffset)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _DEBUG
+    // cmp dword ptr[frameReg-gsCookieOffset], gsCookie
+#ifdef _TARGET_X86_
+    X86EmitCmpRegIndexImm32(frameReg, gsCookieOffset, GetProcessGSCookie());
+#else
+    X64EmitCmp32RegIndexImm32(frameReg, gsCookieOffset, (INT32)GetProcessGSCookie());
+#endif
+    
+    CodeLabel * pLabel = NewCodeLabel();
+    X86EmitCondJump(pLabel, X86CondCode::kJE);
+    
+    X86EmitCall(NewExternalCodeLabel((LPVOID) JIT_FailFast), 0);
+
+    EmitLabel(pLabel);
+#endif
+}
+#endif // !FEATURE_STUBS_AS_IL
+
+
+// This method unboxes the THIS pointer and then calls pRealMD
+// If it's shared code for a method in a generic value class, then also extract the vtable pointer
+// and pass it as an extra argument.  Thus this stub generator really covers both
+//   - Unboxing, non-instantiating stubs
+//   - Unboxing, method-table-instantiating stubs
+VOID StubLinkerCPU::EmitUnboxMethodStub(MethodDesc* pUnboxMD)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION(!pUnboxMD->IsStatic());
+    }
+    CONTRACTL_END;
+
+#ifdef FEATURE_STUBS_AS_IL
+    _ASSERTE(!pUnboxMD->RequiresInstMethodTableArg());
+#else
+    if (pUnboxMD->RequiresInstMethodTableArg())
+    {
+        EmitInstantiatingMethodStub(pUnboxMD, NULL);
+        return;
+    }
+#endif
+
+    //
+    // unboxing a value class simply means adding sizeof(void*) to the THIS pointer
+    //
+#ifdef _TARGET_AMD64_
+    X86EmitAddReg(THIS_kREG, sizeof(void*));
+
+    // Use direct call if possible
+    if (pUnboxMD->HasStableEntryPoint())
+    {
+        X86EmitRegLoad(kRAX, pUnboxMD->GetStableEntryPoint());// MOV RAX, DWORD
+    }
+    else
+    {
+        X86EmitRegLoad(kRAX, (UINT_PTR)pUnboxMD->GetAddrOfSlot()); // MOV RAX, DWORD
+        
+        X86EmitIndexRegLoad(kRAX, kRAX);                // MOV RAX, [RAX]
+    }
+
+    Emit16(X86_INSTR_JMP_EAX);                          // JMP EAX
+#else // _TARGET_AMD64_
+    X86EmitAddReg(THIS_kREG, sizeof(void*));
+
+    // Use direct call if possible
+    if (pUnboxMD->HasStableEntryPoint())
+    {
+        X86EmitNearJump(NewExternalCodeLabel((LPVOID) pUnboxMD->GetStableEntryPoint()));
+    }
+    else
+    {
+        // jmp [slot]
+        Emit16(0x25ff);
+        Emit32((DWORD)(size_t)pUnboxMD->GetAddrOfSlot());
+    }
+#endif //_TARGET_AMD64_
+}
+
+
+#if defined(FEATURE_SHARE_GENERIC_CODE) && !defined(FEATURE_STUBS_AS_IL)
+// The stub generated by this method passes an extra dictionary argument before jumping to 
+// shared-instantiation generic code.
+//
+// pMD is either
+//    * An InstantiatedMethodDesc for a generic method whose code is shared across instantiations.
+//      In this case, the extra argument is the InstantiatedMethodDesc for the instantiation-specific stub itself.
+// or * A MethodDesc for a static method in a generic class whose code is shared across instantiations.
+//      In this case, the extra argument is the MethodTable pointer of the instantiated type.
+// or * A MethodDesc for unboxing stub. In this case, the extra argument is null.
+VOID StubLinkerCPU::EmitInstantiatingMethodStub(MethodDesc* pMD, void* extra)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+        PRECONDITION(pMD->RequiresInstArg());
+    }
+    CONTRACTL_END;
+
+    MetaSig msig(pMD);
+    ArgIterator argit(&msig);
+
+#ifdef _TARGET_AMD64_
+    int paramTypeArgOffset = argit.GetParamTypeArgOffset();
+    int paramTypeArgIndex = TransitionBlock::GetArgumentIndexFromOffset(paramTypeArgOffset);
+
+    CorElementType argTypes[5];
+
+    int firstRealArg = paramTypeArgIndex + 1;
+    int argNum = firstRealArg;
+
+    //
+    // Compute types of the 4 register args and first stack arg
+    //
+
+    CorElementType sigType;
+    while ((sigType = msig.NextArgNormalized()) != ELEMENT_TYPE_END)
+    {
+        argTypes[argNum++] = sigType;
+        if (argNum > 4)
+            break;
+    }
+    msig.Reset();
+
+    BOOL fUseInstantiatingMethodStubWorker = FALSE;
+
+    if (argNum > 4)
+    {
+        //
+        // We will need to go through assembly helper.
+        //
+        fUseInstantiatingMethodStubWorker = TRUE;
+
+        // Allocate space for frame before pushing the arguments for the assembly helper
+        X86EmitSubEsp((INT32)(AlignUp(sizeof(void *) /* extra stack param */ + sizeof(GSCookie) + sizeof(StubHelperFrame), 16) - sizeof(void *) /* return address */));
+
+        //
+        // Store extra arg stack arg param for the helper.
+        //
+        CorElementType argType = argTypes[--argNum];
+        switch (argType)
+        {
+        case ELEMENT_TYPE_R4:
+            // movss dword ptr [rsp], xmm?
+            X64EmitMovSSToMem(kXMM3, (X86Reg)4 /*kRSP*/);
+            break;
+        case ELEMENT_TYPE_R8:
+            // movsd qword ptr [rsp], xmm?
+            X64EmitMovSDToMem(kXMM3, (X86Reg)4 /*kRSP*/);
+            break;
+        default:
+            X86EmitIndexRegStoreRSP(0, kR9);
+            break;
+        }
+    }
+
+    //
+    // Shuffle the register arguments
+    //
+    while (argNum > firstRealArg)
+    {
+        CorElementType argType = argTypes[--argNum];
+
+        switch (argType)
+        {
+            case ELEMENT_TYPE_R4:
+            case ELEMENT_TYPE_R8:
+                // mov xmm#, xmm#-1
+                X64EmitMovXmmXmm((X86Reg)argNum, (X86Reg)(argNum - 1));
+                break;
+            default:
+                //mov reg#, reg#-1
+                X86EmitMovRegReg(c_argRegs[argNum], c_argRegs[argNum-1]);
+                break;
+        }
+    }
+
+    //
+    // Setup the hidden instantiation argument
+    //
+    if (extra != NULL)
+    {
+        X86EmitRegLoad(c_argRegs[paramTypeArgIndex], (UINT_PTR)extra);
+    }
+    else
+    {
+        X86EmitIndexRegLoad(c_argRegs[paramTypeArgIndex], THIS_kREG);
+
+        X86EmitAddReg(THIS_kREG, sizeof(void*));
+    }
+
+    // Use direct call if possible
+    if (pMD->HasStableEntryPoint())
+    {
+        X86EmitRegLoad(kRAX, pMD->GetStableEntryPoint());// MOV RAX, DWORD
+    }
+    else
+    {
+        X86EmitRegLoad(kRAX, (UINT_PTR)pMD->GetAddrOfSlot()); // MOV RAX, DWORD
+        
+        X86EmitIndexRegLoad(kRAX, kRAX);                // MOV RAX, [RAX]
+    }
+
+    if (fUseInstantiatingMethodStubWorker)
+    {
+        X86EmitPushReg(kRAX);
+
+        UINT cbStack = argit.SizeOfArgStack();
+        _ASSERTE(cbStack > 0);
+
+        X86EmitPushImm32((AlignUp(cbStack, 16) / sizeof(void*)) - 1);           // -1 for extra stack arg
+
+        X86EmitRegLoad(kRAX, GetEEFuncEntryPoint(InstantiatingMethodStubWorker));// MOV RAX, DWORD
+    }
+    else
+    {
+        _ASSERTE(argit.SizeOfArgStack() == 0);
+    }
+
+    Emit16(X86_INSTR_JMP_EAX);
+
+#else   
+    int paramTypeArgOffset = argit.GetParamTypeArgOffset();
+
+    // It's on the stack
+    if (TransitionBlock::IsStackArgumentOffset(paramTypeArgOffset))
+    {
+        // Pop return address into AX
+        X86EmitPopReg(kEAX);
+
+        if (extra != NULL)
+        {
+            // Push extra dictionary argument
+            X86EmitPushImmPtr(extra);
+        }
+        else
+        {
+            // Push the vtable pointer from "this"
+            X86EmitIndexPush(THIS_kREG, 0);
+        }
+
+        // Put return address back
+        X86EmitPushReg(kEAX);
+    }
+    // It's in a register
+    else
+    {
+        X86Reg paramReg = GetX86ArgumentRegisterFromOffset(paramTypeArgOffset - TransitionBlock::GetOffsetOfArgumentRegisters());
+
+        if (extra != NULL)
+        {
+            X86EmitRegLoad(paramReg, (UINT_PTR)extra);
+        }
+        else
+        {
+            // Just extract the vtable pointer from "this"
+            X86EmitIndexRegLoad(paramReg, THIS_kREG);
+        }
+    }
+
+    if (extra == NULL)
+    {
+        // Unboxing stub case.
+        X86EmitAddReg(THIS_kREG, sizeof(void*));
+    }
+
+    // Use direct call if possible
+    if (pMD->HasStableEntryPoint())
+    {
+        X86EmitNearJump(NewExternalCodeLabel((LPVOID) pMD->GetStableEntryPoint()));
+    }
+    else
+    {
+        // jmp [slot]
+        Emit16(0x25ff);
+        Emit32((DWORD)(size_t)pMD->GetAddrOfSlot());
+    }
+#endif //
+}
+#endif // FEATURE_SHARE_GENERIC_CODE && FEATURE_STUBS_AS_IL
+
+
+#if defined(_DEBUG) && defined(STUBLINKER_GENERATES_UNWIND_INFO) 
+
+typedef BOOL GetModuleInformationProc(
+  HANDLE hProcess,
+  HMODULE hModule,
+  LPMODULEINFO lpmodinfo,
+  DWORD cb
+);
+
+GetModuleInformationProc *g_pfnGetModuleInformation = NULL;
+
+extern "C" VOID __cdecl DebugCheckStubUnwindInfoWorker (CONTEXT *pStubContext)
+{
+    BEGIN_ENTRYPOINT_VOIDRET;
+
+    LOG((LF_STUBS, LL_INFO1000000, "checking stub unwind info:\n"));
+
+    //
+    // Make a copy of the CONTEXT.  RtlVirtualUnwind will modify this copy.
+    // DebugCheckStubUnwindInfo will need to restore registers from the
+    // original CONTEXT.
+    //
+    CONTEXT ctx = *pStubContext;
+    ctx.ContextFlags = (CONTEXT_CONTROL | CONTEXT_INTEGER);
+
+    //
+    // Find the upper bound of the stack and address range of KERNEL32.  This
+    // is where we expect the unwind to stop.
+    //
+    void *pvStackTop = GetThread()->GetCachedStackBase();
+
+    if (!g_pfnGetModuleInformation)
+    {
+        HMODULE hmodPSAPI = WszGetModuleHandle(W("PSAPI.DLL"));
+
+        if (!hmodPSAPI)
+        {
+            hmodPSAPI = WszLoadLibrary(W("PSAPI.DLL"));
+            if (!hmodPSAPI)
+            {
+                _ASSERTE(!"unable to load PSAPI.DLL");
+                goto ErrExit;
+            }
+        }
+
+        g_pfnGetModuleInformation = (GetModuleInformationProc*)GetProcAddress(hmodPSAPI, "GetModuleInformation");
+        if (!g_pfnGetModuleInformation)
+        {
+            _ASSERTE(!"can't find PSAPI!GetModuleInformation");
+            goto ErrExit;
+        }
+
+        // Intentionally leak hmodPSAPI.  We don't want to
+        // LoadLibrary/FreeLibrary every time, this is slow + produces lots of
+        // debugger spew.  This is just debugging code after all...
+    }
+
+    HMODULE hmodKERNEL32 = WszGetModuleHandle(W("KERNEL32"));
+    _ASSERTE(hmodKERNEL32);
+
+    MODULEINFO modinfoKERNEL32;
+    if (!g_pfnGetModuleInformation(GetCurrentProcess(), hmodKERNEL32, &modinfoKERNEL32, sizeof(modinfoKERNEL32)))
+    {
+        _ASSERTE(!"unable to get bounds of KERNEL32");
+        goto ErrExit;
+    }
+
+    //
+    // Unwind until IP is 0, sp is at the stack top, and callee IP is in kernel32.
+    //
+
+    for (;;)
+    {
+        ULONG64 ControlPc = (ULONG64)GetIP(&ctx);
+
+        LOG((LF_STUBS, LL_INFO1000000, "pc %p, sp %p\n", ControlPc, GetSP(&ctx)));
+
+        ULONG64 ImageBase;
+        T_RUNTIME_FUNCTION *pFunctionEntry = RtlLookupFunctionEntry(
+                ControlPc,
+                &ImageBase,
+                NULL);
+        if (pFunctionEntry)
+        {
+            PVOID HandlerData;
+            ULONG64 EstablisherFrame;
+
+            RtlVirtualUnwind(
+                    0,
+                    ImageBase,
+                    ControlPc,
+                    pFunctionEntry,
+                    &ctx,
+                    &HandlerData,
+                    &EstablisherFrame,
+                    NULL);
+
+            ULONG64 NewControlPc = (ULONG64)GetIP(&ctx);
+
+            LOG((LF_STUBS, LL_INFO1000000, "function %p, image %p, new pc %p, new sp %p\n", pFunctionEntry, ImageBase, NewControlPc, GetSP(&ctx)));
+
+            if (!NewControlPc)
+            {
+                if (dac_cast<PTR_BYTE>(GetSP(&ctx)) < (BYTE*)pvStackTop - 0x100)
+                {
+                    _ASSERTE(!"SP did not end up at top of stack");
+                    goto ErrExit;
+                }
+
+                if (!(   ControlPc > (ULONG64)modinfoKERNEL32.lpBaseOfDll
+                      && ControlPc < (ULONG64)modinfoKERNEL32.lpBaseOfDll + modinfoKERNEL32.SizeOfImage))
+                {
+                    _ASSERTE(!"PC did not end up in KERNEL32");
+                    goto ErrExit;
+                }
+
+                break;
+            }
+        }
+        else
+        {
+            // Nested functions that do not use any stack space or nonvolatile
+            // registers are not required to have unwind info (ex.
+            // USER32!ZwUserCreateWindowEx).
+            ctx.Rip = *(ULONG64*)(ctx.Rsp);
+            ctx.Rsp += sizeof(ULONG64);
+        }
+    }
+ErrExit:
+    
+    END_ENTRYPOINT_VOIDRET;
+    return;
+}
+
+//virtual
+VOID StubLinkerCPU::EmitUnwindInfoCheckWorker (CodeLabel *pCheckLabel)
+{
+    STANDARD_VM_CONTRACT;
+    X86EmitCall(pCheckLabel, 0);
+}
+
+//virtual
+VOID StubLinkerCPU::EmitUnwindInfoCheckSubfunction()
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+    // X86EmitCall will generate "mov rax, target/jmp rax", so we have to save
+    // rax on the stack.  DO NOT use X86EmitPushReg.  That will induce infinite
+    // recursion, since the push may require more unwind info.  This "push rax"
+    // will be accounted for by DebugCheckStubUnwindInfo's unwind info
+    // (considered part of its locals), so there doesn't have to be unwind
+    // info for it.
+    Emit8(0x50);
+#endif
+
+    X86EmitNearJump(NewExternalCodeLabel(DebugCheckStubUnwindInfo));
+}
+
+#endif // defined(_DEBUG) && defined(STUBLINKER_GENERATES_UNWIND_INFO)
+
+
+#ifdef _TARGET_X86_
+
+//-----------------------------------------------------------------------
+// Generates the inline portion of the code to enable preemptive GC. Hopefully,
+// the inline code is all that will execute most of the time. If this code
+// path is entered at certain times, however, it will need to jump out to
+// a separate out-of-line path which is more expensive. The "pForwardRef"
+// label indicates the start of the out-of-line path.
+//
+// Assumptions:
+//      ebx = Thread
+// Preserves
+//      all registers except ecx.
+//
+//-----------------------------------------------------------------------
+VOID StubLinkerCPU::EmitEnable(CodeLabel *pForwardRef)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        PRECONDITION(4 == sizeof( ((Thread*)0)->m_State ));
+        PRECONDITION(4 == sizeof( ((Thread*)0)->m_fPreemptiveGCDisabled ));
+    }
+    CONTRACTL_END;
+
+    // move byte ptr [ebx + Thread.m_fPreemptiveGCDisabled],0
+    X86EmitOffsetModRM(0xc6, (X86Reg)0, kEBX, Thread::GetOffsetOfGCFlag());
+    Emit8(0);
+
+    _ASSERTE(FitsInI1(Thread::TS_CatchAtSafePoint));
+
+    // test byte ptr [ebx + Thread.m_State], TS_CatchAtSafePoint
+    X86EmitOffsetModRM(0xf6, (X86Reg)0, kEBX, Thread::GetOffsetOfState());
+    Emit8(Thread::TS_CatchAtSafePoint);
+
+    // jnz RarePath
+    X86EmitCondJump(pForwardRef, X86CondCode::kJNZ);
+
+#ifdef _DEBUG
+    X86EmitDebugTrashReg(kECX);
+#endif
+
+}
+
+
+//-----------------------------------------------------------------------
+// Generates the out-of-line portion of the code to enable preemptive GC.
+// After the work is done, the code jumps back to the "pRejoinPoint"
+// which should be emitted right after the inline part is generated.
+//
+// Assumptions:
+//      ebx = Thread
+// Preserves
+//      all registers except ecx.
+//
+//-----------------------------------------------------------------------
+VOID StubLinkerCPU::EmitRareEnable(CodeLabel *pRejoinPoint)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitCall(NewExternalCodeLabel((LPVOID) StubRareEnable), 0);
+#ifdef _DEBUG
+    X86EmitDebugTrashReg(kECX);
+#endif
+    if (pRejoinPoint)
+    {
+        X86EmitNearJump(pRejoinPoint);
+    }
+
+}
+
+
+//-----------------------------------------------------------------------
+// Generates the inline portion of the code to disable preemptive GC. Hopefully,
+// the inline code is all that will execute most of the time. If this code
+// path is entered at certain times, however, it will need to jump out to
+// a separate out-of-line path which is more expensive. The "pForwardRef"
+// label indicates the start of the out-of-line path.
+//
+// Assumptions:
+//      ebx = Thread
+// Preserves
+//      all registers except ecx.
+//
+//-----------------------------------------------------------------------
+VOID StubLinkerCPU::EmitDisable(CodeLabel *pForwardRef, BOOL fCallIn, X86Reg ThreadReg)
+{
+    CONTRACTL
+    {
+        STANDARD_VM_CHECK;
+
+        PRECONDITION(4 == sizeof( ((Thread*)0)->m_fPreemptiveGCDisabled ));
+        PRECONDITION(4 == sizeof(g_TrapReturningThreads));
+    }
+    CONTRACTL_END;
+
+#if defined(FEATURE_COMINTEROP) && defined(MDA_SUPPORTED)
+    // If we are checking whether the current thread is already holds the loader lock, vector
+    // such cases to the rare disable pathway, where we can check again.
+    if (fCallIn && (NULL != MDA_GET_ASSISTANT(Reentrancy)))
+    {
+        CodeLabel   *pNotReentrantLabel = NewCodeLabel();
+
+        // test byte ptr [ebx + Thread.m_fPreemptiveGCDisabled],1
+        X86EmitOffsetModRM(0xf6, (X86Reg)0, ThreadReg, Thread::GetOffsetOfGCFlag());
+        Emit8(1);
+
+        // jz NotReentrant
+        X86EmitCondJump(pNotReentrantLabel, X86CondCode::kJZ);
+        
+        X86EmitPushReg(kEAX);
+        X86EmitPushReg(kEDX);
+        X86EmitPushReg(kECX);
+
+        X86EmitCall(NewExternalCodeLabel((LPVOID) HasIllegalReentrancy), 0);
+
+        // If the probe fires, we go ahead and allow the call anyway.  At this point, there could be
+        // GC heap corruptions.  So the probe detects the illegal case, but doesn't prevent it.
+
+        X86EmitPopReg(kECX);
+        X86EmitPopReg(kEDX);
+        X86EmitPopReg(kEAX);
+
+        EmitLabel(pNotReentrantLabel);
+    }
+#endif
+
+    // move byte ptr [ebx + Thread.m_fPreemptiveGCDisabled],1
+    X86EmitOffsetModRM(0xc6, (X86Reg)0, ThreadReg, Thread::GetOffsetOfGCFlag());
+    Emit8(1);
+
+    // cmp dword ptr g_TrapReturningThreads, 0
+    Emit16(0x3d83);
+    EmitPtr((void *)&g_TrapReturningThreads);
+    Emit8(0);
+
+    // jnz RarePath
+    X86EmitCondJump(pForwardRef, X86CondCode::kJNZ);
+
+#if defined(FEATURE_COMINTEROP) && !defined(FEATURE_CORESYSTEM)
+    // If we are checking whether the current thread holds the loader lock, vector
+    // such cases to the rare disable pathway, where we can check again.
+    if (fCallIn && ShouldCheckLoaderLock())
+    {
+        X86EmitPushReg(kEAX);
+        X86EmitPushReg(kEDX);
+
+        if (ThreadReg == kECX)
+            X86EmitPushReg(kECX);
+
+        // BOOL AuxUlibIsDLLSynchronizationHeld(BOOL *IsHeld)
+        //
+        // So we need to be sure that both the return value and the passed BOOL are both TRUE.
+        // If either is FALSE, then the call failed or the lock is not held.  Either way, the
+        // probe should not fire.
+
+        X86EmitPushReg(kEDX);               // BOOL temp
+        Emit8(0x54);                        // push ESP because arg is &temp
+        X86EmitCall(NewExternalCodeLabel((LPVOID) AuxUlibIsDLLSynchronizationHeld), 0);
+
+        // callee has popped.
+        X86EmitPopReg(kEDX);                // recover temp
+
+        CodeLabel   *pPopLabel = NewCodeLabel();
+
+        Emit16(0xc085);                     // test eax, eax
+        X86EmitCondJump(pPopLabel, X86CondCode::kJZ);
+
+        Emit16(0xd285);                     // test edx, edx
+
+        EmitLabel(pPopLabel);               // retain the conditional flags across the pops
+
+        if (ThreadReg == kECX)
+            X86EmitPopReg(kECX);
+
+        X86EmitPopReg(kEDX);
+        X86EmitPopReg(kEAX);
+
+        X86EmitCondJump(pForwardRef, X86CondCode::kJNZ);
+    }
+#endif
+
+#ifdef _DEBUG
+    if (ThreadReg != kECX)
+        X86EmitDebugTrashReg(kECX);
+#endif
+
+}
+
+
+//-----------------------------------------------------------------------
+// Generates the out-of-line portion of the code to disable preemptive GC.
+// After the work is done, the code jumps back to the "pRejoinPoint"
+// which should be emitted right after the inline part is generated.  However,
+// if we cannot execute managed code at this time, an exception is thrown
+// which cannot be caught by managed code.
+//
+// Assumptions:
+//      ebx = Thread
+// Preserves
+//      all registers except ecx, eax.
+//
+//-----------------------------------------------------------------------
+VOID StubLinkerCPU::EmitRareDisable(CodeLabel *pRejoinPoint)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitCall(NewExternalCodeLabel((LPVOID) StubRareDisableTHROW), 0);
+
+#ifdef _DEBUG
+    X86EmitDebugTrashReg(kECX);
+#endif
+    X86EmitNearJump(pRejoinPoint);
+}
+
+#ifdef FEATURE_COMINTEROP
+//-----------------------------------------------------------------------
+// Generates the out-of-line portion of the code to disable preemptive GC.
+// After the work is done, the code normally jumps back to the "pRejoinPoint"
+// which should be emitted right after the inline part is generated.  However,
+// if we cannot execute managed code at this time, an HRESULT is returned
+// via the ExitPoint.
+//
+// Assumptions:
+//      ebx = Thread
+// Preserves
+//      all registers except ecx, eax.
+//
+//-----------------------------------------------------------------------
+VOID StubLinkerCPU::EmitRareDisableHRESULT(CodeLabel *pRejoinPoint, CodeLabel *pExitPoint)
+{
+    STANDARD_VM_CONTRACT;
+
+    X86EmitCall(NewExternalCodeLabel((LPVOID) StubRareDisableHR), 0);
+
+#ifdef _DEBUG
+    X86EmitDebugTrashReg(kECX);
+#endif
+
+    // test eax, eax  ;; test the result of StubRareDisableHR
+    Emit16(0xc085);
+
+    // JZ pRejoinPoint
+    X86EmitCondJump(pRejoinPoint, X86CondCode::kJZ);
+
+    X86EmitNearJump(pExitPoint);
+}
+#endif // FEATURE_COMINTEROP
+
+#endif // _TARGET_X86_
+
+#endif // CROSSGEN_COMPILE
+
+
+VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray)
+{
+    STANDARD_VM_CONTRACT;
+
+#ifdef _TARGET_AMD64_
+
+    // mov SCRATCHREG,rsp
+    X86_64BitOperands();
+    Emit8(0x8b);
+    Emit8(0304 | (SCRATCH_REGISTER_X86REG << 3));
+
+    // save the real target in r11, will jump to it later.  r10 is used below.
+    // Windows: mov r11, rcx
+    // Unix: mov r11, rdi
+    X86EmitMovRegReg(kR11, THIS_kREG);
+
+#ifdef UNIX_AMD64_ABI
+    for (ShuffleEntry* pEntry = pShuffleEntryArray; pEntry->srcofs != ShuffleEntry::SENTINEL; pEntry++)
+    {
+        if (pEntry->srcofs & ShuffleEntry::REGMASK)
+        {
+            // If source is present in register then destination must also be a register
+            _ASSERTE(pEntry->dstofs & ShuffleEntry::REGMASK);
+            // Both the srcofs and dstofs must be of the same kind of registers - float or general purpose.
+            _ASSERTE((pEntry->dstofs & ShuffleEntry::FPREGMASK) == (pEntry->srcofs & ShuffleEntry::FPREGMASK));
+
+            int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+            int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK;
+
+            if (pEntry->srcofs & ShuffleEntry::FPREGMASK) 
+            {
+                // movdqa dstReg, srcReg
+                X64EmitMovXmmXmm((X86Reg)(kXMM0 + dstRegIndex), (X86Reg)(kXMM0 + srcRegIndex));
+            }
+            else
+            {
+                // mov dstReg, srcReg
+                X86EmitMovRegReg(c_argRegs[dstRegIndex], c_argRegs[srcRegIndex]);
+            }
+        }
+        else if (pEntry->dstofs & ShuffleEntry::REGMASK)
+        {
+            // source must be on the stack
+            _ASSERTE(!(pEntry->srcofs & ShuffleEntry::REGMASK));
+
+            int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+            int srcOffset = (pEntry->srcofs + 1) * sizeof(void*);
+
+            if (pEntry->dstofs & ShuffleEntry::FPREGMASK) 
+            {
+                if (pEntry->dstofs & ShuffleEntry::FPSINGLEMASK)
+                {
+                    // movss dstReg, [rax + src]
+                    X64EmitMovSSFromMem((X86Reg)(kXMM0 + dstRegIndex), SCRATCH_REGISTER_X86REG, srcOffset);
+                }
+                else
+                {
+                    // movsd dstReg, [rax + src]
+                    X64EmitMovSDFromMem((X86Reg)(kXMM0 + dstRegIndex), SCRATCH_REGISTER_X86REG, srcOffset);
+                }
+            }
+            else
+            {
+                // mov dstreg, [rax + src]
+                X86EmitIndexRegLoad(c_argRegs[dstRegIndex], SCRATCH_REGISTER_X86REG, srcOffset);
+            }
+        }
+        else
+        {
+            // source must be on the stack
+            _ASSERTE(!(pEntry->srcofs & ShuffleEntry::REGMASK));
+
+            // dest must be on the stack
+            _ASSERTE(!(pEntry->dstofs & ShuffleEntry::REGMASK));
+
+            // mov r10, [rax + src]
+            X86EmitIndexRegLoad (kR10, SCRATCH_REGISTER_X86REG, (pEntry->srcofs + 1) * sizeof(void*));
+
+            // mov [rax + dst], r10
+            X86EmitIndexRegStore (SCRATCH_REGISTER_X86REG, (pEntry->dstofs + 1) * sizeof(void*), kR10);
+        }
+    }
+#else // UNIX_AMD64_ABI
+    UINT step = 1;
+
+    if (pShuffleEntryArray->argtype == ELEMENT_TYPE_END)
+    {
+        // Special handling of open instance methods with return buffer. Move "this"
+        // by two slots, and leave the "retbufptr" between the two slots intact.
+
+        // mov rcx, r8
+        X86EmitMovRegReg(kRCX, kR8);
+
+        // Skip this entry
+        pShuffleEntryArray++;
+
+        // Skip this entry and leave retbufptr intact
+        step += 2;
+    }
+
+    // Now shuffle the args by one position:
+    //   steps 1-3 : reg args (rcx, rdx, r8)
+    //   step  4   : stack->reg arg (r9)
+    //   step >4   : stack args
+
+    for(;
+        pShuffleEntryArray->srcofs != ShuffleEntry::SENTINEL;
+        step++, pShuffleEntryArray++)
+    {
+        switch (step)
+        {
+        case 1:
+        case 2:
+        case 3:
+            switch (pShuffleEntryArray->argtype)
+            {
+            case ELEMENT_TYPE_R4:
+            case ELEMENT_TYPE_R8:
+                // mov xmm-1#, xmm#
+                X64EmitMovXmmXmm((X86Reg)(step - 1), (X86Reg)(step));
+                break;
+            default:
+                // mov argRegs[step-1], argRegs[step]
+                X86EmitMovRegReg(c_argRegs[step-1], c_argRegs[step]);
+                break;
+            }
+            break;
+
+        case 4:
+        {
+            switch (pShuffleEntryArray->argtype)
+            {
+            case ELEMENT_TYPE_R4:
+                X64EmitMovSSFromMem(kXMM3, kRAX, 0x28);
+                break;
+
+            case ELEMENT_TYPE_R8:
+                X64EmitMovSDFromMem(kXMM3, kRAX, 0x28);
+                break;
+
+            default:
+                // mov r9, [rax + 28h]
+                X86EmitIndexRegLoad (kR9, SCRATCH_REGISTER_X86REG, 5*sizeof(void*));
+            }
+            break;
+        }
+        default:
+
+            // mov r10, [rax + (step+1)*sizeof(void*)]
+            X86EmitIndexRegLoad (kR10, SCRATCH_REGISTER_X86REG, (step+1)*sizeof(void*));
+
+            // mov [rax + step*sizeof(void*)], r10
+            X86EmitIndexRegStore (SCRATCH_REGISTER_X86REG, step*sizeof(void*), kR10);
+        }
+    }
+#endif // UNIX_AMD64_ABI
+
+    // mov r10, [r11 + Delegate._methodptraux]
+    X86EmitIndexRegLoad(kR10, kR11, DelegateObject::GetOffsetOfMethodPtrAux());
+    // add r11, DelegateObject::GetOffsetOfMethodPtrAux() - load the indirection cell into r11
+    X86EmitAddReg(kR11, DelegateObject::GetOffsetOfMethodPtrAux());
+    // Now jump to real target
+    //   jmp r10
+    X86EmitR2ROp(0xff, (X86Reg)4, kR10);
+
+#else // _TARGET_AMD64_
+
+    UINT espadjust = 0;
+    BOOL haveMemMemMove = FALSE;
+
+    ShuffleEntry *pWalk = NULL;
+    for (pWalk = pShuffleEntryArray; pWalk->srcofs != ShuffleEntry::SENTINEL; pWalk++)
+    {
+        if (!(pWalk->dstofs & ShuffleEntry::REGMASK) &&
+            !(pWalk->srcofs & ShuffleEntry::REGMASK) &&
+              pWalk->srcofs != pWalk->dstofs)
+        {
+            haveMemMemMove = TRUE;
+            espadjust = sizeof(void*);
+            break;
+        }
+    }
+
+    if (haveMemMemMove)
+    {
+        // push ecx
+        X86EmitPushReg(THIS_kREG);
+    }
+    else
+    {
+        // mov eax, ecx
+        Emit8(0x8b);
+        Emit8(0300 | SCRATCH_REGISTER_X86REG << 3 | THIS_kREG);
+    }
+    
+    UINT16 emptySpot = 0x4 | ShuffleEntry::REGMASK;
+
+    while (true)
+    {
+        for (pWalk = pShuffleEntryArray; pWalk->srcofs != ShuffleEntry::SENTINEL; pWalk++)
+            if (pWalk->dstofs == emptySpot)
+                break;
+            
+        if (pWalk->srcofs == ShuffleEntry::SENTINEL)
+            break;
+        
+        if ((pWalk->dstofs & ShuffleEntry::REGMASK))
+        {
+            if (pWalk->srcofs & ShuffleEntry::REGMASK)
+            {
+                // mov <dstReg>,<srcReg>
+                Emit8(0x8b);
+                Emit8(static_cast<UINT8>(0300 |
+                        (GetX86ArgumentRegisterFromOffset( pWalk->dstofs & ShuffleEntry::OFSMASK ) << 3) |
+                        (GetX86ArgumentRegisterFromOffset( pWalk->srcofs & ShuffleEntry::OFSMASK ))));
+            }
+            else
+            {
+                X86EmitEspOffset(0x8b, GetX86ArgumentRegisterFromOffset( pWalk->dstofs & ShuffleEntry::OFSMASK ), pWalk->srcofs+espadjust);
+            }
+        }
+        else
+        {
+            // if the destination is not a register, the source shouldn't be either.
+            _ASSERTE(!(pWalk->srcofs & ShuffleEntry::REGMASK));
+            if (pWalk->srcofs != pWalk->dstofs)
+            {
+               X86EmitEspOffset(0x8b, kEAX, pWalk->srcofs+espadjust);
+               X86EmitEspOffset(0x89, kEAX, pWalk->dstofs+espadjust);
+            }
+        }
+        emptySpot = pWalk->srcofs;
+    }
+    
+    // Capture the stacksizedelta while we're at the end of the list.
+    _ASSERTE(pWalk->srcofs == ShuffleEntry::SENTINEL);
+
+    if (haveMemMemMove)
+        X86EmitPopReg(SCRATCH_REGISTER_X86REG);
+
+    if (pWalk->stacksizedelta)
+        X86EmitAddEsp(pWalk->stacksizedelta);
+
+    // Now jump to real target
+    //   JMP [SCRATCHREG]
+    // we need to jump indirect so that for virtual delegates eax contains a pointer to the indirection cell
+    X86EmitAddReg(SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfMethodPtrAux());
+    static const BYTE bjmpeax[] = { 0xff, 0x20 };
+    EmitBytes(bjmpeax, sizeof(bjmpeax));
+    
+#endif // _TARGET_AMD64_
+}
+
+
+#if !defined(CROSSGEN_COMPILE) && !defined(FEATURE_STUBS_AS_IL)
+
+//===========================================================================
+// Computes hash code for MulticastDelegate.Invoke()
+UINT_PTR StubLinkerCPU::HashMulticastInvoke(MetaSig* pSig)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+    }
+    CONTRACTL_END;
+
+    ArgIterator argit(pSig);
+
+    UINT numStackBytes = argit.SizeOfArgStack();
+
+    if (numStackBytes > 0x7FFF) 
+        COMPlusThrow(kNotSupportedException, W("NotSupported_TooManyArgs"));
+
+#ifdef _TARGET_AMD64_
+    // Generate a hash key as follows:
+    //      UINT Arg0Type:2; // R4 (1), R8 (2), other (3)
+    //      UINT Arg1Type:2; // R4 (1), R8 (2), other (3)
+    //      UINT Arg2Type:2; // R4 (1), R8 (2), other (3)
+    //      UINT Arg3Type:2; // R4 (1), R8 (2), other (3)
+    //      UINT NumArgs:24; // number of arguments
+    // (This should cover all the prestub variations)
+
+    _ASSERTE(!(numStackBytes & 7));
+    UINT hash = (numStackBytes / sizeof(void*)) << 8;
+
+    UINT argNum = 0;
+
+    // NextArg() doesn't take into account the "this" pointer.
+    // That's why we have to special case it here.
+    if (argit.HasThis())
+    {
+        hash |= 3 << (2*argNum);
+        argNum++;
+    }
+
+    if (argit.HasRetBuffArg())
+    {
+        hash |= 3 << (2*argNum);
+        argNum++;
+    }
+
+    for (; argNum < 4; argNum++)
+    {
+        switch (pSig->NextArgNormalized())
+        {
+        case ELEMENT_TYPE_END:
+            argNum = 4;
+            break;
+        case ELEMENT_TYPE_R4:
+            hash |= 1 << (2*argNum);
+            break;
+        case ELEMENT_TYPE_R8:
+            hash |= 2 << (2*argNum);
+            break;
+        default:
+            hash |= 3 << (2*argNum);
+            break;
+        }
+    }
+
+#else // _TARGET_AMD64_
+
+    // check if the function is returning a float, in which case the stub has to take
+    // care of popping the floating point stack except for the last invocation
+
+    _ASSERTE(!(numStackBytes & 3));
+
+    UINT hash = numStackBytes;
+
+    if (CorTypeInfo::IsFloat(pSig->GetReturnType()))
+    {
+        hash |= 2;
+    }
+#endif // _TARGET_AMD64_
+
+    return hash;
+}
+
+#ifdef _TARGET_X86_
+//===========================================================================
+// Emits code for MulticastDelegate.Invoke()
+VOID StubLinkerCPU::EmitDelegateInvoke()
+{
+    STANDARD_VM_CONTRACT;
+
+    CodeLabel *pNullLabel = NewCodeLabel();
+
+    // test THISREG, THISREG
+    X86EmitR2ROp(0x85, THIS_kREG, THIS_kREG);
+
+    // jz null
+    X86EmitCondJump(pNullLabel, X86CondCode::kJZ);
+
+    // mov SCRATCHREG, [THISREG + Delegate.FP]  ; Save target stub in register
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfMethodPtr());
+
+    // mov THISREG, [THISREG + Delegate.OR]  ; replace "this" pointer
+    X86EmitIndexRegLoad(THIS_kREG, THIS_kREG, DelegateObject::GetOffsetOfTarget());
+
+    // jmp SCRATCHREG
+    Emit16(0xe0ff | (SCRATCH_REGISTER_X86REG<<8));
+
+    // Do a null throw
+    EmitLabel(pNullLabel);
+
+    // mov ECX, CORINFO_NullReferenceException
+    Emit8(0xb8+kECX);
+    Emit32(CORINFO_NullReferenceException);
+
+    X86EmitCall(NewExternalCodeLabel(GetEEFuncEntryPoint(JIT_InternalThrowFromHelper)), 0);
+    
+    X86EmitReturn(0);
+}
+#endif // _TARGET_X86_
+
+VOID StubLinkerCPU::EmitMulticastInvoke(UINT_PTR hash)
+{
+    STANDARD_VM_CONTRACT;
+
+    int thisRegOffset = MulticastFrame::GetOffsetOfTransitionBlock() + 
+        TransitionBlock::GetOffsetOfArgumentRegisters() + offsetof(ArgumentRegisters, THIS_REG);
+
+    // push the methoddesc on the stack
+    // mov eax, [ecx + offsetof(_methodAuxPtr)]
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfMethodPtrAux());
+    
+    // Push a MulticastFrame on the stack.
+    EmitMethodStubProlog(MulticastFrame::GetMethodFrameVPtr(), MulticastFrame::GetOffsetOfTransitionBlock());
+
+#ifdef _TARGET_X86_
+    // Frame is ready to be inspected by debugger for patch location
+    EmitPatchLabel();
+#else // _TARGET_AMD64_
+
+    // Save register arguments in their home locations.
+    // Non-FP registers are already saved by EmitMethodStubProlog.
+    // (Assumes Sig.NextArg() does not enum RetBuffArg or "this".)
+
+    int            argNum      = 0;
+    __int32        argOfs      = MulticastFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs();
+    CorElementType argTypes[4];
+    CorElementType argType;
+
+    // 'this'
+    argOfs += sizeof(void*);
+    argTypes[argNum] = ELEMENT_TYPE_I8;
+    argNum++;
+
+    do
+    {
+        argType = ELEMENT_TYPE_END;
+
+        switch ((hash >> (2 * argNum)) & 3)
+        {
+        case 0:
+            argType = ELEMENT_TYPE_END;
+            break;
+        case 1:
+            argType = ELEMENT_TYPE_R4;
+
+            // movss dword ptr [rsp + argOfs], xmm?
+            X64EmitMovSSToMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        case 2:
+            argType = ELEMENT_TYPE_R8;
+
+            // movsd qword ptr [rsp + argOfs], xmm?
+            X64EmitMovSDToMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        default:
+            argType = ELEMENT_TYPE_I;
+            break;
+        }
+
+        argOfs += sizeof(void*);
+        argTypes[argNum] = argType;
+        argNum++;
+    }
+    while (argNum < 4 && ELEMENT_TYPE_END != argType);
+
+    _ASSERTE(4 == argNum || ELEMENT_TYPE_END == argTypes[argNum-1]);
+
+#endif // _TARGET_AMD64_
+
+    // TODO: on AMD64, pick different regs for locals so don't need the pushes
+
+    // push edi     ;; Save EDI (want to use it as loop index)
+    X86EmitPushReg(kEDI);
+
+    // xor edi,edi  ;; Loop counter: EDI=0,1,2...
+    X86EmitZeroOutReg(kEDI);
+
+    CodeLabel *pLoopLabel = NewCodeLabel();
+    CodeLabel *pEndLoopLabel = NewCodeLabel();
+
+    EmitLabel(pLoopLabel);
+
+    // Entry:
+    //   EDI == iteration counter
+
+    // mov ecx, [esi + this]     ;; get delegate
+    X86EmitIndexRegLoad(THIS_kREG, kESI, thisRegOffset);
+
+    // cmp edi,[ecx]._invocationCount
+    X86EmitOp(0x3b, kEDI, THIS_kREG, DelegateObject::GetOffsetOfInvocationCount());
+
+    // je ENDLOOP
+    X86EmitCondJump(pEndLoopLabel, X86CondCode::kJZ);
+
+#ifdef _TARGET_AMD64_
+
+    INT32 numStackBytes = (INT32)((hash >> 8) * sizeof(void *));
+
+    INT32 stackUsed, numStackArgs, ofs;
+
+    // Push any stack args, plus an extra location
+    // for rsp alignment if needed
+
+    numStackArgs = numStackBytes / sizeof(void*);
+
+    // 1 push above, so stack is currently misaligned
+    const unsigned STACK_ALIGN_ADJUST = 8;
+
+    if (!numStackArgs)
+    {
+        // sub rsp, 28h             ;; 4 reg arg home locs + rsp alignment
+        stackUsed = 0x20 + STACK_ALIGN_ADJUST;
+        X86EmitSubEsp(stackUsed);
+    }
+    else
+    {
+        stackUsed = numStackArgs * sizeof(void*);
+
+        // If the stack is misaligned, then an odd number of arguments
+        // will naturally align the stack.
+        if (   ((numStackArgs & 1) == 0)
+            != (STACK_ALIGN_ADJUST == 0))
+        {
+            X86EmitPushReg(kRAX);
+            stackUsed += sizeof(void*);
+        }
+
+        ofs = MulticastFrame::GetOffsetOfTransitionBlock() +
+            TransitionBlock::GetOffsetOfArgs() + sizeof(ArgumentRegisters) + numStackBytes;
+
+        while (numStackArgs--)
+        {
+            ofs -= sizeof(void*);
+
+            // push [rsi + ofs]     ;; Push stack args
+            X86EmitIndexPush(kESI, ofs);
+        }
+
+        // sub rsp, 20h             ;; Create 4 reg arg home locations
+        X86EmitSubEsp(0x20);
+
+        stackUsed += 0x20;
+    }
+
+    for(
+        argNum = 0, argOfs = MulticastFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs();
+        argNum < 4 && argTypes[argNum] != ELEMENT_TYPE_END;
+        argNum++, argOfs += sizeof(void*)
+        )
+    {
+        switch (argTypes[argNum])
+        {
+        case ELEMENT_TYPE_R4:
+            // movss xmm?, dword ptr [rsi + argOfs]
+            X64EmitMovSSFromMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        case ELEMENT_TYPE_R8:
+            // movsd xmm?, qword ptr [rsi + argOfs]
+            X64EmitMovSDFromMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        default:
+            if (c_argRegs[argNum] != THIS_kREG)
+            {
+                // mov r*, [rsi + dstOfs]
+                X86EmitIndexRegLoad(c_argRegs[argNum], kESI,argOfs);
+            }
+            break;
+        } // switch
+    }
+
+    //    mov SCRATCHREG, [rcx+Delegate._invocationList]  ;;fetch invocation list
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfInvocationList());
+
+    //    mov SCRATCHREG, [SCRATCHREG+m_Array+rdi*8]    ;; index into invocation list
+    X86EmitOp(0x8b, kEAX, SCRATCH_REGISTER_X86REG, static_cast<int>(PtrArray::GetDataOffset()), kEDI, sizeof(void*), k64BitOp);
+
+    //    mov THISREG, [SCRATCHREG+Delegate.object]  ;;replace "this" pointer
+    X86EmitIndexRegLoad(THIS_kREG, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfTarget());
+
+    // call [SCRATCHREG+Delegate.target] ;; call current subscriber
+    X86EmitOffsetModRM(0xff, (X86Reg)2, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfMethodPtr());
+
+    // add rsp, stackUsed           ;; Clean up stack
+    X86EmitAddEsp(stackUsed);
+
+    //    inc edi
+    Emit16(0xC7FF);
+
+#else // _TARGET_AMD64_
+
+    UINT16 numStackBytes = static_cast<UINT16>(hash & ~3);
+
+    //    ..repush & reenregister args..
+    INT32 ofs = numStackBytes + MulticastFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs();
+    while (ofs != MulticastFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs())
+    {
+        ofs -= sizeof(void*);
+        X86EmitIndexPush(kESI, ofs);
+    }
+
+    #define ARGUMENT_REGISTER(regname) if (k##regname != THIS_kREG) { X86EmitIndexRegLoad(k##regname, kESI, \
+        offsetof(ArgumentRegisters, regname) + MulticastFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgumentRegisters()); }
+
+    ENUM_ARGUMENT_REGISTERS_BACKWARD();
+
+    #undef ARGUMENT_REGISTER
+
+    //    mov SCRATCHREG, [ecx+Delegate._invocationList]  ;;fetch invocation list
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfInvocationList());
+
+    //    mov SCRATCHREG, [SCRATCHREG+m_Array+edi*4]    ;; index into invocation list
+    X86EmitOp(0x8b, kEAX, SCRATCH_REGISTER_X86REG, PtrArray::GetDataOffset(), kEDI, sizeof(void*));
+
+    //    mov THISREG, [SCRATCHREG+Delegate.object]  ;;replace "this" pointer
+    X86EmitIndexRegLoad(THIS_kREG, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfTarget());
+
+    //    call [SCRATCHREG+Delegate.target] ;; call current subscriber
+    X86EmitOffsetModRM(0xff, (X86Reg)2, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfMethodPtr());
+    INDEBUG(Emit8(0x90));       // Emit a nop after the call in debug so that
+                                // we know that this is a call that can directly call
+                                // managed code
+
+    //    inc edi
+    Emit8(0x47);
+
+    if (hash & 2) // CorTypeInfo::IsFloat(pSig->GetReturnType())
+    {
+        // if the return value is a float/double check if we just did the last call - if not,
+        // emit the pop of the float stack
+
+        // mov SCRATCHREG, [esi + this]     ;; get delegate
+        X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, kESI, thisRegOffset);
+
+        // cmp edi,[SCRATCHREG]._invocationCount
+        X86EmitOffsetModRM(0x3b, kEDI, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfInvocationCount());
+
+        CodeLabel *pNoFloatStackPopLabel = NewCodeLabel();
+
+        // je NOFLOATSTACKPOP
+        X86EmitCondJump(pNoFloatStackPopLabel, X86CondCode::kJZ);
+
+        // fstp 0
+        Emit16(0xd8dd);
+
+        // NoFloatStackPopLabel:
+        EmitLabel(pNoFloatStackPopLabel);
+    }
+
+#endif // _TARGET_AMD64_
+
+    // The debugger may need to stop here, so grab the offset of this code.
+    EmitPatchLabel();
+
+    // jmp LOOP
+    X86EmitNearJump(pLoopLabel);
+
+    //ENDLOOP:
+    EmitLabel(pEndLoopLabel);
+
+    // pop edi     ;; Restore edi
+    X86EmitPopReg(kEDI);
+ 
+    EmitCheckGSCookie(kESI, MulticastFrame::GetOffsetOfGSCookie());
+
+    // Epilog
+    EmitMethodStubEpilog(numStackBytes, MulticastFrame::GetOffsetOfTransitionBlock());
+}
+
+VOID StubLinkerCPU::EmitSecureDelegateInvoke(UINT_PTR hash)
+{
+    STANDARD_VM_CONTRACT;
+
+    int thisRegOffset = SecureDelegateFrame::GetOffsetOfTransitionBlock() +
+        TransitionBlock::GetOffsetOfArgumentRegisters() + offsetof(ArgumentRegisters, THIS_REG);
+
+    // push the methoddesc on the stack
+    // mov eax, [ecx + offsetof(_invocationCount)]
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfInvocationCount());
+    
+    // Push a SecureDelegateFrame on the stack.
+    EmitMethodStubProlog(SecureDelegateFrame::GetMethodFrameVPtr(), SecureDelegateFrame::GetOffsetOfTransitionBlock());
+
+#ifdef _TARGET_X86_
+    // Frame is ready to be inspected by debugger for patch location
+    EmitPatchLabel();
+#else // _TARGET_AMD64_
+
+    // Save register arguments in their home locations.
+    // Non-FP registers are already saved by EmitMethodStubProlog.
+    // (Assumes Sig.NextArg() does not enum RetBuffArg or "this".)
+
+    int            argNum      = 0;
+    __int32        argOfs      = SecureDelegateFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs();
+    CorElementType argTypes[4];
+    CorElementType argType;
+
+    // 'this'
+    argOfs += sizeof(void*);
+    argTypes[argNum] = ELEMENT_TYPE_I8;
+    argNum++;
+
+    do
+    {
+        argType = ELEMENT_TYPE_END;
+
+        switch ((hash >> (2 * argNum)) & 3)
+        {
+        case 0:
+            argType = ELEMENT_TYPE_END;
+            break;
+        case 1:
+            argType = ELEMENT_TYPE_R4;
+
+            // movss dword ptr [rsp + argOfs], xmm?
+            X64EmitMovSSToMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        case 2:
+            argType = ELEMENT_TYPE_R8;
+
+            // movsd qword ptr [rsp + argOfs], xmm?
+            X64EmitMovSSToMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        default:
+            argType = ELEMENT_TYPE_I;
+            break;
+        }
+
+        argOfs += sizeof(void*);
+        argTypes[argNum] = argType;
+        argNum++;
+    }
+    while (argNum < 4 && ELEMENT_TYPE_END != argType);
+
+    _ASSERTE(4 == argNum || ELEMENT_TYPE_END == argTypes[argNum-1]);
+
+#endif // _TARGET_AMD64_
+
+    // mov ecx, [esi + this]     ;; get delegate
+    X86EmitIndexRegLoad(THIS_kREG, kESI, thisRegOffset);
+
+#ifdef _TARGET_AMD64_
+
+    INT32 numStackBytes = (INT32)((hash >> 8) * sizeof(void *));
+
+    INT32 stackUsed, numStackArgs, ofs;
+
+    // Push any stack args, plus an extra location
+    // for rsp alignment if needed
+
+    numStackArgs = numStackBytes / sizeof(void*);
+
+    // 1 push above, so stack is currently misaligned
+    const unsigned STACK_ALIGN_ADJUST = 0;
+
+    if (!numStackArgs)
+    {
+        // sub rsp, 28h             ;; 4 reg arg home locs + rsp alignment
+        stackUsed = 0x20 + STACK_ALIGN_ADJUST;
+        X86EmitSubEsp(stackUsed);
+    }
+    else
+    {
+        stackUsed = numStackArgs * sizeof(void*);
+
+        // If the stack is misaligned, then an odd number of arguments
+        // will naturally align the stack.
+        if (   ((numStackArgs & 1) == 0)
+            != (STACK_ALIGN_ADJUST == 0))
+        {
+            X86EmitPushReg(kRAX);
+            stackUsed += sizeof(void*);
+        }
+
+        ofs = SecureDelegateFrame::GetOffsetOfTransitionBlock() +
+            TransitionBlock::GetOffsetOfArgs() + sizeof(ArgumentRegisters) + numStackBytes;
+
+        while (numStackArgs--)
+        {
+            ofs -= sizeof(void*);
+
+            // push [rsi + ofs]     ;; Push stack args
+            X86EmitIndexPush(kESI, ofs);
+        }
+
+        // sub rsp, 20h             ;; Create 4 reg arg home locations
+        X86EmitSubEsp(0x20);
+
+        stackUsed += 0x20;
+    }
+
+    int thisArgNum = 0;
+
+    for(
+        argNum = 0, argOfs = SecureDelegateFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs();
+        argNum < 4 && argTypes[argNum] != ELEMENT_TYPE_END;
+        argNum++, argOfs += sizeof(void*)
+        )
+    {
+        switch (argTypes[argNum])
+        {
+        case ELEMENT_TYPE_R4:
+            // movss xmm?, dword ptr [rsi + argOfs]
+            X64EmitMovSSFromMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        case ELEMENT_TYPE_R8:
+            // movsd xmm?, qword ptr [rsi + argOfs]
+            X64EmitMovSDFromMem((X86Reg)argNum, kRSI, argOfs);
+            break;
+        default:
+            if (c_argRegs[argNum] != THIS_kREG)
+            {
+                // mov r*, [rsi + dstOfs]
+                X86EmitIndexRegLoad(c_argRegs[argNum], kESI,argOfs);
+            }
+            break;
+        } // switch
+    }
+
+    //    mov SCRATCHREG, [rcx+Delegate._invocationList]  ;;fetch the inner delegate
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfInvocationList());
+
+    //    mov THISREG, [SCRATCHREG+Delegate.object]  ;;replace "this" pointer
+    X86EmitIndexRegLoad(c_argRegs[thisArgNum], SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfTarget());
+
+    // call [SCRATCHREG+Delegate.target] ;; call current subscriber
+    X86EmitOffsetModRM(0xff, (X86Reg)2, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfMethodPtr());
+
+    // add rsp, stackUsed           ;; Clean up stack
+    X86EmitAddEsp(stackUsed);
+
+#else // _TARGET_AMD64_
+
+    UINT16 numStackBytes = static_cast<UINT16>(hash & ~3);
+
+    //    ..repush & reenregister args..
+    INT32 ofs = numStackBytes + SecureDelegateFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs();
+    while (ofs != SecureDelegateFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgs())
+    {
+        ofs -= sizeof(void*);
+        X86EmitIndexPush(kESI, ofs);
+    }
+
+    #define ARGUMENT_REGISTER(regname) if (k##regname != THIS_kREG) { X86EmitIndexRegLoad(k##regname, kESI, \
+        offsetof(ArgumentRegisters, regname) + SecureDelegateFrame::GetOffsetOfTransitionBlock() + TransitionBlock::GetOffsetOfArgumentRegisters()); }
+
+    ENUM_ARGUMENT_REGISTERS_BACKWARD();
+
+    #undef ARGUMENT_REGISTER
+
+    //    mov SCRATCHREG, [ecx+Delegate._invocationList]  ;;fetch the inner delegate
+    X86EmitIndexRegLoad(SCRATCH_REGISTER_X86REG, THIS_kREG, DelegateObject::GetOffsetOfInvocationList());
+
+    //    mov THISREG, [SCRATCHREG+Delegate.object]  ;;replace "this" pointer
+    X86EmitIndexRegLoad(THIS_kREG, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfTarget());
+
+    //    call [SCRATCHREG+Delegate.target] ;; call current subscriber
+    X86EmitOffsetModRM(0xff, (X86Reg)2, SCRATCH_REGISTER_X86REG, DelegateObject::GetOffsetOfMethodPtr());
+    INDEBUG(Emit8(0x90));       // Emit a nop after the call in debug so that
+                                // we know that this is a call that can directly call
+                                // managed code
+
+#endif // _TARGET_AMD64_
+
+    // The debugger may need to stop here, so grab the offset of this code.
+    EmitPatchLabel();
+
+    EmitCheckGSCookie(kESI, SecureDelegateFrame::GetOffsetOfGSCookie());
+
+    // Epilog
+    EmitMethodStubEpilog(numStackBytes, SecureDelegateFrame::GetOffsetOfTransitionBlock());
+}
+
+#ifndef FEATURE_ARRAYSTUB_AS_IL
+
+// Little helper to generate code to move nbytes bytes of non Ref memory
+
+void generate_noref_copy (unsigned nbytes, StubLinkerCPU* sl)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_NOTRIGGER;
+        INJECT_FAULT(COMPlusThrowOM(););
+    }
+    CONTRACTL_END;
+
+    // If the size is pointer-aligned, we'll use movsd
+    if (IS_ALIGNED(nbytes, sizeof(void*)))
+    {
+        // If there are less than 4 pointers to copy, "unroll" the "rep movsd"
+        if (nbytes <= 3*sizeof(void*))
+        {
+            while (nbytes > 0)
+            {
+                // movsd
+                sl->X86_64BitOperands();
+                sl->Emit8(0xa5);
+
+                nbytes -= sizeof(void*);
+            }
+        }
+        else
+        {
+            // mov ECX, size / 4
+            sl->Emit8(0xb8+kECX);
+            sl->Emit32(nbytes / sizeof(void*));
+
+            // repe movsd
+            sl->Emit8(0xf3);
+            sl->X86_64BitOperands();
+            sl->Emit8(0xa5);
+        }
+    }
+    else
+    {
+        // mov ECX, size
+        sl->Emit8(0xb8+kECX);
+        sl->Emit32(nbytes);
+
+        // repe movsb
+        sl->Emit16(0xa4f3);
+    }
+}
+
+
+X86Reg LoadArrayOpArg (
+        UINT32 idxloc,
+        StubLinkerCPU *psl,
+        X86Reg kRegIfFromMem,
+        UINT ofsadjust
+        AMD64_ARG(StubLinkerCPU::X86OperandSize OperandSize = StubLinkerCPU::k64BitOp)
+        )
+{
+    STANDARD_VM_CONTRACT;
+
+    if (!TransitionBlock::IsStackArgumentOffset(idxloc))
+        return GetX86ArgumentRegisterFromOffset(idxloc - TransitionBlock::GetOffsetOfArgumentRegisters());
+
+    psl->X86EmitEspOffset(0x8b, kRegIfFromMem, idxloc + ofsadjust AMD64_ARG(OperandSize));
+    return kRegIfFromMem;
+}
+
+VOID StubLinkerCPU::EmitArrayOpStubThrow(unsigned exConst, unsigned cbRetArg)
+{
+    STANDARD_VM_CONTRACT;
+
+    //ArrayOpStub*Exception
+    X86EmitPopReg(kESI);
+    X86EmitPopReg(kEDI);
+
+    //mov CORINFO_NullReferenceException_ASM, %ecx
+    Emit8(0xb8 | kECX);
+    Emit32(exConst);
+    //InternalExceptionWorker
+
+    X86EmitPopReg(kEDX);
+    // add pArrayOpScript->m_cbretpop, %esp (was add %eax, %esp)
+    Emit8(0x81);
+    Emit8(0xc0 | 0x4);
+    Emit32(cbRetArg);
+    X86EmitPushReg(kEDX);
+    X86EmitNearJump(NewExternalCodeLabel((PVOID)JIT_InternalThrow));
+}
+
+//===========================================================================
+// Emits code to do an array operation.
+#ifdef _PREFAST_
+#pragma warning(push)
+#pragma warning(disable:21000) // Suppress PREFast warning about overly large function
+#endif
+VOID StubLinkerCPU::EmitArrayOpStub(const ArrayOpScript* pArrayOpScript)
+{
+    STANDARD_VM_CONTRACT;
+
+    // This is the offset to the parameters/what's already pushed on the stack:
+    // return address.
+    const INT  locsize     = sizeof(void*);
+
+    // ArrayOpScript's stack offsets are built using ArgIterator, which
+    // assumes a TransitionBlock has been pushed, which is not the case
+    // here.  rsp + ofsadjust should point at the first argument.  Any further
+    // stack modifications below need to adjust ofsadjust appropriately.
+    // baseofsadjust needs to be the stack adjustment at the entry point -
+    // this is used further below to compute how much stack space was used.
+    
+    INT ofsadjust = locsize - (INT)sizeof(TransitionBlock);
+
+    // Register usage
+    //
+    //                                          x86                 AMD64
+    // Inputs:
+    //  managed array                           THIS_kREG (ecx)     THIS_kREG (rcx)
+    //  index 0                                 edx                 rdx
+    //  index 1/value                           <stack>             r8
+    //  index 2/value                           <stack>             r9
+    //  expected element type for LOADADDR      eax                 rax                 rdx
+    // Working registers:
+    //  total (accumulates unscaled offset)     edi                 r10
+    //  factor (accumulates the slice factor)   esi                 r11
+    X86Reg kArrayRefReg = THIS_kREG;
+#ifdef _TARGET_AMD64_
+    const X86Reg kArrayMTReg  = kR10;
+    const X86Reg kTotalReg    = kR10;
+    const X86Reg kFactorReg   = kR11;
+#else
+    const X86Reg kArrayMTReg  = kESI;
+    const X86Reg kTotalReg    = kEDI;
+    const X86Reg kFactorReg   = kESI;
+#endif
+
+#ifdef _TARGET_AMD64_
+    // Simplifying assumption for fNeedPrologue.
+    _ASSERTE(!pArrayOpScript->m_gcDesc || (pArrayOpScript->m_flags & ArrayOpScript::NEEDSWRITEBARRIER));
+    // Simplifying assumption for saving rsi and rdi.
+    _ASSERTE(!(pArrayOpScript->m_flags & ArrayOpScript::HASRETVALBUFFER) || ArgIterator::IsArgPassedByRef(pArrayOpScript->m_elemsize));
+
+    // Cases where we need to make calls
+    BOOL fNeedScratchArea = (   (pArrayOpScript->m_flags & (ArrayOpScript::NEEDSTYPECHECK | ArrayOpScript::NEEDSWRITEBARRIER))
+                             && (   pArrayOpScript->m_op == ArrayOpScript::STORE
+                                 || (   pArrayOpScript->m_op == ArrayOpScript::LOAD
+                                     && (pArrayOpScript->m_flags & ArrayOpScript::HASRETVALBUFFER))));
+
+    // Cases where we need to copy large values
+    BOOL fNeedRSIRDI = (   ArgIterator::IsArgPassedByRef(pArrayOpScript->m_elemsize)
+                        && ArrayOpScript::LOADADDR != pArrayOpScript->m_op);
+
+    BOOL fNeedPrologue = (   fNeedScratchArea
+                          || fNeedRSIRDI);
+#endif
+
+    X86Reg       kValueReg;
+
+    CodeLabel *Epilog = NewCodeLabel();
+    CodeLabel *Inner_nullexception = NewCodeLabel();
+    CodeLabel *Inner_rangeexception = NewCodeLabel();
+    CodeLabel *Inner_typeMismatchexception = NULL;
+
+    //
+    // Set up the stack frame.
+    //
+    //
+    // x86:
+    //          value
+    //          <index n-1>
+    //          ...
+    //          <index 1>
+    //          return address
+    //          saved edi
+    // esp ->   saved esi
+    //
+    //
+    // AMD64:
+    //          value, if rank > 2
+    //          ...
+    // + 0x48   more indices
+    // + 0x40   r9 home
+    // + 0x38   r8 home
+    // + 0x30   rdx home
+    // + 0x28   rcx home
+    // + 0x20   return address
+    // + 0x18   scratch area (callee's r9)
+    // + 0x10   scratch area (callee's r8)
+    // +    8   scratch area (callee's rdx)
+    // rsp ->   scratch area (callee's rcx)
+    //
+    // If the element type is a value class w/ object references, then rsi
+    // and rdi will also be saved above the scratch area:
+    //
+    // ...
+    // + 0x28   saved rsi
+    // + 0x20   saved rdi
+    // + 0x18   scratch area (callee's r9)
+    // + 0x10   scratch area (callee's r8)
+    // +    8   scratch area (callee's rdx)
+    // rsp ->   scratch area (callee's rcx)
+    //
+    // And if no call or movsb is necessary, then the scratch area sits
+    // directly under the MethodDesc*.
+
+    BOOL fSavedESI = FALSE;
+    BOOL fSavedEDI = FALSE;
+
+#ifdef _TARGET_AMD64_
+    if (fNeedPrologue)
+    {
+        // Save argument registers if we'll be making a call before using
+        // them.  Note that in this case the element value will always be an
+        // object type, and never be in an xmm register.
+
+        if (   (pArrayOpScript->m_flags & ArrayOpScript::NEEDSTYPECHECK)
+            && ArrayOpScript::STORE == pArrayOpScript->m_op)
+        {
+            // mov      [rsp+0x08], rcx
+            X86EmitEspOffset(0x89, kRCX, 0x08);
+            X86EmitEspOffset(0x89, kRDX, 0x10);
+            X86EmitEspOffset(0x89, kR8,  0x18);
+
+            if (pArrayOpScript->m_rank >= 2)
+                X86EmitEspOffset(0x89, kR9, 0x20);
+        }
+
+        if (fNeedRSIRDI)
+        {
+            X86EmitPushReg(kRSI);
+            X86EmitPushReg(kRDI);
+
+            fSavedESI = fSavedEDI = TRUE;
+
+            ofsadjust += 0x10;
+        }
+
+        if (fNeedScratchArea)
+        {
+            // Callee scratch area (0x8 for aligned esp)
+            X86EmitSubEsp(sizeof(ArgumentRegisters) + 0x8);
+            ofsadjust += sizeof(ArgumentRegisters) + 0x8;
+        }
+    }
+#else
+    // Preserve the callee-saved registers
+    // NOTE: if you change the sequence of these pushes, you must also update:
+    //  ArrayOpStubNullException
+    //  ArrayOpStubRangeException
+    //  ArrayOpStubTypeMismatchException
+    _ASSERTE(      kTotalReg == kEDI);
+    X86EmitPushReg(kTotalReg);
+    _ASSERTE(      kFactorReg == kESI);
+    X86EmitPushReg(kFactorReg);
+
+    fSavedESI = fSavedEDI = TRUE;
+
+    ofsadjust += 2*sizeof(void*);
+#endif
+
+    // Check for null.
+    X86EmitR2ROp(0x85, kArrayRefReg, kArrayRefReg);             //   TEST ECX, ECX
+    X86EmitCondJump(Inner_nullexception, X86CondCode::kJZ);     //   jz  Inner_nullexception
+
+    // Do Type Check if needed
+    if (pArrayOpScript->m_flags & ArrayOpScript::NEEDSTYPECHECK)
+    {
+        if (pArrayOpScript->m_op == ArrayOpScript::STORE)
+        {
+            // Get the value to be stored.
+            kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, kEAX, ofsadjust);
+
+            X86EmitR2ROp(0x85, kValueReg, kValueReg);                   // TEST kValueReg, kValueReg
+            CodeLabel *CheckPassed = NewCodeLabel();
+            X86EmitCondJump(CheckPassed, X86CondCode::kJZ);             // storing NULL is OK
+
+                                                                        // mov EAX, element type ; possibly trashes kValueReg
+            X86EmitOp(0x8b, kArrayMTReg, kArrayRefReg, 0 AMD64_ARG(k64BitOp)); // mov ESI/R10, [kArrayRefReg]
+
+            X86EmitOp(0x8b, kEAX, kValueReg, 0 AMD64_ARG(k64BitOp));    // mov EAX, [kValueReg]  ; possibly trashes kValueReg
+                                                                        // cmp EAX, [ESI/R10+m_ElementType]
+
+            X86EmitOp(0x3b, kEAX, kArrayMTReg, MethodTable::GetOffsetOfArrayElementTypeHandle() AMD64_ARG(k64BitOp));
+            X86EmitCondJump(CheckPassed, X86CondCode::kJZ);             // Exact match is OK
+
+            X86EmitRegLoad(kEAX, (UINT_PTR)g_pObjectClass);             // mov EAX, g_pObjectMethodTable
+                                                                        // cmp EAX, [ESI/R10+m_ElementType]
+
+            X86EmitOp(0x3b, kEAX, kArrayMTReg, MethodTable::GetOffsetOfArrayElementTypeHandle() AMD64_ARG(k64BitOp));
+            X86EmitCondJump(CheckPassed, X86CondCode::kJZ);             // Assigning to array of object is OK
+
+            // Try to call the fast helper first ( ObjIsInstanceOfNoGC ).
+            // If that fails we will fall back to calling the slow helper ( ArrayStoreCheck ) that erects a frame.
+            // See also JitInterfaceX86::JIT_Stelem_Ref  
+                                   
+#ifdef _TARGET_AMD64_
+            // RCX contains pointer to object to check (Object*)
+            // RDX contains array type handle 
+            
+            // mov RCX, [rsp+offsetToObject] ; RCX = Object*
+            X86EmitEspOffset(0x8b, kRCX, ofsadjust + pArrayOpScript->m_fValLoc); 
+
+            // get Array TypeHandle 
+            // mov RDX, [RSP+offsetOfTypeHandle]                       
+
+            X86EmitEspOffset(0x8b, kRDX,   ofsadjust
+                                         + TransitionBlock::GetOffsetOfArgumentRegisters()
+                                         + FIELD_OFFSET(ArgumentRegisters, THIS_REG));
+                                 
+            // mov RDX, [kArrayMTReg+offsetof(MethodTable, m_ElementType)]
+            X86EmitIndexRegLoad(kRDX, kArrayMTReg, MethodTable::GetOffsetOfArrayElementTypeHandle());
+            
+#else
+            X86EmitPushReg(kEDX);      // Save EDX
+            X86EmitPushReg(kECX);      // Pass array object
+
+            X86EmitIndexPush(kArrayMTReg, MethodTable::GetOffsetOfArrayElementTypeHandle()); // push [kArrayMTReg + m_ElementType] ; Array element type handle 
+
+            // get address of value to store
+            _ASSERTE(TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc)); // on x86, value will never get a register
+            X86EmitSPIndexPush(pArrayOpScript->m_fValLoc + ofsadjust + 3*sizeof(void*)); // push [ESP+offset] ; the object pointer
+
+#endif //_AMD64
+                        
+          
+            // emit a call to the fast helper            
+            // One side effect of this is that we are going to generate a "jnz Epilog" and we DON'T need it 
+            // in the fast path, however there are no side effects in emitting
+            // it in the fast path anyway. the reason for that is that it makes
+            // the cleanup code much easier ( we have only 1 place to cleanup the stack and
+            // restore it to the original state )
+            X86EmitCall(NewExternalCodeLabel((LPVOID)ObjIsInstanceOfNoGC), 0);
+            X86EmitCmpRegImm32( kEAX, TypeHandle::CanCast); // CMP EAX, CanCast ; if ObjIsInstanceOfNoGC returns CanCast, we will go the fast path
+            CodeLabel * Cleanup = NewCodeLabel();
+            X86EmitCondJump(Cleanup, X86CondCode::kJZ);
+                                               
+#ifdef _TARGET_AMD64_
+            // get address of value to store
+            // lea rcx, [rsp+offs]
+            X86EmitEspOffset(0x8d, kRCX,   ofsadjust + pArrayOpScript->m_fValLoc);
+
+            // get address of 'this'/rcx
+            // lea rdx, [rsp+offs]
+            X86EmitEspOffset(0x8d, kRDX,   ofsadjust
+                                         + TransitionBlock::GetOffsetOfArgumentRegisters()
+                                         + FIELD_OFFSET(ArgumentRegisters, THIS_REG));
+
+#else           
+            // The stack is already setup correctly for the slow helper.            
+            _ASSERTE(TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc)); // on x86, value will never get a register
+            X86EmitEspOffset(0x8d, kECX, pArrayOpScript->m_fValLoc + ofsadjust + 2*sizeof(void*));      // lea ECX, [ESP+offset]
+            
+            // get address of 'this'
+            X86EmitEspOffset(0x8d, kEDX, 0);    // lea EDX, [ESP]       ; (address of ECX) 
+
+            
+#endif 
+            AMD64_ONLY(_ASSERTE(fNeedScratchArea));
+            X86EmitCall(NewExternalCodeLabel((LPVOID)ArrayStoreCheck), 0);
+          
+            EmitLabel(Cleanup);           
+#ifdef _TARGET_AMD64_
+            X86EmitEspOffset(0x8b, kRCX, 0x00 + ofsadjust + TransitionBlock::GetOffsetOfArgumentRegisters());
+            X86EmitEspOffset(0x8b, kRDX, 0x08 + ofsadjust + TransitionBlock::GetOffsetOfArgumentRegisters());
+            X86EmitEspOffset(0x8b, kR8, 0x10 + ofsadjust + TransitionBlock::GetOffsetOfArgumentRegisters());
+
+            if (pArrayOpScript->m_rank >= 2)
+                X86EmitEspOffset(0x8b, kR9, 0x18 + ofsadjust + TransitionBlock::GetOffsetOfArgumentRegisters());
+#else
+            X86EmitPopReg(kECX);        // restore regs
+            X86EmitPopReg(kEDX);
+
+            
+            X86EmitR2ROp(0x3B, kEAX, kEAX);                             //   CMP EAX, EAX
+            X86EmitCondJump(Epilog, X86CondCode::kJNZ);         // This branch never taken, but epilog walker uses it
+#endif
+
+            EmitLabel(CheckPassed);
+        }
+        else
+        {
+            _ASSERTE(pArrayOpScript->m_op == ArrayOpScript::LOADADDR);
+
+            // Load up the hidden type parameter into 'typeReg'
+            X86Reg typeReg = LoadArrayOpArg(pArrayOpScript->m_typeParamOffs, this, kEAX, ofsadjust);
+
+            // 'typeReg' holds the typeHandle for the ARRAY.  This must be a ArrayTypeDesc*, so
+            // mask off the low two bits to get the TypeDesc*
+            X86EmitR2ROp(0x83, (X86Reg)4, typeReg);    //   AND typeReg, 0xFFFFFFFC
+            Emit8(0xFC);
+
+            // If 'typeReg' is NULL then we're executing the readonly ::Address and no type check is
+            // needed.
+            CodeLabel *Inner_passedTypeCheck = NewCodeLabel();
+
+            X86EmitCondJump(Inner_passedTypeCheck, X86CondCode::kJZ);
+            
+            // Get the parameter of the parameterize type
+            // mov typeReg, [typeReg.m_Arg]
+            X86EmitOp(0x8b, typeReg, typeReg, offsetof(ParamTypeDesc, m_Arg) AMD64_ARG(k64BitOp));
+            
+            // Compare this against the element type of the array.
+            // mov ESI/R10, [kArrayRefReg]
+            X86EmitOp(0x8b, kArrayMTReg, kArrayRefReg, 0 AMD64_ARG(k64BitOp));
+            // cmp typeReg, [ESI/R10+m_ElementType];
+            X86EmitOp(0x3b, typeReg, kArrayMTReg, MethodTable::GetOffsetOfArrayElementTypeHandle() AMD64_ARG(k64BitOp));
+
+            // Throw error if not equal
+            Inner_typeMismatchexception = NewCodeLabel();
+            X86EmitCondJump(Inner_typeMismatchexception, X86CondCode::kJNZ);
+            EmitLabel(Inner_passedTypeCheck);
+        }
+    }
+
+    CodeLabel* DoneCheckLabel = 0;
+    if (pArrayOpScript->m_rank == 1 && pArrayOpScript->m_fHasLowerBounds)
+    {
+        DoneCheckLabel = NewCodeLabel();
+        CodeLabel* NotSZArrayLabel = NewCodeLabel();
+
+        // for rank1 arrays, we might actually have two different layouts depending on
+        // if we are ELEMENT_TYPE_ARRAY or ELEMENT_TYPE_SZARRAY.
+
+            // mov EAX, [ARRAY]          // EAX holds the method table
+        X86_64BitOperands();
+        X86EmitOp(0x8b, kEAX, kArrayRefReg);
+
+            // test [EAX + m_dwFlags], enum_flag_Category_IfArrayThenSzArray
+        X86_64BitOperands();
+        X86EmitOffsetModRM(0xf7, (X86Reg)0, kEAX, MethodTable::GetOffsetOfFlags());
+        Emit32(MethodTable::GetIfArrayThenSzArrayFlag());
+
+            // jz NotSZArrayLabel
+        X86EmitCondJump(NotSZArrayLabel, X86CondCode::kJZ);
+
+            //Load the passed-in index into the scratch register.
+        const ArrayOpIndexSpec *pai = pArrayOpScript->GetArrayOpIndexSpecs();
+        X86Reg idxReg = LoadArrayOpArg(pai->m_idxloc, this, SCRATCH_REGISTER_X86REG, ofsadjust);
+
+            // cmp idxReg, [kArrayRefReg + LENGTH]
+        X86EmitOp(0x3b, idxReg, kArrayRefReg, ArrayBase::GetOffsetOfNumComponents());
+
+            // jae Inner_rangeexception
+        X86EmitCondJump(Inner_rangeexception, X86CondCode::kJAE);
+
+            // <TODO> if we cared efficiency of this, this move can be optimized</TODO>
+        X86EmitR2ROp(0x8b, kTotalReg, idxReg AMD64_ARG(k32BitOp));
+
+            // sub ARRAY. 8                  // 8 is accounts for the Lower bound and Dim count in the ARRAY
+        X86EmitSubReg(kArrayRefReg, 8);      // adjust this pointer so that indexing works out for SZARRAY
+
+        X86EmitNearJump(DoneCheckLabel);
+        EmitLabel(NotSZArrayLabel);
+    }
+
+    // For each index, range-check and mix into accumulated total.
+    UINT idx = pArrayOpScript->m_rank;
+    BOOL firstTime = TRUE;
+    while (idx--)
+    {
+        const ArrayOpIndexSpec *pai = pArrayOpScript->GetArrayOpIndexSpecs() + idx;
+
+        //Load the passed-in index into the scratch register.
+        X86Reg srcreg = LoadArrayOpArg(pai->m_idxloc, this, SCRATCH_REGISTER_X86REG, ofsadjust AMD64_ARG(k32BitOp));
+        if (SCRATCH_REGISTER_X86REG != srcreg)
+            X86EmitR2ROp(0x8b, SCRATCH_REGISTER_X86REG, srcreg AMD64_ARG(k32BitOp));
+
+        // sub SCRATCH, dword ptr [kArrayRefReg + LOWERBOUND]
+        if (pArrayOpScript->m_fHasLowerBounds)
+        {
+            X86EmitOp(0x2b, SCRATCH_REGISTER_X86REG, kArrayRefReg, pai->m_lboundofs);
+        }
+
+        // cmp SCRATCH, dword ptr [kArrayRefReg + LENGTH]
+        X86EmitOp(0x3b, SCRATCH_REGISTER_X86REG, kArrayRefReg, pai->m_lengthofs);
+
+        // jae Inner_rangeexception
+        X86EmitCondJump(Inner_rangeexception, X86CondCode::kJAE);
+
+
+        // SCRATCH == idx - LOWERBOUND
+        //
+        // imul SCRATCH, FACTOR
+        if (!firstTime)
+        {
+            //Can skip the first time since FACTOR==1
+            X86EmitR2ROp(0xaf0f, SCRATCH_REGISTER_X86REG, kFactorReg AMD64_ARG(k32BitOp));
+        }
+
+        // TOTAL += SCRATCH
+        if (firstTime)
+        {
+            // First time, we must zero-init TOTAL. Since
+            // zero-initing and then adding is just equivalent to a
+            // "mov", emit a "mov"
+            //    mov  TOTAL, SCRATCH
+            X86EmitR2ROp(0x8b, kTotalReg, SCRATCH_REGISTER_X86REG AMD64_ARG(k32BitOp));
+        }
+        else
+        {
+            //    add  TOTAL, SCRATCH
+            X86EmitR2ROp(0x03, kTotalReg, SCRATCH_REGISTER_X86REG AMD64_ARG(k32BitOp));
+        }
+
+        // FACTOR *= [kArrayRefReg + LENGTH]
+        if (idx != 0)
+        {
+            // No need to update FACTOR on the last iteration
+            //  since we won't use it again
+
+            if (firstTime)
+            {
+                // must init FACTOR to 1 first: hence,
+                // the "imul" becomes a "mov"
+                // mov FACTOR, [kArrayRefReg + LENGTH]
+                X86EmitOp(0x8b, kFactorReg, kArrayRefReg, pai->m_lengthofs);
+            }
+            else
+            {
+                // imul FACTOR, [kArrayRefReg + LENGTH]
+                X86EmitOp(0xaf0f, kFactorReg, kArrayRefReg, pai->m_lengthofs);
+            }
+        }
+
+        firstTime = FALSE;
+    }
+
+    if (DoneCheckLabel != 0)
+        EmitLabel(DoneCheckLabel);
+
+    // Pass these values to X86EmitArrayOp() to generate the element address.
+    X86Reg elemBaseReg   = kArrayRefReg;
+    X86Reg elemScaledReg = kTotalReg;
+    UINT32 elemSize      = pArrayOpScript->m_elemsize;
+    UINT32 elemOfs       = pArrayOpScript->m_ofsoffirst;
+
+    if (!(elemSize == 1 || elemSize == 2 || elemSize == 4 || elemSize == 8))
+    {
+        switch (elemSize)
+        {
+            // No way to express this as a SIB byte. Fold the scale
+            // into TOTAL.
+
+            case 16:
+                // shl TOTAL,4
+                X86EmitR2ROp(0xc1, (X86Reg)4, kTotalReg AMD64_ARG(k32BitOp));
+                Emit8(4);
+                break;
+
+            case 32:
+                // shl TOTAL,5
+                X86EmitR2ROp(0xc1, (X86Reg)4, kTotalReg AMD64_ARG(k32BitOp));
+                Emit8(5);
+                break;
+
+            case 64:
+                // shl TOTAL,6
+                X86EmitR2ROp(0xc1, (X86Reg)4, kTotalReg AMD64_ARG(k32BitOp));
+                Emit8(6);
+                break;
+
+            default:
+                // imul TOTAL, elemScale
+                X86EmitR2ROp(0x69, kTotalReg, kTotalReg AMD64_ARG(k32BitOp));
+                Emit32(elemSize);
+                break;
+        }
+        elemSize = 1;
+    }
+
+    _ASSERTE(FitsInU1(elemSize));
+    BYTE elemScale = static_cast<BYTE>(elemSize);
+
+    // Now, do the operation:
+
+    switch (pArrayOpScript->m_op)
+    {
+        case ArrayOpScript::LOADADDR:
+            // lea eax, ELEMADDR
+            X86EmitOp(0x8d, kEAX, elemBaseReg, elemOfs, elemScaledReg, elemScale AMD64_ARG(k64BitOp));
+            break;
+
+        case ArrayOpScript::LOAD:
+            if (pArrayOpScript->m_flags & ArrayOpScript::HASRETVALBUFFER)
+            {
+                // Ensure that these registers have been saved!
+                _ASSERTE(fSavedESI && fSavedEDI);
+
+                //lea esi, ELEMADDR
+                X86EmitOp(0x8d, kESI, elemBaseReg, elemOfs, elemScaledReg, elemScale AMD64_ARG(k64BitOp));
+
+                _ASSERTE(!TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fRetBufLoc));
+                // mov edi, retbufptr
+                X86EmitR2ROp(0x8b, kEDI, GetX86ArgumentRegisterFromOffset(pArrayOpScript->m_fRetBufLoc - TransitionBlock::GetOffsetOfArgumentRegisters()));
+
+COPY_VALUE_CLASS:
+                {
+                    size_t size = pArrayOpScript->m_elemsize;
+                    size_t total = 0;
+                    if(pArrayOpScript->m_gcDesc)
+                    {
+                        CGCDescSeries* cur = pArrayOpScript->m_gcDesc->GetHighestSeries();
+                        if ((cur->startoffset-elemOfs) > 0)
+                            generate_noref_copy ((unsigned) (cur->startoffset - elemOfs), this);
+                        total += cur->startoffset - elemOfs;
+
+                        SSIZE_T cnt = (SSIZE_T) pArrayOpScript->m_gcDesc->GetNumSeries();
+                        // special array encoding
+                        _ASSERTE(cnt < 0);
+
+                        for (SSIZE_T __i = 0; __i > cnt; __i--)
+                        {
+                            HALF_SIZE_T skip =  cur->val_serie[__i].skip;
+                            HALF_SIZE_T nptrs = cur->val_serie[__i].nptrs;
+                            total += nptrs*sizeof (DWORD*);
+                            do
+                            {
+                                AMD64_ONLY(_ASSERTE(fNeedScratchArea));
+
+                                X86EmitCall(NewExternalCodeLabel((LPVOID) JIT_ByRefWriteBarrier), 0);
+                            } while (--nptrs);
+                            if (skip > 0)
+                            {
+                                //check if we are at the end of the series
+                                if (__i == (cnt + 1))
+                                    skip = skip - (HALF_SIZE_T)(cur->startoffset - elemOfs);
+                                if (skip > 0)
+                                    generate_noref_copy (skip, this);
+                            }
+                            total += skip;
+                        }
+
+                        _ASSERTE (size == total);
+                    }
+                    else
+                    {
+                        // no ref anywhere, just copy the bytes.
+                        _ASSERTE (size);
+                        generate_noref_copy ((unsigned)size, this);
+                    }
+                }
+            }
+            else
+            {
+                switch (pArrayOpScript->m_elemsize)
+                {
+                case 1:
+                    // mov[zs]x eax, byte ptr ELEMADDR
+                    X86EmitOp(pArrayOpScript->m_signed ? 0xbe0f : 0xb60f, kEAX, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                    break;
+
+                case 2:
+                    // mov[zs]x eax, word ptr ELEMADDR
+                    X86EmitOp(pArrayOpScript->m_signed ? 0xbf0f : 0xb70f, kEAX, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                    break;
+
+                case 4:
+                    if (pArrayOpScript->m_flags & ArrayOpScript::ISFPUTYPE)
+                    {
+#ifdef _TARGET_AMD64_
+                        // movss xmm0, dword ptr ELEMADDR
+                        Emit8(0xf3);
+                        X86EmitOp(0x100f, (X86Reg)0, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+#else // !_TARGET_AMD64_
+                        // fld dword ptr ELEMADDR
+                        X86EmitOp(0xd9, (X86Reg)0, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+#endif // !_TARGET_AMD64_
+                    }
+                    else
+                    {
+                        // mov eax, ELEMADDR
+                        X86EmitOp(0x8b, kEAX, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                    }
+                    break;
+
+                case 8:
+                    if (pArrayOpScript->m_flags & ArrayOpScript::ISFPUTYPE)
+                    {
+#ifdef _TARGET_AMD64_
+                        // movsd xmm0, qword ptr ELEMADDR
+                        Emit8(0xf2);
+                        X86EmitOp(0x100f, (X86Reg)0, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+#else // !_TARGET_AMD64_
+                        // fld qword ptr ELEMADDR
+                        X86EmitOp(0xdd, (X86Reg)0, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+#endif // !_TARGET_AMD64_
+                    }
+                    else
+                    {
+                        // mov eax, ELEMADDR
+                        X86EmitOp(0x8b, kEAX, elemBaseReg, elemOfs, elemScaledReg, elemScale AMD64_ARG(k64BitOp));
+#ifdef _TARGET_X86_
+                        // mov edx, ELEMADDR + 4
+                        X86EmitOp(0x8b, kEDX, elemBaseReg, elemOfs + 4, elemScaledReg, elemScale);
+#endif
+                    }
+                    break;
+
+                default:
+                    _ASSERTE(0);
+                }
+            }
+
+            break;
+
+        case ArrayOpScript::STORE:
+
+            switch (pArrayOpScript->m_elemsize)
+            {
+            case 1:
+                // mov SCRATCH, [esp + valoffset]
+                kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, SCRATCH_REGISTER_X86REG, ofsadjust);
+                // mov byte ptr ELEMADDR, SCRATCH.b
+                X86EmitOp(0x88, kValueReg, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                break;
+            case 2:
+                // mov SCRATCH, [esp + valoffset]
+                kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, SCRATCH_REGISTER_X86REG, ofsadjust);
+                // mov word ptr ELEMADDR, SCRATCH.w
+                Emit8(0x66);
+                X86EmitOp(0x89, kValueReg, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                break;
+            case 4:
+#ifndef _TARGET_AMD64_
+                if (pArrayOpScript->m_flags & ArrayOpScript::NEEDSWRITEBARRIER)
+                {
+                    // mov SCRATCH, [esp + valoffset]
+                    kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, SCRATCH_REGISTER_X86REG, ofsadjust);
+
+                    _ASSERTE(SCRATCH_REGISTER_X86REG == kEAX); // value to store is already in EAX where we want it.
+                    // lea edx, ELEMADDR
+                    X86EmitOp(0x8d, kEDX, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+
+                    // call JIT_Writeable_Thunks_Buf.WriteBarrierReg[0] (== EAX)
+                    X86EmitCall(NewExternalCodeLabel((LPVOID) &JIT_WriteBarrierEAX), 0);
+                }
+                else
+#else // _TARGET_AMD64_
+                if (pArrayOpScript->m_flags & ArrayOpScript::ISFPUTYPE)
+                {
+                    if (!TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc))
+                    {
+                        kValueReg = (X86Reg)TransitionBlock::GetArgumentIndexFromOffset(pArrayOpScript->m_fValLoc);
+                    }
+                    else
+                    {
+                        kValueReg = (X86Reg)0;  // xmm0
+
+                        // movss xmm0, dword ptr [rsp+??]
+                        Emit8(0xf3);
+                        X86EmitOp(0x100f, kValueReg, (X86Reg)4 /*rsp*/, ofsadjust + pArrayOpScript->m_fValLoc);
+                    }
+
+                    // movss dword ptr ELEMADDR, xmm?
+                    Emit8(0xf3);
+                    X86EmitOp(0x110f, kValueReg, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                }
+                else
+#endif // _TARGET_AMD64_
+                {
+                    // mov SCRATCH, [esp + valoffset]
+                    kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, SCRATCH_REGISTER_X86REG, ofsadjust AMD64_ARG(k32BitOp));
+
+                    // mov ELEMADDR, SCRATCH
+                    X86EmitOp(0x89, kValueReg, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                }
+                break;
+
+            case 8:
+
+                if (!(pArrayOpScript->m_flags & ArrayOpScript::NEEDSWRITEBARRIER))
+                {
+#ifdef _TARGET_AMD64_
+                    if (pArrayOpScript->m_flags & ArrayOpScript::ISFPUTYPE)
+                    {
+                        if (!TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc))
+                        {
+                            kValueReg = (X86Reg)TransitionBlock::GetArgumentIndexFromOffset(pArrayOpScript->m_fValLoc);
+                        }
+                        else
+                        {
+                            kValueReg = (X86Reg)0;  // xmm0
+
+                            // movsd xmm0, qword ptr [rsp+??]
+                            Emit8(0xf2);
+                            X86EmitOp(0x100f, kValueReg, (X86Reg)4 /*rsp*/, ofsadjust + pArrayOpScript->m_fValLoc);
+                        }
+
+                        // movsd qword ptr ELEMADDR, xmm?
+                        Emit8(0xf2);
+                        X86EmitOp(0x110f, kValueReg, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+                    }
+                    else
+                    {
+                    // mov SCRATCH, [esp + valoffset]
+                        kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, SCRATCH_REGISTER_X86REG, ofsadjust);
+
+                        // mov ELEMADDR, SCRATCH
+                        X86EmitOp(0x89, kValueReg, elemBaseReg, elemOfs, elemScaledReg, elemScale, k64BitOp);
+                    }
+#else // !_TARGET_AMD64_
+                    _ASSERTE(TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc)); // on x86, value will never get a register: so too lazy to implement that case
+                    // mov SCRATCH, [esp + valoffset]
+                    X86EmitEspOffset(0x8b, SCRATCH_REGISTER_X86REG, pArrayOpScript->m_fValLoc + ofsadjust);
+                    // mov ELEMADDR, SCRATCH
+                    X86EmitOp(0x89, SCRATCH_REGISTER_X86REG, elemBaseReg, elemOfs, elemScaledReg, elemScale);
+
+                    _ASSERTE(TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc)); // on x86, value will never get a register: so too lazy to implement that case
+                    // mov SCRATCH, [esp + valoffset + 4]
+                    X86EmitEspOffset(0x8b, SCRATCH_REGISTER_X86REG, pArrayOpScript->m_fValLoc + ofsadjust + 4);
+                    // mov ELEMADDR+4, SCRATCH
+                    X86EmitOp(0x89, SCRATCH_REGISTER_X86REG, elemBaseReg, elemOfs+4, elemScaledReg, elemScale);
+#endif // !_TARGET_AMD64_
+                    break;
+                }
+#ifdef _TARGET_AMD64_
+                else
+                {
+                    _ASSERTE(SCRATCH_REGISTER_X86REG == kEAX); // value to store is already in EAX where we want it.
+                    // lea rcx, ELEMADDR
+                    X86EmitOp(0x8d, kRCX, elemBaseReg, elemOfs, elemScaledReg, elemScale, k64BitOp);
+
+                    // mov rdx, [rsp + valoffset]
+                    kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, kRDX, ofsadjust);
+                    _ASSERT(kRCX != kValueReg);
+                    if (kRDX != kValueReg)
+                        X86EmitR2ROp(0x8b, kRDX, kValueReg);
+
+                    _ASSERTE(fNeedScratchArea);
+                    X86EmitCall(NewExternalCodeLabel((PVOID)JIT_WriteBarrier), 0);
+                    break;
+                }
+#endif // _TARGET_AMD64_
+                    // FALL THROUGH (on x86)
+            default:
+                // Ensure that these registers have been saved!
+                _ASSERTE(fSavedESI && fSavedEDI);
+
+#ifdef _TARGET_AMD64_
+                // mov rsi, [rsp + valoffset]
+                kValueReg = LoadArrayOpArg(pArrayOpScript->m_fValLoc, this, kRSI, ofsadjust);
+                if (kRSI != kValueReg)
+                    X86EmitR2ROp(0x8b, kRSI, kValueReg);
+#else // !_TARGET_AMD64_
+                _ASSERTE(TransitionBlock::IsStackArgumentOffset(pArrayOpScript->m_fValLoc));
+                // lea esi, [esp + valoffset]
+                X86EmitEspOffset(0x8d, kESI, pArrayOpScript->m_fValLoc + ofsadjust);
+#endif // !_TARGET_AMD64_
+
+                // lea edi, ELEMADDR
+                X86EmitOp(0x8d, kEDI, elemBaseReg, elemOfs, elemScaledReg, elemScale AMD64_ARG(k64BitOp));
+                goto COPY_VALUE_CLASS;
+            }
+            break;
+
+        default:
+            _ASSERTE(0);
+    }
+
+    EmitLabel(Epilog);
+
+#ifdef _TARGET_AMD64_
+    if (fNeedPrologue)
+    {
+        if (fNeedScratchArea)
+        {
+            // Throw away scratch area
+            X86EmitAddEsp(sizeof(ArgumentRegisters) + 0x8);
+        }
+
+        if (fSavedEDI)
+            X86EmitPopReg(kRDI);
+
+        if (fSavedESI)
+            X86EmitPopReg(kRSI);
+    }
+
+    X86EmitReturn(0);
+#else // !_TARGET_AMD64_
+    // Restore the callee-saved registers
+    X86EmitPopReg(kFactorReg);
+    X86EmitPopReg(kTotalReg);
+
+    // ret N
+    X86EmitReturn(pArrayOpScript->m_cbretpop);
+#endif // !_TARGET_AMD64_
+
+    // Exception points must clean up the stack for all those extra args.
+    // kFactorReg and kTotalReg will be popped by the jump targets.
+
+    void *pvExceptionThrowFn;
+
+#if defined(_TARGET_AMD64_)
+#define ARRAYOP_EXCEPTION_HELPERS(base)      { (PVOID)base, (PVOID)base##_RSIRDI, (PVOID)base##_ScratchArea, (PVOID)base##_RSIRDI_ScratchArea }
+ static void *rgNullExceptionHelpers[]           = ARRAYOP_EXCEPTION_HELPERS(ArrayOpStubNullException);
+    static void *rgRangeExceptionHelpers[]          = ARRAYOP_EXCEPTION_HELPERS(ArrayOpStubRangeException);
+    static void *rgTypeMismatchExceptionHelpers[]   = ARRAYOP_EXCEPTION_HELPERS(ArrayOpStubTypeMismatchException);
+#undef ARRAYOP_EXCEPTION_HELPERS
+
+    UINT iExceptionHelper = (fNeedRSIRDI ? 1 : 0) + (fNeedScratchArea ? 2 : 0);
+#endif // defined(_TARGET_AMD64_)
+
+    EmitLabel(Inner_nullexception);
+
+#ifndef _TARGET_AMD64_
+    pvExceptionThrowFn = (LPVOID)ArrayOpStubNullException;
+
+    Emit8(0xb8);        // mov EAX, <stack cleanup>
+    Emit32(pArrayOpScript->m_cbretpop);
+#else //_TARGET_AMD64_
+    pvExceptionThrowFn = rgNullExceptionHelpers[iExceptionHelper];
+#endif //!_TARGET_AMD64_
+    X86EmitNearJump(NewExternalCodeLabel(pvExceptionThrowFn));
+
+    EmitLabel(Inner_rangeexception);
+#ifndef _TARGET_AMD64_
+    pvExceptionThrowFn = (LPVOID)ArrayOpStubRangeException;
+    Emit8(0xb8);        // mov EAX, <stack cleanup>
+    Emit32(pArrayOpScript->m_cbretpop);
+#else //_TARGET_AMD64_
+    pvExceptionThrowFn = rgRangeExceptionHelpers[iExceptionHelper];
+#endif //!_TARGET_AMD64_
+    X86EmitNearJump(NewExternalCodeLabel(pvExceptionThrowFn));
+
+    if (Inner_typeMismatchexception != NULL)
+    {
+        EmitLabel(Inner_typeMismatchexception);
+#ifndef _TARGET_AMD64_
+        pvExceptionThrowFn = (LPVOID)ArrayOpStubTypeMismatchException;
+        Emit8(0xb8);        // mov EAX, <stack cleanup>
+        Emit32(pArrayOpScript->m_cbretpop);
+#else //_TARGET_AMD64_
+        pvExceptionThrowFn = rgTypeMismatchExceptionHelpers[iExceptionHelper];
+#endif //!_TARGET_AMD64_
+        X86EmitNearJump(NewExternalCodeLabel(pvExceptionThrowFn));
+    }
+}
+#ifdef _PREFAST_
+#pragma warning(pop)
+#endif
+
+#endif // FEATURE_ARRAYSTUB_AS_IL
+
+//===========================================================================
+// Emits code to break into debugger
+VOID StubLinkerCPU::EmitDebugBreak()
+{
+    STANDARD_VM_CONTRACT;
+
+    // int3
+    Emit8(0xCC);
+}
+
+#if defined(FEATURE_COMINTEROP) && defined(_TARGET_X86_)
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning (disable : 4740) // There is inline asm code in this function, which disables
+                                 // global optimizations.
+#pragma warning (disable : 4731)
+#endif  // _MSC_VER
+Thread* __stdcall CreateThreadBlockReturnHr(ComMethodFrame *pFrame)
+{
+
+    WRAPPER_NO_CONTRACT;
+
+    Thread *pThread = NULL;
+
+    HRESULT hr = S_OK;
+
+    // This means that a thread is FIRST coming in from outside the EE.
+    BEGIN_ENTRYPOINT_THROWS;
+    pThread = SetupThreadNoThrow(&hr);
+    END_ENTRYPOINT_THROWS;
+
+    if (pThread == NULL) {
+        // Unwind stack, and return hr
+        // NOTE: assumes __stdcall
+        // Note that this code does not handle the rare COM signatures that do not return HRESULT
+        // compute the callee pop stack bytes
+        UINT numArgStackBytes = pFrame->GetNumCallerStackBytes();
+        unsigned frameSize = sizeof(Frame) + sizeof(LPVOID);
+        LPBYTE iEsp = ((LPBYTE)pFrame) + ComMethodFrame::GetOffsetOfCalleeSavedRegisters();
+        __asm
+        {
+            mov eax, hr
+            mov edx, numArgStackBytes
+            //*****************************************
+            // reset the stack pointer
+            // none of the locals above can be used in the asm below
+            // if we wack the stack pointer
+            mov esp, iEsp
+            // pop callee saved registers
+            pop edi
+            pop esi
+            pop ebx
+            pop ebp
+            pop ecx         ; //return address
+            // pop the callee cleanup stack args
+            add esp, edx    ;// callee cleanup of args
+            jmp ecx;        // jump to the address to continue execution
+
+            // We will never get here. This "ret" is just so that code-disassembling
+            // profilers know to stop disassembling any further
+            ret
+        }
+    }
+
+    return pThread;
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif // defined(FEATURE_COMINTEROP) && defined(_TARGET_X86_)
+
+#endif // !defined(CROSSGEN_COMPILE) && !defined(FEATURE_STUBS_AS_IL)
+
+#endif // !DACCESS_COMPILE
+
+
+#ifdef _TARGET_AMD64_
+
+//
+// TailCallFrame Object Scanning
+//
+// This handles scanning/promotion of GC objects that were
+// protected by the TailCallHelper routine.  Note that the objects
+// being protected is somewhat dynamic and is dependent upon the
+// the callee...
+//
+
+void TailCallFrame::GcScanRoots(promote_func *fn, ScanContext* sc)
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (m_pGCLayout != NULL)
+    {
+        struct FrameOffsetDecoder {
+        private:
+            TADDR prevOffset;
+            TADDR rangeEnd;
+            BOOL   maybeInterior;
+            BOOL   atEnd;
+            PTR_SBYTE pbOffsets;
+
+            DWORD ReadNumber() {
+                signed char i;
+                DWORD offset = 0;
+                while ((i = *pbOffsets++) >= 0) 
+                {
+                    offset = (offset << 7) | i;
+                }
+                offset = (offset << 7) | (i & 0x7F);
+                return offset;
+            }
+
+        public:
+            FrameOffsetDecoder(PTR_GSCookie _base, TADDR offsets)
+                : prevOffset(dac_cast<TADDR>(_base)), rangeEnd(~0LL), atEnd(FALSE), pbOffsets(dac_cast<PTR_SBYTE>(offsets)) { maybeInterior = FALSE;}
+
+            bool MoveNext() {
+                LIMITED_METHOD_CONTRACT;
+
+                if (rangeEnd < prevOffset)
+                {
+                    prevOffset -= sizeof(void*);
+                    return true;
+                }
+                if (atEnd) return false;
+                DWORD offset = ReadNumber();
+                atEnd = (offset & 1);
+                BOOL range = (offset & 2);
+                maybeInterior = (offset & 0x80000000);
+
+                offset &= 0x7FFFFFFC;
+                
+#ifdef _WIN64
+                offset <<= 1;
+#endif
+                offset += sizeof(void*);
+                _ASSERTE(prevOffset > offset);
+                prevOffset -= offset;
+
+                if (range)
+                {
+                    _ASSERTE(!atEnd);
+                    _ASSERTE(!maybeInterior);
+                    DWORD offsetEnd = ReadNumber();
+                    atEnd = (offsetEnd & 1);
+                    offsetEnd = (offsetEnd & ~1) << 1;
+                    // range encoding starts with a range of 3 (2 is better to encode as
+                    // 2 offsets), so 0 == 2 (the last offset in the range)
+                    offsetEnd += sizeof(void*) * 2;
+                    rangeEnd = prevOffset - offsetEnd;
+                }
+
+                return true;
+            }
+
+            BOOL MaybeInterior() const { return maybeInterior; }
+
+            PTR_PTR_Object Current() const { return PTR_PTR_Object(prevOffset); }
+
+        } decoder(GetGSCookiePtr(), m_pGCLayout);
+
+        while (decoder.MoveNext())
+        {
+            PTR_PTR_Object ppRef = decoder.Current();
+
+            LOG((LF_GC, INFO3, "Tail Call Frame Promoting" FMT_ADDR "to",
+                 DBG_ADDR(OBJECTREF_TO_UNCHECKED_OBJECTREF(*ppRef)) ));
+            if (decoder.MaybeInterior())
+                PromoteCarefully(fn, ppRef, sc, GC_CALL_INTERIOR|CHECK_APP_DOMAIN);
+            else
+                (*fn)(ppRef, sc, 0);
+            LOG((LF_GC, INFO3, FMT_ADDR "\n", DBG_ADDR(OBJECTREF_TO_UNCHECKED_OBJECTREF(*ppRef)) ));
+        }
+    }
+}
+
+#ifndef DACCESS_COMPILE
+static void EncodeOneGCOffset(CPUSTUBLINKER *pSl, ULONG delta, BOOL maybeInterior, BOOL range, BOOL last)
+{
+    CONTRACTL
+    {
+        THROWS; // From the stublinker
+        MODE_ANY;
+        GC_NOTRIGGER;
+    }
+    CONTRACTL_END;
+
+    // Everything should be pointer aligned
+    // but we use a high bit for interior, and the 0 bit to denote the end of the list
+    // we use the 1 bit to denote a range
+    _ASSERTE((delta % sizeof(void*)) == 0);
+
+#if defined(_WIN64)
+    // For 64-bit, we have 3 bits of alignment, so we allow larger frames
+    // by shifting and gaining a free high-bit.
+    ULONG encodedDelta = delta >> 1;
+#else
+    // For 32-bit, we just limit our frame size to <2GB. (I know, such a bummer!)
+    ULONG encodedDelta = delta;
+#endif
+    _ASSERTE((encodedDelta & 0x80000003) == 0);
+    if (last)
+    {
+        encodedDelta |= 1;
+    }
+
+    if (range)
+    {
+        encodedDelta |= 2;
+    }
+    else if (maybeInterior)
+    {
+        _ASSERTE(!range);
+        encodedDelta |= 0x80000000;
+    }
+
+    BYTE bytes[5];
+    UINT index = 5;
+    bytes[--index] = (BYTE)((encodedDelta & 0x7F) | 0x80);
+    encodedDelta >>= 7;
+    while (encodedDelta > 0)
+    {
+        bytes[--index] = (BYTE)(encodedDelta & 0x7F);
+        encodedDelta >>= 7;
+    }
+    pSl->EmitBytes(&bytes[index], 5 - index);
+}
+
+static void EncodeGCOffsets(CPUSTUBLINKER *pSl, /* const */ ULONGARRAY & gcOffsets)
+{
+    CONTRACTL
+    {
+        THROWS;
+        MODE_ANY;
+        GC_NOTRIGGER;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(gcOffsets.Count() > 0);
+
+    ULONG prevOffset = 0;
+    int i = 0;
+    BOOL last = FALSE;
+    do {
+        ULONG offset = gcOffsets[i];
+        // Everything should be pointer aligned
+        // but we use the 0-bit to mean maybeInterior, for byrefs.
+        _ASSERTE(((offset % sizeof(void*)) == 0) || ((offset % sizeof(void*)) == 1));
+        BOOL maybeInterior = (offset & 1);
+        offset &= ~1;
+
+        // Encode just deltas because they're smaller (and the list should be sorted)
+        _ASSERTE(offset >= (prevOffset + sizeof(void*)));
+        ULONG delta = offset - (prevOffset + sizeof(void*));
+        if (!maybeInterior && gcOffsets.Count() > i + 2)
+        {
+            // Check for a potential range.
+            // Only do it if we have 3 or more pointers in a row
+            ULONG rangeOffset = offset;
+            int j = i + 1;
+            do {
+                ULONG nextOffset = gcOffsets[j];
+                // interior pointers can't be in ranges
+                if (nextOffset & 1)
+                     break;
+                // ranges must be saturated
+                if (nextOffset != (rangeOffset + sizeof(void*)))
+                     break;
+                j++;
+                rangeOffset = nextOffset;
+            } while(j < gcOffsets.Count());
+
+            if (j > (i + 2))
+            {
+                EncodeOneGCOffset(pSl, delta, FALSE, TRUE, last); 
+                i = j - 1;
+                _ASSERTE(rangeOffset >= (offset + (sizeof(void*) * 2)));
+                delta = rangeOffset - (offset + (sizeof(void*) * 2));
+                offset = rangeOffset;
+            }
+        }
+        last = (++i == gcOffsets.Count());
+
+
+        EncodeOneGCOffset(pSl, delta, maybeInterior, FALSE, last); 
+
+        prevOffset = offset;
+    } while (!last);
+}
+
+static void AppendGCLayout(ULONGARRAY &gcLayout, size_t baseOffset, BOOL fIsTypedRef, TypeHandle VMClsHnd)
+{
+    STANDARD_VM_CONTRACT;
+
+    _ASSERTE((baseOffset % 16) == 0);
+    _ASSERTE(FitsInU4(baseOffset));
+
+    if (fIsTypedRef)
+    {
+        *gcLayout.AppendThrowing() = (ULONG)(baseOffset | 1); // "| 1" to mark it as an interior pointer
+    }
+    else if (!VMClsHnd.IsNativeValueType())
+    {
+        MethodTable* pMT = VMClsHnd.GetMethodTable();
+        _ASSERTE(pMT);
+        _ASSERTE(pMT->IsValueType());
+
+        // walk the GC descriptors, reporting the correct offsets
+        if (pMT->ContainsPointers())
+        {
+            // size of instance when unboxed must be adjusted for the syncblock
+            // index and the VTable pointer.
+            DWORD       size = pMT->GetBaseSize();
+
+            // we don't include this term in our 'ppstop' calculation below.
+            _ASSERTE(pMT->GetComponentSize() == 0);
+
+            CGCDesc* map = CGCDesc::GetCGCDescFromMT(pMT);
+            CGCDescSeries* cur = map->GetLowestSeries();
+            CGCDescSeries* last = map->GetHighestSeries();
+
+            _ASSERTE(cur <= last);
+            do
+            {
+                // offset to embedded references in this series must be
+                // adjusted by the VTable pointer, when in the unboxed state.
+                size_t   adjustOffset = cur->GetSeriesOffset() - sizeof(void *);
+
+                _ASSERTE(baseOffset >= adjustOffset);
+                size_t start = baseOffset - adjustOffset;
+                size_t stop = start - (cur->GetSeriesSize() + size);
+                for (size_t off = stop + sizeof(void*); off <= start; off += sizeof(void*))
+                {
+                    _ASSERTE(gcLayout.Count() == 0 || off > gcLayout[gcLayout.Count() - 1]);
+                    _ASSERTE(FitsInU4(off));
+                    *gcLayout.AppendThrowing() = (ULONG)off;
+                }
+                cur++;
+
+            } while (cur <= last);
+        }
+    }
+}
+
+Stub * StubLinkerCPU::CreateTailCallCopyArgsThunk(CORINFO_SIG_INFO * pSig,
+                                                  CorInfoHelperTailCallSpecialHandling flags)
+{
+    STANDARD_VM_CONTRACT;
+
+    CPUSTUBLINKER   sl;
+    CPUSTUBLINKER*  pSl = &sl;
+
+    // Generates a function that looks like this:
+    // size_t CopyArguments(va_list args,         (RCX)
+    //                      CONTEXT *pCtx,        (RDX)
+    //                      DWORD64 *pvStack,     (R8)
+    //                      size_t cbStack)       (R9)
+    // {
+    //     if (pCtx != NULL) {
+    //         foreach (arg in args) {
+    //             copy into pCtx or pvStack
+    //         }
+    //     }
+    //     return <size of stack needed>;
+    // }
+    //
+
+    CodeLabel *pNullLabel = pSl->NewCodeLabel();
+
+    // test rdx, rdx
+    pSl->X86EmitR2ROp(0x85, kRDX, kRDX);
+
+    // jz NullLabel
+    pSl->X86EmitCondJump(pNullLabel, X86CondCode::kJZ);
+
+    UINT nArgSlot = 0;
+    UINT totalArgs = pSig->totalILArgs() + ((pSig->isVarArg() || pSig->hasTypeArg()) ? 1 : 0);
+    bool fR10Loaded = false;
+    UINT cbArg;
+    static const UINT rgcbArgRegCtxtOffsets[4] = { offsetof(CONTEXT, Rcx), offsetof(CONTEXT, Rdx),
+                                                   offsetof(CONTEXT, R8), offsetof(CONTEXT, R9) };
+    static const UINT rgcbFpArgRegCtxtOffsets[4] = { offsetof(CONTEXT, Xmm0.Low), offsetof(CONTEXT, Xmm1.Low),
+                                                     offsetof(CONTEXT, Xmm2.Low), offsetof(CONTEXT, Xmm3.Low) };
+
+    ULONGARRAY gcLayout;
+
+    // On input to the function R9 contains the size of the buffer
+    // The first time this macro runs, R10 is loaded with the 'top' of the Frame
+    // and R9 is changed to point to the 'top' of the copy buffer.
+    // Then both R9 and R10 are decremented by the size of the struct we're copying
+    // So R10 is the value to put in the argument slot, and R9 is where the data
+    // should be copied to (or zeroed out in the case of the return buffer).
+#define LOAD_STRUCT_OFFSET_IF_NEEDED(cbSize)                                      \
+    {                                                                             \
+        _ASSERTE(cbSize > 0);                                                     \
+        _ASSERTE(FitsInI4(cbSize));                                               \
+        __int32 offset = (__int32)cbSize;                                         \
+        if (!fR10Loaded) {                                                        \
+            /* mov r10, [rdx + offset of RSP] */                                  \
+            pSl->X86EmitIndexRegLoad(kR10, kRDX, offsetof(CONTEXT, Rsp));         \
+            /* add an extra 8 because RSP is pointing at the return address */    \
+            offset -= 8;                                                          \
+            /* add r10, r9 */                                                     \
+            pSl->X86EmitAddRegReg(kR10, kR9);                                     \
+            /* add r9, r8 */                                                      \
+            pSl->X86EmitAddRegReg(kR9, kR8);                                      \
+            fR10Loaded = true;                                                    \
+        }                                                                         \
+        /* sub r10, offset */                                                     \
+        pSl->X86EmitSubReg(kR10, offset);                                         \
+        /* sub r9, cbSize */                                                      \
+        pSl->X86EmitSubReg(kR9, cbSize);                                          \
+    }
+
+
+    if (flags & CORINFO_TAILCALL_STUB_DISPATCH_ARG) {
+        // This is set for stub dispatch
+        // The JIT placed an extra argument in the list that needs to
+        // get shoved into R11, and not counted.
+        // pCtx->R11 = va_arg(args, DWORD64);
+
+        // mov rax, [rcx]
+        pSl->X86EmitIndexRegLoad(kRAX, kRCX, 0);
+        // add rcx, 8
+        pSl->X86EmitAddReg(kRCX, 8);
+        // mov [rdx + offset of R11], rax
+        pSl->X86EmitIndexRegStore(kRDX, offsetof(CONTEXT, R11), kRAX);
+    }
+
+    ULONG cbStructOffset = 0;
+
+    // First comes the 'this' pointer
+    if (pSig->hasThis()) {
+        // mov rax, [rcx]
+        pSl->X86EmitIndexRegLoad(kRAX, kRCX, 0);
+        // add rcx, 8
+        pSl->X86EmitAddReg(kRCX, 8);
+        // mov [rdx + offset of RCX/RDX], rax
+        pSl->X86EmitIndexRegStore(kRDX, rgcbArgRegCtxtOffsets[nArgSlot++], kRAX);
+    }
+
+    // Next the return buffer
+    cbArg = 0;
+    TypeHandle th(pSig->retTypeClass);
+    if ((pSig->retType == CORINFO_TYPE_REFANY) || (pSig->retType == CORINFO_TYPE_VALUECLASS)) {
+        cbArg = th.GetSize();
+    }
+
+    if (ArgIterator::IsArgPassedByRef(cbArg)) {
+        totalArgs++;
+
+        // We always reserve space for the return buffer, and we always zero it out,
+        // so the GC won't complain, but if it's already pointing above the frame,
+        // then we need to pass it in (so it will get passed out).
+        // Otherwise we assume the caller is returning void, so we just pass in
+        // dummy space to be overwritten.
+        UINT cbUsed = (cbArg + 0xF) & ~0xF;
+        LOAD_STRUCT_OFFSET_IF_NEEDED(cbUsed);
+        // now emit a 'memset(r9, 0, cbUsed)'
+        {
+            // xorps xmm0, xmm0
+            pSl->X86EmitR2ROp(X86_INSTR_XORPS, kXMM0, kXMM0);
+            if (cbUsed <= 4 * 16) {
+                // movaps [r9], xmm0
+                pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, 0);
+                if (16 < cbUsed) {
+                    // movaps [r9 + 16], xmm0
+                    pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, 16);
+                    if (32 < cbUsed) {
+                        // movaps [r9 + 32], xmm0
+                        pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, 32);
+                        if (48 < cbUsed) {
+                            // movaps [r9 + 48], xmm0
+                            pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, 48);
+                        }
+                    }
+                }
+            }
+            else {
+                // a loop (one double-quadword at a time)
+                pSl->X86EmitZeroOutReg(kR11);
+                // LoopLabel:
+                CodeLabel *pLoopLabel = pSl->NewCodeLabel();
+                pSl->EmitLabel(pLoopLabel);
+                // movaps [r9 + r11], xmm0
+                pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, 0, kR11, 1);
+                // add r11, 16
+                pSl->X86EmitAddReg(kR11, 16);
+                // cmp r11, cbUsed
+                pSl->X86EmitCmpRegImm32(kR11, cbUsed);
+                // jl LoopLabel
+                pSl->X86EmitCondJump(pLoopLabel, X86CondCode::kJL);
+            }
+        }
+        cbStructOffset += cbUsed;
+        AppendGCLayout(gcLayout, cbStructOffset, pSig->retType == CORINFO_TYPE_REFANY, th);
+
+        // mov rax, [rcx]
+        pSl->X86EmitIndexRegLoad(kRAX, kRCX, 0);
+        // add rcx, 8
+        pSl->X86EmitAddReg(kRCX, 8);
+        // cmp rax, [rdx + offset of R12]
+        pSl->X86EmitOffsetModRM(0x3B, kRAX, kRDX, offsetof(CONTEXT, R12));
+
+        CodeLabel *pSkipLabel = pSl->NewCodeLabel();
+        // jnb SkipLabel
+        pSl->X86EmitCondJump(pSkipLabel, X86CondCode::kJNB);
+
+        // Also check the lower bound of the stack in case the return buffer is on the GC heap
+        // and the GC heap is below the stack
+        // cmp rax, rsp
+        pSl->X86EmitR2ROp(0x3B, kRAX, (X86Reg)4 /*kRSP*/);
+        // jna SkipLabel
+        pSl->X86EmitCondJump(pSkipLabel, X86CondCode::kJB);
+        // mov rax, r10
+        pSl->X86EmitMovRegReg(kRAX, kR10);
+        // SkipLabel:
+        pSl->EmitLabel(pSkipLabel);
+        // mov [rdx + offset of RCX], rax
+        pSl->X86EmitIndexRegStore(kRDX, rgcbArgRegCtxtOffsets[nArgSlot++], kRAX);
+    }
+
+    // VarArgs Cookie *or* Generics Instantiation Parameter
+    if (pSig->hasTypeArg() || pSig->isVarArg()) {
+        // mov rax, [rcx]
+        pSl->X86EmitIndexRegLoad(kRAX, kRCX, 0);
+        // add rcx, 8
+        pSl->X86EmitAddReg(kRCX, 8);
+        // mov [rdx + offset of RCX/RDX], rax
+        pSl->X86EmitIndexRegStore(kRDX, rgcbArgRegCtxtOffsets[nArgSlot++], kRAX);
+    }
+
+    _ASSERTE(nArgSlot <= 4);
+
+    // Now for *all* the 'real' arguments
+    SigPointer ptr((PCCOR_SIGNATURE)pSig->args);
+    Module * module = GetModule(pSig->scope);
+    Instantiation classInst((TypeHandle*)pSig->sigInst.classInst, pSig->sigInst.classInstCount);
+    Instantiation methodInst((TypeHandle*)pSig->sigInst.methInst, pSig->sigInst.methInstCount);
+    SigTypeContext typeCtxt(classInst, methodInst);
+
+    for( ;nArgSlot < totalArgs; ptr.SkipExactlyOne()) {
+        CorElementType et = ptr.PeekElemTypeNormalized(module, &typeCtxt);
+        if (et == ELEMENT_TYPE_SENTINEL)
+            continue;
+
+        // mov rax, [rcx]
+        pSl->X86EmitIndexRegLoad(kRAX, kRCX, 0);
+        // add rcx, 8
+        pSl->X86EmitAddReg(kRCX, 8);
+        switch (et) {
+        case ELEMENT_TYPE_INTERNAL:
+            // TODO
+            _ASSERTE(!"Shouldn't see ELEMENT_TYPE_INTERNAL");
+            break;
+        case ELEMENT_TYPE_TYPEDBYREF:
+        case ELEMENT_TYPE_VALUETYPE:
+            th = ptr.GetTypeHandleThrowing(module, &typeCtxt, ClassLoader::LoadTypes, CLASS_LOAD_UNRESTOREDTYPEKEY);
+            _ASSERTE(!th.IsNull());
+            g_IBCLogger.LogEEClassAndMethodTableAccess(th.GetMethodTable());
+            cbArg = (UINT)th.GetSize();
+            if (ArgIterator::IsArgPassedByRef(cbArg)) {
+                UINT cbUsed = (cbArg + 0xF) & ~0xF;
+                LOAD_STRUCT_OFFSET_IF_NEEDED(cbUsed);
+                // rax has the source pointer
+                // r9 has the intermediate copy location
+                // r10 has the final destination
+                if (nArgSlot < 4) {
+                    pSl->X86EmitIndexRegStore(kRDX, rgcbArgRegCtxtOffsets[nArgSlot++], kR10);
+                }
+                else {
+                    pSl->X86EmitIndexRegStore(kR8, 8 * nArgSlot++, kR10);
+                }
+                // now emit a 'memcpy(rax, r9, cbUsed)'
+                // These structs are supposed to be 16-byte aligned, but
+                // Reflection puts them on the GC heap, which is only 8-byte
+                // aligned.  It also means we have to be careful about not
+                // copying too much (because we might cross a page boundary)
+                UINT cbUsed16 = (cbArg + 7) & ~0xF;
+                _ASSERTE((cbUsed16 == cbUsed) || ((cbUsed16 + 16) == cbUsed));
+
+                if (cbArg <= 192) {
+                    // Unrolled version (6 x 16 bytes in parallel)
+                    UINT offset = 0;
+                    while (offset < cbUsed16) {
+                        // movups xmm0, [rax + offset]
+                        pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM0, kRAX, offset);
+                        if (offset + 16 < cbUsed16) {
+                            // movups xmm1, [rax + offset + 16]
+                            pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM1, kRAX, offset + 16);
+                            if (offset + 32 < cbUsed16) {
+                                // movups xmm2, [rax + offset + 32]
+                                pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM2, kRAX, offset + 32);
+                                if (offset + 48 < cbUsed16) {
+                                    // movups xmm3, [rax + offset + 48]
+                                    pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM3, kRAX, offset + 48);
+                                    if (offset + 64 < cbUsed16) {
+                                        // movups xmm4, [rax + offset + 64]
+                                        pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM4, kRAX, offset + 64);
+                                        if (offset + 80 < cbUsed16) {
+                                            // movups xmm5, [rax + offset + 80]
+                                            pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM5, kRAX, offset + 80);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        // movaps [r9 + offset], xmm0
+                        pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, offset);
+                        offset += 16;
+                        if (offset < cbUsed16) {
+                            // movaps [r9 + 16], xmm1
+                            pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM1, kR9, offset);
+                            offset += 16;
+                            if (offset < cbUsed16) {
+                                // movaps [r9 + 32], xmm2
+                                pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM2, kR9, offset);
+                                offset += 16;
+                                if (offset < cbUsed16) {
+                                    // movaps [r9 + 48], xmm3
+                                    pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM3, kR9, offset);
+                                    offset += 16;
+                                    if (offset < cbUsed16) {
+                                        // movaps [r9 + 64], xmm4
+                                        pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM4, kR9, offset);
+                                        offset += 16;
+                                        if (offset < cbUsed16) {
+                                            // movaps [r9 + 80], xmm5
+                                            pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM5, kR9, offset);
+                                            offset += 16;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    // Copy the last 8 bytes if needed
+                    if (cbUsed > cbUsed16) {
+                        _ASSERTE(cbUsed16 < cbArg);
+                        // movlps xmm0, [rax + offset]
+                        pSl->X86EmitOp(X86_INSTR_MOVLPS_R_RM, kXMM0, kRAX, offset);
+                        // movlps [r9 + offset], xmm0
+                        pSl->X86EmitOp(X86_INSTR_MOVLPS_RM_R, kXMM0, kR9, offset);
+                    }
+                }
+                else {
+                    // a loop (one double-quadword at a time)
+                    pSl->X86EmitZeroOutReg(kR11);
+                    // LoopLabel:
+                    CodeLabel *pLoopLabel = pSl->NewCodeLabel();
+                    pSl->EmitLabel(pLoopLabel);
+                    // movups xmm0, [rax + r11]
+                    pSl->X86EmitOp(X86_INSTR_MOVUPS_R_RM, kXMM0, kRAX, 0, kR11, 1);
+                    // movaps [r9 + r11], xmm0
+                    pSl->X86EmitOp(X86_INSTR_MOVAPS_RM_R, kXMM0, kR9, 0, kR11, 1);
+                    // add r11, 16
+                    pSl->X86EmitAddReg(kR11, 16);
+                    // cmp r11, cbUsed16
+                    pSl->X86EmitCmpRegImm32(kR11, cbUsed16);
+                    // jl LoopLabel
+                    pSl->X86EmitCondJump(pLoopLabel, X86CondCode::kJL);
+                    if (cbArg > cbUsed16) {
+                        _ASSERTE(cbUsed16 + 8 >= cbArg);
+                        // movlps xmm0, [rax + r11]
+                        pSl->X86EmitOp(X86_INSTR_MOVLPS_R_RM, kXMM0, kRAX, 0, kR11, 1);
+                        // movlps [r9 + r11], xmm0
+                        pSl->X86EmitOp(X86_INSTR_MOVLPS_RM_R, kXMM0, kR9, 0, kR11, 1);
+                    }
+                }
+                cbStructOffset += cbUsed;
+                AppendGCLayout(gcLayout, cbStructOffset, et == ELEMENT_TYPE_TYPEDBYREF, th);
+                break;
+            }
+
+            //
+            // Explicit Fall-Through for non-IsArgPassedByRef
+            //
+
+        default:
+            if (nArgSlot < 4) {
+                pSl->X86EmitIndexRegStore(kRDX, rgcbArgRegCtxtOffsets[nArgSlot], kRAX);
+                if ((et == ELEMENT_TYPE_R4) || (et == ELEMENT_TYPE_R8)) {
+                    pSl->X86EmitIndexRegStore(kRDX, rgcbFpArgRegCtxtOffsets[nArgSlot], kRAX);
+                }
+            }
+            else {
+                pSl->X86EmitIndexRegStore(kR8, 8 * nArgSlot, kRAX);
+            }
+            nArgSlot++;
+            break;
+        }
+    }
+
+#undef LOAD_STRUCT_OFFSET_IF_NEEDED
+
+    // Keep our 4 shadow slots and even number of slots (to keep 16-byte aligned)
+    if (nArgSlot < 4) 
+        nArgSlot = 4;
+    else if (nArgSlot & 1)
+        nArgSlot++;
+
+    _ASSERTE((cbStructOffset % 16) == 0);
+
+    // xor eax, eax
+    pSl->X86EmitZeroOutReg(kRAX);
+    // ret
+    pSl->X86EmitReturn(0);
+
+    // NullLabel:
+    pSl->EmitLabel(pNullLabel);
+
+    CodeLabel *pGCLayoutLabel = NULL;
+    if (gcLayout.Count() == 0) {
+        // xor eax, eax
+        pSl->X86EmitZeroOutReg(kRAX);
+    }
+    else {
+        // lea rax, [rip + offset to gclayout]
+        pGCLayoutLabel = pSl->NewCodeLabel();
+        pSl->X86EmitLeaRIP(pGCLayoutLabel, kRAX);
+    }
+    // mov [r9], rax
+    pSl->X86EmitIndexRegStore(kR9, 0, kRAX);
+    // mov rax, cbStackNeeded
+    pSl->X86EmitRegLoad(kRAX, cbStructOffset + nArgSlot * 8);
+    // ret
+    pSl->X86EmitReturn(0);
+
+    if (gcLayout.Count() > 0) {
+        // GCLayout:
+        pSl->EmitLabel(pGCLayoutLabel);
+        EncodeGCOffsets(pSl, gcLayout);
+    }
+
+    return pSl->Link();
+}
+#endif // DACCESS_COMPILE
+
+#endif // _TARGET_AMD64_
+
+
+#ifdef HAS_FIXUP_PRECODE
+
+#ifdef HAS_FIXUP_PRECODE_CHUNKS
+TADDR FixupPrecode::GetMethodDesc()
+{
+    LIMITED_METHOD_CONTRACT;
+    SUPPORTS_DAC;
+
+    // This lookup is also manually inlined in PrecodeFixupThunk assembly code
+    TADDR base = *PTR_TADDR(GetBase());
+    if (base == NULL)
+        return NULL;
+    return base + (m_MethodDescChunkIndex * MethodDesc::ALIGNMENT);
+}
+#endif
+
+#ifdef DACCESS_COMPILE
+void FixupPrecode::EnumMemoryRegions(CLRDataEnumMemoryFlags flags)
+{
+    SUPPORTS_DAC;
+    DacEnumMemoryRegion(dac_cast<TADDR>(this), sizeof(FixupPrecode));
+
+    DacEnumMemoryRegion(GetBase(), sizeof(TADDR));
+}
+#endif // DACCESS_COMPILE
+
+#endif // HAS_FIXUP_PRECODE
+
+#ifndef DACCESS_COMPILE
+
+BOOL rel32SetInterlocked(/*PINT32*/ PVOID pRel32, TADDR target, TADDR expected, MethodDesc* pMD)
+{
+    CONTRACTL
+    {
+        THROWS;         // Creating a JumpStub could throw OutOfMemory
+        GC_TRIGGERS;
+    }
+    CONTRACTL_END;
+
+    BYTE* callAddrAdj = (BYTE*)pRel32 + 4;
+    INT32 expectedRel32 = static_cast<INT32>((BYTE*)expected - callAddrAdj);
+
+    INT32 targetRel32 = rel32UsingJumpStub((INT32*)pRel32, target, pMD);
+
+    _ASSERTE(IS_ALIGNED(pRel32, sizeof(INT32)));
+    return FastInterlockCompareExchange((LONG*)pRel32, (LONG)targetRel32, (LONG)expectedRel32) == (LONG)expectedRel32;
+}
+
+void StubPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator /* = NULL */,
+    BYTE type /* = StubPrecode::Type */, TADDR target /* = NULL */)
+{
+    WRAPPER_NO_CONTRACT;
+
+    IN_WIN64(m_movR10 = X86_INSTR_MOV_R10_IMM64);   // mov r10, pMethodDesc
+    IN_WIN32(m_movEAX = X86_INSTR_MOV_EAX_IMM32);   // mov eax, pMethodDesc
+    m_pMethodDesc = (TADDR)pMD;
+    IN_WIN32(m_mov_rm_r = X86_INSTR_MOV_RM_R);      // mov reg,reg
+    m_type = type;
+    m_jmp = X86_INSTR_JMP_REL32;        // jmp rel32
+
+    if (pLoaderAllocator != NULL)
+    {
+        // Use pMD == NULL in all precode initialization methods to allocate the initial jump stub in non-dynamic heap
+        // that has the same lifetime like as the precode itself
+        if (target == NULL)
+            target = GetPreStubEntryPoint();
+        m_rel32 = rel32UsingJumpStub(&m_rel32, target, NULL /* pMD */, pLoaderAllocator);
+    }
+}
+
+#ifdef HAS_NDIRECT_IMPORT_PRECODE
+
+void NDirectImportPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator)
+{
+    WRAPPER_NO_CONTRACT;
+    StubPrecode::Init(pMD, pLoaderAllocator, NDirectImportPrecode::Type, GetEEFuncEntryPoint(NDirectImportThunk));
+}
+
+#endif // HAS_NDIRECT_IMPORT_PRECODE
+
+
+#ifdef HAS_REMOTING_PRECODE
+
+void RemotingPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator /* = NULL */)
+{
+    WRAPPER_NO_CONTRACT;
+
+    IN_WIN64(m_movR10 = X86_INSTR_MOV_R10_IMM64);   // mov r10, pMethodDesc
+    IN_WIN32(m_movEAX = X86_INSTR_MOV_EAX_IMM32);   // mov eax, pMethodDesc
+    m_pMethodDesc = (TADDR)pMD;
+    m_type = PRECODE_REMOTING;          // nop
+    m_call = X86_INSTR_CALL_REL32;
+    m_jmp = X86_INSTR_JMP_REL32;        // jmp rel32
+
+    if (pLoaderAllocator != NULL)
+    {
+        m_callRel32 = rel32UsingJumpStub(&m_callRel32,
+            GetEEFuncEntryPoint(PrecodeRemotingThunk), NULL /* pMD */, pLoaderAllocator);
+        m_rel32 = rel32UsingJumpStub(&m_rel32,
+            GetPreStubEntryPoint(), NULL /* pMD */, pLoaderAllocator);
+    }
+}
+
+#endif // HAS_REMOTING_PRECODE
+
+
+#ifdef HAS_FIXUP_PRECODE
+void FixupPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex /*=0*/, int iPrecodeChunkIndex /*=0*/)
+{
+    WRAPPER_NO_CONTRACT;
+
+    m_op   = X86_INSTR_CALL_REL32;       // call PrecodeFixupThunk
+    m_type = FixupPrecode::TypePrestub;
+
+    // Initialize chunk indices only if they are not initialized yet. This is necessary to make MethodDesc::Reset work.
+    if (m_PrecodeChunkIndex == 0)
+    {
+        _ASSERTE(FitsInU1(iPrecodeChunkIndex));
+        m_PrecodeChunkIndex = static_cast<BYTE>(iPrecodeChunkIndex);
+    }
+
+    if (iMethodDescChunkIndex != -1)
+    {
+        if (m_MethodDescChunkIndex == 0)
+        {
+            _ASSERTE(FitsInU1(iMethodDescChunkIndex));
+            m_MethodDescChunkIndex = static_cast<BYTE>(iMethodDescChunkIndex);
+        }
+
+        if (*(void**)GetBase() == NULL)
+            *(void**)GetBase() = (BYTE*)pMD - (iMethodDescChunkIndex * MethodDesc::ALIGNMENT);
+    }
+
+    _ASSERTE(GetMethodDesc() == (TADDR)pMD);
+
+    if (pLoaderAllocator != NULL)
+    {
+        m_rel32 = rel32UsingJumpStub(&m_rel32,
+            GetEEFuncEntryPoint(PrecodeFixupThunk), NULL /* pMD */, pLoaderAllocator);
+    }
+}
+
+BOOL FixupPrecode::SetTargetInterlocked(TADDR target, TADDR expected)
+{
+    CONTRACTL
+    {
+        THROWS;         // Creating a JumpStub could throw OutOfMemory
+        GC_TRIGGERS;
+    }
+    CONTRACTL_END;
+
+    INT64 oldValue = *(INT64*)this;
+    BYTE* pOldValue = (BYTE*)&oldValue;
+
+    if (pOldValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] != FixupPrecode::TypePrestub)
+        return FALSE;
+
+    MethodDesc * pMD = (MethodDesc*)GetMethodDesc();
+    g_IBCLogger.LogMethodPrecodeWriteAccess(pMD);
+    
+    INT64 newValue = oldValue;
+    BYTE* pNewValue = (BYTE*)&newValue;
+
+    pNewValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] = FixupPrecode::Type;
+
+    pOldValue[offsetof(FixupPrecode,m_op)] = X86_INSTR_CALL_REL32;
+    pNewValue[offsetof(FixupPrecode,m_op)] = X86_INSTR_JMP_REL32;
+
+    *(INT32*)(&pNewValue[offsetof(FixupPrecode,m_rel32)]) = rel32UsingJumpStub(&m_rel32, target, pMD);
+
+    _ASSERTE(IS_ALIGNED(this, sizeof(INT64)));
+    EnsureWritableExecutablePages(this, sizeof(INT64));
+    return FastInterlockCompareExchangeLong((INT64*) this, newValue, oldValue) == oldValue;
+}
+
+#ifdef FEATURE_NATIVE_IMAGE_GENERATION
+// Partial initialization. Used to save regrouped chunks.
+void FixupPrecode::InitForSave(int iPrecodeChunkIndex)
+{
+    m_op   = X86_INSTR_CALL_REL32;       // call PrecodeFixupThunk
+    m_type = FixupPrecode::TypePrestub;
+
+    _ASSERTE(FitsInU1(iPrecodeChunkIndex));
+    m_PrecodeChunkIndex = static_cast<BYTE>(iPrecodeChunkIndex);
+
+    // The rest is initialized in code:FixupPrecode::Fixup
+}
+
+void FixupPrecode::Fixup(DataImage *image, MethodDesc * pMD)
+{
+    STANDARD_VM_CONTRACT;
+
+    // Note that GetMethodDesc() does not return the correct value because of 
+    // regrouping of MethodDescs into hot and cold blocks. That's why the caller
+    // has to supply the actual MethodDesc
+
+    SSIZE_T mdChunkOffset;
+    ZapNode * pMDChunkNode = image->GetNodeForStructure(pMD, &mdChunkOffset);
+    ZapNode * pHelperThunk = image->GetHelperThunk(CORINFO_HELP_EE_PRECODE_FIXUP);
+
+    image->FixupFieldToNode(this, offsetof(FixupPrecode, m_rel32),
+                            pHelperThunk, 0, IMAGE_REL_BASED_REL32);
+
+    // Set the actual chunk index
+    FixupPrecode * pNewPrecode = (FixupPrecode *)image->GetImagePointer(this);
+
+    size_t mdOffset   = mdChunkOffset - sizeof(MethodDescChunk);
+    size_t chunkIndex = mdOffset / MethodDesc::ALIGNMENT;
+    _ASSERTE(FitsInU1(chunkIndex));
+    pNewPrecode->m_MethodDescChunkIndex = (BYTE) chunkIndex;
+
+    // Fixup the base of MethodDescChunk
+    if (m_PrecodeChunkIndex == 0)
+    {
+        image->FixupFieldToNode(this, (BYTE *)GetBase() - (BYTE *)this,
+            pMDChunkNode, sizeof(MethodDescChunk));
+    }
+}
+#endif // FEATURE_NATIVE_IMAGE_GENERATION
+
+#endif // HAS_FIXUP_PRECODE
+
+#endif // !DACCESS_COMPILE
+
+
+#ifdef HAS_THISPTR_RETBUF_PRECODE
+
+// rel32 jmp target that points back to the jump (infinite loop).
+// Used to mark uninitialized ThisPtrRetBufPrecode target
+#define REL32_JMP_SELF (-5)
+
+#ifndef DACCESS_COMPILE
+void ThisPtrRetBufPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator)
+{
+    WRAPPER_NO_CONTRACT;
+
+    IN_WIN64(m_nop1 = X86_INSTR_NOP;)   // nop
+#ifdef UNIX_AMD64_ABI
+    m_prefix1 = 0x48;
+    m_movScratchArg0 = 0xC78B;          // mov rax,rdi
+    m_prefix2 = 0x48;
+    m_movArg0Arg1 = 0xFE8B;             // mov rdi,rsi
+    m_prefix3 = 0x48;
+    m_movArg1Scratch = 0xF08B;          // mov rsi,rax
+#else
+    IN_WIN64(m_prefix1 = 0x48;)
+    m_movScratchArg0 = 0xC889;          // mov r/eax,r/ecx
+    IN_WIN64(m_prefix2 = 0x48;)
+    m_movArg0Arg1 = 0xD189;             // mov r/ecx,r/edx
+    IN_WIN64(m_prefix3 = 0x48;)
+    m_movArg1Scratch = 0xC289;          // mov r/edx,r/eax
+#endif
+    m_nop2 = X86_INSTR_NOP;             // nop
+    m_jmp = X86_INSTR_JMP_REL32;        // jmp rel32
+    m_pMethodDesc = (TADDR)pMD;
+
+    // This precode is never patched lazily - avoid unnecessary jump stub allocation
+    m_rel32 = REL32_JMP_SELF;
+}
+
+BOOL ThisPtrRetBufPrecode::SetTargetInterlocked(TADDR target, TADDR expected)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+    }
+    CONTRACTL_END;
+
+    // This precode is never patched lazily - the interlocked semantics is not required.
+    _ASSERTE(m_rel32 == REL32_JMP_SELF);
+
+    // Use pMD == NULL to allocate the jump stub in non-dynamic heap that has the same lifetime as the precode itself
+    m_rel32 = rel32UsingJumpStub(&m_rel32, target, NULL /* pMD */, ((MethodDesc *)GetMethodDesc())->GetLoaderAllocatorForCode());
+
+    return TRUE;
+}
+#endif // !DACCESS_COMPILE
+
+PCODE ThisPtrRetBufPrecode::GetTarget()
+{ 
+    LIMITED_METHOD_DAC_CONTRACT;
+
+    // This precode is never patched lazily - pretend that the uninitialized m_rel32 points to prestub
+    if (m_rel32 == REL32_JMP_SELF)
+        return GetPreStubEntryPoint();
+
+    return rel32Decode(PTR_HOST_MEMBER_TADDR(ThisPtrRetBufPrecode, this, m_rel32));
+}
+
+#endif // HAS_THISPTR_RETBUF_PRECODE
diff --git a/src/vm/i386/stublinkerx86.h b/src/vm/i386/stublinkerx86.h
new file mode 100644
index 0000000000..237fc794d4
--- /dev/null
+++ b/src/vm/i386/stublinkerx86.h
@@ -0,0 +1,781 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#ifndef STUBLINKERX86_H_
+#define STUBLINKERX86_H_
+
+#include "stublink.h"
+
+struct ArrayOpScript;
+class MetaSig;
+
+//=======================================================================
+
+#define X86_INSTR_CALL_REL32    0xE8        // call rel32
+#define X86_INSTR_CALL_IND      0x15FF      // call dword ptr[addr32]
+#define X86_INSTR_CALL_IND_EAX  0x10FF      // call dword ptr[eax]
+#define X86_INSTR_CALL_IND_EAX_OFFSET  0x50FF  // call dword ptr[eax + offset] ; where offset follows these 2 bytes
+#define X86_INSTR_CALL_EAX      0xD0FF      // call eax
+#define X86_INSTR_JMP_REL32     0xE9        // jmp rel32
+#define X86_INSTR_JMP_IND       0x25FF      // jmp dword ptr[addr32]
+#define X86_INSTR_JMP_EAX       0xE0FF      // jmp eax
+#define X86_INSTR_MOV_EAX_IMM32 0xB8        // mov eax, imm32
+#define X86_INSTR_MOV_EAX_ECX_IND 0x018b    // mov eax, [ecx]        
+#define X86_INSTR_CMP_IND_ECX_IMM32 0x3981  // cmp [ecx], imm32
+#define X86_INSTR_MOV_RM_R      0x89        // mov r/m,reg
+
+#define X86_INSTR_MOV_AL        0xB0        // mov al, imm8
+#define X86_INSTR_JMP_REL8      0xEB        // jmp short rel8
+
+#define X86_INSTR_NOP           0x90        // nop
+#define X86_INSTR_NOP3_1        0x9090      // 1st word of 3-byte nop
+#define X86_INSTR_NOP3_3        0x90        // 3rd byte of 3-byte nop
+#define X86_INSTR_INT3          0xCC        // int 3
+#define X86_INSTR_HLT           0xF4        // hlt
+
+#define X86_INSTR_MOVAPS_R_RM   0x280F      // movaps xmm1, xmm2/mem128
+#define X86_INSTR_MOVAPS_RM_R   0x290F      // movaps xmm1/mem128, xmm2
+#define X86_INSTR_MOVLPS_R_RM   0x120F      // movlps xmm1, xmm2/mem128
+#define X86_INSTR_MOVLPS_RM_R   0x130F      // movlps xmm1/mem128, xmm2
+#define X86_INSTR_MOVUPS_R_RM   0x100F      // movups xmm1, xmm2/mem128
+#define X86_INSTR_MOVUPS_RM_R   0x110F      // movups xmm1/mem128, xmm2
+#define X86_INSTR_XORPS         0x570F      // xorps xmm1, xmm2/mem128
+
+#ifdef _TARGET_AMD64_
+#define X86_INSTR_MOV_R10_IMM64 0xBA49      // mov r10, imm64
+#endif
+
+//----------------------------------------------------------------------
+// Encodes X86 registers. The numbers are chosen to match Intel's opcode
+// encoding.
+//----------------------------------------------------------------------
+enum X86Reg
+{
+    kEAX = 0,
+    kECX = 1,
+    kEDX = 2,
+    kEBX = 3,
+    // kESP intentionally omitted because of its irregular treatment in MOD/RM
+    kEBP = 5,
+    kESI = 6,
+    kEDI = 7,
+
+#ifdef _TARGET_X86_
+    NumX86Regs = 8,
+#endif // _TARGET_X86_
+
+    kXMM0 = 0,
+    kXMM1 = 1,
+    kXMM2 = 2,
+    kXMM3 = 3,
+    kXMM4 = 4,
+    kXMM5 = 5,
+#if defined(_TARGET_AMD64_)
+    kXMM6 = 6,
+    kXMM7 = 7,
+    kXMM8 = 8,
+    kXMM9 = 9,
+    kXMM10 = 10,
+    kXMM11 = 11,
+    kXMM12 = 12,
+    kXMM13 = 13,
+    kXMM14 = 14,
+    kXMM15 = 15,
+    // Integer registers commence here
+    kRAX = 0,
+    kRCX = 1,
+    kRDX = 2,
+    kRBX = 3,
+    // kRSP intentionally omitted because of its irregular treatment in MOD/RM
+    kRBP = 5,
+    kRSI = 6,
+    kRDI = 7,
+    kR8  = 8,
+    kR9  = 9,
+    kR10 = 10,
+    kR11 = 11,
+    kR12 = 12,
+    kR13 = 13,
+    kR14 = 14,
+    kR15 = 15,
+    NumX86Regs = 16,
+
+#endif // _TARGET_AMD64_
+
+    // We use "push ecx" instead of "sub esp, sizeof(LPVOID)"
+    kDummyPushReg = kECX
+};
+
+
+// Use this only if you are absolutely sure that the instruction format
+// handles it. This is not declared as X86Reg so that users are forced
+// to add a cast and think about what exactly they are doing.
+const int kESP_Unsafe = 4;
+
+//----------------------------------------------------------------------
+// Encodes X86 conditional jumps. The numbers are chosen to match
+// Intel's opcode encoding.
+//----------------------------------------------------------------------
+class X86CondCode {
+    public:
+        enum cc {
+            kJA   = 0x7,
+            kJAE  = 0x3,
+            kJB   = 0x2,
+            kJBE  = 0x6,
+            kJC   = 0x2,
+            kJE   = 0x4,
+            kJZ   = 0x4,
+            kJG   = 0xf,
+            kJGE  = 0xd,
+            kJL   = 0xc,
+            kJLE  = 0xe,
+            kJNA  = 0x6,
+            kJNAE = 0x2,
+            kJNB  = 0x3,
+            kJNBE = 0x7,
+            kJNC  = 0x3,
+            kJNE  = 0x5,
+            kJNG  = 0xe,
+            kJNGE = 0xc,
+            kJNL  = 0xd,
+            kJNLE = 0xf,
+            kJNO  = 0x1,
+            kJNP  = 0xb,
+            kJNS  = 0x9,
+            kJNZ  = 0x5,
+            kJO   = 0x0,
+            kJP   = 0xa,
+            kJPE  = 0xa,
+            kJPO  = 0xb,
+            kJS   = 0x8,
+        };
+};
+
+//----------------------------------------------------------------------
+// StubLinker with extensions for generating X86 code.
+//----------------------------------------------------------------------
+class StubLinkerCPU : public StubLinker
+{
+    public:
+
+#ifdef _TARGET_AMD64_
+        enum X86OperandSize
+        {
+            k32BitOp,
+            k64BitOp,
+        };
+#endif        
+
+        VOID X86EmitAddReg(X86Reg reg, INT32 imm32);
+        VOID X86EmitAddRegReg(X86Reg destreg, X86Reg srcReg);
+        VOID X86EmitSubReg(X86Reg reg, INT32 imm32);
+        VOID X86EmitSubRegReg(X86Reg destreg, X86Reg srcReg);
+
+        VOID X86EmitMovRegReg(X86Reg destReg, X86Reg srcReg);
+        VOID X86EmitMovSPReg(X86Reg srcReg);
+        VOID X86EmitMovRegSP(X86Reg destReg);
+        
+        VOID X86EmitPushReg(X86Reg reg);
+        VOID X86EmitPopReg(X86Reg reg);
+        VOID X86EmitPushRegs(unsigned regSet);
+        VOID X86EmitPopRegs(unsigned regSet);
+        VOID X86EmitPushImm32(UINT value);
+        VOID X86EmitPushImm32(CodeLabel &pTarget);
+        VOID X86EmitPushImm8(BYTE value);
+        VOID X86EmitPushImmPtr(LPVOID value WIN64_ARG(X86Reg tmpReg = kR10));
+
+        VOID X86EmitCmpRegImm32(X86Reg reg, INT32 imm32); // cmp reg, imm32
+        VOID X86EmitCmpRegIndexImm32(X86Reg reg, INT32 offs, INT32 imm32); // cmp [reg+offs], imm32
+#ifdef _TARGET_AMD64_
+        VOID X64EmitCmp32RegIndexImm32(X86Reg reg, INT32 offs, INT32 imm32); // cmp dword ptr [reg+offs], imm32
+
+        VOID X64EmitMovXmmXmm(X86Reg destXmmreg, X86Reg srcXmmReg);
+        VOID X64EmitMovdqaFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+        VOID X64EmitMovdqaToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+        VOID X64EmitMovSDFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+        VOID X64EmitMovSDToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+        VOID X64EmitMovSSFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+        VOID X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+
+        VOID X64EmitMovXmmWorker(BYTE prefix, BYTE opcode, X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+#endif
+
+        VOID X86EmitZeroOutReg(X86Reg reg);        
+        VOID X86EmitJumpReg(X86Reg reg);
+
+        VOID X86EmitOffsetModRM(BYTE opcode, X86Reg altreg, X86Reg indexreg, __int32 ofs);
+        VOID X86EmitOffsetModRmSIB(BYTE opcode, X86Reg opcodeOrReg, X86Reg baseReg, X86Reg indexReg, __int32 scale, __int32 ofs);
+        
+        VOID X86EmitTailcallWithESPAdjust(CodeLabel *pTarget, INT32 imm32);
+        VOID X86EmitTailcallWithSinglePop(CodeLabel *pTarget, X86Reg reg);
+        
+        VOID X86EmitNearJump(CodeLabel *pTarget);
+        VOID X86EmitCondJump(CodeLabel *pTarget, X86CondCode::cc condcode);
+        VOID X86EmitCall(CodeLabel *target, int iArgBytes);
+        VOID X86EmitReturn(WORD wArgBytes);
+#ifdef _TARGET_AMD64_
+        VOID X86EmitLeaRIP(CodeLabel *target, X86Reg reg);
+#endif
+
+        static const unsigned X86TLSFetch_TRASHABLE_REGS = (1<<kEAX) | (1<<kEDX) | (1<<kECX);
+        VOID X86EmitTLSFetch(DWORD idx, X86Reg dstreg, unsigned preservedRegSet);
+
+        VOID X86EmitCurrentThreadFetch(X86Reg dstreg, unsigned preservedRegSet);
+        VOID X86EmitCurrentAppDomainFetch(X86Reg dstreg, unsigned preservedRegSet);
+        
+        VOID X86EmitIndexRegLoad(X86Reg dstreg, X86Reg srcreg, __int32 ofs = 0);
+        VOID X86EmitIndexRegStore(X86Reg dstreg, __int32 ofs, X86Reg srcreg);
+#if defined(_TARGET_AMD64_)
+        VOID X86EmitIndexRegStoreRSP(__int32 ofs, X86Reg srcreg);
+        VOID X86EmitIndexRegStoreR12(__int32 ofs, X86Reg srcreg);
+#endif // defined(_TARGET_AMD64_)
+
+        VOID X86EmitIndexPush(X86Reg srcreg, __int32 ofs);
+        VOID X86EmitBaseIndexPush(X86Reg baseReg, X86Reg indexReg, __int32 scale, __int32 ofs);
+        VOID X86EmitIndexPop(X86Reg srcreg, __int32 ofs);
+        VOID X86EmitIndexLea(X86Reg dstreg, X86Reg srcreg, __int32 ofs);
+#if defined(_TARGET_AMD64_)
+        VOID X86EmitIndexLeaRSP(X86Reg dstreg, X86Reg srcreg, __int32 ofs);
+#endif // defined(_TARGET_AMD64_)
+
+        VOID X86EmitSPIndexPush(__int32 ofs);
+        VOID X86EmitSubEsp(INT32 imm32);
+        VOID X86EmitAddEsp(INT32 imm32);
+        VOID X86EmitEspOffset(BYTE opcode,
+                              X86Reg altreg,
+                              __int32 ofs
+                    AMD64_ARG(X86OperandSize OperandSize = k64BitOp)
+                              );
+        VOID X86EmitPushEBPframe();
+
+        // These are used to emit calls to notify the profiler of transitions in and out of
+        // managed code through COM->COM+ interop or N/Direct
+        VOID EmitProfilerComCallProlog(TADDR pFrameVptr, X86Reg regFrame);
+        VOID EmitProfilerComCallEpilog(TADDR pFrameVptr, X86Reg regFrame);
+
+
+
+        // Emits the most efficient form of the operation:
+        //
+        //    opcode   altreg, [basereg + scaledreg*scale + ofs]
+        //
+        // or
+        //
+        //    opcode   [basereg + scaledreg*scale + ofs], altreg
+        //
+        // (the opcode determines which comes first.)
+        //
+        //
+        // Limitations:
+        //
+        //    scale must be 0,1,2,4 or 8.
+        //    if scale == 0, scaledreg is ignored.
+        //    basereg and altreg may be equal to 4 (ESP) but scaledreg cannot
+        //    for some opcodes, "altreg" may actually select an operation
+        //      rather than a second register argument.
+        //    
+
+        VOID X86EmitOp(WORD    opcode,
+                       X86Reg  altreg,
+                       X86Reg  basereg,
+                       __int32 ofs = 0,
+                       X86Reg  scaledreg = (X86Reg)0,
+                       BYTE    scale = 0
+             AMD64_ARG(X86OperandSize OperandSize = k32BitOp)
+                       );
+
+#ifdef _TARGET_AMD64_
+        FORCEINLINE
+        VOID X86EmitOp(WORD    opcode,
+                       X86Reg  altreg,
+                       X86Reg  basereg,
+                       __int32 ofs,
+                       X86OperandSize OperandSize
+                       )
+        {
+            X86EmitOp(opcode, altreg, basereg, ofs, (X86Reg)0, 0, OperandSize);
+        }
+#endif // _TARGET_AMD64_
+
+        // Emits
+        //
+        //    opcode altreg, modrmreg
+        //
+        // or
+        //
+        //    opcode modrmreg, altreg
+        //
+        // (the opcode determines which one comes first)
+        //
+        // For single-operand opcodes, "altreg" actually selects
+        // an operation rather than a register.
+
+        VOID X86EmitR2ROp(WORD opcode,
+                          X86Reg altreg,
+                          X86Reg modrmreg
+                AMD64_ARG(X86OperandSize OperandSize = k64BitOp)
+                          );
+
+        VOID X86EmitRegLoad(X86Reg reg, UINT_PTR imm);
+
+        VOID X86EmitRegSave(X86Reg altreg, __int32 ofs)
+        {
+            LIMITED_METHOD_CONTRACT;        
+            X86EmitEspOffset(0x89, altreg, ofs);
+            // X86Reg values never are outside a byte.
+            UnwindSavedReg(static_cast<UCHAR>(altreg), ofs);
+        }
+
+        VOID X86_64BitOperands ()
+        {
+            WRAPPER_NO_CONTRACT;
+#ifdef _TARGET_AMD64_
+            Emit8(0x48);
+#endif
+        }
+
+        VOID EmitEnable(CodeLabel *pForwardRef);
+        VOID EmitRareEnable(CodeLabel *pRejoinPoint);
+
+        VOID EmitDisable(CodeLabel *pForwardRef, BOOL fCallIn, X86Reg ThreadReg);
+        VOID EmitRareDisable(CodeLabel *pRejoinPoint);
+        VOID EmitRareDisableHRESULT(CodeLabel *pRejoinPoint, CodeLabel *pExitPoint);
+
+        VOID EmitSetup(CodeLabel *pForwardRef);
+        VOID EmitRareSetup(CodeLabel* pRejoinPoint, BOOL fThrow);
+        VOID EmitCheckGSCookie(X86Reg frameReg, int gsCookieOffset);
+
+#ifdef _TARGET_X86_
+        void EmitComMethodStubProlog(TADDR pFrameVptr, CodeLabel** rgRareLabels,
+                                     CodeLabel** rgRejoinLabels, BOOL bShouldProfile);
+
+        void EmitComMethodStubEpilog(TADDR pFrameVptr, CodeLabel** rgRareLabels, 
+                                     CodeLabel** rgRejoinLabels, BOOL bShouldProfile);
+#endif
+
+        VOID EmitMethodStubProlog(TADDR pFrameVptr, int transitionBlockOffset);
+        VOID EmitMethodStubEpilog(WORD numArgBytes, int transitionBlockOffset);
+
+        VOID EmitUnboxMethodStub(MethodDesc* pRealMD);
+#if defined(FEATURE_SHARE_GENERIC_CODE)  
+        VOID EmitInstantiatingMethodStub(MethodDesc* pSharedMD, void* extra);
+#endif // FEATURE_SHARE_GENERIC_CODE
+
+#if defined(FEATURE_COMINTEROP) && defined(_TARGET_X86_)
+        //========================================================================
+        //  shared Epilog for stubs that enter managed code from COM
+        //  uses a return thunk within the method desc
+        void EmitSharedComMethodStubEpilog(TADDR pFrameVptr,
+                                           CodeLabel** rgRareLabels,
+                                           CodeLabel** rgRejoinLabels,
+                                           unsigned offsetReturnThunk,
+                                           BOOL bShouldProfile);
+#endif // FEATURE_COMINTEROP && _TARGET_X86_
+
+        //===========================================================================
+        // Computes hash code for MulticastDelegate.Invoke()
+        static UINT_PTR HashMulticastInvoke(MetaSig* pSig);
+
+        //===========================================================================
+        // Emits code for Delegate.Invoke() any delegate type
+        VOID EmitDelegateInvoke();
+
+        //===========================================================================
+        // Emits code for MulticastDelegate.Invoke() - sig specific
+        VOID EmitMulticastInvoke(UINT_PTR hash);
+
+        //===========================================================================
+        // Emits code for Delegate.Invoke() on delegates that recorded creator assembly
+        VOID EmitSecureDelegateInvoke(UINT_PTR hash);
+
+        //===========================================================================
+        // Emits code to adjust for a static delegate target.
+        VOID EmitShuffleThunk(struct ShuffleEntry *pShuffleEntryArray);
+
+
+        //===========================================================================
+        // Emits code to do an array operation.
+        VOID EmitArrayOpStub(const ArrayOpScript*);
+
+        //Worker function to emit throw helpers for array ops.
+        VOID EmitArrayOpStubThrow(unsigned exConst, unsigned cbRetArg);
+
+        //===========================================================================
+        // Emits code to break into debugger
+        VOID EmitDebugBreak();
+
+#if defined(_DEBUG) && (defined(_TARGET_AMD64_) || defined(_TARGET_X86_)) && !defined(FEATURE_PAL)
+        //===========================================================================
+        // Emits code to log JITHelper access
+        void EmitJITHelperLoggingThunk(PCODE pJitHelper, LPVOID helperFuncCount);
+#endif
+
+#ifdef _DEBUG
+        VOID X86EmitDebugTrashReg(X86Reg reg);
+#endif
+
+#if defined(_DEBUG) && defined(STUBLINKER_GENERATES_UNWIND_INFO) && !defined(CROSSGEN_COMPILE)
+        virtual VOID EmitUnwindInfoCheckWorker (CodeLabel *pCheckLabel);
+        virtual VOID EmitUnwindInfoCheckSubfunction();
+#endif
+
+#ifdef _TARGET_AMD64_
+
+        static Stub * CreateTailCallCopyArgsThunk(CORINFO_SIG_INFO * pSig,
+                                                  CorInfoHelperTailCallSpecialHandling flags);
+
+#endif // _TARGET_AMD64_
+
+    private:
+        VOID X86EmitSubEspWorker(INT32 imm32);
+
+    public:
+        static void Init();
+
+};
+
+inline TADDR rel32Decode(/*PTR_INT32*/ TADDR pRel32)
+{
+    LIMITED_METHOD_CONTRACT;
+    SUPPORTS_DAC;
+    return pRel32 + 4 + *PTR_INT32(pRel32);
+}
+
+BOOL rel32SetInterlocked(/*PINT32*/ PVOID pRel32, TADDR target, TADDR expected, MethodDesc* pMD);
+
+//------------------------------------------------------------------------
+//
+// Precode definitions
+//
+//------------------------------------------------------------------------
+
+EXTERN_C VOID STDCALL PrecodeFixupThunk();
+
+#ifdef _WIN64
+
+#define OFFSETOF_PRECODE_TYPE              0
+#define OFFSETOF_PRECODE_TYPE_CALL_OR_JMP  5
+#define OFFSETOF_PRECODE_TYPE_MOV_R10     10
+
+#define SIZEOF_PRECODE_BASE               16
+
+#else
+
+EXTERN_C VOID STDCALL PrecodeRemotingThunk();
+
+#define OFFSETOF_PRECODE_TYPE              5
+#define OFFSETOF_PRECODE_TYPE_CALL_OR_JMP  5
+#define OFFSETOF_PRECODE_TYPE_MOV_RM_R     6
+
+#define SIZEOF_PRECODE_BASE                8
+
+#endif // _WIN64
+
+
+#include <pshpack1.h>
+
+// Invalid precode type
+struct InvalidPrecode {
+    // int3
+    static const int Type = 0xCC;
+};
+
+
+// Regular precode
+struct StubPrecode {
+
+#ifdef _WIN64
+    static const BYTE Type = 0x40;
+    // mov r10,pMethodDesc
+    // inc eax
+    // jmp Stub
+#else
+    static const BYTE Type = 0xED;
+    // mov eax,pMethodDesc
+    // mov ebp,ebp
+    // jmp Stub
+#endif // _WIN64
+
+    IN_WIN64(USHORT m_movR10;)
+    IN_WIN32(BYTE   m_movEAX;)
+    TADDR           m_pMethodDesc;
+    IN_WIN32(BYTE   m_mov_rm_r;)
+    BYTE            m_type;
+    BYTE            m_jmp;
+    INT32           m_rel32;
+
+    void Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator = NULL, BYTE type = StubPrecode::Type, TADDR target = NULL);
+
+    TADDR GetMethodDesc()
+    {
+        LIMITED_METHOD_DAC_CONTRACT;
+
+        return m_pMethodDesc;
+    }
+
+    PCODE GetTarget()
+    { 
+        LIMITED_METHOD_DAC_CONTRACT;
+
+        return rel32Decode(PTR_HOST_MEMBER_TADDR(StubPrecode, this, m_rel32));
+    }
+
+    BOOL SetTargetInterlocked(TADDR target, TADDR expected)
+    {
+        CONTRACTL
+        {
+            THROWS;
+            GC_TRIGGERS;
+        }
+        CONTRACTL_END;
+
+        EnsureWritableExecutablePages(&m_rel32);
+        return rel32SetInterlocked(&m_rel32, target, expected, (MethodDesc*)GetMethodDesc());
+    }
+};
+IN_WIN64(static_assert_no_msg(offsetof(StubPrecode, m_movR10) == OFFSETOF_PRECODE_TYPE);)
+IN_WIN64(static_assert_no_msg(offsetof(StubPrecode, m_type) == OFFSETOF_PRECODE_TYPE_MOV_R10);)
+IN_WIN32(static_assert_no_msg(offsetof(StubPrecode, m_mov_rm_r) == OFFSETOF_PRECODE_TYPE);)
+IN_WIN32(static_assert_no_msg(offsetof(StubPrecode, m_type) == OFFSETOF_PRECODE_TYPE_MOV_RM_R);)
+typedef DPTR(StubPrecode) PTR_StubPrecode;
+
+
+#ifdef HAS_NDIRECT_IMPORT_PRECODE
+
+// NDirect import precode
+// (This is fake precode. VTable slot does not point to it.)
+struct NDirectImportPrecode : StubPrecode {
+
+#ifdef _WIN64
+    static const int Type = 0x48;
+    // mov r10,pMethodDesc
+    // dec eax
+    // jmp NDirectImportThunk
+#else
+    static const int Type = 0xC0;
+    // mov eax,pMethodDesc
+    // mov eax,eax
+    // jmp NDirectImportThunk
+#endif // _WIN64
+
+    void Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator);
+
+    LPVOID GetEntrypoint()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return this;
+    }
+};
+typedef DPTR(NDirectImportPrecode) PTR_NDirectImportPrecode;
+
+#endif // HAS_NDIRECT_IMPORT_PRECODE
+
+
+#ifdef HAS_REMOTING_PRECODE
+
+// Precode with embedded remoting interceptor
+struct RemotingPrecode {
+
+#ifdef _WIN64
+    static const int Type = XXX;       // NYI
+    // mov r10,pMethodDesc
+    // call PrecodeRemotingThunk
+    // jmp Prestub/Stub/NativeCode
+#else
+    static const int Type = 0x90;
+    // mov eax,pMethodDesc
+    // nop
+    // call PrecodeRemotingThunk
+    // jmp Prestub/Stub/NativeCode
+#endif // _WIN64
+
+    IN_WIN64(USHORT m_movR10;)
+    IN_WIN32(BYTE   m_movEAX;)
+    TADDR           m_pMethodDesc;
+    BYTE            m_type;
+    BYTE            m_call;
+    INT32           m_callRel32;
+    BYTE            m_jmp;
+    INT32           m_rel32;
+
+    void Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator = NULL);
+
+    TADDR GetMethodDesc()
+    {
+        LIMITED_METHOD_CONTRACT; 
+        SUPPORTS_DAC;
+
+        return m_pMethodDesc;
+    }
+
+    PCODE GetTarget()
+    { 
+        LIMITED_METHOD_DAC_CONTRACT;
+
+        return rel32Decode(PTR_HOST_MEMBER_TADDR(RemotingPrecode, this, m_rel32));
+    }
+
+    BOOL SetTargetInterlocked(TADDR target, TADDR expected)
+    {
+        CONTRACTL
+        {
+            THROWS;
+            GC_TRIGGERS;
+        }
+        CONTRACTL_END;
+
+        EnsureWritableExecutablePages(&m_rel32);
+        return rel32SetInterlocked(&m_rel32, target, expected, (MethodDesc*)GetMethodDesc());
+    }
+};
+IN_WIN64(static_assert_no_msg(offsetof(RemotingPrecode, m_movR10) == OFFSETOF_PRECODE_TYPE);)
+IN_WIN64(static_assert_no_msg(offsetof(RemotingPrecode, m_type) == OFFSETOF_PRECODE_TYPE_MOV_R10);)
+IN_WIN32(static_assert_no_msg(offsetof(RemotingPrecode, m_type) == OFFSETOF_PRECODE_TYPE);)
+typedef DPTR(RemotingPrecode) PTR_RemotingPrecode;
+
+#endif // HAS_REMOTING_PRECODE
+
+
+#ifdef HAS_FIXUP_PRECODE
+
+// Fixup precode is used in ngen images when the prestub does just one time fixup.
+// The fixup precode is simple jump once patched. It does not have the two instruction overhead of regular precode.
+struct FixupPrecode {
+
+    static const int TypePrestub = 0x5E;
+    // The entrypoint has to be 8-byte aligned so that the "call PrecodeFixupThunk" can be patched to "jmp NativeCode" atomically.
+    // call PrecodeFixupThunk
+    // db TypePrestub (pop esi)
+    // db MethodDescChunkIndex
+    // db PrecodeChunkIndex
+
+    static const int Type = 0x5F;
+    // After it has been patched to point to native code
+    // jmp NativeCode
+    // db Type (pop edi)
+
+    BYTE            m_op;
+    INT32           m_rel32;
+    BYTE            m_type;
+    BYTE            m_MethodDescChunkIndex;
+    BYTE            m_PrecodeChunkIndex;
+#ifdef HAS_FIXUP_PRECODE_CHUNKS
+    // Fixup precode chunk is associated with MethodDescChunk. The layout of the fixup precode chunk is:
+    //
+    // FixupPrecode     Entrypoint PrecodeChunkIndex = 2
+    // FixupPrecode     Entrypoint PrecodeChunkIndex = 1
+    // FixupPrecode     Entrypoint PrecodeChunkIndex = 0
+    // TADDR            Base of MethodDescChunk
+#else
+    TADDR           m_pMethodDesc;
+#endif
+
+    void Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex = 0, int iPrecodeChunkIndex = 0);
+
+#ifdef HAS_FIXUP_PRECODE_CHUNKS
+    TADDR GetBase()
+    {
+        LIMITED_METHOD_CONTRACT;
+        SUPPORTS_DAC;
+
+        return dac_cast<TADDR>(this) + (m_PrecodeChunkIndex + 1) * sizeof(FixupPrecode);
+    }
+
+    TADDR GetMethodDesc();
+#else // HAS_FIXUP_PRECODE_CHUNKS
+    TADDR GetMethodDesc()
+    {
+        LIMITED_METHOD_CONTRACT; 
+        return m_pMethodDesc;
+    }
+#endif // HAS_FIXUP_PRECODE_CHUNKS
+
+    PCODE GetTarget()
+    {
+        LIMITED_METHOD_DAC_CONTRACT;
+
+        return rel32Decode(PTR_HOST_MEMBER_TADDR(FixupPrecode, this, m_rel32));
+    }
+
+    BOOL SetTargetInterlocked(TADDR target, TADDR expected);
+
+    static BOOL IsFixupPrecodeByASM(TADDR addr)
+    {
+        LIMITED_METHOD_CONTRACT; 
+
+        return *dac_cast<PTR_BYTE>(addr) == X86_INSTR_JMP_REL32;
+    }
+
+#ifdef FEATURE_PREJIT
+    // Partial initialization. Used to save regrouped chunks.
+    void InitForSave(int iPrecodeChunkIndex);
+
+    void Fixup(DataImage *image, MethodDesc * pMD);
+#endif
+
+#ifdef DACCESS_COMPILE
+    void EnumMemoryRegions(CLRDataEnumMemoryFlags flags);
+#endif
+};
+IN_WIN32(static_assert_no_msg(offsetof(FixupPrecode, m_type) == OFFSETOF_PRECODE_TYPE));
+IN_WIN64(static_assert_no_msg(offsetof(FixupPrecode, m_op)   == OFFSETOF_PRECODE_TYPE);)
+IN_WIN64(static_assert_no_msg(offsetof(FixupPrecode, m_type) == OFFSETOF_PRECODE_TYPE_CALL_OR_JMP);)
+
+typedef DPTR(FixupPrecode) PTR_FixupPrecode;
+
+#endif // HAS_FIXUP_PRECODE
+
+#ifdef HAS_THISPTR_RETBUF_PRECODE
+
+// Precode to stuffle this and retbuf for closed delegates over static methods with return buffer
+struct ThisPtrRetBufPrecode {
+
+#ifdef _WIN64
+    static const int Type = 0x90;
+#else
+    static const int Type = 0xC2;
+#endif // _WIN64
+
+    // mov regScratch,regArg0
+    // mov regArg0,regArg1
+    // mov regArg1,regScratch
+    // nop
+    // jmp EntryPoint
+    // dw pMethodDesc
+
+    IN_WIN64(BYTE   m_nop1;)
+    IN_WIN64(BYTE   m_prefix1;)
+    WORD            m_movScratchArg0;
+    IN_WIN64(BYTE   m_prefix2;)
+    WORD            m_movArg0Arg1;
+    IN_WIN64(BYTE   m_prefix3;)
+    WORD            m_movArg1Scratch;
+    BYTE            m_nop2;
+    BYTE            m_jmp;
+    INT32           m_rel32;
+    TADDR           m_pMethodDesc;
+
+    void Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator);
+
+    TADDR GetMethodDesc()
+    {
+        LIMITED_METHOD_CONTRACT; 
+        SUPPORTS_DAC;
+
+        return m_pMethodDesc;
+    }
+
+    PCODE GetTarget();
+
+    BOOL SetTargetInterlocked(TADDR target, TADDR expected);
+};
+IN_WIN32(static_assert_no_msg(offsetof(ThisPtrRetBufPrecode, m_movArg1Scratch) + 1 == OFFSETOF_PRECODE_TYPE);)
+typedef DPTR(ThisPtrRetBufPrecode) PTR_ThisPtrRetBufPrecode;
+
+#endif // HAS_THISPTR_RETBUF_PRECODE
+
+#include <poppack.h>
+
+#endif  // STUBLINKERX86_H_
diff --git a/src/vm/i386/virtualcallstubcpu.hpp b/src/vm/i386/virtualcallstubcpu.hpp
new file mode 100644
index 0000000000..33ce8199b9
--- /dev/null
+++ b/src/vm/i386/virtualcallstubcpu.hpp
@@ -0,0 +1,1077 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+// File: virtualcallstubcpu.hpp
+//
+
+
+//
+
+//
+// ============================================================================
+
+#ifndef _VIRTUAL_CALL_STUB_X86_H
+#define _VIRTUAL_CALL_STUB_X86_H
+
+#ifdef DECLARE_DATA
+#include "asmconstants.h"
+#ifdef FEATURE_REMOTING
+#include "remoting.h"
+#endif
+#endif
+
+#include <pshpack1.h>  // Since we are placing code, we want byte packing of the structs
+
+#define USES_LOOKUP_STUBS	1
+
+/*********************************************************************************************
+Stubs that contain code are all part of larger structs called Holders.  There is a
+Holder for each kind of stub, i.e XXXStub is contained with XXXHolder.  Holders are
+essentially an implementation trick that allowed rearranging the code sequences more
+easily while trying out different alternatives, and for dealing with any alignment 
+issues in a way that was mostly immune to the actually code sequences.  These Holders
+should be revisited when the stub code sequences are fixed, since in many cases they
+add extra space to a stub that is not really needed.  
+
+Stubs are placed in cache and hash tables.  Since unaligned access of data in memory
+is very slow, the keys used in those tables should be aligned.  The things used as keys
+typically also occur in the generated code, e.g. a token as an immediate part of an instruction.
+For now, to avoid alignment computations as different code strategies are tried out, the key
+fields are all in the Holders.  Eventually, many of these fields should be dropped, and the instruction
+streams aligned so that the immediate fields fall on aligned boundaries.  
+*/
+
+#if USES_LOOKUP_STUBS
+
+struct LookupStub;
+struct LookupHolder;
+
+/*LookupStub**************************************************************************************
+Virtual and interface call sites are initially setup to point at LookupStubs.  
+This is because the runtime type of the <this> pointer is not yet known, 
+so the target cannot be resolved.  Note: if the jit is able to determine the runtime type 
+of the <this> pointer, it should be generating a direct call not a virtual or interface call.
+This stub pushes a lookup token onto the stack to identify the sought after method, and then 
+jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and
+transfer of control to the appropriate target method implementation, perhaps patching of the call site
+along the way to point to a more appropriate stub.  Hence callsites that point to LookupStubs 
+get quickly changed to point to another kind of stub.
+*/
+struct LookupStub
+{
+    inline PCODE entryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; }
+    inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; }
+    inline size_t       size()          { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
+
+private:
+    friend struct LookupHolder;
+
+    // DispatchStub:: _entryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+    BYTE    _entryPoint [2];    // 50           push    eax             ;save siteAddrForRegisterIndirect - this may be an indirect call
+                                // 68           push
+    size_t  _token;             // xx xx xx xx          32-bit constant
+#ifdef STUB_LOGGING
+    BYTE cntr2[2];              // ff 05        inc
+    size_t* c_lookup;           // xx xx xx xx          [call_lookup_counter]
+#endif //STUB_LOGGING 
+    BYTE part2 [1];             // e9           jmp
+    DISPL   _resolveWorkerDispl;// xx xx xx xx          pc-rel displ
+};
+
+/* LookupHolders are the containers for LookupStubs, they provide for any alignment of 
+stubs as necessary.  In the case of LookupStubs, alignment is necessary since
+LookupStubs are placed in a hash table keyed by token. */
+struct LookupHolder
+{
+    static void InitializeStatic();
+
+    void  Initialize(PCODE resolveWorkerTarget, size_t dispatchToken);
+
+    LookupStub*    stub()               { LIMITED_METHOD_CONTRACT;  return &_stub;    }
+
+    static LookupHolder*  FromLookupEntry(PCODE lookupEntry);
+
+private:
+    friend struct LookupStub;
+
+    BYTE align[(sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*)))%sizeof(void*)];
+    LookupStub _stub;
+    BYTE pad[sizeof(void*) -
+             ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) +
+              (sizeof(LookupStub))
+             ) % sizeof(void*)];    //complete DWORD
+
+    static_assert_no_msg((sizeof(void*) -
+             ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) +
+              (sizeof(LookupStub))
+             ) % sizeof(void*)) != 0);
+};
+
+#endif // USES_LOOKUP_STUBS
+
+struct DispatchStub;
+struct DispatchHolder;
+
+/*DispatchStub**************************************************************************************
+Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs.
+A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure).  
+If the calling frame does in fact have the <this> type be of the expected type, then
+control is transfered to the target address, the method implementation.  If not, 
+then control is transfered to the fail address, a fail stub (see below) where a polymorphic 
+lookup is done to find the correct address to go to.  
+
+implementation note: Order, choice of instructions, and branch directions
+should be carefully tuned since it can have an inordinate effect on performance.  Particular
+attention needs to be paid to the effects on the BTB and branch prediction, both in the small
+and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions.
+Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched
+to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important
+that the branch prediction staticly predict this, which means it must be a forward jump.  The alternative 
+is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" 
+is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier
+to control the placement of the stubs than control the placement of the jitted code and the stubs. */
+struct DispatchStub 
+{
+    inline PCODE        entryPoint()  { LIMITED_METHOD_CONTRACT;  return (PCODE)&_entryPoint[0]; }
+
+    inline size_t       expectedMT()  { LIMITED_METHOD_CONTRACT;  return _expectedMT;     }
+    inline PCODE        implTarget()  { LIMITED_METHOD_CONTRACT;  return (PCODE) &_implDispl + sizeof(DISPL) + _implDispl; }
+    inline PCODE        failTarget()  { LIMITED_METHOD_CONTRACT;  return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; }
+    inline size_t       size()        { LIMITED_METHOD_CONTRACT;  return sizeof(DispatchStub); }
+
+private:
+    friend struct DispatchHolder;
+
+    // DispatchStub:: _entryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+#ifndef STUB_LOGGING
+    BYTE    _entryPoint [2];    // 81 39        cmp  [ecx],                   ; This is the place where we are going to fault on null this.
+    size_t  _expectedMT;        // xx xx xx xx              expectedMT        ; If you change it, change also AdjustContextForVirtualStub in excep.cpp!!!
+    BYTE    jmpOp1[2];          // 0f 85        jne                 
+    DISPL   _failDispl;         // xx xx xx xx              failEntry         ;must be forward jmp for perf reasons
+    BYTE jmpOp2;                // e9           jmp     
+    DISPL   _implDispl;         // xx xx xx xx              implTarget
+#else //STUB_LOGGING
+    BYTE    _entryPoint [2];    // ff 05        inc
+    size_t* d_call;             // xx xx xx xx              [call_mono_counter]
+    BYTE cmpOp [2];             // 81 39        cmp  [ecx],
+    size_t  _expectedMT;        // xx xx xx xx              expectedMT
+    BYTE jmpOp1[2];             // 0f 84        je 
+    DISPL   _implDispl;         // xx xx xx xx              implTarget        ;during logging, perf is not so important               
+    BYTE fail [2];              // ff 05        inc 
+    size_t* d_miss;             // xx xx xx xx      [miss_mono_counter]
+    BYTE jmpFail;               // e9           jmp     
+    DISPL   _failDispl;         // xx xx xx xx              failEntry 
+#endif //STUB_LOGGING 
+};
+
+/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of 
+stubs as necessary.  DispatchStubs are placed in a hashtable and in a cache.  The keys for both
+are the pair expectedMT and token.  Efficiency of the of the hash table is not a big issue,
+since lookups in it are fairly rare.  Efficiency of the cache is paramount since it is accessed frequently
+o(see ResolveStub below).  Currently we are storing both of these fields in the DispatchHolder to simplify
+alignment issues.  If inlineMT in the stub itself was aligned, then it could be the expectedMT field.
+While the token field can be logically gotten by following the failure target to the failEntryPoint 
+of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here.
+This allows us to use DispatchStubs in the cache.  The alternative is to provide some other immutable struct
+for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when
+they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid).
+*/
+
+/* @workaround for ee resolution - Since the EE does not currently have a resolver function that
+does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are 
+using dispatch stubs to siumulate what we want.  That means that inlineTarget, which should be immutable
+is in fact written.  Hence we have moved target out into the holder and aligned it so we can 
+atomically update it.  When we get a resolver function that does what we want, we can drop this field,
+and live with just the inlineTarget field in the stub itself, since immutability will hold.*/
+struct DispatchHolder
+{
+    static void InitializeStatic();
+
+    void  Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT);
+
+    DispatchStub* stub()      { LIMITED_METHOD_CONTRACT;  return &_stub; }
+
+    static DispatchHolder*  FromDispatchEntry(PCODE dispatchEntry);
+
+private:
+    //force expectedMT to be aligned since used as key in hash tables.
+#ifndef STUB_LOGGING
+    BYTE align[(sizeof(void*)-(offsetof(DispatchStub,_expectedMT)%sizeof(void*)))%sizeof(void*)];
+#endif
+    DispatchStub _stub;
+    BYTE pad[(sizeof(void*)-(sizeof(DispatchStub)%sizeof(void*))+offsetof(DispatchStub,_expectedMT))%sizeof(void*)];	//complete DWORD
+};
+
+struct ResolveStub;
+struct ResolveHolder;
+
+/*ResolveStub**************************************************************************************
+Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub.  There is only 
+one resolver stub built for any given token, even though there may be many call sites that
+use that token and many distinct <this> types that are used in the calling call frames.  A resolver stub 
+actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their
+expectedMT test.  There is a third part of the resolver stub that enters the ee when a decision should
+be made about changing the callsite.  Therefore, we have defined the resolver stub as three distinct pieces,
+even though they are actually allocated as a single contiguous block of memory.  These pieces are:
+
+A ResolveStub has two entry points:
+
+FailEntry - where the dispatch stub goes if the expected MT test fails.  This piece of the stub does
+a check to see how often we are actually failing. If failures are frequent, control transfers to the 
+patch piece to cause the call site to be changed from a mostly monomorphic callsite 
+(calls dispatch stub) to a polymorphic callsize (calls resolve stub).  If failures are rare, control
+transfers to the resolve piece (see ResolveStub).  The failEntryPoint decrements a counter 
+every time it is entered.  The ee at various times will add a large chunk to the counter. 
+
+ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s
+<this> and the token identifying the (contract,method) pair desired.  If found, control is transfered
+to the method implementation.  If not found in the cache, the token is pushed and the ee is entered via
+the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation.  Since
+there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed.
+The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used,
+as well as its speed.  It turns out it is very important to make the hash function sensitive to all 
+of the bits of the method table, as method tables are laid out in memory in a very non-random way.  Before
+making any changes to the code sequences here, it is very important to measure and tune them as perf
+can vary greatly, in unexpected ways, with seeming minor changes.
+
+Implementation note - Order, choice of instructions, and branch directions
+should be carefully tuned since it can have an inordinate effect on performance.  Particular
+attention needs to be paid to the effects on the BTB and branch prediction, both in the small
+and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. 
+Note that this stub is called in highly polymorphic cases, but the cache should have been sized
+and the hash function chosen to maximize the cache hit case.  Hence the cmp/jcc instructions should
+mostly be going down the cache hit route, and it is important that this be statically predicted as so.
+Hence the 3 jcc instrs need to be forward jumps.  As structured, there is only one jmp/jcc that typically
+gets put in the BTB since all the others typically fall straight thru.  Minimizing potential BTB entries
+is important. */
+
+struct ResolveStub 
+{
+    inline PCODE failEntryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0];    }
+    inline PCODE resolveEntryPoint()        { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; }
+    inline PCODE slowEntryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; }
+
+    inline INT32* pCounter()                { LIMITED_METHOD_CONTRACT; return _pCounter; }
+    inline UINT32 hashedToken()             { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE;    }
+    inline size_t cacheAddress()            { LIMITED_METHOD_CONTRACT; return _cacheAddress;   }
+    inline size_t token()                   { LIMITED_METHOD_CONTRACT; return _token;          }
+    inline size_t size()                    { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); }
+
+private:
+    friend struct ResolveHolder;
+
+    // ResolveStub::_failEntryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+    BYTE   _failEntryPoint [2];     // 83 2d        sub
+    INT32* _pCounter;               // xx xx xx xx          [counter],
+    BYTE   part0 [2];               // 01                   01
+                                    // 7c           jl
+    BYTE toPatcher;                 // xx                   backpatcher     ;must be forward jump, for perf reasons
+                                    //                                      ;fall into the resolver stub
+
+    // ResolveStub::_resolveEntryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+    BYTE    _resolveEntryPoint[6];  // 50           push    eax             ;save siteAddrForRegisterIndirect - this may be an indirect call
+                                    // 8b 01        mov     eax,[ecx]       ;get the method table from the "this" pointer. This is the place
+                                    //                                      ;    where we are going to fault on null this. If you change it,
+                                    //                                      ;    change also AdjustContextForVirtualStub in excep.cpp!!!
+                                    // 52           push    edx            
+                                    // 8b d0        mov     edx, eax
+    BYTE    part1 [6];              // c1 e8 0C     shr     eax,12          ;we are adding upper bits into lower bits of mt
+                                    // 03 c2        add     eax,edx
+                                    // 35           xor     eax,
+    UINT32  _hashedToken;           // xx xx xx xx              hashedToken ;along with pre-hashed token
+    BYTE    part2 [1];              // 25           and     eax,
+    size_t mask;                    // xx xx xx xx              cache_mask
+    BYTE part3 [2];                 // 8b 80        mov     eax, [eax+
+    size_t  _cacheAddress;          // xx xx xx xx                lookupCache]
+#ifdef STUB_LOGGING
+    BYTE cntr1[2];                  // ff 05        inc
+    size_t* c_call;                 // xx xx xx xx          [call_cache_counter]
+#endif //STUB_LOGGING 
+    BYTE part4 [2];                 // 3b 10        cmp     edx,[eax+
+    // BYTE mtOffset;               //                          ResolverCacheElem.pMT]
+    BYTE part5 [1];                 // 75           jne
+    BYTE toMiss1;                   // xx                   miss            ;must be forward jump, for perf reasons
+    BYTE part6 [2];                 // 81 78        cmp     [eax+
+    BYTE tokenOffset;               // xx                        ResolverCacheElem.token],
+    size_t  _token;                 // xx xx xx xx              token
+    BYTE part7 [1];                 // 75           jne
+    BYTE toMiss2;                   // xx                   miss            ;must be forward jump, for perf reasons
+    BYTE part8 [2];                 // 8B 40 xx     mov     eax,[eax+
+    BYTE targetOffset;              //                          ResolverCacheElem.target]
+    BYTE part9 [6];                 // 5a           pop     edx
+                                    // 83 c4 04     add     esp,4           ;throw away siteAddrForRegisterIndirect - we don't need it now
+                                    // ff e0        jmp     eax
+                                    //         miss:
+    BYTE    miss [1];               // 5a           pop     edx             ; don't pop siteAddrForRegisterIndirect - leave it on the stack for use by ResolveWorkerChainLookupAsmStub and/or ResolveWorkerAsmStub
+    BYTE    _slowEntryPoint[1];     // 68           push
+    size_t  _tokenPush;             // xx xx xx xx          token
+#ifdef STUB_LOGGING
+    BYTE cntr2[2];                  // ff 05        inc
+    size_t* c_miss;                 // xx xx xx xx          [miss_cache_counter]
+#endif //STUB_LOGGING
+    BYTE part10 [1];                // e9           jmp
+    DISPL   _resolveWorkerDispl;    // xx xx xx xx          resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub
+    BYTE  patch[1];                 // e8           call
+    DISPL _backpatcherDispl;        // xx xx xx xx          backpatcherWorker  == BackPatchWorkerAsmStub
+    BYTE  part11 [1];               // eb           jmp
+    BYTE toResolveStub;             // xx                   resolveStub, i.e. go back to _resolveEntryPoint
+};
+
+/* ResolveHolders are the containers for ResolveStubs,  They provide 
+for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by 
+the token for which they are built.  Efficiency of access requires that this token be aligned.  
+For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that
+any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder
+is not needed. */ 
+struct ResolveHolder
+{
+    static void  InitializeStatic();
+
+    void  Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, 
+                     size_t dispatchToken, UINT32 hashedToken,
+                     void * cacheAddr, INT32 * counterAddr);
+
+    ResolveStub* stub()      { LIMITED_METHOD_CONTRACT;  return &_stub; }
+
+    static ResolveHolder*  FromFailEntry(PCODE failEntry);
+    static ResolveHolder*  FromResolveEntry(PCODE resolveEntry);
+
+private:
+    //align _token in resolve stub
+
+    BYTE align[(sizeof(void*)-((offsetof(ResolveStub,_token))%sizeof(void*)))%sizeof(void*)
+#ifdef STUB_LOGGING // This turns out to be zero-sized in stub_logging case, and is an error. So round up.
+               +sizeof(void*)
+#endif
+              ];
+
+    ResolveStub _stub;
+
+//#ifdef STUB_LOGGING // This turns out to be zero-sized in non stub_logging case, and is an error. So remove
+    BYTE pad[(sizeof(void*)-((sizeof(ResolveStub))%sizeof(void*))+offsetof(ResolveStub,_token))%sizeof(void*)];	//fill out DWORD
+//#endif
+};
+#include <poppack.h>
+
+
+#ifdef DECLARE_DATA
+
+#ifndef DACCESS_COMPILE
+
+#ifdef _MSC_VER
+
+#ifdef CHAIN_LOOKUP
+/* This will perform a chained lookup of the entry if the initial cache lookup fails
+
+   Entry stack:
+            dispatch token
+            siteAddrForRegisterIndirect (used only if this is a RegisterIndirect dispatch call)
+            return address of caller to stub
+        Also, EAX contains the pointer to the first ResolveCacheElem pointer for the calculated
+        bucket in the cache table.
+*/
+__declspec (naked) void ResolveWorkerChainLookupAsmStub()
+{
+    enum
+    {
+        e_token_size                = 4,
+        e_indirect_addr_size        = 4,
+        e_caller_ret_addr_size      = 4,
+    };
+    enum
+    {
+        // this is the part of the stack that is present as we enter this function:
+        e_token                     = 0,
+        e_indirect_addr             = e_token + e_token_size,
+        e_caller_ret_addr           = e_indirect_addr + e_indirect_addr_size,
+        e_ret_esp                   = e_caller_ret_addr + e_caller_ret_addr_size,
+    };
+    enum
+    {
+        e_spilled_reg_size          = 8,
+    };
+
+    // main loop setup
+    __asm {
+#ifdef STUB_LOGGING
+        inc     g_chained_lookup_call_counter
+#endif
+        // spill regs
+        push    edx
+        push    ecx
+        // move the token into edx
+        mov     edx,[esp+e_spilled_reg_size+e_token]
+        // move the MT into ecx
+        mov     ecx,[ecx]
+    }
+    main_loop:
+    __asm {
+        // get the next entry in the chain (don't bother checking the first entry again)
+        mov     eax,[eax+e_resolveCacheElem_offset_next]
+        // test if we hit a terminating NULL
+        test    eax,eax
+        jz      fail
+        // compare the MT of the ResolveCacheElem
+        cmp     ecx,[eax+e_resolveCacheElem_offset_mt]
+        jne     main_loop
+        // compare the token of the ResolveCacheElem
+        cmp     edx,[eax+e_resolveCacheElem_offset_token]
+        jne     main_loop
+        // success
+        // decrement success counter and move entry to start if necessary
+        sub     g_dispatch_cache_chain_success_counter,1
+        //@TODO: Perhaps this should be a jl for better branch prediction?
+        jge     nopromote
+        // be quick to reset the counter so we don't get a bunch of contending threads
+        add     g_dispatch_cache_chain_success_counter,CALL_STUB_CACHE_INITIAL_SUCCESS_COUNT
+        // promote the entry to the beginning of the chain
+        mov     ecx,eax
+        call    VirtualCallStubManager::PromoteChainEntry
+    }
+    nopromote:
+    __asm {
+        // clean up the stack and jump to the target
+        pop     ecx
+        pop     edx
+        add     esp,(e_caller_ret_addr - e_token)
+        mov     eax,[eax+e_resolveCacheElem_offset_target]
+        jmp     eax
+    }
+    fail:
+    __asm {
+#ifdef STUB_LOGGING
+        inc     g_chained_lookup_miss_counter
+#endif
+        // restore registers
+        pop     ecx
+        pop     edx
+        jmp     ResolveWorkerAsmStub
+    }
+}
+#endif 
+
+/* Call the resolver, it will return where we are supposed to go.
+   There is a little stack magic here, in that we are entered with one
+   of the arguments for the resolver (the token) on the stack already.
+   We just push the other arguments, <this> in the call frame and the call site pointer, 
+   and call the resolver.
+   
+   On return we have the stack frame restored to the way it was when the ResolveStub
+   was called, i.e. as it was at the actual call site.  The return value from
+   the resolver is the address we need to transfer control to, simulating a direct
+   call from the original call site.  If we get passed back NULL, it means that the
+   resolution failed, an unimpelemented method is being called.
+
+   Entry stack:
+            dispatch token
+            siteAddrForRegisterIndirect (used only if this is a RegisterIndirect dispatch call)
+            return address of caller to stub
+
+   Call stack:
+            pointer to TransitionBlock
+            call site
+            dispatch token
+            TransitionBlock
+                ArgumentRegisters (ecx, edx)
+                CalleeSavedRegisters (ebp, ebx, esi, edi)
+            return address of caller to stub
+   */
+__declspec (naked) void ResolveWorkerAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        //
+        // The stub arguments are where we want to setup the TransitionBlock. We will
+        // setup the TransitionBlock later once we can trash them
+        //
+        // push ebp-frame
+        // push      ebp
+        // mov       ebp,esp
+
+        // save CalleeSavedRegisters
+        // push      ebx
+
+        push        esi
+        push        edi
+
+        // push ArgumentRegisters
+        push        ecx
+        push        edx
+
+        mov         esi, esp
+
+        push        [esi + 4*4]     // dispatch token
+        push        [esi + 5*4]     // siteAddrForRegisterIndirect
+        push        esi             // pTransitionBlock
+
+        // Setup up proper EBP frame now that the stub arguments can be trashed
+        mov         [esi + 4*4],ebx
+        mov         [esi + 5*4],ebp
+        lea         ebp, [esi + 5*4]
+
+        // Make the call
+        call        VSD_ResolveWorker
+
+        // From here on, mustn't trash eax
+        
+        // pop ArgumentRegisters
+        pop     edx
+        pop     ecx
+
+        // pop CalleeSavedRegisters
+        pop edi
+        pop esi
+        pop ebx
+        pop ebp
+        
+        // Now jump to the target
+        jmp     eax             // continue on into the method
+    }
+}
+
+#ifdef FEATURE_REMOTING
+/*  For an in-context dispatch, we will find the target. This
+    is the slow path, and erects a MachState structure for 
+    creating a HelperMethodFrame
+
+    Entry stack:
+            dispatch token
+            return address of caller to stub
+
+   Call stack:
+            pointer to StubDispatchFrame
+            call site
+            dispatch token
+            StubDispatchFrame
+                GSCookie
+                negspace
+                vptr
+                datum
+                ArgumentRegisters (ecx, edx)
+                CalleeSavedRegisters (ebp, ebx, esi, edi)
+            return address of caller to stub
+*/    
+__declspec (naked) void InContextTPDispatchAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        // Pop dispatch token
+        pop         eax
+
+        // push ebp-frame
+        push        ebp
+        mov         ebp,esp
+
+        // save CalleeSavedRegisters
+        push        ebx
+        push        esi
+        push        edi
+
+        // push ArgumentRegisters
+        push        ecx
+        push        edx
+
+        mov         esi, esp
+
+        push        eax                     // token
+        push        esi                     // pTransitionContext
+
+        // Make the call
+        call    VSD_GetTargetForTPWorker
+
+        // From here on, mustn't trash eax
+        
+        // pop ArgumentRegisters
+        pop     edx
+        pop     ecx
+
+        // pop CalleeSavedRegisters
+        pop edi
+        pop esi
+        pop ebx
+        pop ebp
+
+        // Now jump to the target
+        jmp     eax             // continue on into the method
+    }
+}
+
+/*  For an in-context dispatch, we will try to find the target in
+    the resolve cache. If this fails, we will jump to the full
+    version of InContextTPDispatchAsmStub
+    
+    Entry stack:
+        dispatch slot number of interface MD
+        caller return address
+    ECX: this object
+*/    
+__declspec (naked) void InContextTPQuickDispatchAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        // Spill registers
+        push        ecx
+        push        edx
+
+        // Arg 2 -  token
+        mov         eax, [esp + 8]
+        push        eax
+
+        // Arg 1 - this
+        push        ecx
+
+        // Make the call
+        call        VSD_GetTargetForTPWorkerQuick
+
+        // Restore registers
+        pop         edx
+        pop         ecx
+
+        // Test to see if we found a target
+        test        eax, eax
+        jnz         TargetFound
+
+        // If no target, jump to the slow worker
+        jmp         InContextTPDispatchAsmStub
+
+    TargetFound:
+        // We got a target, so pop off the token and jump to it
+        add         esp,4
+        jmp         eax
+    }
+}
+#endif // FEATURE_REMOTING
+
+/* Call the callsite back patcher.  The fail stub piece of the resolver is being
+call too often, i.e. dispatch stubs are failing the expect MT test too often.
+In this stub wraps the call to the BackPatchWorker to take care of any stack magic
+needed.
+*/
+__declspec (naked) void BackPatchWorkerAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        push EBP
+        mov ebp,esp
+        push EAX        // it may contain siteAddrForRegisterIndirect
+        push ECX
+        push EDX
+        push EAX        //  push any indirect call address as the second arg to BackPatchWorker
+        push [EBP+8]    //  and push return address as the first arg to BackPatchWorker
+        call VirtualCallStubManager::BackPatchWorkerStatic
+        pop EDX
+        pop ECX
+        pop EAX
+        mov esp,ebp
+        pop ebp
+        ret
+    }
+}
+
+#endif // _MSC_VER
+
+#ifdef _DEBUG
+//
+// This function verifies that a pointer to an indirection cell lives inside a delegate object.
+// In the delegate case the indirection cell is held by the delegate itself in _methodPtrAux, when the delegate Invoke is
+// called the shuffle thunk is first invoked and that will call into the virtual dispatch stub.
+// Before control is given to the virtual dispatch stub a pointer to the indirection cell (thus an interior pointer to the delegate)
+// is pushed in EAX
+//
+BOOL isDelegateCall(BYTE *interiorPtr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    if (GCHeap::GetGCHeap()->IsHeapPointer((void*)interiorPtr))
+    {
+        Object *delegate = (Object*)(interiorPtr - DelegateObject::GetOffsetOfMethodPtrAux());
+        VALIDATEOBJECTREF(ObjectToOBJECTREF(delegate));
+        _ASSERTE(delegate->GetMethodTable()->IsDelegate());
+
+        return TRUE;
+    }
+    return FALSE;
+}
+#endif
+
+StubCallSite::StubCallSite(TADDR siteAddrForRegisterIndirect, PCODE returnAddr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    // Not used
+    // if (isCallRelative(returnAddr))
+    // {
+    //      m_siteAddr = returnAddr - sizeof(DISPL);
+    // }
+    // else
+    if (isCallRelativeIndirect((BYTE *)returnAddr))
+    {
+        m_siteAddr = *dac_cast<PTR_PTR_PCODE>(returnAddr - sizeof(PCODE));
+    }
+    else
+    {
+        _ASSERTE(isCallRegisterIndirect((BYTE *)returnAddr) || isDelegateCall((BYTE *)siteAddrForRegisterIndirect));
+        m_siteAddr = dac_cast<PTR_PCODE>(siteAddrForRegisterIndirect);
+    }
+}
+
+// the special return address for VSD tailcalls
+extern "C" void STDCALL JIT_TailCallReturnFromVSD();
+
+PCODE StubCallSite::GetCallerAddress()
+{
+    LIMITED_METHOD_CONTRACT; 
+    if (m_returnAddr != (PCODE)JIT_TailCallReturnFromVSD)
+        return m_returnAddr;
+
+    // Find the tailcallframe in the frame chain and get the actual caller from the first TailCallFrame
+    return TailCallFrame::FindTailCallFrame(GetThread()->GetFrame())->GetCallerAddress();
+}
+
+#ifdef STUB_LOGGING
+extern size_t g_lookup_inline_counter;
+extern size_t g_mono_call_counter;
+extern size_t g_mono_miss_counter;
+extern size_t g_poly_call_counter;
+extern size_t g_poly_miss_counter;
+#endif
+
+/* Template used to generate the stub.  We generate a stub by allocating a block of 
+   memory and copy the template over it and just update the specific fields that need 
+   to be changed.
+*/ 
+LookupStub lookupInit;
+
+void LookupHolder::InitializeStatic()
+{
+    static_assert_no_msg(((offsetof(LookupStub, _token)+offsetof(LookupHolder, _stub)) % sizeof(void*)) == 0);
+    static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0);
+
+    lookupInit._entryPoint [0]     = 0x50;
+    lookupInit._entryPoint [1]     = 0x68;
+    static_assert_no_msg(sizeof(lookupInit._entryPoint) == 2);
+    lookupInit._token              = 0xcccccccc;
+#ifdef STUB_LOGGING
+    lookupInit.cntr2 [0]           = 0xff;
+    lookupInit.cntr2 [1]           = 0x05;
+    static_assert_no_msg(sizeof(lookupInit.cntr2) == 2);
+    lookupInit.c_lookup            = &g_call_lookup_counter;
+#endif //STUB_LOGGING 
+    lookupInit.part2 [0]           = 0xe9;
+    static_assert_no_msg(sizeof(lookupInit.part2) == 1);
+    lookupInit._resolveWorkerDispl = 0xcccccccc;
+}
+
+void  LookupHolder::Initialize(PCODE resolveWorkerTarget, size_t dispatchToken)
+{
+    _stub = lookupInit;
+
+    //fill in the stub specific fields
+    //@TODO: Get rid of this duplication of data.
+    _stub._token              = dispatchToken;
+    _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &_stub._resolveWorkerDispl + sizeof(DISPL));
+}
+
+LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint)  );
+    //    _ASSERTE(lookupHolder->_stub._entryPoint[0] == lookupInit._entryPoint[0]);
+    return lookupHolder;
+}
+
+
+/* Template used to generate the stub.  We generate a stub by allocating a block of 
+   memory and copy the template over it and just update the specific fields that need 
+   to be changed.
+*/ 
+DispatchStub dispatchInit;
+
+void DispatchHolder::InitializeStatic()
+{
+    // Check that _expectedMT is aligned in the DispatchHolder
+    static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub,_expectedMT)) % sizeof(void*)) == 0);
+    static_assert_no_msg((sizeof(DispatchHolder) % sizeof(void*)) == 0);
+
+#ifndef STUB_LOGGING
+    dispatchInit._entryPoint [0] = 0x81;
+    dispatchInit._entryPoint [1] = 0x39;
+    static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2);
+
+    dispatchInit._expectedMT     = 0xcccccccc;
+    dispatchInit.jmpOp1 [0]      = 0x0f;
+    dispatchInit.jmpOp1 [1]      = 0x85;
+    static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2);
+
+    dispatchInit._failDispl      = 0xcccccccc;
+    dispatchInit.jmpOp2          = 0xe9;
+    dispatchInit._implDispl      = 0xcccccccc;
+#else //STUB_LOGGING
+    dispatchInit._entryPoint [0] = 0xff;
+    dispatchInit._entryPoint [1] = 0x05;
+    static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2);
+
+    dispatchInit.d_call          = &g_mono_call_counter;
+    dispatchInit.cmpOp [0]       = 0x81;
+    dispatchInit.cmpOp [1]       = 0x39;              
+    static_assert_no_msg(sizeof(dispatchInit.cmpOp) == 2);
+
+    dispatchInit._expectedMT     = 0xcccccccc;
+    dispatchInit.jmpOp1 [0]      = 0x0f;
+    dispatchInit.jmpOp1 [1]      = 0x84;
+    static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2);
+
+    dispatchInit._implDispl      = 0xcccccccc;
+    dispatchInit.fail [0]        = 0xff;
+    dispatchInit.fail [1]        = 0x05;
+    static_assert_no_msg(sizeof(dispatchInit.fail) == 2);
+
+    dispatchInit.d_miss          = &g_mono_miss_counter;
+    dispatchInit.jmpFail         = 0xe9;
+    dispatchInit._failDispl      = 0xcccccccc;
+#endif //STUB_LOGGING 
+};
+
+void  DispatchHolder::Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT)
+{
+    _stub = dispatchInit;
+
+    //fill in the stub specific fields
+    _stub._expectedMT  = (size_t) expectedMT;
+    _stub._failDispl   = failTarget - ((PCODE) &_stub._failDispl + sizeof(DISPL));
+    _stub._implDispl   = implTarget - ((PCODE) &_stub._implDispl + sizeof(DISPL));
+}
+
+DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) );
+    //    _ASSERTE(dispatchHolder->_stub._entryPoint[0] == dispatchInit._entryPoint[0]);
+    return dispatchHolder;
+}
+
+
+/* Template used to generate the stub.  We generate a stub by allocating a block of 
+   memory and copy the template over it and just update the specific fields that need 
+   to be changed.
+*/ 
+
+ResolveStub resolveInit;
+
+void ResolveHolder::InitializeStatic()
+{
+    //Check that _token is aligned in ResolveHolder
+    static_assert_no_msg(((offsetof(ResolveHolder, _stub) + offsetof(ResolveStub, _token)) % sizeof(void*)) == 0);
+    static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0);
+
+    resolveInit._failEntryPoint [0]    = 0x83;
+    resolveInit._failEntryPoint [1]    = 0x2d;
+    static_assert_no_msg(sizeof(resolveInit._failEntryPoint) == 2);
+
+    resolveInit._pCounter              = (INT32 *) (size_t) 0xcccccccc;
+    resolveInit.part0 [0]              = 0x01;
+    resolveInit.part0 [1]              = 0x7c;
+    static_assert_no_msg(sizeof(resolveInit.part0) == 2);
+
+    resolveInit.toPatcher              = (offsetof(ResolveStub, patch) - (offsetof(ResolveStub, toPatcher) + 1)) & 0xFF;
+
+    resolveInit._resolveEntryPoint [0] = 0x50;
+    resolveInit._resolveEntryPoint [1] = 0x8b;
+    resolveInit._resolveEntryPoint [2] = 0x01;
+    resolveInit._resolveEntryPoint [3] = 0x52;
+    resolveInit._resolveEntryPoint [4] = 0x8b;
+    resolveInit._resolveEntryPoint [5] = 0xd0;
+    static_assert_no_msg(sizeof(resolveInit._resolveEntryPoint) == 6);
+
+    resolveInit.part1 [0]              = 0xc1;
+    resolveInit.part1 [1]              = 0xe8;
+    resolveInit.part1 [2]              = CALL_STUB_CACHE_NUM_BITS;
+    resolveInit.part1 [3]              = 0x03;
+    resolveInit.part1 [4]              = 0xc2;
+    resolveInit.part1 [5]              = 0x35;
+    static_assert_no_msg(sizeof(resolveInit.part1) == 6);
+
+    resolveInit._hashedToken           = 0xcccccccc;
+    resolveInit.part2 [0]              = 0x25;
+    static_assert_no_msg(sizeof(resolveInit.part2) == 1);
+
+    resolveInit.mask                   = (CALL_STUB_CACHE_MASK << LOG2_PTRSIZE);
+    resolveInit.part3 [0]              = 0x8b;
+    resolveInit.part3 [1]              = 0x80;;
+    static_assert_no_msg(sizeof(resolveInit.part3) == 2);
+
+    resolveInit._cacheAddress          = 0xcccccccc;
+#ifdef STUB_LOGGING
+    resolveInit.cntr1 [0]              = 0xff;
+    resolveInit.cntr1 [1]              = 0x05;
+    static_assert_no_msg(sizeof(resolveInit.cntr1) == 2);
+
+    resolveInit.c_call                 = &g_poly_call_counter;
+#endif //STUB_LOGGING 
+    resolveInit.part4 [0]              = 0x3b;
+    resolveInit.part4 [1]              = 0x10;
+    static_assert_no_msg(sizeof(resolveInit.part4) == 2);
+
+    // resolveInit.mtOffset               = offsetof(ResolveCacheElem,pMT) & 0xFF;
+    static_assert_no_msg(offsetof(ResolveCacheElem,pMT) == 0);
+
+    resolveInit.part5 [0]              = 0x75;
+    static_assert_no_msg(sizeof(resolveInit.part5) == 1);
+
+    resolveInit.toMiss1                = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1);
+
+    resolveInit.part6 [0]              = 0x81;
+    resolveInit.part6 [1]              = 0x78;
+    static_assert_no_msg(sizeof(resolveInit.part6) == 2);
+
+    resolveInit.tokenOffset            = offsetof(ResolveCacheElem,token) & 0xFF;    
+
+    resolveInit._token                 = 0xcccccccc;
+
+    resolveInit.part7 [0]              = 0x75;
+    static_assert_no_msg(sizeof(resolveInit.part7) == 1);
+
+    resolveInit.part8 [0]              = 0x8b;
+    resolveInit.part8 [1]              = 0x40;
+    static_assert_no_msg(sizeof(resolveInit.part8) == 2);
+
+    resolveInit.targetOffset           = offsetof(ResolveCacheElem,target) & 0xFF;
+
+    resolveInit.toMiss2                = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1);
+
+    resolveInit.part9 [0]              = 0x5a;
+    resolveInit.part9 [1]              = 0x83;
+    resolveInit.part9 [2]              = 0xc4;
+    resolveInit.part9 [3]              = 0x04;
+    resolveInit.part9 [4]              = 0xff;
+    resolveInit.part9 [5]              = 0xe0;
+    static_assert_no_msg(sizeof(resolveInit.part9) == 6);
+
+    resolveInit.miss [0]               = 0x5a;
+//    resolveInit.miss [1]               = 0xb8;
+//    resolveInit._hashedTokenMov        = 0xcccccccc;
+    resolveInit._slowEntryPoint [0]    = 0x68;
+    resolveInit._tokenPush             = 0xcccccccc;
+#ifdef STUB_LOGGING
+    resolveInit.cntr2 [0]              = 0xff;
+    resolveInit.cntr2 [1]              = 0x05;
+    resolveInit.c_miss                 = &g_poly_miss_counter;
+#endif //STUB_LOGGING 
+    resolveInit.part10 [0]             = 0xe9;
+    resolveInit._resolveWorkerDispl    = 0xcccccccc;
+
+    resolveInit.patch [0]              = 0xe8;
+    resolveInit._backpatcherDispl      = 0xcccccccc;
+    resolveInit.part11 [0]             = 0xeb;
+    resolveInit.toResolveStub          = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub) + 1)) & 0xFF;
+};
+
+void  ResolveHolder::Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, 
+                                size_t dispatchToken, UINT32 hashedToken,
+                                void * cacheAddr, INT32 * counterAddr)
+{
+    _stub = resolveInit;
+
+    //fill in the stub specific fields
+    _stub._pCounter           = counterAddr;
+    _stub._hashedToken        = hashedToken << LOG2_PTRSIZE;
+    _stub._cacheAddress       = (size_t) cacheAddr;
+    _stub._token              = dispatchToken;
+//    _stub._hashedTokenMov     = hashedToken;
+    _stub._tokenPush          = dispatchToken;
+    _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &_stub._resolveWorkerDispl + sizeof(DISPL));
+    _stub._backpatcherDispl   = patcherTarget       - ((PCODE) &_stub._backpatcherDispl   + sizeof(DISPL));
+}
+
+ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) );
+    //    _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]);
+    return resolveHolder;
+}
+
+ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) );
+    //    _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]);
+    return resolveHolder;
+}
+
+#endif // DACCESS_COMPILE
+
+VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE stubStartAddress)
+{
+    SUPPORTS_DAC;
+#ifdef DACCESS_COMPILE
+
+    return SK_BREAKPOINT;  // Dac always uses the slower lookup
+
+#else
+
+    StubKind stubKind = SK_UNKNOWN;
+
+    EX_TRY
+    {
+        // If stubStartAddress is completely bogus, then this might AV,
+        // so we protect it with SEH. An AV here is OK.
+        AVInRuntimeImplOkayHolder AVOkay;
+
+        WORD firstWord = *((WORD*) stubStartAddress);
+
+#ifndef STUB_LOGGING
+        if (firstWord == 0x3981)
+#else //STUB_LOGGING
+        if (firstWord == 0x05ff)
+#endif
+        {
+            stubKind = SK_DISPATCH;
+        }
+        else if (firstWord == 0x6850)
+        {
+            stubKind = SK_LOOKUP;
+        }
+        else if (firstWord == 0x8b50)
+        {
+            stubKind = SK_RESOLVE;
+        }
+        else
+        {
+            BYTE firstByte  = ((BYTE*) stubStartAddress)[0];
+            BYTE secondByte = ((BYTE*) stubStartAddress)[1];
+
+            if ((firstByte  == X86_INSTR_INT3) ||
+                (secondByte == X86_INSTR_INT3))
+            {
+                stubKind = SK_BREAKPOINT;
+            }
+        }
+    }
+    EX_CATCH
+    {
+        stubKind = SK_UNKNOWN;
+    }
+    EX_END_CATCH(SwallowAllExceptions);        
+
+    return stubKind;
+
+#endif // DACCESS_COMPILE
+}
+
+#endif //DECLARE_DATA
+
+#endif // _VIRTUAL_CALL_STUB_X86_H
-- 
cgit v1.2.3