1 files changed, 1077 insertions, 0 deletions
diff --git a/src/vm/i386/virtualcallstubcpu.hpp b/src/vm/i386/virtualcallstubcpu.hpp
new file mode 100644
index 0000000000..33ce8199b9
--- /dev/null
+++ b/src/vm/i386/virtualcallstubcpu.hpp
@@ -0,0 +1,1077 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+// File: virtualcallstubcpu.hpp
+//
+
+
+//
+
+//
+// ============================================================================
+
+#ifndef _VIRTUAL_CALL_STUB_X86_H
+#define _VIRTUAL_CALL_STUB_X86_H
+
+#ifdef DECLARE_DATA
+#include "asmconstants.h"
+#ifdef FEATURE_REMOTING
+#include "remoting.h"
+#endif
+#endif
+
+#include <pshpack1.h>  // Since we are placing code, we want byte packing of the structs
+
+#define USES_LOOKUP_STUBS	1
+
+/*********************************************************************************************
+Stubs that contain code are all part of larger structs called Holders.  There is a
+Holder for each kind of stub, i.e XXXStub is contained with XXXHolder.  Holders are
+essentially an implementation trick that allowed rearranging the code sequences more
+easily while trying out different alternatives, and for dealing with any alignment 
+issues in a way that was mostly immune to the actually code sequences.  These Holders
+should be revisited when the stub code sequences are fixed, since in many cases they
+add extra space to a stub that is not really needed.  
+
+Stubs are placed in cache and hash tables.  Since unaligned access of data in memory
+is very slow, the keys used in those tables should be aligned.  The things used as keys
+typically also occur in the generated code, e.g. a token as an immediate part of an instruction.
+For now, to avoid alignment computations as different code strategies are tried out, the key
+fields are all in the Holders.  Eventually, many of these fields should be dropped, and the instruction
+streams aligned so that the immediate fields fall on aligned boundaries.  
+*/
+
+#if USES_LOOKUP_STUBS
+
+struct LookupStub;
+struct LookupHolder;
+
+/*LookupStub**************************************************************************************
+Virtual and interface call sites are initially setup to point at LookupStubs.  
+This is because the runtime type of the <this> pointer is not yet known, 
+so the target cannot be resolved.  Note: if the jit is able to determine the runtime type 
+of the <this> pointer, it should be generating a direct call not a virtual or interface call.
+This stub pushes a lookup token onto the stack to identify the sought after method, and then 
+jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and
+transfer of control to the appropriate target method implementation, perhaps patching of the call site
+along the way to point to a more appropriate stub.  Hence callsites that point to LookupStubs 
+get quickly changed to point to another kind of stub.
+*/
+struct LookupStub
+{
+    inline PCODE entryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; }
+    inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; }
+    inline size_t       size()          { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); }
+
+private:
+    friend struct LookupHolder;
+
+    // DispatchStub:: _entryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+    BYTE    _entryPoint [2];    // 50           push    eax             ;save siteAddrForRegisterIndirect - this may be an indirect call
+                                // 68           push
+    size_t  _token;             // xx xx xx xx          32-bit constant
+#ifdef STUB_LOGGING
+    BYTE cntr2[2];              // ff 05        inc
+    size_t* c_lookup;           // xx xx xx xx          [call_lookup_counter]
+#endif //STUB_LOGGING 
+    BYTE part2 [1];             // e9           jmp
+    DISPL   _resolveWorkerDispl;// xx xx xx xx          pc-rel displ
+};
+
+/* LookupHolders are the containers for LookupStubs, they provide for any alignment of 
+stubs as necessary.  In the case of LookupStubs, alignment is necessary since
+LookupStubs are placed in a hash table keyed by token. */
+struct LookupHolder
+{
+    static void InitializeStatic();
+
+    void  Initialize(PCODE resolveWorkerTarget, size_t dispatchToken);
+
+    LookupStub*    stub()               { LIMITED_METHOD_CONTRACT;  return &_stub;    }
+
+    static LookupHolder*  FromLookupEntry(PCODE lookupEntry);
+
+private:
+    friend struct LookupStub;
+
+    BYTE align[(sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*)))%sizeof(void*)];
+    LookupStub _stub;
+    BYTE pad[sizeof(void*) -
+             ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) +
+              (sizeof(LookupStub))
+             ) % sizeof(void*)];    //complete DWORD
+
+    static_assert_no_msg((sizeof(void*) -
+             ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) +
+              (sizeof(LookupStub))
+             ) % sizeof(void*)) != 0);
+};
+
+#endif // USES_LOOKUP_STUBS
+
+struct DispatchStub;
+struct DispatchHolder;
+
+/*DispatchStub**************************************************************************************
+Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs.
+A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure).  
+If the calling frame does in fact have the <this> type be of the expected type, then
+control is transfered to the target address, the method implementation.  If not, 
+then control is transfered to the fail address, a fail stub (see below) where a polymorphic 
+lookup is done to find the correct address to go to.  
+
+implementation note: Order, choice of instructions, and branch directions
+should be carefully tuned since it can have an inordinate effect on performance.  Particular
+attention needs to be paid to the effects on the BTB and branch prediction, both in the small
+and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions.
+Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched
+to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important
+that the branch prediction staticly predict this, which means it must be a forward jump.  The alternative 
+is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" 
+is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier
+to control the placement of the stubs than control the placement of the jitted code and the stubs. */
+struct DispatchStub 
+{
+    inline PCODE        entryPoint()  { LIMITED_METHOD_CONTRACT;  return (PCODE)&_entryPoint[0]; }
+
+    inline size_t       expectedMT()  { LIMITED_METHOD_CONTRACT;  return _expectedMT;     }
+    inline PCODE        implTarget()  { LIMITED_METHOD_CONTRACT;  return (PCODE) &_implDispl + sizeof(DISPL) + _implDispl; }
+    inline PCODE        failTarget()  { LIMITED_METHOD_CONTRACT;  return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; }
+    inline size_t       size()        { LIMITED_METHOD_CONTRACT;  return sizeof(DispatchStub); }
+
+private:
+    friend struct DispatchHolder;
+
+    // DispatchStub:: _entryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+#ifndef STUB_LOGGING
+    BYTE    _entryPoint [2];    // 81 39        cmp  [ecx],                   ; This is the place where we are going to fault on null this.
+    size_t  _expectedMT;        // xx xx xx xx              expectedMT        ; If you change it, change also AdjustContextForVirtualStub in excep.cpp!!!
+    BYTE    jmpOp1[2];          // 0f 85        jne                 
+    DISPL   _failDispl;         // xx xx xx xx              failEntry         ;must be forward jmp for perf reasons
+    BYTE jmpOp2;                // e9           jmp     
+    DISPL   _implDispl;         // xx xx xx xx              implTarget
+#else //STUB_LOGGING
+    BYTE    _entryPoint [2];    // ff 05        inc
+    size_t* d_call;             // xx xx xx xx              [call_mono_counter]
+    BYTE cmpOp [2];             // 81 39        cmp  [ecx],
+    size_t  _expectedMT;        // xx xx xx xx              expectedMT
+    BYTE jmpOp1[2];             // 0f 84        je 
+    DISPL   _implDispl;         // xx xx xx xx              implTarget        ;during logging, perf is not so important               
+    BYTE fail [2];              // ff 05        inc 
+    size_t* d_miss;             // xx xx xx xx      [miss_mono_counter]
+    BYTE jmpFail;               // e9           jmp     
+    DISPL   _failDispl;         // xx xx xx xx              failEntry 
+#endif //STUB_LOGGING 
+};
+
+/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of 
+stubs as necessary.  DispatchStubs are placed in a hashtable and in a cache.  The keys for both
+are the pair expectedMT and token.  Efficiency of the of the hash table is not a big issue,
+since lookups in it are fairly rare.  Efficiency of the cache is paramount since it is accessed frequently
+o(see ResolveStub below).  Currently we are storing both of these fields in the DispatchHolder to simplify
+alignment issues.  If inlineMT in the stub itself was aligned, then it could be the expectedMT field.
+While the token field can be logically gotten by following the failure target to the failEntryPoint 
+of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here.
+This allows us to use DispatchStubs in the cache.  The alternative is to provide some other immutable struct
+for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when
+they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid).
+*/
+
+/* @workaround for ee resolution - Since the EE does not currently have a resolver function that
+does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are 
+using dispatch stubs to siumulate what we want.  That means that inlineTarget, which should be immutable
+is in fact written.  Hence we have moved target out into the holder and aligned it so we can 
+atomically update it.  When we get a resolver function that does what we want, we can drop this field,
+and live with just the inlineTarget field in the stub itself, since immutability will hold.*/
+struct DispatchHolder
+{
+    static void InitializeStatic();
+
+    void  Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT);
+
+    DispatchStub* stub()      { LIMITED_METHOD_CONTRACT;  return &_stub; }
+
+    static DispatchHolder*  FromDispatchEntry(PCODE dispatchEntry);
+
+private:
+    //force expectedMT to be aligned since used as key in hash tables.
+#ifndef STUB_LOGGING
+    BYTE align[(sizeof(void*)-(offsetof(DispatchStub,_expectedMT)%sizeof(void*)))%sizeof(void*)];
+#endif
+    DispatchStub _stub;
+    BYTE pad[(sizeof(void*)-(sizeof(DispatchStub)%sizeof(void*))+offsetof(DispatchStub,_expectedMT))%sizeof(void*)];	//complete DWORD
+};
+
+struct ResolveStub;
+struct ResolveHolder;
+
+/*ResolveStub**************************************************************************************
+Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub.  There is only 
+one resolver stub built for any given token, even though there may be many call sites that
+use that token and many distinct <this> types that are used in the calling call frames.  A resolver stub 
+actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their
+expectedMT test.  There is a third part of the resolver stub that enters the ee when a decision should
+be made about changing the callsite.  Therefore, we have defined the resolver stub as three distinct pieces,
+even though they are actually allocated as a single contiguous block of memory.  These pieces are:
+
+A ResolveStub has two entry points:
+
+FailEntry - where the dispatch stub goes if the expected MT test fails.  This piece of the stub does
+a check to see how often we are actually failing. If failures are frequent, control transfers to the 
+patch piece to cause the call site to be changed from a mostly monomorphic callsite 
+(calls dispatch stub) to a polymorphic callsize (calls resolve stub).  If failures are rare, control
+transfers to the resolve piece (see ResolveStub).  The failEntryPoint decrements a counter 
+every time it is entered.  The ee at various times will add a large chunk to the counter. 
+
+ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s
+<this> and the token identifying the (contract,method) pair desired.  If found, control is transfered
+to the method implementation.  If not found in the cache, the token is pushed and the ee is entered via
+the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation.  Since
+there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed.
+The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used,
+as well as its speed.  It turns out it is very important to make the hash function sensitive to all 
+of the bits of the method table, as method tables are laid out in memory in a very non-random way.  Before
+making any changes to the code sequences here, it is very important to measure and tune them as perf
+can vary greatly, in unexpected ways, with seeming minor changes.
+
+Implementation note - Order, choice of instructions, and branch directions
+should be carefully tuned since it can have an inordinate effect on performance.  Particular
+attention needs to be paid to the effects on the BTB and branch prediction, both in the small
+and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. 
+Note that this stub is called in highly polymorphic cases, but the cache should have been sized
+and the hash function chosen to maximize the cache hit case.  Hence the cmp/jcc instructions should
+mostly be going down the cache hit route, and it is important that this be statically predicted as so.
+Hence the 3 jcc instrs need to be forward jumps.  As structured, there is only one jmp/jcc that typically
+gets put in the BTB since all the others typically fall straight thru.  Minimizing potential BTB entries
+is important. */
+
+struct ResolveStub 
+{
+    inline PCODE failEntryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0];    }
+    inline PCODE resolveEntryPoint()        { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; }
+    inline PCODE slowEntryPoint()           { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; }
+
+    inline INT32* pCounter()                { LIMITED_METHOD_CONTRACT; return _pCounter; }
+    inline UINT32 hashedToken()             { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE;    }
+    inline size_t cacheAddress()            { LIMITED_METHOD_CONTRACT; return _cacheAddress;   }
+    inline size_t token()                   { LIMITED_METHOD_CONTRACT; return _token;          }
+    inline size_t size()                    { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); }
+
+private:
+    friend struct ResolveHolder;
+
+    // ResolveStub::_failEntryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+    BYTE   _failEntryPoint [2];     // 83 2d        sub
+    INT32* _pCounter;               // xx xx xx xx          [counter],
+    BYTE   part0 [2];               // 01                   01
+                                    // 7c           jl
+    BYTE toPatcher;                 // xx                   backpatcher     ;must be forward jump, for perf reasons
+                                    //                                      ;fall into the resolver stub
+
+    // ResolveStub::_resolveEntryPoint expects:
+    //       ecx: object (the "this" pointer)
+    //       eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call
+    BYTE    _resolveEntryPoint[6];  // 50           push    eax             ;save siteAddrForRegisterIndirect - this may be an indirect call
+                                    // 8b 01        mov     eax,[ecx]       ;get the method table from the "this" pointer. This is the place
+                                    //                                      ;    where we are going to fault on null this. If you change it,
+                                    //                                      ;    change also AdjustContextForVirtualStub in excep.cpp!!!
+                                    // 52           push    edx            
+                                    // 8b d0        mov     edx, eax
+    BYTE    part1 [6];              // c1 e8 0C     shr     eax,12          ;we are adding upper bits into lower bits of mt
+                                    // 03 c2        add     eax,edx
+                                    // 35           xor     eax,
+    UINT32  _hashedToken;           // xx xx xx xx              hashedToken ;along with pre-hashed token
+    BYTE    part2 [1];              // 25           and     eax,
+    size_t mask;                    // xx xx xx xx              cache_mask
+    BYTE part3 [2];                 // 8b 80        mov     eax, [eax+
+    size_t  _cacheAddress;          // xx xx xx xx                lookupCache]
+#ifdef STUB_LOGGING
+    BYTE cntr1[2];                  // ff 05        inc
+    size_t* c_call;                 // xx xx xx xx          [call_cache_counter]
+#endif //STUB_LOGGING 
+    BYTE part4 [2];                 // 3b 10        cmp     edx,[eax+
+    // BYTE mtOffset;               //                          ResolverCacheElem.pMT]
+    BYTE part5 [1];                 // 75           jne
+    BYTE toMiss1;                   // xx                   miss            ;must be forward jump, for perf reasons
+    BYTE part6 [2];                 // 81 78        cmp     [eax+
+    BYTE tokenOffset;               // xx                        ResolverCacheElem.token],
+    size_t  _token;                 // xx xx xx xx              token
+    BYTE part7 [1];                 // 75           jne
+    BYTE toMiss2;                   // xx                   miss            ;must be forward jump, for perf reasons
+    BYTE part8 [2];                 // 8B 40 xx     mov     eax,[eax+
+    BYTE targetOffset;              //                          ResolverCacheElem.target]
+    BYTE part9 [6];                 // 5a           pop     edx
+                                    // 83 c4 04     add     esp,4           ;throw away siteAddrForRegisterIndirect - we don't need it now
+                                    // ff e0        jmp     eax
+                                    //         miss:
+    BYTE    miss [1];               // 5a           pop     edx             ; don't pop siteAddrForRegisterIndirect - leave it on the stack for use by ResolveWorkerChainLookupAsmStub and/or ResolveWorkerAsmStub
+    BYTE    _slowEntryPoint[1];     // 68           push
+    size_t  _tokenPush;             // xx xx xx xx          token
+#ifdef STUB_LOGGING
+    BYTE cntr2[2];                  // ff 05        inc
+    size_t* c_miss;                 // xx xx xx xx          [miss_cache_counter]
+#endif //STUB_LOGGING
+    BYTE part10 [1];                // e9           jmp
+    DISPL   _resolveWorkerDispl;    // xx xx xx xx          resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub
+    BYTE  patch[1];                 // e8           call
+    DISPL _backpatcherDispl;        // xx xx xx xx          backpatcherWorker  == BackPatchWorkerAsmStub
+    BYTE  part11 [1];               // eb           jmp
+    BYTE toResolveStub;             // xx                   resolveStub, i.e. go back to _resolveEntryPoint
+};
+
+/* ResolveHolders are the containers for ResolveStubs,  They provide 
+for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by 
+the token for which they are built.  Efficiency of access requires that this token be aligned.  
+For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that
+any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder
+is not needed. */ 
+struct ResolveHolder
+{
+    static void  InitializeStatic();
+
+    void  Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, 
+                     size_t dispatchToken, UINT32 hashedToken,
+                     void * cacheAddr, INT32 * counterAddr);
+
+    ResolveStub* stub()      { LIMITED_METHOD_CONTRACT;  return &_stub; }
+
+    static ResolveHolder*  FromFailEntry(PCODE failEntry);
+    static ResolveHolder*  FromResolveEntry(PCODE resolveEntry);
+
+private:
+    //align _token in resolve stub
+
+    BYTE align[(sizeof(void*)-((offsetof(ResolveStub,_token))%sizeof(void*)))%sizeof(void*)
+#ifdef STUB_LOGGING // This turns out to be zero-sized in stub_logging case, and is an error. So round up.
+               +sizeof(void*)
+#endif
+              ];
+
+    ResolveStub _stub;
+
+//#ifdef STUB_LOGGING // This turns out to be zero-sized in non stub_logging case, and is an error. So remove
+    BYTE pad[(sizeof(void*)-((sizeof(ResolveStub))%sizeof(void*))+offsetof(ResolveStub,_token))%sizeof(void*)];	//fill out DWORD
+//#endif
+};
+#include <poppack.h>
+
+
+#ifdef DECLARE_DATA
+
+#ifndef DACCESS_COMPILE
+
+#ifdef _MSC_VER
+
+#ifdef CHAIN_LOOKUP
+/* This will perform a chained lookup of the entry if the initial cache lookup fails
+
+   Entry stack:
+            dispatch token
+            siteAddrForRegisterIndirect (used only if this is a RegisterIndirect dispatch call)
+            return address of caller to stub
+        Also, EAX contains the pointer to the first ResolveCacheElem pointer for the calculated
+        bucket in the cache table.
+*/
+__declspec (naked) void ResolveWorkerChainLookupAsmStub()
+{
+    enum
+    {
+        e_token_size                = 4,
+        e_indirect_addr_size        = 4,
+        e_caller_ret_addr_size      = 4,
+    };
+    enum
+    {
+        // this is the part of the stack that is present as we enter this function:
+        e_token                     = 0,
+        e_indirect_addr             = e_token + e_token_size,
+        e_caller_ret_addr           = e_indirect_addr + e_indirect_addr_size,
+        e_ret_esp                   = e_caller_ret_addr + e_caller_ret_addr_size,
+    };
+    enum
+    {
+        e_spilled_reg_size          = 8,
+    };
+
+    // main loop setup
+    __asm {
+#ifdef STUB_LOGGING
+        inc     g_chained_lookup_call_counter
+#endif
+        // spill regs
+        push    edx
+        push    ecx
+        // move the token into edx
+        mov     edx,[esp+e_spilled_reg_size+e_token]
+        // move the MT into ecx
+        mov     ecx,[ecx]
+    }
+    main_loop:
+    __asm {
+        // get the next entry in the chain (don't bother checking the first entry again)
+        mov     eax,[eax+e_resolveCacheElem_offset_next]
+        // test if we hit a terminating NULL
+        test    eax,eax
+        jz      fail
+        // compare the MT of the ResolveCacheElem
+        cmp     ecx,[eax+e_resolveCacheElem_offset_mt]
+        jne     main_loop
+        // compare the token of the ResolveCacheElem
+        cmp     edx,[eax+e_resolveCacheElem_offset_token]
+        jne     main_loop
+        // success
+        // decrement success counter and move entry to start if necessary
+        sub     g_dispatch_cache_chain_success_counter,1
+        //@TODO: Perhaps this should be a jl for better branch prediction?
+        jge     nopromote
+        // be quick to reset the counter so we don't get a bunch of contending threads
+        add     g_dispatch_cache_chain_success_counter,CALL_STUB_CACHE_INITIAL_SUCCESS_COUNT
+        // promote the entry to the beginning of the chain
+        mov     ecx,eax
+        call    VirtualCallStubManager::PromoteChainEntry
+    }
+    nopromote:
+    __asm {
+        // clean up the stack and jump to the target
+        pop     ecx
+        pop     edx
+        add     esp,(e_caller_ret_addr - e_token)
+        mov     eax,[eax+e_resolveCacheElem_offset_target]
+        jmp     eax
+    }
+    fail:
+    __asm {
+#ifdef STUB_LOGGING
+        inc     g_chained_lookup_miss_counter
+#endif
+        // restore registers
+        pop     ecx
+        pop     edx
+        jmp     ResolveWorkerAsmStub
+    }
+}
+#endif 
+
+/* Call the resolver, it will return where we are supposed to go.
+   There is a little stack magic here, in that we are entered with one
+   of the arguments for the resolver (the token) on the stack already.
+   We just push the other arguments, <this> in the call frame and the call site pointer, 
+   and call the resolver.
+   
+   On return we have the stack frame restored to the way it was when the ResolveStub
+   was called, i.e. as it was at the actual call site.  The return value from
+   the resolver is the address we need to transfer control to, simulating a direct
+   call from the original call site.  If we get passed back NULL, it means that the
+   resolution failed, an unimpelemented method is being called.
+
+   Entry stack:
+            dispatch token
+            siteAddrForRegisterIndirect (used only if this is a RegisterIndirect dispatch call)
+            return address of caller to stub
+
+   Call stack:
+            pointer to TransitionBlock
+            call site
+            dispatch token
+            TransitionBlock
+                ArgumentRegisters (ecx, edx)
+                CalleeSavedRegisters (ebp, ebx, esi, edi)
+            return address of caller to stub
+   */
+__declspec (naked) void ResolveWorkerAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        //
+        // The stub arguments are where we want to setup the TransitionBlock. We will
+        // setup the TransitionBlock later once we can trash them
+        //
+        // push ebp-frame
+        // push      ebp
+        // mov       ebp,esp
+
+        // save CalleeSavedRegisters
+        // push      ebx
+
+        push        esi
+        push        edi
+
+        // push ArgumentRegisters
+        push        ecx
+        push        edx
+
+        mov         esi, esp
+
+        push        [esi + 4*4]     // dispatch token
+        push        [esi + 5*4]     // siteAddrForRegisterIndirect
+        push        esi             // pTransitionBlock
+
+        // Setup up proper EBP frame now that the stub arguments can be trashed
+        mov         [esi + 4*4],ebx
+        mov         [esi + 5*4],ebp
+        lea         ebp, [esi + 5*4]
+
+        // Make the call
+        call        VSD_ResolveWorker
+
+        // From here on, mustn't trash eax
+        
+        // pop ArgumentRegisters
+        pop     edx
+        pop     ecx
+
+        // pop CalleeSavedRegisters
+        pop edi
+        pop esi
+        pop ebx
+        pop ebp
+        
+        // Now jump to the target
+        jmp     eax             // continue on into the method
+    }
+}
+
+#ifdef FEATURE_REMOTING
+/*  For an in-context dispatch, we will find the target. This
+    is the slow path, and erects a MachState structure for 
+    creating a HelperMethodFrame
+
+    Entry stack:
+            dispatch token
+            return address of caller to stub
+
+   Call stack:
+            pointer to StubDispatchFrame
+            call site
+            dispatch token
+            StubDispatchFrame
+                GSCookie
+                negspace
+                vptr
+                datum
+                ArgumentRegisters (ecx, edx)
+                CalleeSavedRegisters (ebp, ebx, esi, edi)
+            return address of caller to stub
+*/    
+__declspec (naked) void InContextTPDispatchAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        // Pop dispatch token
+        pop         eax
+
+        // push ebp-frame
+        push        ebp
+        mov         ebp,esp
+
+        // save CalleeSavedRegisters
+        push        ebx
+        push        esi
+        push        edi
+
+        // push ArgumentRegisters
+        push        ecx
+        push        edx
+
+        mov         esi, esp
+
+        push        eax                     // token
+        push        esi                     // pTransitionContext
+
+        // Make the call
+        call    VSD_GetTargetForTPWorker
+
+        // From here on, mustn't trash eax
+        
+        // pop ArgumentRegisters
+        pop     edx
+        pop     ecx
+
+        // pop CalleeSavedRegisters
+        pop edi
+        pop esi
+        pop ebx
+        pop ebp
+
+        // Now jump to the target
+        jmp     eax             // continue on into the method
+    }
+}
+
+/*  For an in-context dispatch, we will try to find the target in
+    the resolve cache. If this fails, we will jump to the full
+    version of InContextTPDispatchAsmStub
+    
+    Entry stack:
+        dispatch slot number of interface MD
+        caller return address
+    ECX: this object
+*/    
+__declspec (naked) void InContextTPQuickDispatchAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        // Spill registers
+        push        ecx
+        push        edx
+
+        // Arg 2 -  token
+        mov         eax, [esp + 8]
+        push        eax
+
+        // Arg 1 - this
+        push        ecx
+
+        // Make the call
+        call        VSD_GetTargetForTPWorkerQuick
+
+        // Restore registers
+        pop         edx
+        pop         ecx
+
+        // Test to see if we found a target
+        test        eax, eax
+        jnz         TargetFound
+
+        // If no target, jump to the slow worker
+        jmp         InContextTPDispatchAsmStub
+
+    TargetFound:
+        // We got a target, so pop off the token and jump to it
+        add         esp,4
+        jmp         eax
+    }
+}
+#endif // FEATURE_REMOTING
+
+/* Call the callsite back patcher.  The fail stub piece of the resolver is being
+call too often, i.e. dispatch stubs are failing the expect MT test too often.
+In this stub wraps the call to the BackPatchWorker to take care of any stack magic
+needed.
+*/
+__declspec (naked) void BackPatchWorkerAsmStub()
+{
+    CANNOT_HAVE_CONTRACT;
+
+    __asm {
+        push EBP
+        mov ebp,esp
+        push EAX        // it may contain siteAddrForRegisterIndirect
+        push ECX
+        push EDX
+        push EAX        //  push any indirect call address as the second arg to BackPatchWorker
+        push [EBP+8]    //  and push return address as the first arg to BackPatchWorker
+        call VirtualCallStubManager::BackPatchWorkerStatic
+        pop EDX
+        pop ECX
+        pop EAX
+        mov esp,ebp
+        pop ebp
+        ret
+    }
+}
+
+#endif // _MSC_VER
+
+#ifdef _DEBUG
+//
+// This function verifies that a pointer to an indirection cell lives inside a delegate object.
+// In the delegate case the indirection cell is held by the delegate itself in _methodPtrAux, when the delegate Invoke is
+// called the shuffle thunk is first invoked and that will call into the virtual dispatch stub.
+// Before control is given to the virtual dispatch stub a pointer to the indirection cell (thus an interior pointer to the delegate)
+// is pushed in EAX
+//
+BOOL isDelegateCall(BYTE *interiorPtr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    if (GCHeap::GetGCHeap()->IsHeapPointer((void*)interiorPtr))
+    {
+        Object *delegate = (Object*)(interiorPtr - DelegateObject::GetOffsetOfMethodPtrAux());
+        VALIDATEOBJECTREF(ObjectToOBJECTREF(delegate));
+        _ASSERTE(delegate->GetMethodTable()->IsDelegate());
+
+        return TRUE;
+    }
+    return FALSE;
+}
+#endif
+
+StubCallSite::StubCallSite(TADDR siteAddrForRegisterIndirect, PCODE returnAddr)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    // Not used
+    // if (isCallRelative(returnAddr))
+    // {
+    //      m_siteAddr = returnAddr - sizeof(DISPL);
+    // }
+    // else
+    if (isCallRelativeIndirect((BYTE *)returnAddr))
+    {
+        m_siteAddr = *dac_cast<PTR_PTR_PCODE>(returnAddr - sizeof(PCODE));
+    }
+    else
+    {
+        _ASSERTE(isCallRegisterIndirect((BYTE *)returnAddr) || isDelegateCall((BYTE *)siteAddrForRegisterIndirect));
+        m_siteAddr = dac_cast<PTR_PCODE>(siteAddrForRegisterIndirect);
+    }
+}
+
+// the special return address for VSD tailcalls
+extern "C" void STDCALL JIT_TailCallReturnFromVSD();
+
+PCODE StubCallSite::GetCallerAddress()
+{
+    LIMITED_METHOD_CONTRACT; 
+    if (m_returnAddr != (PCODE)JIT_TailCallReturnFromVSD)
+        return m_returnAddr;
+
+    // Find the tailcallframe in the frame chain and get the actual caller from the first TailCallFrame
+    return TailCallFrame::FindTailCallFrame(GetThread()->GetFrame())->GetCallerAddress();
+}
+
+#ifdef STUB_LOGGING
+extern size_t g_lookup_inline_counter;
+extern size_t g_mono_call_counter;
+extern size_t g_mono_miss_counter;
+extern size_t g_poly_call_counter;
+extern size_t g_poly_miss_counter;
+#endif
+
+/* Template used to generate the stub.  We generate a stub by allocating a block of 
+   memory and copy the template over it and just update the specific fields that need 
+   to be changed.
+*/ 
+LookupStub lookupInit;
+
+void LookupHolder::InitializeStatic()
+{
+    static_assert_no_msg(((offsetof(LookupStub, _token)+offsetof(LookupHolder, _stub)) % sizeof(void*)) == 0);
+    static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0);
+
+    lookupInit._entryPoint [0]     = 0x50;
+    lookupInit._entryPoint [1]     = 0x68;
+    static_assert_no_msg(sizeof(lookupInit._entryPoint) == 2);
+    lookupInit._token              = 0xcccccccc;
+#ifdef STUB_LOGGING
+    lookupInit.cntr2 [0]           = 0xff;
+    lookupInit.cntr2 [1]           = 0x05;
+    static_assert_no_msg(sizeof(lookupInit.cntr2) == 2);
+    lookupInit.c_lookup            = &g_call_lookup_counter;
+#endif //STUB_LOGGING 
+    lookupInit.part2 [0]           = 0xe9;
+    static_assert_no_msg(sizeof(lookupInit.part2) == 1);
+    lookupInit._resolveWorkerDispl = 0xcccccccc;
+}
+
+void  LookupHolder::Initialize(PCODE resolveWorkerTarget, size_t dispatchToken)
+{
+    _stub = lookupInit;
+
+    //fill in the stub specific fields
+    //@TODO: Get rid of this duplication of data.
+    _stub._token              = dispatchToken;
+    _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &_stub._resolveWorkerDispl + sizeof(DISPL));
+}
+
+LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint)  );
+    //    _ASSERTE(lookupHolder->_stub._entryPoint[0] == lookupInit._entryPoint[0]);
+    return lookupHolder;
+}
+
+
+/* Template used to generate the stub.  We generate a stub by allocating a block of 
+   memory and copy the template over it and just update the specific fields that need 
+   to be changed.
+*/ 
+DispatchStub dispatchInit;
+
+void DispatchHolder::InitializeStatic()
+{
+    // Check that _expectedMT is aligned in the DispatchHolder
+    static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub,_expectedMT)) % sizeof(void*)) == 0);
+    static_assert_no_msg((sizeof(DispatchHolder) % sizeof(void*)) == 0);
+
+#ifndef STUB_LOGGING
+    dispatchInit._entryPoint [0] = 0x81;
+    dispatchInit._entryPoint [1] = 0x39;
+    static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2);
+
+    dispatchInit._expectedMT     = 0xcccccccc;
+    dispatchInit.jmpOp1 [0]      = 0x0f;
+    dispatchInit.jmpOp1 [1]      = 0x85;
+    static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2);
+
+    dispatchInit._failDispl      = 0xcccccccc;
+    dispatchInit.jmpOp2          = 0xe9;
+    dispatchInit._implDispl      = 0xcccccccc;
+#else //STUB_LOGGING
+    dispatchInit._entryPoint [0] = 0xff;
+    dispatchInit._entryPoint [1] = 0x05;
+    static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2);
+
+    dispatchInit.d_call          = &g_mono_call_counter;
+    dispatchInit.cmpOp [0]       = 0x81;
+    dispatchInit.cmpOp [1]       = 0x39;              
+    static_assert_no_msg(sizeof(dispatchInit.cmpOp) == 2);
+
+    dispatchInit._expectedMT     = 0xcccccccc;
+    dispatchInit.jmpOp1 [0]      = 0x0f;
+    dispatchInit.jmpOp1 [1]      = 0x84;
+    static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2);
+
+    dispatchInit._implDispl      = 0xcccccccc;
+    dispatchInit.fail [0]        = 0xff;
+    dispatchInit.fail [1]        = 0x05;
+    static_assert_no_msg(sizeof(dispatchInit.fail) == 2);
+
+    dispatchInit.d_miss          = &g_mono_miss_counter;
+    dispatchInit.jmpFail         = 0xe9;
+    dispatchInit._failDispl      = 0xcccccccc;
+#endif //STUB_LOGGING 
+};
+
+void  DispatchHolder::Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT)
+{
+    _stub = dispatchInit;
+
+    //fill in the stub specific fields
+    _stub._expectedMT  = (size_t) expectedMT;
+    _stub._failDispl   = failTarget - ((PCODE) &_stub._failDispl + sizeof(DISPL));
+    _stub._implDispl   = implTarget - ((PCODE) &_stub._implDispl + sizeof(DISPL));
+}
+
+DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) );
+    //    _ASSERTE(dispatchHolder->_stub._entryPoint[0] == dispatchInit._entryPoint[0]);
+    return dispatchHolder;
+}
+
+
+/* Template used to generate the stub.  We generate a stub by allocating a block of 
+   memory and copy the template over it and just update the specific fields that need 
+   to be changed.
+*/ 
+
+ResolveStub resolveInit;
+
+void ResolveHolder::InitializeStatic()
+{
+    //Check that _token is aligned in ResolveHolder
+    static_assert_no_msg(((offsetof(ResolveHolder, _stub) + offsetof(ResolveStub, _token)) % sizeof(void*)) == 0);
+    static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0);
+
+    resolveInit._failEntryPoint [0]    = 0x83;
+    resolveInit._failEntryPoint [1]    = 0x2d;
+    static_assert_no_msg(sizeof(resolveInit._failEntryPoint) == 2);
+
+    resolveInit._pCounter              = (INT32 *) (size_t) 0xcccccccc;
+    resolveInit.part0 [0]              = 0x01;
+    resolveInit.part0 [1]              = 0x7c;
+    static_assert_no_msg(sizeof(resolveInit.part0) == 2);
+
+    resolveInit.toPatcher              = (offsetof(ResolveStub, patch) - (offsetof(ResolveStub, toPatcher) + 1)) & 0xFF;
+
+    resolveInit._resolveEntryPoint [0] = 0x50;
+    resolveInit._resolveEntryPoint [1] = 0x8b;
+    resolveInit._resolveEntryPoint [2] = 0x01;
+    resolveInit._resolveEntryPoint [3] = 0x52;
+    resolveInit._resolveEntryPoint [4] = 0x8b;
+    resolveInit._resolveEntryPoint [5] = 0xd0;
+    static_assert_no_msg(sizeof(resolveInit._resolveEntryPoint) == 6);
+
+    resolveInit.part1 [0]              = 0xc1;
+    resolveInit.part1 [1]              = 0xe8;
+    resolveInit.part1 [2]              = CALL_STUB_CACHE_NUM_BITS;
+    resolveInit.part1 [3]              = 0x03;
+    resolveInit.part1 [4]              = 0xc2;
+    resolveInit.part1 [5]              = 0x35;
+    static_assert_no_msg(sizeof(resolveInit.part1) == 6);
+
+    resolveInit._hashedToken           = 0xcccccccc;
+    resolveInit.part2 [0]              = 0x25;
+    static_assert_no_msg(sizeof(resolveInit.part2) == 1);
+
+    resolveInit.mask                   = (CALL_STUB_CACHE_MASK << LOG2_PTRSIZE);
+    resolveInit.part3 [0]              = 0x8b;
+    resolveInit.part3 [1]              = 0x80;;
+    static_assert_no_msg(sizeof(resolveInit.part3) == 2);
+
+    resolveInit._cacheAddress          = 0xcccccccc;
+#ifdef STUB_LOGGING
+    resolveInit.cntr1 [0]              = 0xff;
+    resolveInit.cntr1 [1]              = 0x05;
+    static_assert_no_msg(sizeof(resolveInit.cntr1) == 2);
+
+    resolveInit.c_call                 = &g_poly_call_counter;
+#endif //STUB_LOGGING 
+    resolveInit.part4 [0]              = 0x3b;
+    resolveInit.part4 [1]              = 0x10;
+    static_assert_no_msg(sizeof(resolveInit.part4) == 2);
+
+    // resolveInit.mtOffset               = offsetof(ResolveCacheElem,pMT) & 0xFF;
+    static_assert_no_msg(offsetof(ResolveCacheElem,pMT) == 0);
+
+    resolveInit.part5 [0]              = 0x75;
+    static_assert_no_msg(sizeof(resolveInit.part5) == 1);
+
+    resolveInit.toMiss1                = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1);
+
+    resolveInit.part6 [0]              = 0x81;
+    resolveInit.part6 [1]              = 0x78;
+    static_assert_no_msg(sizeof(resolveInit.part6) == 2);
+
+    resolveInit.tokenOffset            = offsetof(ResolveCacheElem,token) & 0xFF;    
+
+    resolveInit._token                 = 0xcccccccc;
+
+    resolveInit.part7 [0]              = 0x75;
+    static_assert_no_msg(sizeof(resolveInit.part7) == 1);
+
+    resolveInit.part8 [0]              = 0x8b;
+    resolveInit.part8 [1]              = 0x40;
+    static_assert_no_msg(sizeof(resolveInit.part8) == 2);
+
+    resolveInit.targetOffset           = offsetof(ResolveCacheElem,target) & 0xFF;
+
+    resolveInit.toMiss2                = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1);
+
+    resolveInit.part9 [0]              = 0x5a;
+    resolveInit.part9 [1]              = 0x83;
+    resolveInit.part9 [2]              = 0xc4;
+    resolveInit.part9 [3]              = 0x04;
+    resolveInit.part9 [4]              = 0xff;
+    resolveInit.part9 [5]              = 0xe0;
+    static_assert_no_msg(sizeof(resolveInit.part9) == 6);
+
+    resolveInit.miss [0]               = 0x5a;
+//    resolveInit.miss [1]               = 0xb8;
+//    resolveInit._hashedTokenMov        = 0xcccccccc;
+    resolveInit._slowEntryPoint [0]    = 0x68;
+    resolveInit._tokenPush             = 0xcccccccc;
+#ifdef STUB_LOGGING
+    resolveInit.cntr2 [0]              = 0xff;
+    resolveInit.cntr2 [1]              = 0x05;
+    resolveInit.c_miss                 = &g_poly_miss_counter;
+#endif //STUB_LOGGING 
+    resolveInit.part10 [0]             = 0xe9;
+    resolveInit._resolveWorkerDispl    = 0xcccccccc;
+
+    resolveInit.patch [0]              = 0xe8;
+    resolveInit._backpatcherDispl      = 0xcccccccc;
+    resolveInit.part11 [0]             = 0xeb;
+    resolveInit.toResolveStub          = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub) + 1)) & 0xFF;
+};
+
+void  ResolveHolder::Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, 
+                                size_t dispatchToken, UINT32 hashedToken,
+                                void * cacheAddr, INT32 * counterAddr)
+{
+    _stub = resolveInit;
+
+    //fill in the stub specific fields
+    _stub._pCounter           = counterAddr;
+    _stub._hashedToken        = hashedToken << LOG2_PTRSIZE;
+    _stub._cacheAddress       = (size_t) cacheAddr;
+    _stub._token              = dispatchToken;
+//    _stub._hashedTokenMov     = hashedToken;
+    _stub._tokenPush          = dispatchToken;
+    _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &_stub._resolveWorkerDispl + sizeof(DISPL));
+    _stub._backpatcherDispl   = patcherTarget       - ((PCODE) &_stub._backpatcherDispl   + sizeof(DISPL));
+}
+
+ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) );
+    //    _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]);
+    return resolveHolder;
+}
+
+ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry)
+{ 
+    LIMITED_METHOD_CONTRACT;
+    ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) );
+    //    _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]);
+    return resolveHolder;
+}
+
+#endif // DACCESS_COMPILE
+
+VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE stubStartAddress)
+{
+    SUPPORTS_DAC;
+#ifdef DACCESS_COMPILE
+
+    return SK_BREAKPOINT;  // Dac always uses the slower lookup
+
+#else
+
+    StubKind stubKind = SK_UNKNOWN;
+
+    EX_TRY
+    {
+        // If stubStartAddress is completely bogus, then this might AV,
+        // so we protect it with SEH. An AV here is OK.
+        AVInRuntimeImplOkayHolder AVOkay;
+
+        WORD firstWord = *((WORD*) stubStartAddress);
+
+#ifndef STUB_LOGGING
+        if (firstWord == 0x3981)
+#else //STUB_LOGGING
+        if (firstWord == 0x05ff)
+#endif
+        {
+            stubKind = SK_DISPATCH;
+        }
+        else if (firstWord == 0x6850)
+        {
+            stubKind = SK_LOOKUP;
+        }
+        else if (firstWord == 0x8b50)
+        {
+            stubKind = SK_RESOLVE;
+        }
+        else
+        {
+            BYTE firstByte  = ((BYTE*) stubStartAddress)[0];
+            BYTE secondByte = ((BYTE*) stubStartAddress)[1];
+
+            if ((firstByte  == X86_INSTR_INT3) ||
+                (secondByte == X86_INSTR_INT3))
+            {
+                stubKind = SK_BREAKPOINT;
+            }
+        }
+    }
+    EX_CATCH
+    {
+        stubKind = SK_UNKNOWN;
+    }
+    EX_END_CATCH(SwallowAllExceptions);        
+
+    return stubKind;
+
+#endif // DACCESS_COMPILE
+}
+
+#endif //DECLARE_DATA
+
+#endif // _VIRTUAL_CALL_STUB_X86_H