From 850164ee70077e0970d7ab4e4bf2ca51809b92e8 Mon Sep 17 00:00:00 2001
From: noahfalk <noahfalk@microsoft.com>
Date: Fri, 24 Mar 2017 22:43:47 -0700
Subject: Tiered Compilation step 1

Tiered compilation is a new feature we are experimenting with that aims to improve startup times. Initially we jit methods non-optimized, then switch to an optimized version once the method has been called a number of times. More details about the current feature operation are in the comments of TieredCompilation.cpp.

This is only the first step in a longer process building the feature. The primary goal for now is to avoid regressing any runtime behavior in the shipping configuration in which the complus variable is OFF, while putting enough code in place that we can measure performance in the daily builds and make incremental progress visible to collaborators and reviewers. The design of the TieredCompilationManager is likely to change substantively, and the call counter may also change.
---
 src/vm/CMakeLists.txt         |   4 +-
 src/vm/appdomain.cpp          |  16 ++
 src/vm/appdomain.hpp          |  28 ++++
 src/vm/callcounter.cpp        |  98 +++++++++++
 src/vm/callcounter.h          |  87 ++++++++++
 src/vm/ceeload.cpp            |   4 +-
 src/vm/eeconfig.cpp           |   8 +
 src/vm/eeconfig.h             |   9 +
 src/vm/i386/stublinkerx86.cpp |  28 +++-
 src/vm/method.cpp             |   2 +-
 src/vm/method.hpp             |  52 ++++++
 src/vm/methodtablebuilder.cpp |  10 ++
 src/vm/precode.cpp            |   4 +-
 src/vm/precode.h              |   2 +-
 src/vm/prestub.cpp            |  60 ++++++-
 src/vm/tieredcompilation.cpp  | 377 ++++++++++++++++++++++++++++++++++++++++++
 src/vm/tieredcompilation.h    |  51 ++++++
 17 files changed, 823 insertions(+), 17 deletions(-)
 create mode 100644 src/vm/callcounter.cpp
 create mode 100644 src/vm/callcounter.h
 create mode 100644 src/vm/tieredcompilation.cpp
 create mode 100644 src/vm/tieredcompilation.h

(limited to 'src/vm')

diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt
index 21f1659c6f..861d68c9c9 100644
--- a/src/vm/CMakeLists.txt
+++ b/src/vm/CMakeLists.txt
@@ -97,6 +97,7 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON
     securitydescriptorassembly.cpp
     sigformat.cpp
     siginfo.cpp
+    spinlock.cpp
     stackwalk.cpp
     stublink.cpp
     stubmgr.cpp
@@ -137,6 +138,7 @@ set(VM_SOURCES_WKS
     assemblynative.cpp
     assemblyspec.cpp
     cachelinealloc.cpp
+    callcounter.cpp
     callhelpers.cpp
     ceemain.cpp
     clrconfignative.cpp
@@ -220,7 +222,6 @@ set(VM_SOURCES_WKS
     sha1.cpp
     simplerwlock.cpp
     sourceline.cpp
-    spinlock.cpp
     stackingallocator.cpp
     stringliteralmap.cpp
     stubcache.cpp
@@ -230,6 +231,7 @@ set(VM_SOURCES_WKS
     synch.cpp
     synchronizationcontextnative.cpp
     testhookmgr.cpp
+    tieredcompilation.cpp
     threaddebugblockinginfo.cpp
     threadsuspend.cpp
     typeparse.cpp
diff --git a/src/vm/appdomain.cpp b/src/vm/appdomain.cpp
index 8e79018982..5664740b5d 100644
--- a/src/vm/appdomain.cpp
+++ b/src/vm/appdomain.cpp
@@ -4333,6 +4333,10 @@ void AppDomain::Init()
     }
 #endif //FEATURE_COMINTEROP
 
+#ifdef FEATURE_TIERED_COMPILATION
+    m_callCounter.SetTieredCompilationManager(GetTieredCompilationManager());
+    m_tieredCompilationManager.Init(GetId());
+#endif
 #endif // CROSSGEN_COMPILE
 } // AppDomain::Init
 
@@ -8254,6 +8258,18 @@ void AppDomain::Exit(BOOL fRunFinalizers, BOOL fAsyncExit)
         }
     }
 
+    // Tell the tiered compilation manager to stop initiating any new work for background
+    // jit optimization. Its possible the standard thread unwind mechanisms would pre-emptively
+    // evacuate the jit threadpool worker threads from the domain on their own, but I see no reason 
+    // to take the risk of relying on them when we can easily augment with a cooperative 
+    // shutdown check. This notification only initiates the process of evacuating the threads
+    // and then the UnwindThreads() call below is where blocking will occur to ensure the threads 
+    // have exited the domain.
+    //
+#ifdef FEATURE_TIERED_COMPILATION
+    m_tieredCompilationManager.OnAppDomainShutdown();
+#endif
+
     //
     // Set up blocks so no threads can enter except for the finalizer and the thread
     // doing the unload.
diff --git a/src/vm/appdomain.hpp b/src/vm/appdomain.hpp
index d6023dd3b2..ed83eb3e2f 100644
--- a/src/vm/appdomain.hpp
+++ b/src/vm/appdomain.hpp
@@ -43,6 +43,11 @@
 
 #include "appxutil.h"
 
+#ifdef FEATURE_TIERED_COMPILATION
+#include "tieredcompilation.h"
+#include "callcounter.h"
+#endif
+
 class BaseDomain;
 class SystemDomain;
 class SharedDomain;
@@ -3823,6 +3828,29 @@ public:
 
 #endif
 
+#if defined(FEATURE_TIERED_COMPILATION)
+
+public:
+    TieredCompilationManager * GetTieredCompilationManager()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return &m_tieredCompilationManager;
+    }
+
+private:
+    TieredCompilationManager m_tieredCompilationManager;
+
+public:
+    CallCounter * GetCallCounter()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return &m_callCounter;
+    }
+
+private:
+    CallCounter m_callCounter;
+#endif
+
 #ifdef FEATURE_COMINTEROP
 
 private:
diff --git a/src/vm/callcounter.cpp b/src/vm/callcounter.cpp
new file mode 100644
index 0000000000..90013c79fb
--- /dev/null
+++ b/src/vm/callcounter.cpp
@@ -0,0 +1,98 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// ===========================================================================
+// File: CallCounter.CPP
+//
+// ===========================================================================
+
+
+
+#include "common.h"
+#include "excep.h"
+#include "log.h"
+#include "tieredcompilation.h"
+#include "callcounter.h"
+
+#ifdef FEATURE_TIERED_COMPILATION
+
+CallCounter::CallCounter()
+{
+    LIMITED_METHOD_CONTRACT;
+
+    m_lock.Init(LOCK_TYPE_DEFAULT);
+}
+
+// Init our connection to the tiered compilation manager during
+// AppDomain startup. This pointer will remain valid for the lifetime
+// of the AppDomain.
+void CallCounter::SetTieredCompilationManager(TieredCompilationManager* pTieredCompilationManager)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        CAN_TAKE_LOCK;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    m_pTieredCompilationManager.Store(pTieredCompilationManager);
+}
+
+// This is called by the prestub each time the method is invoked in a particular
+// AppDomain (the AppDomain for which AppDomain.GetCallCounter() == this). These
+// calls continue until we backpatch the prestub to avoid future calls. This allows
+// us to track the number of calls to each method and use it as a trigger for tiered
+// compilation.
+//
+// Returns TRUE if no future invocations are needed (we reached the count we cared about)
+// and FALSE otherwise. It is permissible to keep calling even when TRUE was previously
+// returned and multi-threaded race conditions will surely cause this to occur.
+BOOL CallCounter::OnMethodCalled(MethodDesc* pMethodDesc)
+{
+    STANDARD_VM_CONTRACT;
+
+    _ASSERTE(pMethodDesc->IsEligibleForTieredCompilation());
+
+    // PERF: This as a simple to implement, but not so performant, call counter
+    // Currently this is only called until we reach a fixed call count and then
+    // disabled. Its likely we'll want to improve this at some point but
+    // its not as bad as you might expect. Allocating a counter inline in the
+    // MethodDesc or at some location computable from the MethodDesc should
+    // eliminate 1 pointer per-method (the MethodDesc* key) and the CPU
+    // overhead to acquire the lock/search the dictionary. Depending on where it
+    // is we may also be able to reduce it to 1 byte counter without wasting the
+    // following bytes for alignment. Further work to inline the OnMethodCalled
+    // callback directly into the jitted code would eliminate CPU overhead of 
+    // leaving the prestub unpatched, but may not be good overall as it increases
+    // the size of the jitted code.
+
+
+    TieredCompilationManager* pCallCounterSink = NULL;
+    int callCount;
+    {
+        //Be careful if you convert to something fully lock/interlocked-free that
+        //you correctly handle what happens when some N simultaneous calls don't
+        //all increment the counter. The slight drift is probably neglible for tuning
+        //but TieredCompilationManager::OnMethodCalled() doesn't expect multiple calls
+        //each claiming to be exactly the threshhold call count needed to trigger
+        //optimization.
+        SpinLockHolder holder(&m_lock);
+        CallCounterEntry* pEntry = const_cast<CallCounterEntry*>(m_methodToCallCount.LookupPtr(pMethodDesc));
+        if (pEntry == NULL)
+        {
+            callCount = 1;
+            m_methodToCallCount.Add(CallCounterEntry(pMethodDesc, callCount));
+        }
+        else
+        {
+            pEntry->callCount++;
+            callCount = pEntry->callCount;
+        }
+    }
+
+    return m_pTieredCompilationManager.Load()->OnMethodCalled(pMethodDesc, callCount);
+}
+
+#endif // FEATURE_TIERED_COMPILATION
diff --git a/src/vm/callcounter.h b/src/vm/callcounter.h
new file mode 100644
index 0000000000..82d14b76d9
--- /dev/null
+++ b/src/vm/callcounter.h
@@ -0,0 +1,87 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// ===========================================================================
+// File: CallCounter.h
+//
+// ===========================================================================
+
+
+#ifndef CALL_COUNTER_H
+#define CALL_COUNTER_H
+
+#ifdef FEATURE_TIERED_COMPILATION
+
+// One entry in our dictionary mapping methods to the number of times they
+// have been invoked
+struct CallCounterEntry
+{
+    CallCounterEntry() {}
+    CallCounterEntry(const MethodDesc* m, const int c)
+        : pMethod(m), callCount(c) {}
+
+    const MethodDesc* pMethod;
+    int callCount;
+};
+
+class CallCounterHashTraits : public DefaultSHashTraits<CallCounterEntry>
+{
+public:
+    typedef typename DefaultSHashTraits<CallCounterEntry>::element_t element_t;
+    typedef typename DefaultSHashTraits<CallCounterEntry>::count_t count_t;
+
+    typedef const MethodDesc* key_t;
+
+    static key_t GetKey(element_t e)
+    {
+        LIMITED_METHOD_CONTRACT;
+        return e.pMethod;
+    }
+    static BOOL Equals(key_t k1, key_t k2)
+    {
+        LIMITED_METHOD_CONTRACT;
+        return k1 == k2;
+    }
+    static count_t Hash(key_t k)
+    {
+        LIMITED_METHOD_CONTRACT;
+        return (count_t)(size_t)k;
+    }
+
+    static const element_t Null() { LIMITED_METHOD_CONTRACT; return element_t(NULL, 0); }
+    static const element_t Deleted() { LIMITED_METHOD_CONTRACT; return element_t((const MethodDesc*)-1, 0); }
+    static bool IsNull(const element_t &e) { LIMITED_METHOD_CONTRACT; return e.pMethod == NULL; }
+    static bool IsDeleted(const element_t &e) { return e.pMethod == (const MethodDesc*)-1; }
+};
+
+typedef SHash<NoRemoveSHashTraits<CallCounterHashTraits>> CallCounterHash;
+
+
+// This is a per-appdomain cache of call counts for all code in that AppDomain.
+// Each method invocation should trigger a call to OnMethodCalled (until it is disabled per-method)
+// and the CallCounter will forward the call to the TieredCompilationManager including the
+// current call count.
+class CallCounter
+{
+public:
+#if defined(DACCESS_COMPILE) || defined(CROSSGEN_COMPILE)
+    CallCounter() {}
+#else
+    CallCounter();
+#endif
+
+    void SetTieredCompilationManager(TieredCompilationManager* pTieredCompilationManager);
+    BOOL OnMethodCalled(MethodDesc* pMethodDesc);
+
+private:
+
+    VolatilePtr<TieredCompilationManager> m_pTieredCompilationManager;
+
+    // fields protected by lock
+    SpinLock m_lock;
+    CallCounterHash m_methodToCallCount;
+};
+
+#endif // FEATURE_TIERED_COMPILATION
+
+#endif // CALL_COUNTER_H
diff --git a/src/vm/ceeload.cpp b/src/vm/ceeload.cpp
index 710195d809..0845d8608f 100644
--- a/src/vm/ceeload.cpp
+++ b/src/vm/ceeload.cpp
@@ -6511,7 +6511,9 @@ MethodDesc *Module::FindMethod(mdToken pMethod)
         CONTRACT_VIOLATION(ThrowsViolation);
         char szMethodName [MAX_CLASSNAME_LENGTH];
         CEEInfo::findNameOfToken(this, pMethod, szMethodName, COUNTOF (szMethodName));
-        LOG((LF_IJW, LL_INFO10, "Failed to find Method: %s for Vtable Fixup\n", szMethodName));
+        // This used to be LF_IJW, but changed to LW_INTEROP to reclaim a bit in our log facilities
+        // IJW itself is not supported in coreclr so this code should never be run.
+        LOG((LF_INTEROP, LL_INFO10, "Failed to find Method: %s for Vtable Fixup\n", szMethodName));
 #endif // _DEBUG
     }
     EX_END_CATCH(SwallowAllExceptions)
diff --git a/src/vm/eeconfig.cpp b/src/vm/eeconfig.cpp
index 8b980ef17b..812d1df671 100644
--- a/src/vm/eeconfig.cpp
+++ b/src/vm/eeconfig.cpp
@@ -402,6 +402,10 @@ HRESULT EEConfig::Init()
 #if defined(_DEBUG)
     bDiagnosticSuspend = false;
 #endif
+
+#if defined(FEATURE_TIERED_COMPILATION)
+    fTieredCompilation = false;
+#endif
     
     // After initialization, register the code:#GetConfigValueCallback method with code:CLRConfig to let
     // CLRConfig access config files. This is needed because CLRConfig lives outside the VM and can't
@@ -1277,6 +1281,10 @@ HRESULT EEConfig::sync()
 
     dwSleepOnExit = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_SleepOnExit);
 
+#if defined(FEATURE_TIERED_COMPILATION)
+    fTieredCompilation = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredCompilation) != 0;
+#endif
+
     return hr;
 }
 
diff --git a/src/vm/eeconfig.h b/src/vm/eeconfig.h
index 2c0d5ce20c..e97385e3da 100644
--- a/src/vm/eeconfig.h
+++ b/src/vm/eeconfig.h
@@ -294,6 +294,11 @@ public:
     bool          AddRejitNops(void)                const {LIMITED_METHOD_DAC_CONTRACT;  return fAddRejitNops; }
     bool          JitMinOpts(void)                  const {LIMITED_METHOD_CONTRACT;  return fJitMinOpts; }
     
+    // Tiered Compilation config
+#if defined(FEATURE_TIERED_COMPILATION)
+    bool          TieredCompilation(void)           const {LIMITED_METHOD_CONTRACT;  return fTieredCompilation; }
+#endif
+
     BOOL PInvokeRestoreEsp(BOOL fDefault) const
     {
         LIMITED_METHOD_CONTRACT;
@@ -1182,6 +1187,10 @@ private: //----------------------------------------------------------------
     DWORD testThreadAbort;
 #endif
 
+#if defined(FEATURE_TIERED_COMPILATION)
+    bool fTieredCompilation;
+#endif
+
 public:
 
     HRESULT GetConfiguration_DontUse_(__in_z LPCWSTR pKey, ConfigSearch direction, __deref_out_opt LPCWSTR* value);
diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp
index d9c613064d..d951314676 100644
--- a/src/vm/i386/stublinkerx86.cpp
+++ b/src/vm/i386/stublinkerx86.cpp
@@ -6698,20 +6698,34 @@ BOOL FixupPrecode::SetTargetInterlocked(TADDR target, TADDR expected)
     INT64 oldValue = *(INT64*)this;
     BYTE* pOldValue = (BYTE*)&oldValue;
 
-    if (pOldValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] != FixupPrecode::TypePrestub)
-        return FALSE;
-
     MethodDesc * pMD = (MethodDesc*)GetMethodDesc();
     g_IBCLogger.LogMethodPrecodeWriteAccess(pMD);
     
     INT64 newValue = oldValue;
     BYTE* pNewValue = (BYTE*)&newValue;
 
-    pNewValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] = FixupPrecode::Type;
-
-    pOldValue[offsetof(FixupPrecode,m_op)] = X86_INSTR_CALL_REL32;
-    pNewValue[offsetof(FixupPrecode,m_op)] = X86_INSTR_JMP_REL32;
+    if (pOldValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] == FixupPrecode::TypePrestub)
+    {
+        pNewValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] = FixupPrecode::Type;
 
+        pOldValue[offsetof(FixupPrecode, m_op)] = X86_INSTR_CALL_REL32;
+        pNewValue[offsetof(FixupPrecode, m_op)] = X86_INSTR_JMP_REL32;
+    }
+    else if (pOldValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] == FixupPrecode::Type)
+    {
+#ifdef FEATURE_TIERED_COMPILATION
+        // No change needed, jmp is already in place
+#else
+        // Setting the target more than once is unexpected
+        return FALSE;
+#endif
+    }
+    else
+    {
+        // Pre-existing code doesn't conform to the expectations for a FixupPrecode
+        return FALSE;
+    }
+	
     *(INT32*)(&pNewValue[offsetof(FixupPrecode, m_rel32)]) =
 #ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS
         pMD->IsLCGMethod() ?
diff --git a/src/vm/method.cpp b/src/vm/method.cpp
index bfd9c73e32..834ab63d58 100644
--- a/src/vm/method.cpp
+++ b/src/vm/method.cpp
@@ -2300,7 +2300,7 @@ PCODE MethodDesc::TryGetMultiCallableAddrOfCode(CORINFO_ACCESS_FLAGS accessFlags
     }
     else
     {
-        if (IsPointingToNativeCode())
+        if (IsPointingToStableNativeCode())
             return GetNativeCode();
     }
 
diff --git a/src/vm/method.hpp b/src/vm/method.hpp
index bbcb012d54..9545da2248 100644
--- a/src/vm/method.hpp
+++ b/src/vm/method.hpp
@@ -1290,6 +1290,58 @@ public:
 
 public:
 
+#ifdef FEATURE_TIERED_COMPILATION
+    // Is this method allowed to be recompiled and the entrypoint redirected so that we
+    // can optimize its performance? Eligibility is invariant for the lifetime of a method.
+    BOOL IsEligibleForTieredCompilation()
+    {
+        LIMITED_METHOD_DAC_CONTRACT;
+
+        // This policy will need to change some more before tiered compilation feature
+        // can be properly supported across a broad range of scenarios. For instance it 
+        // wouldn't interact correctly debugging or profiling at the moment because we 
+        // enable it too aggresively and it conflicts with the operations of those features.
+
+        //Keep in-sync with MethodTableBuilder::NeedsNativeCodeSlot(bmtMDMethod * pMDMethod)
+        //In the future we might want mutable vtable slots too, but that would require
+        //more work around the runtime to prevent those mutable pointers from leaking
+        return g_pConfig->TieredCompilation() &&
+            !GetModule()->HasNativeOrReadyToRunImage() &&
+            !IsEnCMethod() &&
+            HasNativeCodeSlot();
+
+    }
+#endif
+
+    // Does this method force the NativeCodeSlot to stay fixed after it
+    // is first initialized to native code? Consumers of the native code
+    // pointer need to be very careful about if and when they cache it
+    // if it is not stable.
+    //
+    // The stability of the native code pointer is separate from the
+    // stability of the entrypoint. A stable entrypoint can be a precode
+    // which dispatches to an unstable native code pointer.
+    BOOL IsNativeCodeStableAfterInit()
+    {
+        LIMITED_METHOD_DAC_CONTRACT;
+        return 
+#ifdef FEATURE_TIERED_COMPILATION
+            !IsEligibleForTieredCompilation() &&
+#endif
+            !IsEnCMethod();
+    }
+
+    //Is this method currently pointing to native code that will never change?
+    BOOL IsPointingToStableNativeCode()
+    {
+        LIMITED_METHOD_DAC_CONTRACT;
+
+        if (!IsNativeCodeStableAfterInit())
+            return FALSE;
+
+        return IsPointingToNativeCode();
+    }
+
     // Note: We are skipping the prestub based on addition information from the JIT.
     // (e.g. that the call is on same this ptr or that the this ptr is not null).
     // Thus we can end up with a running NGENed method for which IsPointingToNativeCode is false!
diff --git a/src/vm/methodtablebuilder.cpp b/src/vm/methodtablebuilder.cpp
index 5244f6c30a..503c13af5b 100644
--- a/src/vm/methodtablebuilder.cpp
+++ b/src/vm/methodtablebuilder.cpp
@@ -7032,6 +7032,16 @@ MethodTableBuilder::NeedsNativeCodeSlot(bmtMDMethod * pMDMethod)
     LIMITED_METHOD_CONTRACT;
 
 
+#ifdef FEATURE_TIERED_COMPILATION
+    // Keep in-sync with MethodDesc::IsEligibleForTieredCompilation()
+    if (g_pConfig->TieredCompilation() &&
+        !GetModule()->HasNativeOrReadyToRunImage() &&
+        (pMDMethod->GetMethodType() == METHOD_TYPE_NORMAL || pMDMethod->GetMethodType() == METHOD_TYPE_INSTANTIATED))
+    {
+        return TRUE;
+    }
+#endif
+
     return GetModule()->IsEditAndContinueEnabled();
 }
 
diff --git a/src/vm/precode.cpp b/src/vm/precode.cpp
index 180e1709c5..9707b2756b 100644
--- a/src/vm/precode.cpp
+++ b/src/vm/precode.cpp
@@ -421,14 +421,14 @@ void Precode::Init(PrecodeType t, MethodDesc* pMD, LoaderAllocator *pLoaderAlloc
     _ASSERTE(IsValidType(GetType()));
 }
 
-BOOL Precode::SetTargetInterlocked(PCODE target)
+BOOL Precode::SetTargetInterlocked(PCODE target, BOOL fOnlyRedirectFromPrestub)
 {
     WRAPPER_NO_CONTRACT;
 
     PCODE expected = GetTarget();
     BOOL ret = FALSE;
 
-    if (!IsPointingToPrestub(expected))
+    if (fOnlyRedirectFromPrestub && !IsPointingToPrestub(expected))
         return FALSE;
 
     g_IBCLogger.LogMethodPrecodeWriteAccess(GetMethodDesc());
diff --git a/src/vm/precode.h b/src/vm/precode.h
index 0afa762647..7dd4cd22f0 100644
--- a/src/vm/precode.h
+++ b/src/vm/precode.h
@@ -256,7 +256,7 @@ public:
     void Init(PrecodeType t, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator);
 
 #ifndef DACCESS_COMPILE
-    BOOL SetTargetInterlocked(PCODE target);
+    BOOL SetTargetInterlocked(PCODE target, BOOL fOnlyRedirectFromPrestub = TRUE);
 
     // Reset precode to point to prestub
     void Reset();
diff --git a/src/vm/prestub.cpp b/src/vm/prestub.cpp
index 87b36fa275..e6467710dd 100644
--- a/src/vm/prestub.cpp
+++ b/src/vm/prestub.cpp
@@ -48,6 +48,10 @@
 #include "perfmap.h"
 #endif
 
+#ifdef FEATURE_TIERED_COMPILATION
+#include "callcounter.h"
+#endif
+
 #ifndef DACCESS_COMPILE 
 
 EXTERN_C void STDCALL ThePreStub();
@@ -267,10 +271,12 @@ PCODE MethodDesc::MakeJitWorker(COR_ILMETHOD_DECODER* ILHeader, CORJIT_FLAGS fla
 
     PCODE pCode = NULL;
     ULONG sizeOfCode = 0;
+#if defined(FEATURE_INTERPRETER) || defined(FEATURE_TIERED_COMPILATION)
+    BOOL fStable = TRUE;  // True iff the new code address (to be stored in pCode), is a stable entry point.
+#endif
 #ifdef FEATURE_INTERPRETER
     PCODE pPreviousInterpStub = NULL;
     BOOL fInterpreted = FALSE;
-    BOOL fStable = TRUE;  // True iff the new code address (to be stored in pCode), is a stable entry point.
 #endif
 
 #ifdef FEATURE_MULTICOREJIT
@@ -279,6 +285,20 @@ PCODE MethodDesc::MakeJitWorker(COR_ILMETHOD_DECODER* ILHeader, CORJIT_FLAGS fla
     bool fBackgroundThread = flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_MCJIT_BACKGROUND);
 #endif
 
+    // If this is the first stage of a tiered compilation progression, use min-opt, otherwise
+    // use default compilation options
+#ifdef FEATURE_TIERED_COMPILATION
+    if (!IsEligibleForTieredCompilation())
+    {
+        fStable = TRUE;
+    }
+    else
+    {
+        fStable = FALSE;
+        flags.Add(CORJIT_FLAGS(CORJIT_FLAGS::CORJIT_FLAG_MIN_OPT));
+    }
+#endif
+
     {
         // Enter the global lock which protects the list of all functions being JITd
         ListLockHolder pJitLock (GetDomain()->GetJitLock());
@@ -1283,6 +1303,22 @@ PCODE MethodDesc::DoPrestub(MethodTable *pDispatchingMT)
     if (!IsPointingToPrestub())
 #endif
     {
+        // If we are counting calls for tiered compilation, leave the prestub
+        // in place so that we can continue intercepting method invocations.
+        // When the TieredCompilationManager has received enough call notifications
+        // for this method only then do we back-patch it.
+#ifdef FEATURE_TIERED_COMPILATION
+        PCODE pNativeCode = GetNativeCode();
+        if (pNativeCode && IsEligibleForTieredCompilation())
+        {
+            CallCounter * pCallCounter = GetAppDomain()->GetCallCounter();
+            BOOL doBackPatch = pCallCounter->OnMethodCalled(this);
+            if (!doBackPatch)
+            {
+                return pNativeCode;
+            }
+        }
+#endif
         LOG((LF_CLASSLOADER, LL_INFO10000,
                 "    In PreStubWorker, method already jitted, backpatching call point\n"));
 
@@ -1308,8 +1344,8 @@ PCODE MethodDesc::DoPrestub(MethodTable *pDispatchingMT)
     else if (IsIL() || IsNoMetadata())
     {
         // remember if we need to backpatch the MethodTable slot
-        BOOL  fBackpatch           = !fRemotingIntercepted
-                                    && !IsEnCMethod();
+        BOOL  fBackpatch = !fRemotingIntercepted
+                            && IsNativeCodeStableAfterInit();
 
 #ifdef FEATURE_PREJIT 
         //
@@ -1583,6 +1619,22 @@ PCODE MethodDesc::DoPrestub(MethodTable *pDispatchingMT)
     MemoryBarrier();
 #endif
 
+    // If we are counting calls for tiered compilation, leave the prestub
+    // in place so that we can continue intercepting method invocations.
+    // When the TieredCompilationManager has received enough call notifications
+    // for this method only then do we back-patch it.
+#ifdef FEATURE_TIERED_COMPILATION
+    if (pCode && IsEligibleForTieredCompilation())
+    {
+        CallCounter * pCallCounter = GetAppDomain()->GetCallCounter();
+        BOOL doBackPatch = pCallCounter->OnMethodCalled(this);
+        if (!doBackPatch)
+        {
+            return pCode;
+        }
+    }
+#endif
+
     if (pCode != NULL)
     {
         if (HasPrecode())
@@ -1712,7 +1764,7 @@ static PCODE PatchNonVirtualExternalMethod(MethodDesc * pMD, PCODE pCode, PTR_CO
     //
 #ifdef HAS_FIXUP_PRECODE
     if (pMD->HasPrecode() && pMD->GetPrecode()->GetType() == PRECODE_FIXUP
-        && !pMD->IsEnCMethod()
+        && pMD->IsNativeCodeStableAfterInit()
 #ifndef HAS_REMOTING_PRECODE
         && !pMD->IsRemotingInterceptedViaPrestub()
 #endif
diff --git a/src/vm/tieredcompilation.cpp b/src/vm/tieredcompilation.cpp
new file mode 100644
index 0000000000..64378dbfc2
--- /dev/null
+++ b/src/vm/tieredcompilation.cpp
@@ -0,0 +1,377 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// ===========================================================================
+// File: TieredCompilation.CPP
+//
+// ===========================================================================
+
+
+
+#include "common.h"
+#include "excep.h"
+#include "log.h"
+#include "win32threadpool.h"
+#include "tieredcompilation.h"
+
+// TieredCompilationManager determines which methods should be recompiled and
+// how they should be recompiled to best optimize the running code. It then
+// handles logistics of getting new code created and installed.
+//
+//
+// # Current feature state
+//
+// This feature is incomplete and currently experimental. To enable it
+// you need to set COMPLUS_EXPERIMENTAL_TieredCompilation = 1. When the environment
+// variable is unset the runtime should work as normal, but when it is set there are 
+// anticipated incompatibilities and limited cross cutting test coverage so far.
+//   Profiler - Anticipated incompatible with ReJIT, untested in general
+//   ETW - Anticipated incompatible with the ReJIT id of the MethodJitted rundown events
+//   Managed debugging - Anticipated incompatible with breakpoints/stepping that are
+//                       active when a method is recompiled.
+//   
+//
+// Testing that has been done so far largely consists of regression testing with
+// the environment variable off + functional/perf testing of the Music Store ASP.Net
+// workload as a basic example that the feature can work. Running the coreclr repo
+// tests with the env var on generates about a dozen failures in JIT tests. The issues
+// are likely related to assertions about optimization behavior but haven't been
+// properly investigated yet.
+//
+// If you decide to try this out on a new workload and run into trouble a quick note
+// on github is appreciated but this code may have high churn for a while to come and
+// there will be no sense investing a lot of time investigating only to have it rendered 
+// moot by changes. I aim to keep this comment updated as things change.
+//
+//
+// # Important entrypoints in this code:
+//
+// 
+// a) .ctor and Init(...) - called once during AppDomain initialization
+// b) OnMethodCalled(...) - called when a method is being invoked. When a method
+//                     has been called enough times this is currently the only
+//                     trigger that initiates re-compilation.
+// c) OnAppDomainShutdown() - called during AppDomain::Exit() to begin the process
+//                     of stopping tiered compilation. After this point no more
+//                     background optimization work will be initiated but in-progress
+//                     work still needs to complete.
+//
+// # Overall workflow
+//
+// Methods initially call into OnMethodCalled() and once the call count exceeds
+// a fixed limit we queue work on to our internal list of methods needing to
+// be recompiled (m_methodsToOptimize). If there is currently no thread
+// servicing our queue asynchronously then we use the runtime threadpool
+// QueueUserWorkItem to recruit one. During the callback for each threadpool work
+// item we handle as many methods as possible in a fixed period of time, then
+// queue another threadpool work item if m_methodsToOptimize hasn't been drained.
+//
+// The background thread enters at StaticOptimizeMethodsCallback(), enters the
+// appdomain, and then begins calling OptimizeMethod on each method in the
+// queue. For each method we jit it, then update the precode so that future
+// entrypoint callers will run the new code.
+// 
+// # Error handling
+//
+// The overall principle is don't swallow terminal failures that may have corrupted the
+// process (AV for example), but otherwise for any transient issue or functional limitation
+// that prevents us from optimizing log it for diagnostics and then back out gracefully,
+// continuing to run the less optimal code. The feature should be constructed so that
+// errors are limited to OS resource exhaustion or poorly behaved managed code
+// (for example within an AssemblyResolve event or static constructor triggered by the JIT).
+
+#ifdef FEATURE_TIERED_COMPILATION
+
+// Called at AppDomain construction
+TieredCompilationManager::TieredCompilationManager() :
+    m_isAppDomainShuttingDown(FALSE),
+    m_countOptimizationThreadsRunning(0),
+    m_callCountOptimizationThreshhold(30),
+    m_optimizationQuantumMs(50)
+{
+    LIMITED_METHOD_CONTRACT;
+    m_lock.Init(LOCK_TYPE_DEFAULT);
+}
+
+// Called at AppDomain Init
+void TieredCompilationManager::Init(ADID appDomainId)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        CAN_TAKE_LOCK;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    SpinLockHolder holder(&m_lock);
+    m_domainId = appDomainId;
+}
+
+// Called each time code in this AppDomain has been run. This is our sole entrypoint to begin
+// tiered compilation for now. Returns TRUE if no more notifications are necessary, but
+// more notifications may come anyways.
+//
+// currentCallCount is pre-incremented, that is to say the value is 1 on first call for a given
+//      method.
+BOOL TieredCompilationManager::OnMethodCalled(MethodDesc* pMethodDesc, DWORD currentCallCount)
+{
+    STANDARD_VM_CONTRACT;
+
+    if (currentCallCount < m_callCountOptimizationThreshhold)
+    {
+        return FALSE; // continue notifications for this method
+    }
+    else if (currentCallCount > m_callCountOptimizationThreshhold)
+    {
+        return TRUE; // stop notifications for this method
+    }
+
+    // Insert the method into the optimization queue and trigger a thread to service
+    // the queue if needed.
+    //
+    // Terminal exceptions escape as exceptions, but all other errors should gracefully
+    // return to the caller. Non-terminal error conditions should be rare (ie OOM,
+    // OS failure to create thread) and we consider it reasonable for some methods
+    // to go unoptimized or have their optimization arbitrarily delayed under these
+    // circumstances. Note an error here could affect concurrent threads running this
+    // code. Those threads will observe m_countOptimizationThreadsRunning > 0 and return,
+    // then QueueUserWorkItem fails on this thread lowering the count and leaves them 
+    // unserviced. Synchronous retries appear unlikely to offer any material improvement 
+    // and complicating the code to narrow an already rare error case isn't desirable.
+    {
+        SListElem<MethodDesc*>* pMethodListItem = new (nothrow) SListElem<MethodDesc*>(pMethodDesc);
+        SpinLockHolder holder(&m_lock);
+        if (pMethodListItem != NULL)
+        {
+            m_methodsToOptimize.InsertTail(pMethodListItem);
+        }
+
+        if (0 == m_countOptimizationThreadsRunning && !m_isAppDomainShuttingDown)
+        {
+            // Our current policy throttles at 1 thread, but in the future we
+            // could experiment with more parallelism.
+            m_countOptimizationThreadsRunning++;
+        }
+        else
+        {
+            return TRUE; // stop notifications for this method
+        }
+    }
+
+    EX_TRY
+    {
+        if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
+        {
+            SpinLockHolder holder(&m_lock);
+            m_countOptimizationThreadsRunning--;
+            STRESS_LOG1(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
+                "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run), method=%pM\n",
+                pMethodDesc);
+        }
+    }
+    EX_CATCH
+    {
+        SpinLockHolder holder(&m_lock);
+        m_countOptimizationThreadsRunning--;
+        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
+            "Exception queuing work item to threadpool, hr=0x%x, method=%pM\n",
+            GET_EXCEPTION()->GetHR(), pMethodDesc);
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+
+    return TRUE; // stop notifications for this method
+}
+
+void TieredCompilationManager::OnAppDomainShutdown()
+{
+    SpinLockHolder holder(&m_lock);
+    m_isAppDomainShuttingDown = TRUE;
+}
+
+// This is the initial entrypoint for the background thread, called by
+// the threadpool.
+DWORD WINAPI TieredCompilationManager::StaticOptimizeMethodsCallback(void *args)
+{
+    STANDARD_VM_CONTRACT;
+
+    TieredCompilationManager * pTieredCompilationManager = (TieredCompilationManager *)args;
+    pTieredCompilationManager->OptimizeMethodsCallback();
+
+    return 0;
+}
+
+//This method will process one or more methods from optimization queue
+// on a background thread. Each such method will be jitted with code
+// optimizations enabled and then installed as the active implementation
+// of the method entrypoint.
+// 
+// We need to be carefuly not to work for too long in a single invocation
+// of this method or we could starve the threadpool and force
+// it to create unnecessary additional threads.
+void TieredCompilationManager::OptimizeMethodsCallback()
+{
+    STANDARD_VM_CONTRACT;
+
+    // This app domain shutdown check isn't required for correctness
+    // but it should reduce some unneeded exceptions trying
+    // to enter a closed AppDomain
+    {
+        SpinLockHolder holder(&m_lock);
+        if (m_isAppDomainShuttingDown)
+        {
+            m_countOptimizationThreadsRunning--;
+            return;
+        }
+    }
+
+    ULONGLONG startTickCount = CLRGetTickCount64();
+    MethodDesc* pMethod = NULL;
+    EX_TRY
+    {
+        ENTER_DOMAIN_ID(m_domainId);
+        {
+            while (true)
+            {
+                {
+                    SpinLockHolder holder(&m_lock); 
+                    pMethod = GetNextMethodToOptimize();
+                    if (pMethod == NULL ||
+                        m_isAppDomainShuttingDown)
+                    {
+                        m_countOptimizationThreadsRunning--;
+                        break;
+                    }
+                    
+                }
+                OptimizeMethod(pMethod);
+
+                // If we have been running for too long return the thread to the threadpool and queue another event
+                // This gives the threadpool a chance to service other requests on this thread before returning to
+                // this work.
+                ULONGLONG currentTickCount = CLRGetTickCount64();
+                if (currentTickCount >= startTickCount + m_optimizationQuantumMs)
+                {
+                    if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
+                    {
+                        SpinLockHolder holder(&m_lock);
+                        m_countOptimizationThreadsRunning--;
+                        STRESS_LOG0(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OptimizeMethodsCallback: "
+                            "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run)\n");
+                    }
+                    break;
+                }
+            }
+        }
+        END_DOMAIN_TRANSITION;
+    }
+    EX_CATCH
+    {
+        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethodsCallback: "
+            "Unhandled exception during method optimization, hr=0x%x, last method=%pM\n",
+            GET_EXCEPTION()->GetHR(), pMethod);
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+}
+
+// Jit compiles and installs new optimized code for a method.
+// Called on a background thread.
+void TieredCompilationManager::OptimizeMethod(MethodDesc* pMethod)
+{
+    STANDARD_VM_CONTRACT;
+
+    _ASSERTE(pMethod->IsEligibleForTieredCompilation());
+    PCODE pJittedCode = CompileMethod(pMethod);
+    if (pJittedCode != NULL)
+    {
+        InstallMethodCode(pMethod, pJittedCode);
+    }
+}
+
+// Compiles new optimized code for a method.
+// Called on a background thread.
+PCODE TieredCompilationManager::CompileMethod(MethodDesc* pMethod)
+{
+    STANDARD_VM_CONTRACT;
+
+    PCODE pCode = NULL;
+    ULONG sizeOfCode = 0;
+    EX_TRY
+    {
+        CORJIT_FLAGS flags = CORJIT_FLAGS(CORJIT_FLAGS::CORJIT_FLAG_MCJIT_BACKGROUND);
+        flags.Add(CORJIT_FLAGS(CORJIT_FLAGS::CORJIT_FLAG_SPEED_OPT));
+
+        if (pMethod->IsDynamicMethod())
+        {
+            ILStubResolver* pResolver = pMethod->AsDynamicMethodDesc()->GetILStubResolver();
+            flags.Add(pResolver->GetJitFlags());
+            COR_ILMETHOD_DECODER* pILheader = pResolver->GetILHeader();
+            pCode = UnsafeJitFunction(pMethod, pILheader, flags, &sizeOfCode);
+        }
+        else
+        {
+            COR_ILMETHOD_DECODER::DecoderStatus status;
+            COR_ILMETHOD_DECODER header(pMethod->GetILHeader(), pMethod->GetModule()->GetMDImport(), &status);
+            pCode = UnsafeJitFunction(pMethod, &header, flags, &sizeOfCode);
+        }
+    }
+    EX_CATCH
+    {
+        // Failing to jit should be rare but acceptable. We will leave whatever code already exists in place.
+        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_INFO10, "TieredCompilationManager::CompileMethod: Method %pM failed to jit, hr=0x%x\n", 
+            pMethod, GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions)
+
+    return pCode;
+}
+
+// Updates the MethodDesc and precode so that future invocations of a method will
+// execute the native code pointed to by pCode.
+// Called on a background thread.
+void TieredCompilationManager::InstallMethodCode(MethodDesc* pMethod, PCODE pCode)
+{
+    STANDARD_VM_CONTRACT;
+
+    _ASSERTE(!pMethod->IsNativeCodeStableAfterInit());
+
+    PCODE pExistingCode = pMethod->GetNativeCode();
+    if (!pMethod->SetNativeCodeInterlocked(pCode, pExistingCode))
+    {
+        //We aren't there yet, but when the feature is finished we shouldn't be racing against any other code mutator and there would be no
+        //reason for this to fail
+        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_INFO10, "TieredCompilationManager::InstallMethodCode: Method %pM failed to update native code slot. Code=%pK\n",
+            pMethod, pCode);
+    }
+    else
+    {
+        Precode* pPrecode = pMethod->GetPrecode();
+        if (!pPrecode->SetTargetInterlocked(pCode, FALSE))
+        {
+            //We aren't there yet, but when the feature is finished we shouldn't be racing against any other code mutator and there would be no
+            //reason for this to fail
+            STRESS_LOG2(LF_TIEREDCOMPILATION, LL_INFO10, "TieredCompilationManager::InstallMethodCode: Method %pM failed to update precode. Code=%pK\n",
+                pMethod, pCode);
+        }
+    }
+}
+
+// Dequeues the next method in the optmization queue.
+// This should be called with m_lock already held and runs
+// on the background thread.
+MethodDesc* TieredCompilationManager::GetNextMethodToOptimize()
+{
+    STANDARD_VM_CONTRACT;
+
+    SListElem<MethodDesc*>* pElem = m_methodsToOptimize.RemoveHead();
+    if (pElem != NULL)
+    {
+        MethodDesc* pMD = pElem->GetValue();
+        delete pElem;
+        return pMD;
+    }
+    return NULL;
+}
+
+#endif // FEATURE_TIERED_COMPILATION
diff --git a/src/vm/tieredcompilation.h b/src/vm/tieredcompilation.h
new file mode 100644
index 0000000000..71236c5374
--- /dev/null
+++ b/src/vm/tieredcompilation.h
@@ -0,0 +1,51 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// ===========================================================================
+// File: TieredCompilation.h
+//
+// ===========================================================================
+
+
+#ifndef TIERED_COMPILATION_H
+#define TIERED_COMPILATION_H
+
+#ifdef FEATURE_TIERED_COMPILATION
+
+// TieredCompilationManager determines which methods should be recompiled and
+// how they should be recompiled to best optimize the running code. It then
+// handles logistics of getting new code created and installed.
+class TieredCompilationManager
+{
+public:
+#if defined(DACCESS_COMPILE) || defined(CROSSGEN_COMPILE)
+    TieredCompilationManager() {}
+#else
+    TieredCompilationManager();
+#endif
+
+    void Init(ADID appDomainId);
+    BOOL OnMethodCalled(MethodDesc* pMethodDesc, DWORD currentCallCount);
+    void OnAppDomainShutdown();
+
+private:
+
+    static DWORD StaticOptimizeMethodsCallback(void* args);
+    void OptimizeMethodsCallback();
+    void OptimizeMethod(MethodDesc* pMethod);
+    MethodDesc* GetNextMethodToOptimize();
+    PCODE CompileMethod(MethodDesc* pMethod);
+    void InstallMethodCode(MethodDesc* pMethod, PCODE pCode);
+
+    SpinLock m_lock;
+    SList<SListElem<MethodDesc*>> m_methodsToOptimize;
+    ADID m_domainId;
+    BOOL m_isAppDomainShuttingDown;
+    DWORD m_countOptimizationThreadsRunning;
+    DWORD m_callCountOptimizationThreshhold;
+    DWORD m_optimizationQuantumMs;
+};
+
+#endif // FEATURE_TIERED_COMPILATION
+
+#endif // TIERED_COMPILATION_H
-- 
cgit v1.2.3