// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. // =========================================================================== // File: TieredCompilation.CPP // // =========================================================================== #include "common.h" #include "excep.h" #include "log.h" #include "win32threadpool.h" #include "threadsuspend.h" #include "tieredcompilation.h" // TieredCompilationManager determines which methods should be recompiled and // how they should be recompiled to best optimize the running code. It then // handles logistics of getting new code created and installed. // // // # Current feature state // // This feature is incomplete and currently experimental. To enable it // you need to set COMPLUS_EXPERIMENTAL_TieredCompilation = 1. When the environment // variable is unset the runtime should work as normal, but when it is set there are // anticipated incompatibilities and limited cross cutting test coverage so far. // Profiler - Anticipated incompatible with ReJIT, untested in general // ETW - Anticipated incompatible with the ReJIT id of the MethodJitted rundown events // Managed debugging - Anticipated incompatible with breakpoints/stepping that are // active when a method is recompiled. // // // Testing that has been done so far largely consists of regression testing with // the environment variable off + functional/perf testing of the Music Store ASP.Net // workload as a basic example that the feature can work. Running the coreclr repo // tests with the env var on generates about a dozen failures in JIT tests. The issues // are likely related to assertions about optimization behavior but haven't been // properly investigated yet. // // If you decide to try this out on a new workload and run into trouble a quick note // on github is appreciated but this code may have high churn for a while to come and // there will be no sense investing a lot of time investigating only to have it rendered // moot by changes. I aim to keep this comment updated as things change. // // // # Important entrypoints in this code: // // // a) .ctor and Init(...) - called once during AppDomain initialization // b) OnMethodCalled(...) - called when a method is being invoked. When a method // has been called enough times this is currently the only // trigger that initiates re-compilation. // c) OnAppDomainShutdown() - called during AppDomain::Exit() to begin the process // of stopping tiered compilation. After this point no more // background optimization work will be initiated but in-progress // work still needs to complete. // // # Overall workflow // // Methods initially call into OnMethodCalled() and once the call count exceeds // a fixed limit we queue work on to our internal list of methods needing to // be recompiled (m_methodsToOptimize). If there is currently no thread // servicing our queue asynchronously then we use the runtime threadpool // QueueUserWorkItem to recruit one. During the callback for each threadpool work // item we handle as many methods as possible in a fixed period of time, then // queue another threadpool work item if m_methodsToOptimize hasn't been drained. // // The background thread enters at StaticOptimizeMethodsCallback(), enters the // appdomain, and then begins calling OptimizeMethod on each method in the // queue. For each method we jit it, then update the precode so that future // entrypoint callers will run the new code. // // # Error handling // // The overall principle is don't swallow terminal failures that may have corrupted the // process (AV for example), but otherwise for any transient issue or functional limitation // that prevents us from optimizing log it for diagnostics and then back out gracefully, // continuing to run the less optimal code. The feature should be constructed so that // errors are limited to OS resource exhaustion or poorly behaved managed code // (for example within an AssemblyResolve event or static constructor triggered by the JIT). #ifdef FEATURE_TIERED_COMPILATION // Called at AppDomain construction TieredCompilationManager::TieredCompilationManager() : m_isAppDomainShuttingDown(FALSE), m_countOptimizationThreadsRunning(0), m_callCountOptimizationThreshhold(30), m_optimizationQuantumMs(50) { LIMITED_METHOD_CONTRACT; m_lock.Init(LOCK_TYPE_DEFAULT); } // Called at AppDomain Init void TieredCompilationManager::Init(ADID appDomainId) { CONTRACTL { NOTHROW; GC_NOTRIGGER; CAN_TAKE_LOCK; MODE_PREEMPTIVE; } CONTRACTL_END; SpinLockHolder holder(&m_lock); m_domainId = appDomainId; } // Called each time code in this AppDomain has been run. This is our sole entrypoint to begin // tiered compilation for now. Returns TRUE if no more notifications are necessary, but // more notifications may come anyways. // // currentCallCount is pre-incremented, that is to say the value is 1 on first call for a given // method. BOOL TieredCompilationManager::OnMethodCalled(MethodDesc* pMethodDesc, DWORD currentCallCount) { STANDARD_VM_CONTRACT; if (currentCallCount < m_callCountOptimizationThreshhold) { return FALSE; // continue notifications for this method } else if (currentCallCount > m_callCountOptimizationThreshhold) { return TRUE; // stop notifications for this method } AsyncPromoteMethodToTier1(pMethodDesc); return TRUE; } void TieredCompilationManager::AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc) { STANDARD_VM_CONTRACT; NativeCodeVersion t1NativeCodeVersion; // Add an inactive native code entry in the versioning table to track the tier1 // compilation we are going to create. This entry binds the compilation to a // particular version of the IL code regardless of any changes that may // occur between now and when jitting completes. If the IL does change in that // interval the new code entry won't be activated. { CodeVersionManager* pCodeVersionManager = pMethodDesc->GetCodeVersionManager(); CodeVersionManager::TableLockHolder lock(pCodeVersionManager); ILCodeVersion ilVersion = pCodeVersionManager->GetActiveILCodeVersion(pMethodDesc); NativeCodeVersionCollection nativeVersions = ilVersion.GetNativeCodeVersions(pMethodDesc); for (NativeCodeVersionIterator cur = nativeVersions.Begin(), end = nativeVersions.End(); cur != end; cur++) { if (cur->GetOptimizationTier() == NativeCodeVersion::OptimizationTier1) { // we've already promoted return; } } if (FAILED(ilVersion.AddNativeCodeVersion(pMethodDesc, &t1NativeCodeVersion))) { // optimization didn't work for some reason (presumably OOM) // just give up and continue on return; } t1NativeCodeVersion.SetOptimizationTier(NativeCodeVersion::OptimizationTier1); } // Insert the method into the optimization queue and trigger a thread to service // the queue if needed. // // Terminal exceptions escape as exceptions, but all other errors should gracefully // return to the caller. Non-terminal error conditions should be rare (ie OOM, // OS failure to create thread) and we consider it reasonable for some methods // to go unoptimized or have their optimization arbitrarily delayed under these // circumstances. Note an error here could affect concurrent threads running this // code. Those threads will observe m_countOptimizationThreadsRunning > 0 and return, // then QueueUserWorkItem fails on this thread lowering the count and leaves them // unserviced. Synchronous retries appear unlikely to offer any material improvement // and complicating the code to narrow an already rare error case isn't desirable. { SListElem* pMethodListItem = new (nothrow) SListElem(t1NativeCodeVersion); SpinLockHolder holder(&m_lock); if (pMethodListItem != NULL) { m_methodsToOptimize.InsertTail(pMethodListItem); } if (0 == m_countOptimizationThreadsRunning && !m_isAppDomainShuttingDown) { // Our current policy throttles at 1 thread, but in the future we // could experiment with more parallelism. m_countOptimizationThreadsRunning++; } else { return; } } EX_TRY { if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE)) { SpinLockHolder holder(&m_lock); m_countOptimizationThreadsRunning--; STRESS_LOG1(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: " "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run), method=%pM\n", pMethodDesc); } } EX_CATCH { SpinLockHolder holder(&m_lock); m_countOptimizationThreadsRunning--; STRESS_LOG2(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: " "Exception queuing work item to threadpool, hr=0x%x, method=%pM\n", GET_EXCEPTION()->GetHR(), pMethodDesc); } EX_END_CATCH(RethrowTerminalExceptions); return; } void TieredCompilationManager::OnAppDomainShutdown() { CONTRACTL { NOTHROW; GC_NOTRIGGER; MODE_ANY; CAN_TAKE_LOCK; } CONTRACTL_END SpinLockHolder holder(&m_lock); m_isAppDomainShuttingDown = TRUE; } // This is the initial entrypoint for the background thread, called by // the threadpool. DWORD WINAPI TieredCompilationManager::StaticOptimizeMethodsCallback(void *args) { STANDARD_VM_CONTRACT; TieredCompilationManager * pTieredCompilationManager = (TieredCompilationManager *)args; pTieredCompilationManager->OptimizeMethodsCallback(); return 0; } //This method will process one or more methods from optimization queue // on a background thread. Each such method will be jitted with code // optimizations enabled and then installed as the active implementation // of the method entrypoint. // // We need to be carefuly not to work for too long in a single invocation // of this method or we could starve the threadpool and force // it to create unnecessary additional threads. void TieredCompilationManager::OptimizeMethodsCallback() { STANDARD_VM_CONTRACT; // This app domain shutdown check isn't required for correctness // but it should reduce some unneeded exceptions trying // to enter a closed AppDomain { SpinLockHolder holder(&m_lock); if (m_isAppDomainShuttingDown) { m_countOptimizationThreadsRunning--; return; } } ULONGLONG startTickCount = CLRGetTickCount64(); NativeCodeVersion nativeCodeVersion; EX_TRY { GCX_COOP(); ENTER_DOMAIN_ID(m_domainId); { GCX_PREEMP(); while (true) { { SpinLockHolder holder(&m_lock); nativeCodeVersion = GetNextMethodToOptimize(); if (nativeCodeVersion.IsNull() || m_isAppDomainShuttingDown) { m_countOptimizationThreadsRunning--; break; } } OptimizeMethod(nativeCodeVersion); // If we have been running for too long return the thread to the threadpool and queue another event // This gives the threadpool a chance to service other requests on this thread before returning to // this work. ULONGLONG currentTickCount = CLRGetTickCount64(); if (currentTickCount >= startTickCount + m_optimizationQuantumMs) { if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE)) { SpinLockHolder holder(&m_lock); m_countOptimizationThreadsRunning--; STRESS_LOG0(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OptimizeMethodsCallback: " "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run)\n"); } break; } } } END_DOMAIN_TRANSITION; } EX_CATCH { STRESS_LOG2(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethodsCallback: " "Unhandled exception during method optimization, hr=0x%x, last method=%pM\n", GET_EXCEPTION()->GetHR(), nativeCodeVersion.GetMethodDesc()); } EX_END_CATCH(RethrowTerminalExceptions); } // Jit compiles and installs new optimized code for a method. // Called on a background thread. void TieredCompilationManager::OptimizeMethod(NativeCodeVersion nativeCodeVersion) { STANDARD_VM_CONTRACT; _ASSERTE(nativeCodeVersion.GetMethodDesc()->IsEligibleForTieredCompilation()); if (CompileCodeVersion(nativeCodeVersion)) { ActivateCodeVersion(nativeCodeVersion); } } // Compiles new optimized code for a method. // Called on a background thread. BOOL TieredCompilationManager::CompileCodeVersion(NativeCodeVersion nativeCodeVersion) { STANDARD_VM_CONTRACT; PCODE pCode = NULL; MethodDesc* pMethod = nativeCodeVersion.GetMethodDesc(); EX_TRY { pCode = pMethod->PrepareCode(nativeCodeVersion); } EX_CATCH { // Failing to jit should be rare but acceptable. We will leave whatever code already exists in place. STRESS_LOG2(LF_TIEREDCOMPILATION, LL_INFO10, "TieredCompilationManager::CompileMethod: Method %pM failed to jit, hr=0x%x\n", pMethod, GET_EXCEPTION()->GetHR()); } EX_END_CATCH(RethrowTerminalExceptions) return pCode != NULL; } // Updates the MethodDesc and precode so that future invocations of a method will // execute the native code pointed to by pCode. // Called on a background thread. void TieredCompilationManager::ActivateCodeVersion(NativeCodeVersion nativeCodeVersion) { STANDARD_VM_CONTRACT; MethodDesc* pMethod = nativeCodeVersion.GetMethodDesc(); CodeVersionManager* pCodeVersionManager = pMethod->GetCodeVersionManager(); // If the ilParent version is active this will activate the native code version now. // Otherwise if the ilParent version becomes active again in the future the native // code version will activate then. ILCodeVersion ilParent; HRESULT hr = S_OK; { // As long as we are exclusively using precode publishing for tiered compilation // methods this first attempt should succeed CodeVersionManager::TableLockHolder lock(pCodeVersionManager); ilParent = nativeCodeVersion.GetILCodeVersion(); hr = ilParent.SetActiveNativeCodeVersion(nativeCodeVersion, FALSE); } if (hr == CORPROF_E_RUNTIME_SUSPEND_REQUIRED) { // if we start using jump-stamp publishing for tiered compilation, the first attempt // without the runtime suspended will fail and then this second attempt will // succeed. // Even though this works performance is likely to be quite bad. Realistically // we are going to need batched updates to makes tiered-compilation + jump-stamp // viable. This fallback path is just here as a proof-of-concept. ThreadSuspend::SuspendEE(ThreadSuspend::SUSPEND_FOR_REJIT); { CodeVersionManager::TableLockHolder lock(pCodeVersionManager); hr = ilParent.SetActiveNativeCodeVersion(nativeCodeVersion, TRUE); } ThreadSuspend::RestartEE(FALSE, TRUE); } if (FAILED(hr)) { STRESS_LOG2(LF_TIEREDCOMPILATION, LL_INFO10, "TieredCompilationManager::ActivateCodeVersion: Method %pM failed to publish native code for native code version %d\n", pMethod, nativeCodeVersion.GetVersionId()); } } // Dequeues the next method in the optmization queue. // This should be called with m_lock already held and runs // on the background thread. NativeCodeVersion TieredCompilationManager::GetNextMethodToOptimize() { STANDARD_VM_CONTRACT; SListElem* pElem = m_methodsToOptimize.RemoveHead(); if (pElem != NULL) { NativeCodeVersion nativeCodeVersion = pElem->GetValue(); delete pElem; return nativeCodeVersion; } return NativeCodeVersion(); } //static CORJIT_FLAGS TieredCompilationManager::GetJitFlags(NativeCodeVersion nativeCodeVersion) { LIMITED_METHOD_CONTRACT; CORJIT_FLAGS flags; if (!nativeCodeVersion.GetMethodDesc()->IsEligibleForTieredCompilation()) { #ifdef FEATURE_INTERPRETER flags.Set(CORJIT_FLAGS::CORJIT_FLAG_MAKEFINALCODE); #endif return flags; } if (nativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0) { flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0); } else { flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER1); #ifdef FEATURE_INTERPRETER flags.Set(CORJIT_FLAGS::CORJIT_FLAG_MAKEFINALCODE); #endif } return flags; } #endif // FEATURE_TIERED_COMPILATION