Fix profiler crash on shutdown (#22712)

Fixes issue #22176. Use the profiler evacuation counters to ensure that we don't callback into the profiler when it has already been released. Previously we only did this as part of the attach/detach feature, but this is required for correctness during standard shutdown given that managed threads are still running concurrently.
author: Noah Falk <noahfalk@users.noreply.github.com> 2019-05-21 21:52:18 -0700
committer: David Mason <davmason@microsoft.com> 2019-05-21 21:52:18 -0700
commit: 671772c20a27c050df3d7d11391ea4f7de05165c (patch)
tree: 6e20b07feb90c3ef3b7b5bc1634e445b3c7f5f98
parent: 29810a78e5b93d8da9fb921d096226d249fc75a5 (diff)
download: coreclr-671772c20a27c050df3d7d11391ea4f7de05165c.tar.gz
coreclr-671772c20a27c050df3d7d11391ea4f7de05165c.tar.bz2
coreclr-671772c20a27c050df3d7d11391ea4f7de05165c.zip
8 files changed, 112 insertions, 104 deletions
diff --git a/src/vm/ceemain.cpp b/src/vm/ceemain.cpp
index 32dceea5e6..e03c78bcff 100644
--- a/src/vm/ceemain.cpp
+++ b/src/vm/ceemain.cpp
@@ -1605,6 +1605,15 @@ void STDMETHODCALLTYPE EEShutDownHelper(BOOL fIsDllUnloading)
         // profiler can make any last calls it needs to.  Do this only if we
         // are not detaching
 
+        // NOTE: We haven't stopped other threads at this point and nothing is stopping
+        // callbacks from coming into the profiler even after Shutdown() has been called.
+        // See https://github.com/dotnet/coreclr/issues/22176 for an example of how that
+        // happens.
+        // Callbacks will be prevented when ProfilingAPIUtility::Terminate() changes the state
+        // to detached, which occurs shortly afterwards. It might be kinder to make the detaching
+        // transition before calling Shutdown(), but if we do we'd have to be very careful not
+        // to break profilers that were relying on being able to call various APIs during
+        // Shutdown(). I suspect this isn't something we'll ever do unless we get complaints.
         if (CORProfilerPresent())
         {
             // If EEShutdown is not being called due to a ProcessDetach event, so
diff --git a/src/vm/eetoprofinterfaceimpl.cpp b/src/vm/eetoprofinterfaceimpl.cpp
index faae65f2eb..30b35a1cf6 100644
--- a/src/vm/eetoprofinterfaceimpl.cpp
+++ b/src/vm/eetoprofinterfaceimpl.cpp
@@ -99,13 +99,9 @@ enum ClrToProfEntrypointFlags
     kEE2PNoTrigger                      = 0x00000004,
 };
 
-#ifdef FEATURE_PROFAPI_ATTACH_DETACH
 #define ASSERT_EVAC_COUNTER_NONZERO()   \
     _ASSERTE((GetThreadNULLOk() == NULL) ||                                             \
              (GetThreadNULLOk()->GetProfilerEvacuationCounter() != 0U))
-#else // FEATURE_PROFAPI_ATTACH_DETACH
-#define ASSERT_EVAC_COUNTER_NONZERO()
-#endif // FEATURE_PROFAPI_ATTACH_DETACH
 
 #define CHECK_PROFILER_STATUS(ee2pFlags)                                                \
     /* If one of these asserts fires, perhaps you forgot to use                     */  \
diff --git a/src/vm/profdetach.cpp b/src/vm/profdetach.cpp
index 0181e83d7b..1594d42008 100644
--- a/src/vm/profdetach.cpp
+++ b/src/vm/profdetach.cpp
@@ -324,7 +324,7 @@ void ProfilingAPIDetach::ExecuteEvacuationLoop()
         // Give profiler a chance to return from its procs
         SleepWhileProfilerEvacuates();
     }
-    while (!IsProfilerEvacuated());
+    while (!ProfilingAPIUtility::IsProfilerEvacuated());
 
     UnloadProfiler();
 }
@@ -442,96 +442,7 @@ void ProfilingAPIDetach::SleepWhileProfilerEvacuates()
     ClrSleepEx((DWORD) ui64SleepMilliseconds, FALSE /* alertable */);
 }
 
-//---------------------------------------------------------------------------------------
-//
-// Performs the evacuation checks by grabbing the thread store lock, iterating through
-// all EE Threads, and querying each one's evacuation counter.  If they're all 0, the
-// profiler is ready to be unloaded.
-//
-// Return Value:
-//    Nonzero iff the profiler is fully evacuated and ready to be unloaded.
-//
-
-// static
-BOOL ProfilingAPIDetach::IsProfilerEvacuated()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_TRIGGERS;
-        MODE_ANY;
-        CAN_TAKE_LOCK;
-    }
-    CONTRACTL_END;
 
-    _ASSERTE(g_profControlBlock.curProfStatus.Get() == kProfStatusDetaching);
-
-    // Check evacuation counters on all the threads (see
-    // code:ProfilingAPIUtility::InitializeProfiling#LoadUnloadCallbackSynchronization
-    // for details). Doing this under the thread store lock not only ensures we can
-    // iterate through the Thread objects safely, but also forces us to serialize with
-    // the GC. The latter is important, as server GC enters the profiler on non-EE
-    // Threads, and so no evacuation counters might be incremented during server GC even
-    // though control could be entering the profiler.
-    {
-        ThreadStoreLockHolder TSLockHolder;
-
-        Thread * pThread = ThreadStore::GetAllThreadList(
-            NULL,   // cursor thread; always NULL to begin with
-            0,      // mask to AND with Thread::m_State to filter returned threads 
-            0);     // bits to match the result of the above AND.  (m_State & 0 == 0,
-                    // so we won't filter out any threads)
-        
-        // Note that, by not filtering out any of the threads, we're intentionally including
-        // stuff like TS_Dead or TS_Unstarted.  But that keeps us on the safe
-        // side.  If an EE Thread object exists, we want to check its counters to be
-        // absolutely certain it isn't executing in a profiler.
-
-        while (pThread != NULL)
-        {
-            // Note that pThread is still in motion as we check its evacuation counter.
-            // This is ok, because we've already changed the profiler status to
-            // kProfStatusDetaching and flushed CPU buffers. So at this point the counter
-            // will typically only go down to 0 (and not increment anymore), with one
-            // small exception (below). So if we get a read of 0 below, the counter will
-            // typically stay there. Specifically:
-            //     * pThread is most likely not about to increment its evacuation counter
-            //         from 0 to 1 because pThread sees that the status is
-            //         kProfStatusDetaching.
-            //     * Note that there is a small race where pThread might actually
-            //         increment its evac counter from 0 to 1 (if it dirty-read the
-            //         profiler status a tad too early), but that implies that when
-            //         pThread rechecks the profiler status (clean read) then pThread
-            //         will immediately decrement the evac counter back to 0 and avoid
-            //         calling into the EEToProfInterfaceImpl pointer.
-            // 
-            // (see
-            // code:ProfilingAPIUtility::InitializeProfiling#LoadUnloadCallbackSynchronization
-            // for details)
-            DWORD dwEvacCounter = pThread->GetProfilerEvacuationCounter();
-            if (dwEvacCounter != 0)
-            {
-                LOG((
-                    LF_CORPROF, 
-                    LL_INFO100, 
-                    "**PROF: Profiler not yet evacuated because OS Thread ID 0x%x has evac counter of %d (decimal).\n",
-                    pThread->GetOSThreadId(),
-                    dwEvacCounter));
-                return FALSE;
-            }
-
-            pThread = ThreadStore::GetAllThreadList(pThread, 0, 0);
-        }
-    }
-
-    // FUTURE: When rejit feature crew complete, add code to verify all rejitted
-    // functions are fully reverted and off of all stacks.  If this is very easy to
-    // verify (e.g., checking a single value), consider putting it above the loop
-    // above so we can early-out quicker if rejitted code is still around.
-
-    // We got this far without returning, so the profiler is fully evacuated
-    return TRUE;
-}
 
 // ---------------------------------------------------------------------------------------
 // After we've verified a detaching profiler has fully evacuated, call this to unload the
diff --git a/src/vm/profdetach.h b/src/vm/profdetach.h
index f3d13b8f9e..a3026b12f1 100644
--- a/src/vm/profdetach.h
+++ b/src/vm/profdetach.h
@@ -54,7 +54,6 @@ public:
     static HRESULT CreateDetachThread();
     static DWORD WINAPI ProfilingAPIDetachThreadStart(LPVOID lpParameter);
     static void ExecuteEvacuationLoop();
-    static BOOL IsProfilerEvacuated();
 
     static EEToProfInterfaceImpl * GetEEToProfPtr();
 
diff --git a/src/vm/profilinghelper.cpp b/src/vm/profilinghelper.cpp
index ba410b1a70..7a686b6c2f 100644
--- a/src/vm/profilinghelper.cpp
+++ b/src/vm/profilinghelper.cpp
@@ -94,7 +94,7 @@
 // use g_profControlBlock.pProfInterface. And if the reader clean-reads the status before
 // the buffers were flushed, then the reader will have incremented its evacuation counter
 // first, which the writer will be sure to see in (c).  For more details about how the
-// evacuation counters work, see code:ProfilingAPIDetach::IsProfilerEvacuated.
+// evacuation counters work, see code:ProfilingAPIUtility::IsProfilerEvacuated.
 // 
 // WHEN ARE BEGIN/END_PIN_PROFILER REQUIRED?
 // 
@@ -1246,6 +1246,97 @@ HRESULT ProfilingAPIUtility::LoadProfiler(
 
 //---------------------------------------------------------------------------------------
 //
+// Performs the evacuation checks by grabbing the thread store lock, iterating through
+// all EE Threads, and querying each one's evacuation counter.  If they're all 0, the
+// profiler is ready to be unloaded.
+//
+// Return Value:
+//    Nonzero iff the profiler is fully evacuated and ready to be unloaded.
+//
+
+// static
+BOOL ProfilingAPIUtility::IsProfilerEvacuated()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_TRIGGERS;
+        MODE_ANY;
+        CAN_TAKE_LOCK;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(g_profControlBlock.curProfStatus.Get() == kProfStatusDetaching);
+
+    // Check evacuation counters on all the threads (see
+    // code:ProfilingAPIUtility::InitializeProfiling#LoadUnloadCallbackSynchronization
+    // for details). Doing this under the thread store lock not only ensures we can
+    // iterate through the Thread objects safely, but also forces us to serialize with
+    // the GC. The latter is important, as server GC enters the profiler on non-EE
+    // Threads, and so no evacuation counters might be incremented during server GC even
+    // though control could be entering the profiler.
+    {
+        ThreadStoreLockHolder TSLockHolder;
+
+        Thread * pThread = ThreadStore::GetAllThreadList(
+            NULL,   // cursor thread; always NULL to begin with
+            0,      // mask to AND with Thread::m_State to filter returned threads 
+            0);     // bits to match the result of the above AND.  (m_State & 0 == 0,
+                    // so we won't filter out any threads)
+
+        // Note that, by not filtering out any of the threads, we're intentionally including
+        // stuff like TS_Dead or TS_Unstarted.  But that keeps us on the safe
+        // side.  If an EE Thread object exists, we want to check its counters to be
+        // absolutely certain it isn't executing in a profiler.
+
+        while (pThread != NULL)
+        {
+            // Note that pThread is still in motion as we check its evacuation counter.
+            // This is ok, because we've already changed the profiler status to
+            // kProfStatusDetaching and flushed CPU buffers. So at this point the counter
+            // will typically only go down to 0 (and not increment anymore), with one
+            // small exception (below). So if we get a read of 0 below, the counter will
+            // typically stay there. Specifically:
+            //     * pThread is most likely not about to increment its evacuation counter
+            //         from 0 to 1 because pThread sees that the status is
+            //         kProfStatusDetaching.
+            //     * Note that there is a small race where pThread might actually
+            //         increment its evac counter from 0 to 1 (if it dirty-read the
+            //         profiler status a tad too early), but that implies that when
+            //         pThread rechecks the profiler status (clean read) then pThread
+            //         will immediately decrement the evac counter back to 0 and avoid
+            //         calling into the EEToProfInterfaceImpl pointer.
+            // 
+            // (see
+            // code:ProfilingAPIUtility::InitializeProfiling#LoadUnloadCallbackSynchronization
+            // for details)
+            DWORD dwEvacCounter = pThread->GetProfilerEvacuationCounter();
+            if (dwEvacCounter != 0)
+            {
+                LOG((
+                    LF_CORPROF,
+                    LL_INFO100,
+                    "**PROF: Profiler not yet evacuated because OS Thread ID 0x%x has evac counter of %d (decimal).\n",
+                    pThread->GetOSThreadId(),
+                    dwEvacCounter));
+                return FALSE;
+            }
+
+            pThread = ThreadStore::GetAllThreadList(pThread, 0, 0);
+        }
+    }
+
+    // FUTURE: When rejit feature crew complete, add code to verify all rejitted
+    // functions are fully reverted and off of all stacks.  If this is very easy to
+    // verify (e.g., checking a single value), consider putting it above the loop
+    // above so we can early-out quicker if rejitted code is still around.
+
+    // We got this far without returning, so the profiler is fully evacuated
+    return TRUE;
+}
+
+//---------------------------------------------------------------------------------------
+//
 // This is the top-most level of profiling API teardown, and is called directly by
 // EEShutDownHelper() (in ceemain.cpp). This cleans up internal structures relating to
 // the Profiling API. If we're not in process teardown, then this also releases the
@@ -1300,6 +1391,7 @@ void ProfilingAPIUtility::TerminateProfiling()
             // them give a GetEEToProfPtr() equal to g_profControlBlock.pProfInterface.
             return;
         }
+#endif // FEATURE_PROFAPI_ATTACH_DETACH
 
         if (g_profControlBlock.curProfStatus.Get() == kProfStatusActive)
         {
@@ -1311,13 +1403,13 @@ void ProfilingAPIUtility::TerminateProfiling()
             // that the status has been changed to kProfStatusDetaching, no new threads will
             // attempt to enter the profiler. But use the detach evacuation counters to see
             // if other threads already began to enter the profiler.
-            if (!ProfilingAPIDetach::IsProfilerEvacuated())
+            if (!ProfilingAPIUtility::IsProfilerEvacuated())
             {
                 // Other threads might be entering the profiler, so just skip cleanup
                 return;
             }
         }
-#endif // FEATURE_PROFAPI_ATTACH_DETACH
+
 
         // If we have a profiler callback wrapper and / or info implementation
         // active, then terminate them.
diff --git a/src/vm/profilinghelper.h b/src/vm/profilinghelper.h
index 2682d8a6d1..6847a901d2 100644
--- a/src/vm/profilinghelper.h
+++ b/src/vm/profilinghelper.h
@@ -54,6 +54,7 @@ public:
         UINT cbClientData,
         DWORD dwConcurrentGCWaitTimeoutInMs);
 
+    static BOOL IsProfilerEvacuated();
     static void TerminateProfiling();
     static void LogProfError(int iStringResourceID, ...);
     static void LogProfInfo(int iStringResourceID, ...);
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index 4abc89155a..40e846bf40 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -1368,9 +1368,9 @@ Thread::Thread()
     m_debuggerCantStop = 0;
     m_fInteropDebuggingHijacked = FALSE;
     m_profilerCallbackState = 0;
-#ifdef FEATURE_PROFAPI_ATTACH_DETACH
+#ifdef PROFILING_SUPPORTED
     m_dwProfilerEvacuationCounter = 0;
-#endif // FEATURE_PROFAPI_ATTACH_DETACH
+#endif // PROFILING_SUPPORTED
 
     m_pProfilerFilterContext = NULL;
 
diff --git a/src/vm/threads.h b/src/vm/threads.h
index 6b7fe45368..2cbf0e0674 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -3848,7 +3848,7 @@ private:
     //---------------------------------------------------------------
     DWORD m_profilerCallbackState;
 
-#if defined(FEATURE_PROFAPI_ATTACH_DETACH) || defined(DATA_PROFAPI_ATTACH_DETACH)
+#if defined(PROFILING_SUPPORTED) || defined(PROFILING_SUPPORTED_DATA)
     //---------------------------------------------------------------
     // m_dwProfilerEvacuationCounter keeps track of how many profiler
     // callback calls remain on the stack
@@ -3856,7 +3856,7 @@ private:
     // Why volatile?
     // See code:ProfilingAPIUtility::InitializeProfiling#LoadUnloadCallbackSynchronization.
     Volatile<DWORD> m_dwProfilerEvacuationCounter;
-#endif // defined(FEATURE_PROFAPI_ATTACH_DETACH) || defined(DATA_PROFAPI_ATTACH_DETACH)
+#endif // defined(PROFILING_SUPPORTED) || defined(PROFILING_SUPPORTED_DATA)
 
 private:
     UINT32 m_workerThreadPoolCompletionCount;
@@ -4035,7 +4035,7 @@ public:
         return m_pProfilerFilterContext;
     }
 
-#ifdef FEATURE_PROFAPI_ATTACH_DETACH
+#ifdef PROFILING_SUPPORTED
 
     FORCEINLINE DWORD GetProfilerEvacuationCounter(void)
     {
@@ -4057,7 +4057,7 @@ public:
         m_dwProfilerEvacuationCounter--;
     }
 
-#endif // FEATURE_PROFAPI_ATTACH_DETACH
+#endif // PROFILING_SUPPORTED
 
     //-------------------------------------------------------------------------
     // The hijack lock enforces that a thread on which a profiler is currently
author	Noah Falk <noahfalk@users.noreply.github.com>	2019-05-21 21:52:18 -0700
committer	David Mason <davmason@microsoft.com>	2019-05-21 21:52:18 -0700
commit	671772c20a27c050df3d7d11391ea4f7de05165c (patch)
tree	6e20b07feb90c3ef3b7b5bc1634e445b3c7f5f98
parent	29810a78e5b93d8da9fb921d096226d249fc75a5 (diff)
download	coreclr-671772c20a27c050df3d7d11391ea4f7de05165c.tar.gz coreclr-671772c20a27c050df3d7d11391ea4f7de05165c.tar.bz2 coreclr-671772c20a27c050df3d7d11391ea4f7de05165c.zip