GCStress: try to reduce races and tolerate races better (#17330)

This change addresses races that cause spurious failures in when running GC stress on multithreaded applications. * Instruction update race Threads that hit a gc cover interrupt where gc is not safe can race to overrwrite the interrupt instruction and change it back to the original instruction. This can cause confusion when handling stress exceptions as the exception code raised by the kernel may be determined by disassembling the instruction that caused the fault, and this instruction may now change between the time the fault is raised and the instruction is disassembled. When this happens the kernel may report an ACCESS_VIOLATION where there was actually an attempt to execute a priveledged instruction. x86 already had a tolerance mechanism here where when gc stress was active and the exception status was ACCESS_VIOLATION the faulting instruction would be retried to see if it faults the same way again. In this change we extend this to tolerance to cover x64 and also enable it regardless of the gc mode. We use the exception information to further screen as these spurious AVs look like reads from address 0xFF..FF. * Instrumentation vs execution race The second race happens when one thread is jitting a method and another is about to call the method. The first thread finishes jitting and publishes the method code, then starts instrumenting the method for gc coverage. While this instrumentation is ongoing, the second thread then calls the method and hits a gc interrupt instruction. The code that recognizes the fault as a gc coverage interrupt gets confused as the instrumentation is not yet complete -- in particular the m_GcCover member of the MethodDesc is not yet set. So the second thread triggers an assert. The fix for this is to instrument for GcCoverage before publishing the code. Since multiple threads can be jitting a method concurrently the instrument and public steps are done under a lock to ensure that the instrumentation and code are consistent (come from the same thread). With this lock in place we have removed the secondary locking done in SetupGcCoverage as it is no longer needed; only one thread can be instrumenting a given jitted method for GcCoverage. However we retain a bailout` clause that first looks to see if m_GcCover is set and if so skips instrumentation, as there are prejit and rejit cases where we will retry instrumentation. * Instruction cache flushes In some cases when replacing the interrupt instruction with the original the instruction cache was either not flushed or not flushed with sufficient length. This possibly leads to an increased frequency of the above races. No impact expected for non-gc stress scenarios, though some of the code changes are in common code paths. Addresses the spurious GC stress failures seen in #17027 and #17610.
author: Andy Ayers <andya@microsoft.com> 2018-04-19 10:17:11 -0700
committer: GitHub <noreply@github.com> 2018-04-19 10:17:11 -0700
commit: 571b1a7c84aa264afe6a33bd58eca8c9c10052ff (patch)
tree: 238ff06b6a076af59bd9e1a28f115c6c41a32adf /src/vm
parent: 204c11b8309bf243b9255ddf7f17820cd320cf4d (diff)
download: coreclr-571b1a7c84aa264afe6a33bd58eca8c9c10052ff.tar.gz
coreclr-571b1a7c84aa264afe6a33bd58eca8c9c10052ff.tar.bz2
coreclr-571b1a7c84aa264afe6a33bd58eca8c9c10052ff.zip
11 files changed, 119 insertions, 106 deletions
diff --git a/src/vm/ceemain.cpp b/src/vm/ceemain.cpp
index 0f2efd64c0..1d14293441 100644
--- a/src/vm/ceemain.cpp
+++ b/src/vm/ceemain.cpp
@@ -1124,6 +1124,10 @@ void EEStartupHelper(COINITIEE fFlags)
 
 #endif // _DEBUG
 
+#ifdef HAVE_GCCOVER
+        MethodDesc::Init();
+#endif
+
 #endif // !CROSSGEN_COMPILE
 
 ErrExit: ;
diff --git a/src/vm/dynamicmethod.cpp b/src/vm/dynamicmethod.cpp
index 806ab57805..7778924efb 100644
--- a/src/vm/dynamicmethod.cpp
+++ b/src/vm/dynamicmethod.cpp
@@ -280,6 +280,11 @@ DynamicMethodDesc* DynamicMethodTable::GetDynamicMethod(BYTE *psig, DWORD sigSiz
     pNewMD->m_pszDebugClassName  = (LPUTF8)"dynamicclass";
     pNewMD->m_pszDebugMethodSignature = "DynamicMethod Signature not available";
 #endif // _DEBUG
+
+#ifdef HAVE_GCCOVER
+    pNewMD->m_GcCover = NULL;
+#endif
+
     pNewMD->SetNotInline(TRUE);
     pNewMD->GetLCGMethodResolver()->Reset();
 
diff --git a/src/vm/excep.cpp b/src/vm/excep.cpp
index e5c024e89c..d4e00d635d 100644
--- a/src/vm/excep.cpp
+++ b/src/vm/excep.cpp
@@ -6811,34 +6811,39 @@ DWORD GetGcMarkerExceptionCode(LPVOID ip)
 }
 
 // Did we hit an DO_A_GC_HERE marker in JITted code?
-bool IsGcMarker(DWORD exceptionCode, CONTEXT *pContext)
+bool IsGcMarker(CONTEXT* pContext, EXCEPTION_RECORD *pExceptionRecord)
 {
+    DWORD exceptionCode = pExceptionRecord->ExceptionCode;
 #ifdef HAVE_GCCOVER
     WRAPPER_NO_CONTRACT;
 
     if (GCStress<cfg_any>::IsEnabled())
     {
-#ifdef _TARGET_X86_
-        // on x86 we can't suspend EE to update the GC marker instruction so
-        // we update it directly without suspending.  this can sometimes yield
-        // a STATUS_ACCESS_VIOLATION instead of STATUS_CLR_GCCOVER_CODE.  in
-        // this case we let the AV through and retry the instruction.  we'll
-        // track the IP of the instruction that generated an AV so we don't
-        // mix up a real AV with a "fake" AV.
-        // see comments in function DoGcStress for more details on this race.
-        // also make sure that the thread is actually in managed code since AVs
-        // outside of of JIT code will never be potential GC markers
+#if defined(GCCOVER_TOLERATE_SPURIOUS_AV)
+
+        // We sometimes can't suspend the EE to update the GC marker instruction so
+        // we update it directly without suspending.  This can sometimes yield
+        // a STATUS_ACCESS_VIOLATION instead of STATUS_CLR_GCCOVER_CODE.  In
+        // this case we let the AV through and retry the instruction as hopefully
+        // the race will have resolved.  We'll track the IP of the instruction
+        // that generated an AV so we don't mix up a real AV with a "fake" AV.
+        //
+        // See comments in function DoGcStress for more details on this race.
+        //
+        // Note these "fake" AVs will be reported by the kernel as reads from
+        // address 0xF...F so we also use that as a screen.
         Thread* pThread = GetThread();
         if (exceptionCode == STATUS_ACCESS_VIOLATION &&
             GCStress<cfg_instr>::IsEnabled() &&
+            pExceptionRecord->ExceptionInformation[0] == 0 &&
+            pExceptionRecord->ExceptionInformation[1] == ~0 &&
             pThread->GetLastAVAddress() != (LPVOID)GetIP(pContext) &&
-            pThread->PreemptiveGCDisabled() &&
             !IsIPInEE((LPVOID)GetIP(pContext)))
         {
             pThread->SetLastAVAddress((LPVOID)GetIP(pContext));
             return true;
         }
-#endif // _TARGET_X86_
+#endif // defined(GCCOVER_TOLERATE_SPURIOUS_AV)
 
         if (exceptionCode == STATUS_CLR_GCCOVER_CODE)
         {
@@ -7737,7 +7742,7 @@ VEH_ACTION WINAPI CLRVectoredExceptionHandlerPhase3(PEXCEPTION_POINTERS pExcepti
     // NOTE: this is effectively ifdef (_TARGET_AMD64_ || _TARGET_ARM_), and does not actually trigger
     // a GC.  This will redirect the exception context to a stub which will
     // push a frame and cause GC.
-    if (IsGcMarker(exceptionCode, pContext))
+    if (IsGcMarker(pContext, pExceptionRecord))
     {
         return VEH_CONTINUE_EXECUTION;;
     }
@@ -8161,11 +8166,11 @@ LONG WINAPI CLRVectoredExceptionHandlerShim(PEXCEPTION_POINTERS pExceptionInfo)
 
 #ifdef USE_REDIRECT_FOR_GCSTRESS
     // This is AMD64 & ARM specific as the macro above is defined for AMD64 & ARM only
-    bIsGCMarker = IsGcMarker(dwCode, pExceptionInfo->ContextRecord);
+    bIsGCMarker = IsGcMarker(pExceptionInfo->ContextRecord, pExceptionInfo->ExceptionRecord);
 #elif defined(_TARGET_X86_) && defined(HAVE_GCCOVER)
     // This is the equivalent of the check done in COMPlusFrameHandler, incase the exception is
     // seen by VEH first on x86.
-    bIsGCMarker = IsGcMarker(dwCode, pExceptionInfo->ContextRecord);
+    bIsGCMarker = IsGcMarker(pExceptionInfo->ContextRecord, pExceptionInfo->ExceptionRecord);
 #endif // USE_REDIRECT_FOR_GCSTRESS
 
     // Do not update the TLS with exception details for exceptions pertaining to GCStress
diff --git a/src/vm/excep.h b/src/vm/excep.h
index 6df9a98452..8c49071a81 100644
--- a/src/vm/excep.h
+++ b/src/vm/excep.h
@@ -766,7 +766,7 @@ void CPFH_AdjustContextForThreadSuspensionRace(T_CONTEXT *pContext, Thread *pThr
 #endif // _TARGET_X86_
 
 DWORD GetGcMarkerExceptionCode(LPVOID ip);
-bool IsGcMarker(DWORD exceptionCode, T_CONTEXT *pContext);
+bool IsGcMarker(T_CONTEXT *pContext, EXCEPTION_RECORD *pExceptionRecord);
 
 void InitSavedExceptionInfo();
 
diff --git a/src/vm/exceptionhandling.cpp b/src/vm/exceptionhandling.cpp
index 38a2a43419..f78f9c3e1e 100644
--- a/src/vm/exceptionhandling.cpp
+++ b/src/vm/exceptionhandling.cpp
@@ -936,7 +936,7 @@ ProcessCLRException(IN     PEXCEPTION_RECORD   pExceptionRecord
     //
     {
 #ifndef USE_REDIRECT_FOR_GCSTRESS
-        if (IsGcMarker(pExceptionRecord->ExceptionCode, pContextRecord))
+        if (IsGcMarker(pContextRecord, pExceptionRecord))
         {
             returnDisposition = ExceptionContinueExecution;
             goto lExit;
@@ -5227,7 +5227,7 @@ BOOL HandleHardwareException(PAL_SEHException* ex)
         // A hardware exception is handled only if it happened in a jitted code or 
         // in one of the JIT helper functions (JIT_MemSet, ...)
         PCODE controlPc = GetIP(ex->GetContextRecord());
-        if (ExecutionManager::IsManagedCode(controlPc) && IsGcMarker(ex->GetExceptionRecord()->ExceptionCode, ex->GetContextRecord()))
+        if (ExecutionManager::IsManagedCode(controlPc) && IsGcMarker(ex->GetContextRecord(), ex->GetExceptionRecord()))
         {
             // Exception was handled, let the signal handler return to the exception context. Some registers in the context can
             // have been modified by the GC.
diff --git a/src/vm/gccover.cpp b/src/vm/gccover.cpp
index d61a168f47..ca91687887 100644
--- a/src/vm/gccover.cpp
+++ b/src/vm/gccover.cpp
@@ -144,69 +144,31 @@ void SetupGcCoverage(MethodDesc* pMD, BYTE* methodStartPtr) {
     }
 #endif
 
-    if (pMD->m_GcCover)
-        return;
-
+    // Ideally we would assert here that m_GcCover is NULL.
+    //
+    // However, we can't do that (at least not yet), because we may
+    // invoke this method more than once on a given
+    // MethodDesc. Examples include prejitted methods and rejitted
+    // methods.
     //
-    // In the gcstress=4 case, we can easily piggy-back onto the JITLock because we
-    // have a JIT operation that needs to take that lock already.  But in the case of
-    // gcstress=8, we cannot do this because the code already exists, and if gccoverage
-    // were not in the picture, we're happy to race to do the prestub work because all
-    // threads end up with the same answer and don't leak any resources in the process.
-    // 
-    // However, with gccoverage, we need to exclude all other threads from mucking with
-    // the code while we fill in the breakpoints and make our shadow copy of the code.
+    // In the prejit case, we can't safely re-instrument an already
+    // instrumented method. By bailing out here, we will use the
+    // original instrumentation, which should still be valid as
+    // the method code has not changed.
     //
+    // In the rejit case, the old method code may still be active and
+    // instrumented, so we need to preserve that gc cover info.  By
+    // bailing out here we will skip instrumenting the rejitted native
+    // code, and since the rejitted method does not get instrumented
+    // we should be able to tolerate that the gc cover info does not
+    // match.
+    if (pMD->m_GcCover)
     {
-        BaseDomain* pDomain = pMD->GetDomain();
-        // Enter the global lock which protects the list of all functions being JITd
-        JitListLock::LockHolder pJitLock(pDomain->GetJitLock());
-
-
-        // It is possible that another thread stepped in before we entered the global lock for the first time.
-        if (pMD->m_GcCover)
-        {
-            // We came in to jit but someone beat us so return the jitted method!
-            return;
-        }
-        else
-        {
-            const char *description = "jit lock (gc cover)";
-#ifdef _DEBUG 
-            description = pMD->m_pszDebugMethodName;
-#endif
-            ReleaseHolder<JitListLockEntry> pEntry(JitListLockEntry::Find(pJitLock, pMD->GetInitialCodeVersion(), description));
-
-            // We have an entry now, we can release the global lock
-            pJitLock.Release();
-
-            // Take the entry lock
-            {
-                JitListLockEntry::LockHolder pEntryLock(pEntry, FALSE);
-
-                if (pEntryLock.DeadlockAwareAcquire())
-                {
-                    // we have the lock...
-                }
-                else
-                {
-                    // Note that at this point we don't have the lock, but that's OK because the
-                    // thread which does have the lock is blocked waiting for us.
-                }
-
-                if (pMD->m_GcCover)
-                {
-                    return;
-                }
-
-                PCODE codeStart = (PCODE) methodStartPtr;
-
-                SetupAndSprinkleBreakpointsForJittedMethod(pMD, 
-                                                           codeStart
-                                                          );
-            }
-        }
+        return;
     }
+
+    PCODE codeStart = (PCODE) methodStartPtr;
+    SetupAndSprinkleBreakpointsForJittedMethod(pMD, codeStart);
 }
 
 #ifdef FEATURE_PREJIT
@@ -1305,6 +1267,8 @@ void RemoveGcCoverageInterrupt(TADDR instrPtr, BYTE * savedInstrPtr)
 #else
         *(BYTE *)instrPtr = *savedInstrPtr;
 #endif
+
+        FlushInstructionCache(GetCurrentProcess(), (LPCVOID)instrPtr, 4);
 }
 
 BOOL OnGcCoverageInterrupt(PCONTEXT regs)
@@ -1677,7 +1641,8 @@ void DoGcStress (PCONTEXT regs, MethodDesc *pMD)
         }
 
         // Must flush instruction cache before returning as instruction has been modified.
-        FlushInstructionCache(GetCurrentProcess(), (LPCVOID)instrPtr, 6);
+        // Note this needs to reach beyond the call by up to 4 bytes.
+        FlushInstructionCache(GetCurrentProcess(), (LPCVOID)instrPtr, 10);
 
         // It's not GC safe point, the GC Stress instruction is 
         // already commited and interrupt is already put at next instruction so we just return.
diff --git a/src/vm/i386/excepx86.cpp b/src/vm/i386/excepx86.cpp
index 9f19d47440..9f558c457e 100644
--- a/src/vm/i386/excepx86.cpp
+++ b/src/vm/i386/excepx86.cpp
@@ -1347,7 +1347,7 @@ CPFH_FirstPassHandler(EXCEPTION_RECORD *pExceptionRecord,
     // Check to see if this exception is due to GCStress. Since the GCStress mechanism only injects these faults
     // into managed code, we only need to check for them in CPFH_FirstPassHandler.
     //
-    if (IsGcMarker(exceptionCode, pContext))
+    if (IsGcMarker(pContext, pExceptionRecord))
     {
         return ExceptionContinueExecution;
     }
@@ -1675,7 +1675,7 @@ EXCEPTION_HANDLER_IMPL(COMPlusFrameHandler)
         // it is very easy to trash the last error. For example, a p/invoke called a native method
         // which sets last error. Before we getting the last error in the IL stub, it is trashed here
         DWORD dwLastError = GetLastError();
-        fIsGCMarker = IsGcMarker(pExceptionRecord->ExceptionCode, pContext);
+        fIsGCMarker = IsGcMarker(pContext, pExceptionRecord);
         if (!fIsGCMarker)
         {
             SaveCurrentExceptionInfo(pExceptionRecord, pContext);
@@ -3693,12 +3693,11 @@ AdjustContextForVirtualStub(
     pExceptionRecord->ExceptionAddress = (PVOID)callsite;
     SetIP(pContext, callsite);
 
-#ifdef HAVE_GCCOVER
+#if defined(GCCOVER_TOLERATE_SPURIOUS_AV)
     // Modify LastAVAddress saved in thread to distinguish between fake & real AV
     // See comments in IsGcMarker in file excep.cpp for more details
     pThread->SetLastAVAddress((LPVOID)GetIP(pContext));
-#endif    
-
+#endif // defined(GCCOVER_TOLERATE_SPURIOUS_AV)
 
     // put ESP back to what it was before the call.
     SetSP(pContext, dac_cast<PCODE>(dac_cast<PTR_BYTE>(GetSP(pContext)) + sizeof(void*)));
diff --git a/src/vm/method.hpp b/src/vm/method.hpp
index c1316d06c3..fd91631214 100644
--- a/src/vm/method.hpp
+++ b/src/vm/method.hpp
@@ -1876,6 +1876,14 @@ private:
     PCODE JitCompileCodeLockedEventWrapper(PrepareCodeConfig* pConfig, JitListLockEntry* pEntry);
     PCODE JitCompileCodeLocked(PrepareCodeConfig* pConfig, JitListLockEntry* pLockEntry, ULONG* pSizeOfCode, CORJIT_FLAGS* pFlags);
 #endif // DACCESS_COMPILE
+
+#ifdef HAVE_GCCOVER
+private:
+    static CrstStatic m_GCCoverCrst;
+
+public:
+    static void Init();
+#endif
 };
 
 #ifndef DACCESS_COMPILE
diff --git a/src/vm/prestub.cpp b/src/vm/prestub.cpp
index 507f8d3d00..532f04ffc1 100644
--- a/src/vm/prestub.cpp
+++ b/src/vm/prestub.cpp
@@ -65,6 +65,16 @@ EXTERN_C void MarkMethodNotPitchingCandidate(MethodDesc* pMD);
 
 EXTERN_C void STDCALL ThePreStubPatch();
 
+#if defined(HAVE_GCCOVER)
+CrstStatic MethodDesc::m_GCCoverCrst;
+
+void MethodDesc::Init()
+{
+    m_GCCoverCrst.Init(CrstGCCover);
+}
+
+#endif
+
 //==========================================================================
 
 PCODE MethodDesc::DoBackpatch(MethodTable * pMT, MethodTable *pDispatchingMT, BOOL fFullBackPatch)
@@ -874,32 +884,49 @@ PCODE MethodDesc::JitCompileCodeLocked(PrepareCodeConfig* pConfig, JitListLockEn
         }
 
     _ASSERTE(pCode != NULL);
-    
-    // Aside from rejit, performing a SetNativeCodeInterlocked at this point
-    // generally ensures that there is only one winning version of the native
-    // code. This also avoid races with profiler overriding ngened code (see
-    // matching SetNativeCodeInterlocked done after
-    // JITCachedFunctionSearchStarted)
+
+#ifdef HAVE_GCCOVER
+    // Instrument for coverage before trying to publish this version
+    // of the code as the native code, to avoid other threads seeing
+    // partially instrumented methods.
+    if (GCStress<cfg_instr_jit>::IsEnabled())
     {
-        if (!pConfig->SetNativeCode(pCode, &pOtherCode))
+        // Do the instrumentation and publish atomically, so that the
+        // instrumentation data always matches the published code.
+        CrstHolder gcCoverLock(&m_GCCoverCrst);
+
+        // Make sure no other thread has stepped in before us.
+        if ((pOtherCode = pConfig->IsJitCancellationRequested()))
         {
-            // Another thread beat us to publishing its copy of the JITted code.
             return pOtherCode;
         }
-#if defined(FEATURE_JIT_PITCHING)
-        else
+
+        SetupGcCoverage(this, (BYTE*)pCode);
+
+        // This thread should always win the publishing race
+        // since we're under a lock.
+        if (!pConfig->SetNativeCode(pCode, &pOtherCode))
         {
-            SavePitchingCandidate(this, *pSizeOfCode);
+            _ASSERTE(!"GC Cover native code publish failed");
         }
-#endif
     }
+    else
+#endif // HAVE_GCCOVER
 
-#ifdef HAVE_GCCOVER
-    if (GCStress<cfg_instr_jit>::IsEnabled())
+    // Aside from rejit, performing a SetNativeCodeInterlocked at this point
+    // generally ensures that there is only one winning version of the native
+    // code. This also avoid races with profiler overriding ngened code (see
+    // matching SetNativeCodeInterlocked done after
+    // JITCachedFunctionSearchStarted)
+    if (!pConfig->SetNativeCode(pCode, &pOtherCode))
     {
-        SetupGcCoverage(this, (BYTE*)pCode);
+        // Another thread beat us to publishing its copy of the JITted code.
+        return pOtherCode;
     }
-#endif // HAVE_GCCOVER
+
+#if defined(FEATURE_JIT_PITCHING)
+    SavePitchingCandidate(this, *pSizeOfCode);
+#endif
 
     // We succeeded in jitting the code, and our jitted code is the one that's going to run now.
     pEntry->m_hrResultCode = S_OK;
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index 283c6299ca..71ddb6560b 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -1615,9 +1615,9 @@ Thread::Thread()
 #ifdef HAVE_GCCOVER
     m_pbDestCode = NULL;
     m_pbSrcCode = NULL;
-#ifdef _TARGET_X86_
+#if defined(GCCOVER_TOLERATE_SPURIOUS_AV)
     m_pLastAVAddress = NULL;
-#endif // _TARGET_X86_
+#endif // defined(GCCOVER_TOLERATE_SPURIOUS_AV)
 #endif // HAVE_GCCOVER
 
     m_fCompletionPortDrained = FALSE;
diff --git a/src/vm/threads.h b/src/vm/threads.h
index 292ab226b3..70fcb522c8 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -4813,9 +4813,9 @@ public:
 private:
     BYTE* m_pbDestCode;
     BYTE* m_pbSrcCode;
-#ifdef _TARGET_X86_
+#if defined(GCCOVER_TOLERATE_SPURIOUS_AV)
     LPVOID m_pLastAVAddress;
-#endif // _TARGET_X86_
+#endif // defined(GCCOVER_TOLERATE_SPURIOUS_AV)
 
 public:
     void CommitGCStressInstructionUpdate();
@@ -4841,7 +4841,7 @@ public:
         m_pbDestCode = NULL;
         m_pbSrcCode = NULL;
     }
-#ifdef _TARGET_X86_
+#if defined(GCCOVER_TOLERATE_SPURIOUS_AV)
     void SetLastAVAddress(LPVOID address)
     {
         LIMITED_METHOD_CONTRACT;
@@ -4852,7 +4852,7 @@ public:
         LIMITED_METHOD_CONTRACT;
         return m_pLastAVAddress;
     }
-#endif // _TARGET_X86_
+#endif // defined(GCCOVER_TOLERATE_SPURIOUS_AV)
 #endif // HAVE_GCCOVER
 
 #if defined(_DEBUG) && defined(FEATURE_STACK_PROBE)
author	Andy Ayers <andya@microsoft.com>	2018-04-19 10:17:11 -0700
committer	GitHub <noreply@github.com>	2018-04-19 10:17:11 -0700
commit	571b1a7c84aa264afe6a33bd58eca8c9c10052ff (patch)
tree	238ff06b6a076af59bd9e1a28f115c6c41a32adf /src/vm
parent	204c11b8309bf243b9255ddf7f17820cd320cf4d (diff)
download	coreclr-571b1a7c84aa264afe6a33bd58eca8c9c10052ff.tar.gz coreclr-571b1a7c84aa264afe6a33bd58eca8c9c10052ff.tar.bz2 coreclr-571b1a7c84aa264afe6a33bd58eca8c9c10052ff.zip