summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVladimir Sadov <vsadov@microsoft.com>2019-06-20 19:54:34 -0700
committerGitHub <noreply@github.com>2019-06-20 19:54:34 -0700
commitf77a8bee633f8fae4b3ee85afff619e11226d79b (patch)
tree76a73904f1fdb385e2c5ba8b9cc51a93a428c284
parent61dfc51ae30a9a2dbdb637fa200af85ea0435dc5 (diff)
downloadcoreclr-f77a8bee633f8fae4b3ee85afff619e11226d79b.tar.gz
coreclr-f77a8bee633f8fae4b3ee85afff619e11226d79b.tar.bz2
coreclr-f77a8bee633f8fae4b3ee85afff619e11226d79b.zip
ensure process-wide fence when updating GC write barrier on ARM64 (#25130)
* ensure process-wide fences when updating GC write barrier on ARM64
-rw-r--r--src/gc/sample/GCSample.cpp2
-rw-r--r--src/vm/gcenv.ee.cpp94
-rw-r--r--src/vm/gchelpers.cpp25
-rw-r--r--src/vm/gchelpers.inl7
4 files changed, 84 insertions, 44 deletions
diff --git a/src/gc/sample/GCSample.cpp b/src/gc/sample/GCSample.cpp
index 4e228509dc..d7db90d4f9 100644
--- a/src/gc/sample/GCSample.cpp
+++ b/src/gc/sample/GCSample.cpp
@@ -95,7 +95,7 @@ inline void ErectWriteBarrier(Object ** dst, Object * ref)
return;
// volatile is used here to prevent fetch of g_card_table from being reordered
- // with g_lowest/highest_address check above. See comment in code:gc_heap::grow_brick_card_tables.
+ // with g_lowest/highest_address check above. See comments in StompWriteBarrier
uint8_t* pCardByte = (uint8_t *)*(volatile uint8_t **)(&g_gc_card_table) + card_byte((uint8_t *)dst);
if(*pCardByte != 0xFF)
*pCardByte = 0xFF;
diff --git a/src/vm/gcenv.ee.cpp b/src/vm/gcenv.ee.cpp
index 871efd1849..e4d3153732 100644
--- a/src/vm/gcenv.ee.cpp
+++ b/src/vm/gcenv.ee.cpp
@@ -803,7 +803,7 @@ void GCToEEInterface::DiagWalkBGCSurvivors(void* gcContext)
void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
{
int stompWBCompleteActions = SWB_PASS;
- bool is_runtime_suspended = false;
+ bool is_runtime_suspended = args->is_runtime_suspended;
assert(args != nullptr);
switch (args->operation)
@@ -815,7 +815,13 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
assert(args->lowest_address != nullptr);
assert(args->highest_address != nullptr);
- g_card_table = args->card_table;
+ // We are sensitive to the order of writes here (more comments on this further in the method)
+ // In particular g_card_table must be written before writing the heap bounds.
+ // For platforms with weak memory ordering we will issue fences, for x64/x86 we are ok
+ // as long as compiler does not reorder these writes.
+ // That is unlikely since we have method calls in between.
+ // Just to be robust agains possible refactoring/inlining we will do a compiler-fenced store here.
+ VolatileStoreWithoutBarrier(&g_card_table, args->card_table);
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
assert(args->card_bundle_table != nullptr);
@@ -830,53 +836,84 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
}
#endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
- stompWBCompleteActions |= ::StompWriteBarrierResize(args->is_runtime_suspended, args->requires_upper_bounds_check);
-
- // We need to make sure that other threads executing checked write barriers
- // will see the g_card_table update before g_lowest/highest_address updates.
- // Otherwise, the checked write barrier may AV accessing the old card table
- // with address that it does not cover.
- //
- // Even x86's total store ordering is insufficient here because threads reading
- // g_card_table do so via the instruction cache, whereas g_lowest/highest_address
- // are read via the data cache.
- //
- // The g_card_table update is covered by section 8.1.3 of the Intel Software
- // Development Manual, Volume 3A (System Programming Guide, Part 1), about
- // "cross-modifying code": We need all _executing_ threads to invalidate
- // their instruction cache, which FlushProcessWriteBuffers achieves by sending
- // an IPI (inter-process interrupt).
+ stompWBCompleteActions |= ::StompWriteBarrierResize(is_runtime_suspended, args->requires_upper_bounds_check);
+ is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || is_runtime_suspended;
if (stompWBCompleteActions & SWB_ICACHE_FLUSH)
{
- // flushing icache on current processor (thread)
+ // flushing/invalidating the write barrier's body for the current process
+ // NOTE: the underlying API may flush more than needed or nothing at all if Icache is coherent.
::FlushWriteBarrierInstructionCache();
- // asking other processors (threads) to invalidate their icache
+ }
+
+ // IMPORTANT: managed heap segments may surround unmanaged/stack segments. In such cases adding another managed
+ // heap segment may put a stack/unmanaged write inside the new heap range. However the old card table would
+ // not cover it. Therefore we must ensure that the write barriers see the new table before seeing the new bounds.
+ //
+ // On architectures with strong ordering, we only need to prevent compiler reordering.
+ // Otherwise we put a process-wide fence here (so that we could use an ordinary read in the barrier)
+
+#if defined(_ARM64_) || defined(_ARM_)
+ if (!is_runtime_suspended)
+ {
+ // If runtime is not suspended, force all threads to see the changed table before seeing updated heap boundaries.
+ // See: http://vstfdevdiv:8080/DevDiv2/DevDiv/_workitems/edit/346765
FlushProcessWriteBuffers();
}
+#endif
g_lowest_address = args->lowest_address;
- VolatileStore(&g_highest_address, args->highest_address);
+ g_highest_address = args->highest_address;
#if defined(_ARM64_) || defined(_ARM_)
// Need to reupdate for changes to g_highest_address g_lowest_address
- is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || args->is_runtime_suspended;
stompWBCompleteActions |= ::StompWriteBarrierResize(is_runtime_suspended, args->requires_upper_bounds_check);
#ifdef _ARM_
if (stompWBCompleteActions & SWB_ICACHE_FLUSH)
{
+ // flushing/invalidating the write barrier's body for the current process
+ // NOTE: the underlying API may flush more than needed or nothing at all if Icache is coherent.
::FlushWriteBarrierInstructionCache();
}
#endif
+#endif
+
+ // At this point either the old or the new set of globals (card_table, bounds etc) can be used. Card tables and card bundles allow such use.
+ // When card tables are de-published (at EE suspension) all the info will be merged, so the information will not be lost.
+ // Another point - we should not yet have any managed objects/addresses outside of the former bounds, so either old or new bounds are fine.
+ // That is - because bounds can only become wider and we are not yet done with widening.
+ //
+ // However!!
+ // Once we are done, a new object can (and likely will) be allocated outside of the former bounds.
+ // So, before such object can be used in a write barier, we must ensure that the barrier also uses the new bounds.
+ //
+ // This is easy to arrange for architectures with strong memory ordering. We only need to ensure that
+ // - object is allocated/published _after_ we publish bounds here
+ // - write barrier reads bounds after reading the new object locations
+ //
+ // for architectures with strong memory ordering (x86/x64) both conditions above are naturally guaranteed.
+ // Systems with weak ordering are more interesting. We could either:
+ // a) issue a write fence here and pair it with a read fence in the write barrier, or
+ // b) issue a process-wide full fence here and do ordinary reads in the barrier.
+ //
+ // We will do "b" because executing write barrier is by far more common than updating card table.
+ //
+ // I.E. - for weak architectures we have to do a process-wide fence.
+ //
+ // NOTE: suspending/resuming EE works the same as process-wide fence for our purposes here.
+ // (we care only about managed threads and suspend/resume will do full fences - good enough for us).
+ //
- is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || args->is_runtime_suspended;
- if(!is_runtime_suspended)
+#if defined(_ARM64_) || defined(_ARM_)
+ is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || is_runtime_suspended;
+ if (!is_runtime_suspended)
{
- // If runtime is not suspended, force updated state to be visible to all threads
- MemoryBarrier();
+ // If runtime is not suspended, force all threads to see the changed state before observing future allocations.
+ FlushProcessWriteBuffers();
}
#endif
+
if (stompWBCompleteActions & SWB_EE_RESTART)
{
assert(!args->is_runtime_suspended &&
@@ -885,6 +922,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
}
return; // unlike other branches we have already done cleanup so bailing out here
case WriteBarrierOp::StompEphemeral:
+ assert(args->is_runtime_suspended && "the runtime must be suspended here!");
// StompEphemeral requires a new ephemeral low and a new ephemeral high
assert(args->ephemeral_low != nullptr);
assert(args->ephemeral_high != nullptr);
@@ -893,6 +931,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
stompWBCompleteActions |= ::StompWriteBarrierEphemeral(args->is_runtime_suspended);
break;
case WriteBarrierOp::Initialize:
+ assert(args->is_runtime_suspended && "the runtime must be suspended here!");
// This operation should only be invoked once, upon initialization.
assert(g_card_table == nullptr);
assert(g_lowest_address == nullptr);
@@ -902,7 +941,6 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
assert(args->highest_address != nullptr);
assert(args->ephemeral_low != nullptr);
assert(args->ephemeral_high != nullptr);
- assert(args->is_runtime_suspended && "the runtime must be suspended here!");
assert(!args->requires_upper_bounds_check && "the ephemeral generation must be at the top of the heap!");
g_card_table = args->card_table;
@@ -926,8 +964,8 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args)
break;
case WriteBarrierOp::SwitchToWriteWatch:
#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
- assert(args->write_watch_table != nullptr);
assert(args->is_runtime_suspended && "the runtime must be suspended here!");
+ assert(args->write_watch_table != nullptr);
g_sw_ww_table = args->write_watch_table;
g_sw_ww_enabled_for_gc_heap = true;
stompWBCompleteActions |= ::SwitchToWriteWatchBarrier(true);
diff --git a/src/vm/gchelpers.cpp b/src/vm/gchelpers.cpp
index 22a1a5d39b..07429a070a 100644
--- a/src/vm/gchelpers.cpp
+++ b/src/vm/gchelpers.cpp
@@ -1373,7 +1373,7 @@ extern "C" HCIMPL2_RAW(VOID, JIT_CheckedWriteBarrier, Object **dst, Object *ref)
// no HELPER_METHOD_FRAME because we are MODE_COOPERATIVE, GC_NOTRIGGER
- *dst = ref;
+ VolatileStore(dst, ref);
// if the dst is outside of the heap (unboxed value classes) then we
// simply exit
@@ -1407,8 +1407,8 @@ extern "C" HCIMPL2_RAW(VOID, JIT_CheckedWriteBarrier, Object **dst, Object *ref)
CheckedAfterRefInEphemFilter++;
#endif
// VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered
- // with g_lowest/highest_address check above. See comment in code:gc_heap::grow_brick_card_tables.
- BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
+ // with g_lowest/highest_address check above. See comment in StompWriteBarrier.
+ BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
if(*pCardByte != 0xFF)
{
#ifdef FEATURE_COUNT_GC_WRITE_BARRIERS
@@ -1440,7 +1440,7 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrier, Object **dst, Object *ref)
#endif
// no HELPER_METHOD_FRAME because we are MODE_COOPERATIVE, GC_NOTRIGGER
- *dst = ref;
+ VolatileStore(dst, ref);
// If the store above succeeded, "dst" should be in the heap.
assert(GCHeapUtilities::GetGCHeap()->IsHeapPointer((void*)dst));
@@ -1467,7 +1467,9 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrier, Object **dst, Object *ref)
#ifdef FEATURE_COUNT_GC_WRITE_BARRIERS
UncheckedAfterRefInEphemFilter++;
#endif
- BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
+ // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered
+ // with g_lowest/highest_address check above. See comment in StompWriteBarrier.
+ BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
if(*pCardByte != 0xFF)
{
#ifdef FEATURE_COUNT_GC_WRITE_BARRIERS
@@ -1478,7 +1480,6 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrier, Object **dst, Object *ref)
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
SetCardBundleByte((BYTE*)dst);
#endif
-
}
}
}
@@ -1498,6 +1499,7 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrierEnsureNonHeapTarget, Object **dst,
// no HELPER_METHOD_FRAME because we are MODE_COOPERATIVE, GC_NOTRIGGER
+ // not a release store because NonHeap.
*dst = ref;
}
HCIMPLEND_RAW
@@ -1531,8 +1533,8 @@ void ErectWriteBarrier(OBJECTREF *dst, OBJECTREF ref)
if ((BYTE*) OBJECTREFToObject(ref) >= g_ephemeral_low && (BYTE*) OBJECTREFToObject(ref) < g_ephemeral_high)
{
// VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered
- // with g_lowest/highest_address check above. See comment in code:gc_heap::grow_brick_card_tables.
- BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
+ // with g_lowest/highest_address check above. See comment in StompWriteBarrier.
+ BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
if (*pCardByte != 0xFF)
{
*pCardByte = 0xFF;
@@ -1540,7 +1542,6 @@ void ErectWriteBarrier(OBJECTREF *dst, OBJECTREF ref)
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
SetCardBundleByte((BYTE*)dst);
#endif
-
}
}
}
@@ -1571,8 +1572,9 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref)
BYTE *refObject = *(BYTE **)((MethodTable*)ref)->GetLoaderAllocatorObjectHandle();
if((BYTE*) refObject >= g_ephemeral_low && (BYTE*) refObject < g_ephemeral_high)
{
- // See comment above
- BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
+ // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered
+ // with g_lowest/highest_address check above. See comment in StompWriteBarrier.
+ BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst);
if( !((*pCardByte) & card_bit((BYTE *)dst)) )
{
*pCardByte = 0xFF;
@@ -1580,7 +1582,6 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref)
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
SetCardBundleByte((BYTE*)dst);
#endif
-
}
}
}
diff --git a/src/vm/gchelpers.inl b/src/vm/gchelpers.inl
index 1b14077e72..fed186d5ef 100644
--- a/src/vm/gchelpers.inl
+++ b/src/vm/gchelpers.inl
@@ -66,9 +66,10 @@ FORCEINLINE void InlinedSetCardsAfterBulkCopyHelper(Object **start, size_t len)
// calculate the number of clumps to mark (round_up(end) - start)
size_t clumpCount = endingClump - startingClump;
- // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered
- // with g_lowest/highest_address check at the beginning of this function.
- uint8_t* card = ((uint8_t*)VolatileLoadWithoutBarrier(&g_card_table)) + startingClump;
+
+ // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered
+ // with g_lowest/highest_address check above. See comment in StompWriteBarrier.
+ BYTE* card = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + startingClump;
// Fill the cards. To avoid cache line thrashing we check whether the cards have already been set before
// writing.