summaryrefslogtreecommitdiff
path: root/src/gc
diff options
context:
space:
mode:
authorMaoni Stephens <Maoni0@users.noreply.github.com>2019-07-05 12:48:34 -0700
committerGitHub <noreply@github.com>2019-07-05 12:48:34 -0700
commitc7d12d2b91f79802a12484d4d56c41ba355b7058 (patch)
tree823ab1ef9de13e8f5493fc219af7a5deaa118e13 /src/gc
parent41407b7d4a2eac4cae6a3c4855ba072cdd01f8b1 (diff)
downloadcoreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.tar.gz
coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.tar.bz2
coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.zip
many core (#25350)
Diffstat (limited to 'src/gc')
-rw-r--r--src/gc/env/gcenv.interlocked.h6
-rw-r--r--src/gc/env/gcenv.interlocked.inl31
-rw-r--r--src/gc/env/gcenv.os.h19
-rw-r--r--src/gc/gc.cpp940
-rw-r--r--src/gc/gcpriv.h35
-rw-r--r--src/gc/unix/gcenv.unix.cpp8
-rw-r--r--src/gc/windows/gcenv.windows.cpp83
7 files changed, 970 insertions, 152 deletions
diff --git a/src/gc/env/gcenv.interlocked.h b/src/gc/env/gcenv.interlocked.h
index 1da222de75..eb73a36eb7 100644
--- a/src/gc/env/gcenv.interlocked.h
+++ b/src/gc/env/gcenv.interlocked.h
@@ -78,6 +78,12 @@ public:
template<typename T>
static T ExchangeAdd(T volatile *addend, T value);
+ template<typename T>
+ static T ExchangeAdd64(T volatile* addend, T value);
+
+ template<typename T>
+ static T ExchangeAddPtr(T volatile* addend, T value);
+
// Performs an atomic compare-and-exchange operation on the specified values.
// Parameters:
// destination - value to be exchanged
diff --git a/src/gc/env/gcenv.interlocked.inl b/src/gc/env/gcenv.interlocked.inl
index 1df2700d7f..401c4da0db 100644
--- a/src/gc/env/gcenv.interlocked.inl
+++ b/src/gc/env/gcenv.interlocked.inl
@@ -116,6 +116,37 @@ __forceinline T Interlocked::ExchangeAdd(T volatile *addend, T value)
#endif
}
+template <typename T>
+__forceinline T Interlocked::ExchangeAdd64(T volatile* addend, T value)
+{
+#ifdef _MSC_VER
+ static_assert(sizeof(int64_t) == sizeof(T), "Size of LONGLONG must be the same as size of T");
+ return _InterlockedExchangeAdd64((int64_t*)addend, value);
+#else
+ T result = __sync_fetch_and_add(addend, value);
+ ArmInterlockedOperationBarrier();
+ return result;
+#endif
+}
+
+template <typename T>
+__forceinline T Interlocked::ExchangeAddPtr(T volatile* addend, T value)
+{
+#ifdef _MSC_VER
+#ifdef BIT64
+ static_assert(sizeof(int64_t) == sizeof(T), "Size of LONGLONG must be the same as size of T");
+ return _InterlockedExchangeAdd64((int64_t*)addend, value);
+#else
+ static_assert(sizeof(long) == sizeof(T), "Size of long must be the same as size of T");
+ return _InterlockedExchangeAdd((long*)addend, value);
+#endif
+#else
+ T result = __sync_fetch_and_add(addend, value);
+ ArmInterlockedOperationBarrier();
+ return result;
+#endif
+}
+
// Perform an atomic AND operation on the specified values values
// Parameters:
// destination - the first operand and the destination
diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h
index 393bd1fe18..e56321cace 100644
--- a/src/gc/env/gcenv.os.h
+++ b/src/gc/env/gcenv.os.h
@@ -144,8 +144,10 @@ typedef void (*GCThreadFunction)(void* param);
// Right now we support maximum 1024 procs - meaning that we will create at most
// that many GC threads and GC heaps.
#define MAX_SUPPORTED_CPUS 1024
+#define MAX_SUPPORTED_NODES 64
#else
#define MAX_SUPPORTED_CPUS 64
+#define MAX_SUPPORTED_NODES 16
#endif // BIT64
// Add of processor indices used to store affinity.
@@ -253,6 +255,7 @@ public:
// size - size of the virtual memory range
// alignment - requested memory alignment
// flags - flags to control special settings like write watching
+ // node - the NUMA node to reserve memory on
// Return:
// Starting virtual address of the reserved range
// Notes:
@@ -264,7 +267,7 @@ public:
//
// Windows guarantees that the returned mapping will be aligned to the allocation
// granularity.
- static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags);
+ static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node = NUMA_NODE_UNDEFINED);
// Release virtual memory range previously reserved using VirtualReserve
// Parameters:
@@ -360,6 +363,8 @@ public:
// true if it has succeeded, false if it has failed
static bool SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t dstProcNo);
+ static bool GetCurrentThreadIdealProc(uint16_t* procNo);
+
// Get numeric id of the current thread if possible on the
// current platform. It is indended for logging purposes only.
// Return:
@@ -484,6 +489,15 @@ public:
// Is NUMA support available
static bool CanEnableGCNumaAware();
+ // TODO: add Linux implementation.
+ // For no NUMA this returns false.
+ static bool GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node);
+
+ // Is CPU Group enabled
+ // This only applies on Windows and only used by instrumentation but is on the
+ // interface due to LocalGC.
+ static bool CanEnableGCCPUGroups();
+
// Get processor number and optionally its NUMA node number for the specified heap number
// Parameters:
// heap_number - heap number to get the result for
@@ -493,6 +507,9 @@ public:
// true if it succeeded
static bool GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_no, uint16_t* node_no);
+ // For no CPU groups this returns false.
+ static bool GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group);
+
// Parse the confing string describing affinitization ranges and update the passed in affinitySet accordingly
// Parameters:
// config_string - string describing the affinitization range, platform specific
diff --git a/src/gc/gc.cpp b/src/gc/gc.cpp
index 0917527ab4..f4d877fe88 100644
--- a/src/gc/gc.cpp
+++ b/src/gc/gc.cpp
@@ -56,6 +56,8 @@ BOOL bgc_heap_walk_for_etw_p = FALSE;
#define MAX_PTR ((uint8_t*)(~(ptrdiff_t)0))
#define commit_min_th (16*OS_PAGE_SIZE)
+static size_t smoothed_desired_per_heap = 0;
+
#ifdef SERVER_GC
#define partial_size_th 100
#define num_partial_refs 64
@@ -205,6 +207,12 @@ size_t GetHighPrecisionTimeStamp()
return (size_t)(ts / (qpf / 1000));
}
+
+uint64_t RawGetHighPrecisionTimeStamp()
+{
+ return (uint64_t)GCToOSInterface::QueryPerformanceCounter();
+}
+
#endif
#ifdef GC_STATS
@@ -436,6 +444,7 @@ void c_write (uint32_t& place, uint32_t value)
}
#ifndef DACCESS_COMPILE
+
// If every heap's gen2 or gen3 size is less than this threshold we will do a blocking GC.
const size_t bgc_min_per_heap = 4*1024*1024;
@@ -682,6 +691,7 @@ process_sync_log_stats()
#ifdef MULTIPLE_HEAPS
#ifndef DACCESS_COMPILE
+uint32_t g_num_active_processors = 0;
enum gc_join_stage
{
@@ -2442,6 +2452,7 @@ sorted_table* gc_heap::seg_table;
#ifdef MULTIPLE_HEAPS
GCEvent gc_heap::ee_suspend_event;
+size_t gc_heap::min_gen0_balance_delta = 0;
size_t gc_heap::min_balance_threshold = 0;
#endif //MULTIPLE_HEAPS
@@ -2537,6 +2548,7 @@ uint64_t gc_heap::entry_available_physical_mem = 0;
size_t gc_heap::heap_hard_limit = 0;
+bool affinity_config_specified_p = false;
#ifdef BACKGROUND_GC
GCEvent gc_heap::bgc_start_event;
@@ -2576,6 +2588,7 @@ int gc_heap::spinlock_info_index = 0;
spinlock_info gc_heap::last_spinlock_info[max_saved_spinlock_info + 8];
#endif //SPINLOCK_HISTORY
+uint32_t gc_heap::fgn_maxgen_percent = 0;
size_t gc_heap::fgn_last_alloc = 0;
int gc_heap::generation_skip_ratio = 100;
@@ -2834,6 +2847,9 @@ size_t gc_heap::eph_gen_starts_size = 0;
heap_segment* gc_heap::segment_standby_list;
bool gc_heap::use_large_pages_p = 0;
size_t gc_heap::last_gc_index = 0;
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+size_t gc_heap::last_gc_end_time_ms = 0;
+#endif //HEAP_BALANCE_INSTRUMENTATION
#ifdef SEG_MAPPING_TABLE
size_t gc_heap::min_segment_size = 0;
size_t gc_heap::min_segment_size_shr = 0;
@@ -2859,8 +2875,6 @@ GCEvent gc_heap::full_gc_approach_event;
GCEvent gc_heap::full_gc_end_event;
-uint32_t gc_heap::fgn_maxgen_percent = 0;
-
uint32_t gc_heap::fgn_loh_percent = 0;
#ifdef BACKGROUND_GC
@@ -4283,10 +4297,10 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
// should only be called once
assert (memory_details.initial_memory == 0);
- memory_details.initial_memory = new (nothrow) imemory_data[num_heaps*2];
+ memory_details.initial_memory = new (nothrow) imemory_data[num_heaps * 2];
if (memory_details.initial_memory == 0)
{
- dprintf (2, ("failed to reserve %Id bytes for imemory_data", num_heaps*2*sizeof(imemory_data)));
+ dprintf (2, ("failed to reserve %Id bytes for imemory_data", num_heaps * 2 * sizeof (imemory_data)));
return FALSE;
}
@@ -4320,15 +4334,15 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
uint8_t* allatonce_block = (uint8_t*)virtual_alloc (requestedMemory, use_large_pages_p);
if (allatonce_block)
{
- g_gc_lowest_address = allatonce_block;
+ g_gc_lowest_address = allatonce_block;
g_gc_highest_address = allatonce_block + requestedMemory;
memory_details.allocation_pattern = initial_memory_details::ALLATONCE;
- for(size_t i = 0; i < memory_details.block_count; i++)
+ for (size_t i = 0; i < memory_details.block_count; i++)
{
- memory_details.initial_normal_heap[i].memory_base = allatonce_block + (i*normal_size);
+ memory_details.initial_normal_heap[i].memory_base = allatonce_block + (i * normal_size);
memory_details.initial_large_heap[i].memory_base = allatonce_block +
- (memory_details.block_count*normal_size) + (i*large_size);
+ (memory_details.block_count * normal_size) + (i * large_size);
reserve_success = TRUE;
}
}
@@ -4344,13 +4358,13 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
if (b2)
{
memory_details.allocation_pattern = initial_memory_details::TWO_STAGE;
- g_gc_lowest_address = min(b1,b2);
- g_gc_highest_address = max(b1 + memory_details.block_count*normal_size,
- b2 + memory_details.block_count*large_size);
- for(size_t i = 0; i < memory_details.block_count; i++)
+ g_gc_lowest_address = min (b1, b2);
+ g_gc_highest_address = max (b1 + memory_details.block_count * normal_size,
+ b2 + memory_details.block_count * large_size);
+ for (size_t i = 0; i < memory_details.block_count; i++)
{
- memory_details.initial_normal_heap[i].memory_base = b1 + (i*normal_size);
- memory_details.initial_large_heap[i].memory_base = b2 + (i*large_size);
+ memory_details.initial_normal_heap[i].memory_base = b1 + (i * normal_size);
+ memory_details.initial_large_heap[i].memory_base = b2 + (i * large_size);
reserve_success = TRUE;
}
}
@@ -4362,28 +4376,28 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
}
}
- if ((b2==NULL) && ( memory_details.block_count > 1))
+ if ((b2 == NULL) && (memory_details.block_count > 1))
{
memory_details.allocation_pattern = initial_memory_details::EACH_BLOCK;
- imemory_data *current_block = memory_details.initial_memory;
- for(size_t i = 0; i < (memory_details.block_count*2); i++, current_block++)
+ imemory_data* current_block = memory_details.initial_memory;
+ for (size_t i = 0; i < (memory_details.block_count * 2); i++, current_block++)
{
size_t block_size = ((i < memory_details.block_count) ?
- memory_details.block_size_normal :
- memory_details.block_size_large);
+ memory_details.block_size_normal :
+ memory_details.block_size_large);
current_block->memory_base =
(uint8_t*)virtual_alloc (block_size, use_large_pages_p);
if (current_block->memory_base == 0)
{
// Free the blocks that we've allocated so far
current_block = memory_details.initial_memory;
- for(size_t j = 0; j < i; j++, current_block++){
- if (current_block->memory_base != 0){
+ for (size_t j = 0; j < i; j++, current_block++) {
+ if (current_block->memory_base != 0) {
block_size = ((j < memory_details.block_count) ?
- memory_details.block_size_normal :
- memory_details.block_size_large);
- virtual_free (current_block->memory_base , block_size);
+ memory_details.block_size_normal :
+ memory_details.block_size_large);
+ virtual_free (current_block->memory_base, block_size);
}
}
reserve_success = FALSE;
@@ -4392,8 +4406,8 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
else
{
if (current_block->memory_base < g_gc_lowest_address)
- g_gc_lowest_address = current_block->memory_base;
- if (((uint8_t *) current_block->memory_base + block_size) > g_gc_highest_address)
+ g_gc_lowest_address = current_block->memory_base;
+ if (((uint8_t*)current_block->memory_base + block_size) > g_gc_highest_address)
g_gc_highest_address = (current_block->memory_base + block_size);
}
reserve_success = TRUE;
@@ -4507,7 +4521,9 @@ void* virtual_alloc (size_t size, bool use_large_pages_p)
}
#endif // !FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
- void* prgmem = use_large_pages_p ? GCToOSInterface::VirtualReserveAndCommitLargePages(requested_size) : GCToOSInterface::VirtualReserve(requested_size, card_size * card_word_width, flags);
+ void* prgmem = use_large_pages_p ?
+ GCToOSInterface::VirtualReserveAndCommitLargePages(requested_size) :
+ GCToOSInterface::VirtualReserve(requested_size, card_size * card_word_width, flags);
void *aligned_mem = prgmem;
// We don't want (prgmem + size) to be right at the end of the address space
@@ -5053,9 +5069,18 @@ extern "C" uint64_t __rdtsc();
}
#endif //_TARGET_X86_
+// We may not be on contiguous numa nodes so need to store
+// the node index as well.
+struct node_heap_count
+{
+ int node_no;
+ int heap_count;
+};
+
class heap_select
{
heap_select() {}
+public:
static uint8_t* sniff_buffer;
static unsigned n_sniff_buffers;
static unsigned cur_sniff_index;
@@ -5063,7 +5088,12 @@ class heap_select
static uint16_t proc_no_to_heap_no[MAX_SUPPORTED_CPUS];
static uint16_t heap_no_to_proc_no[MAX_SUPPORTED_CPUS];
static uint16_t heap_no_to_numa_node[MAX_SUPPORTED_CPUS];
+ static uint16_t proc_no_to_numa_node[MAX_SUPPORTED_CPUS];
static uint16_t numa_node_to_heap_map[MAX_SUPPORTED_CPUS+4];
+ // Note this is the total numa nodes GC heaps are on. There might be
+ // more on the machine if GC threads aren't using all of them.
+ static uint16_t total_numa_nodes;
+ static node_heap_count heaps_on_node[MAX_SUPPORTED_NODES];
static int access_time(uint8_t *sniff_buffer, int heap_number, unsigned sniff_index, unsigned n_sniff_buffers)
{
@@ -5104,14 +5134,11 @@ public:
return TRUE;
}
- static void init_cpu_mapping(gc_heap * /*heap*/, int heap_number)
+ static void init_cpu_mapping(int heap_number)
{
if (GCToOSInterface::CanGetCurrentProcessorNumber())
{
- uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber() % gc_heap::n_heaps;
- // We can safely cast heap_number to a uint16_t 'cause GetCurrentProcessCpuCount
- // only returns up to MAX_SUPPORTED_CPUS procs right now. We only ever create at most
- // MAX_SUPPORTED_CPUS GC threads.
+ uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber();
proc_no_to_heap_no[proc_no] = (uint16_t)heap_number;
}
}
@@ -5125,12 +5152,15 @@ public:
sniff_buffer[(1 + heap_number*n_sniff_buffers + sniff_index)*HS_CACHE_LINE_SIZE] &= 1;
}
- static int select_heap(alloc_context* acontext, int /*hint*/)
+ static int select_heap(alloc_context* acontext)
{
UNREFERENCED_PARAMETER(acontext); // only referenced by dprintf
if (GCToOSInterface::CanGetCurrentProcessorNumber())
- return proc_no_to_heap_no[GCToOSInterface::GetCurrentProcessorNumber() % gc_heap::n_heaps];
+ {
+ uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber();
+ return proc_no_to_heap_no[proc_no];
+ }
unsigned sniff_index = Interlocked::Increment(&cur_sniff_index);
sniff_index %= n_sniff_buffers;
@@ -5175,6 +5205,11 @@ public:
return GCToOSInterface::CanGetCurrentProcessorNumber();
}
+ static uint16_t find_heap_no_from_proc_no(uint16_t proc_no)
+ {
+ return proc_no_to_heap_no[proc_no];
+ }
+
static uint16_t find_proc_no_from_heap_no(int heap_number)
{
return heap_no_to_proc_no[heap_number];
@@ -5190,33 +5225,102 @@ public:
return heap_no_to_numa_node[heap_number];
}
- static void set_numa_node_for_heap(int heap_number, uint16_t numa_node)
+ static uint16_t find_numa_node_from_proc_no (uint16_t proc_no)
+ {
+ return proc_no_to_numa_node[proc_no];
+ }
+
+ static void set_numa_node_for_heap_and_proc(int heap_number, uint16_t proc_no, uint16_t numa_node)
{
heap_no_to_numa_node[heap_number] = numa_node;
+ proc_no_to_numa_node[proc_no] = numa_node;
}
static void init_numa_node_to_heap_map(int nheaps)
- { // Called right after GCHeap::Init() for each heap
+ {
+ // Called right after GCHeap::Init() for each heap
// For each NUMA node used by the heaps, the
// numa_node_to_heap_map[numa_node] is set to the first heap number on that node and
// numa_node_to_heap_map[numa_node + 1] is set to the first heap number not on that node
-
// Set the start of the heap number range for the first NUMA node
numa_node_to_heap_map[heap_no_to_numa_node[0]] = 0;
+ total_numa_nodes = 0;
+ memset (heaps_on_node, 0, sizeof (heaps_on_node));
+ heaps_on_node[0].node_no = heap_no_to_numa_node[0];
+ heaps_on_node[0].heap_count = 1;
for (int i=1; i < nheaps; i++)
{
if (heap_no_to_numa_node[i] != heap_no_to_numa_node[i-1])
{
+ total_numa_nodes++;
+ heaps_on_node[total_numa_nodes].node_no = heap_no_to_numa_node[i];
+
// Set the end of the heap number range for the previous NUMA node
numa_node_to_heap_map[heap_no_to_numa_node[i-1] + 1] =
// Set the start of the heap number range for the current NUMA node
numa_node_to_heap_map[heap_no_to_numa_node[i]] = (uint16_t)i;
}
+ (heaps_on_node[total_numa_nodes].heap_count)++;
}
// Set the end of the heap range for the last NUMA node
numa_node_to_heap_map[heap_no_to_numa_node[nheaps-1] + 1] = (uint16_t)nheaps; //mark the end with nheaps
+ total_numa_nodes++;
+ }
+
+ // TODO: curently this doesn't work with GCHeapAffinitizeMask/GCHeapAffinitizeRanges
+ // because the heaps may not be on contiguous active procs.
+ //
+ // This is for scenarios where GCHeapCount is specified as something like
+ // (g_num_active_processors - 2) to allow less randomization to the Server GC threads.
+ // In this case we want to assign the right heaps to those procs, ie if they share
+ // the same numa node we want to assign local heaps to those procs. Otherwise we
+ // let the heap balancing mechanism take over for now.
+ static void distribute_other_procs()
+ {
+ if (affinity_config_specified_p)
+ return;
+
+ uint16_t proc_no = 0;
+ uint16_t node_no = 0;
+ bool res = false;
+ int start_heap = -1;
+ int end_heap = -1;
+ int current_node_no = -1;
+ int current_heap_on_node = -1;
+
+ for (int i = gc_heap::n_heaps; i < (int)g_num_active_processors; i++)
+ {
+ if (!GCToOSInterface::GetProcessorForHeap (i, &proc_no, &node_no))
+ break;
+
+ int start_heap = (int)numa_node_to_heap_map[node_no];
+ int end_heap = (int)(numa_node_to_heap_map[node_no + 1]);
+
+ if ((end_heap - start_heap) > 0)
+ {
+ if (node_no == current_node_no)
+ {
+ // We already iterated through all heaps on this node, don't add more procs to these
+ // heaps.
+ if (current_heap_on_node >= end_heap)
+ {
+ continue;
+ }
+ }
+ else
+ {
+ current_node_no = node_no;
+ current_heap_on_node = start_heap;
+ }
+
+ proc_no_to_heap_no[proc_no] = current_heap_on_node;
+ proc_no_to_numa_node[proc_no] = node_no;
+
+ current_heap_on_node++;
+ }
+ }
}
static void get_heap_range_for_heap(int hn, int* start, int* end)
@@ -5225,6 +5329,43 @@ public:
*start = (int)numa_node_to_heap_map[numa_node];
*end = (int)(numa_node_to_heap_map[numa_node+1]);
}
+
+ // This gets the next valid numa node index starting at current_index+1.
+ // It assumes that current_index is a valid node index.
+ // If current_index+1 is at the end this will start at the beginning. So this will
+ // always return a valid node index, along with that node's start/end heaps.
+ static uint16_t get_next_numa_node (uint16_t current_index, int* start, int* end)
+ {
+ int start_index = current_index + 1;
+ int nheaps = gc_heap::n_heaps;
+
+ bool found_node_with_heaps_p = false;
+ do
+ {
+ int start_heap = (int)numa_node_to_heap_map[start_index];
+ int end_heap = (int)numa_node_to_heap_map[start_index + 1];
+ if (start_heap == nheaps)
+ {
+ // This is the last node.
+ start_index = 0;
+ continue;
+ }
+
+ if ((end_heap - start_heap) == 0)
+ {
+ // This node has no heaps.
+ start_index++;
+ }
+ else
+ {
+ found_node_with_heaps_p = true;
+ *start = start_heap;
+ *end = end_heap;
+ }
+ } while (!found_node_with_heaps_p);
+
+ return start_index;
+ }
};
uint8_t* heap_select::sniff_buffer;
unsigned heap_select::n_sniff_buffers;
@@ -5232,7 +5373,318 @@ unsigned heap_select::cur_sniff_index;
uint16_t heap_select::proc_no_to_heap_no[MAX_SUPPORTED_CPUS];
uint16_t heap_select::heap_no_to_proc_no[MAX_SUPPORTED_CPUS];
uint16_t heap_select::heap_no_to_numa_node[MAX_SUPPORTED_CPUS];
+uint16_t heap_select::proc_no_to_numa_node[MAX_SUPPORTED_CPUS];
uint16_t heap_select::numa_node_to_heap_map[MAX_SUPPORTED_CPUS+4];
+uint16_t heap_select::total_numa_nodes;
+node_heap_count heap_select::heaps_on_node[MAX_SUPPORTED_NODES];
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+// This records info we use to look at effect of different strategies
+// for heap balancing.
+struct heap_balance_info
+{
+ uint64_t timestamp;
+ // This also encodes when we detect the thread runs on
+ // different proc during a balance attempt. Sometimes
+ // I observe this happens multiple times during one attempt!
+ // If this happens, I just record the last proc we observe
+ // and set MSB.
+ int tid;
+ // This records the final alloc_heap for the thread.
+ //
+ // This also encodes the reason why we needed to set_home_heap
+ // in balance_heaps.
+ // If we set it because the home heap is not the same as the proc,
+ // we set MSB.
+ //
+ // If we set ideal proc, we set the 2nd MSB.
+ int alloc_heap;
+ int ideal_proc_no;
+};
+
+// This means inbetween each GC we can log at most this many entries per proc.
+// This is usually enough. Most of the time we only need to log something every 128k
+// of allocations in balance_heaps and gen0 budget is <= 200mb.
+#define default_max_hb_heap_balance_info 4096
+
+struct heap_balance_info_proc
+{
+ int count;
+ int index;
+ heap_balance_info hb_info[default_max_hb_heap_balance_info];
+};
+
+struct heap_balance_info_numa
+{
+ heap_balance_info_proc* hb_info_procs;
+};
+
+uint64_t start_raw_ts = 0;
+bool cpu_group_enabled_p = false;
+uint32_t procs_per_numa_node = 0;
+uint16_t total_numa_nodes_on_machine = 0;
+uint32_t procs_per_cpu_group = 0;
+uint16_t total_cpu_groups_on_machine = 0;
+// Note this is still on one of the numa nodes, so we'll incur a remote access
+// no matter what.
+heap_balance_info_numa* hb_info_numa_nodes = NULL;
+
+// TODO: This doesn't work for multiple nodes per CPU group yet.
+int get_proc_index_numa (int proc_no, int* numa_no)
+{
+ if (total_numa_nodes_on_machine == 1)
+ {
+ *numa_no = 0;
+ return proc_no;
+ }
+ else
+ {
+ if (cpu_group_enabled_p)
+ {
+ // see vm\gcenv.os.cpp GroupProcNo implementation.
+ *numa_no = proc_no >> 6;
+ return (proc_no % 64);
+ }
+ else
+ {
+ *numa_no = proc_no / procs_per_numa_node;
+ return (proc_no % procs_per_numa_node);
+ }
+ }
+}
+
+// We could consider optimizing it so we don't need to get the tid
+// everytime but it's not very expensive to get.
+void add_to_hb_numa (
+ int proc_no,
+ int ideal_proc_no,
+ int alloc_heap,
+ bool multiple_procs_p,
+ bool alloc_count_p,
+ bool set_ideal_p)
+{
+ int tid = (int)GCToOSInterface::GetCurrentThreadIdForLogging ();
+ uint64_t timestamp = RawGetHighPrecisionTimeStamp ();
+
+ int saved_proc_no = proc_no;
+ int numa_no = -1;
+ proc_no = get_proc_index_numa (proc_no, &numa_no);
+
+ heap_balance_info_numa* hb_info_numa_node = &hb_info_numa_nodes[numa_no];
+
+ heap_balance_info_proc* hb_info_proc = &(hb_info_numa_node->hb_info_procs[proc_no]);
+ int index = hb_info_proc->index;
+ int count = hb_info_proc->count;
+
+ if (index == count)
+ {
+ // Too much info inbetween GCs. This can happen if the thread is scheduled on a different
+ // processor very often so it caused us to log many entries due to that reason. You could
+ // increase default_max_hb_heap_balance_info but this usually indicates a problem that
+ // should be investigated.
+ dprintf (HEAP_BALANCE_LOG, ("too much info between GCs, already logged %d entries", index));
+ GCToOSInterface::DebugBreak ();
+ }
+ heap_balance_info* hb_info = &(hb_info_proc->hb_info[index]);
+
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP[p%3d->%3d(i:%3d), N%d] #%4d: %I64d, tid %d, ah: %d, m: %d, p: %d, i: %d",
+ saved_proc_no, proc_no, ideal_proc_no, numa_no, index,
+ (timestamp - start_raw_ts), tid, alloc_heap, (int)multiple_procs_p, (int)(!alloc_count_p), (int)set_ideal_p));
+
+ if (multiple_procs_p)
+ {
+ tid |= (1 << (sizeof (tid) * 8 - 1));
+ }
+
+ if (!alloc_count_p)
+ {
+ alloc_heap |= (1 << (sizeof (alloc_heap) * 8 - 1));
+ }
+
+ if (set_ideal_p)
+ {
+ alloc_heap |= (1 << (sizeof (alloc_heap) * 8 - 2));
+ }
+
+ hb_info->timestamp = timestamp;
+ hb_info->tid = tid;
+ hb_info->alloc_heap = alloc_heap;
+ hb_info->ideal_proc_no = ideal_proc_no;
+ (hb_info_proc->index)++;
+}
+
+const int hb_log_buffer_size = 1024;
+static char hb_log_buffer[hb_log_buffer_size];
+int last_hb_recorded_gc_index = -1;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
+// This logs what we recorded in balance_heaps
+// The format for this is
+//
+// [ms since last GC end]
+// [cpu index]
+// all elements we stored before this GC for this CPU in the format
+// timestamp,tid, alloc_heap_no
+// repeat this for each CPU
+//
+// the timestamp here is just the result of calling QPC,
+// it's not converted to ms. The conversion will be done when we process
+// the log.
+void gc_heap::hb_log_balance_activities()
+{
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ char* log_buffer = hb_log_buffer;
+
+ size_t now = GetHighPrecisionTimeStamp ();
+ size_t time_since_last_gc_ms = now - last_gc_end_time_ms;
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP%Id - %Id = %Id", now, last_gc_end_time_ms, time_since_last_gc_ms));
+
+ // We want to get the min and the max timestamp for all procs because it helps with our post processing
+ // to know how big an array to allocate to display the history inbetween the GCs.
+ uint64_t min_timestamp = 0xffffffffffffffff;
+ uint64_t max_timestamp = 0;
+
+ for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+ {
+ heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs;
+ for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+ {
+ heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+ int total_entries_on_proc = hb_info_proc->index;
+
+ if (total_entries_on_proc > 0)
+ {
+ min_timestamp = min (min_timestamp, hb_info_proc->hb_info[0].timestamp);
+ max_timestamp = max (max_timestamp, hb_info_proc->hb_info[total_entries_on_proc - 1].timestamp);
+ }
+ }
+ }
+
+ dprintf (HEAP_BALANCE_LOG, ("[GCA#%Id %Id-%I64d-%I64d]",
+ settings.gc_index, time_since_last_gc_ms, (min_timestamp - start_raw_ts), (max_timestamp - start_raw_ts)));
+
+ if (last_hb_recorded_gc_index == (int)settings.gc_index)
+ {
+ GCToOSInterface::DebugBreak ();
+ }
+
+ last_hb_recorded_gc_index = (int)settings.gc_index;
+
+ // When we print out the proc index we need to convert it to the actual proc index (this is contiguous).
+ // It helps with post processing.
+ for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+ {
+ heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs;
+ for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+ {
+ heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+ int total_entries_on_proc = hb_info_proc->index;
+ if (total_entries_on_proc > 0)
+ {
+ int total_exec_time_ms = (int)((hb_info_proc->hb_info[total_entries_on_proc - 1].timestamp - hb_info_proc->hb_info[0].timestamp) / (qpf / 1000));
+ dprintf (HEAP_BALANCE_LOG, ("[p%d]-%d-%dms", (proc_index + numa_node_index * procs_per_numa_node), total_entries_on_proc, total_exec_time_ms));
+ }
+
+ for (int i = 0; i < hb_info_proc->index; i++)
+ {
+ heap_balance_info* hb_info = &hb_info_proc->hb_info[i];
+ bool multiple_procs_p = false;
+ bool alloc_count_p = true;
+ bool set_ideal_p = false;
+ int tid = hb_info->tid;
+ int alloc_heap = hb_info->alloc_heap;
+
+ if (tid & (1 << (sizeof (tid) * 8 - 1)))
+ {
+ multiple_procs_p = true;
+ tid &= ~(1 << (sizeof (tid) * 8 - 1));
+ }
+
+ if (alloc_heap & (1 << (sizeof (alloc_heap) * 8 - 1)))
+ {
+ alloc_count_p = false;
+ alloc_heap &= ~(1 << (sizeof (alloc_heap) * 8 - 1));
+ }
+
+ if (alloc_heap & (1 << (sizeof (alloc_heap) * 8 - 2)))
+ {
+ set_ideal_p = true;
+ alloc_heap &= ~(1 << (sizeof (alloc_heap) * 8 - 2));
+ }
+
+ // TODO - This assumes ideal proc is in the same cpu group which is not true
+ // when we don't have CPU groups.
+ int ideal_proc_no = hb_info->ideal_proc_no;
+ int ideal_node_no = -1;
+ ideal_proc_no = get_proc_index_numa (ideal_proc_no, &ideal_node_no);
+ ideal_proc_no = ideal_proc_no + ideal_node_no * procs_per_numa_node;
+
+ dprintf (HEAP_BALANCE_LOG, ("%I64d,%d,%d,%d%s%s%s",
+ (hb_info->timestamp - start_raw_ts),
+ tid,
+ ideal_proc_no,
+ (int)alloc_heap,
+ (multiple_procs_p ? "|m" : ""), (!alloc_count_p ? "|p" : ""), (set_ideal_p ? "|i" : "")));
+ }
+ }
+ }
+
+ for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+ {
+ heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs;
+ for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+ {
+ heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+ hb_info_proc->index = 0;
+ }
+ }
+#endif //HEAP_BALANCE_INSTRUMENTATION
+}
+
+// The format for this is
+//
+// [GC_alloc_mb]
+// h0_new_alloc, h1_new_alloc, ...
+//
+void gc_heap::hb_log_new_allocation()
+{
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ char* log_buffer = hb_log_buffer;
+
+ int desired_alloc_mb = (int)(dd_desired_allocation (g_heaps[0]->dynamic_data_of (0)) / 1024 / 1024);
+
+ int buffer_pos = sprintf_s (hb_log_buffer, hb_log_buffer_size, "[GC_alloc_mb]\n");
+ for (int numa_node_index = 0; numa_node_index < heap_select::total_numa_nodes; numa_node_index++)
+ {
+ int node_allocated_mb = 0;
+
+ // I'm printing out the budget here instead of the numa node index so we know how much
+ // of the budget we consumed.
+ buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "[N#%3d]",
+ //numa_node_index);
+ desired_alloc_mb);
+
+ int heaps_on_node = heap_select::heaps_on_node[numa_node_index].heap_count;
+
+ for (int heap_index = 0; heap_index < heaps_on_node; heap_index++)
+ {
+ int actual_heap_index = heap_index + numa_node_index * heaps_on_node;
+ gc_heap* hp = g_heaps[actual_heap_index];
+ dynamic_data* dd0 = hp->dynamic_data_of (0);
+ int allocated_mb = (int)((dd_desired_allocation (dd0) - dd_new_allocation (dd0)) / 1024 / 1024);
+ node_allocated_mb += allocated_mb;
+ buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "%d,",
+ allocated_mb);
+ }
+
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPN#%d a %dmb(%dmb)", numa_node_index, node_allocated_mb, desired_alloc_mb));
+
+ buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "\n");
+ }
+
+ dprintf (HEAP_BALANCE_LOG, ("%s", hb_log_buffer));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+}
BOOL gc_heap::create_thread_support (unsigned number_of_heaps)
{
@@ -5274,23 +5726,30 @@ void gc_heap::destroy_thread_support ()
}
}
-void set_thread_affinity_for_heap(int heap_number)
+bool get_proc_and_numa_for_heap (int heap_number)
{
uint16_t proc_no;
uint16_t node_no;
- if (GCToOSInterface::GetProcessorForHeap(heap_number, &proc_no, &node_no))
+ bool res = GCToOSInterface::GetProcessorForHeap (heap_number, &proc_no, &node_no);
+ if (res)
{
- heap_select::set_proc_no_for_heap(heap_number, proc_no);
+ heap_select::set_proc_no_for_heap (heap_number, proc_no);
if (node_no != NUMA_NODE_UNDEFINED)
{
- heap_select::set_numa_node_for_heap(heap_number, node_no);
- }
- if (!GCToOSInterface::SetThreadAffinity(proc_no))
- {
- dprintf(1, ("Failed to set thread affinity for server GC thread"));
+ heap_select::set_numa_node_for_heap_and_proc (heap_number, proc_no, node_no);
}
}
+
+ return res;
+}
+
+void set_thread_affinity_for_heap (int heap_number, uint16_t proc_no)
+{
+ if (!GCToOSInterface::SetThreadAffinity (proc_no))
+ {
+ dprintf (1, ("Failed to set thread affinity for GC thread %d on proc #%d", heap_number, proc_no));
+ }
}
bool gc_heap::create_gc_thread ()
@@ -5308,7 +5767,7 @@ void gc_heap::gc_thread_function ()
assert (gc_start_event.IsValid());
dprintf (3, ("gc thread started"));
- heap_select::init_cpu_mapping(this, heap_number);
+ heap_select::init_cpu_mapping(heap_number);
while (1)
{
@@ -5430,7 +5889,7 @@ bool gc_heap::virtual_alloc_commit_for_heap (void* addr, size_t size, int h_numb
if (GCToOSInterface::CanEnableGCNumaAware())
{
uint16_t numa_node = heap_select::find_numa_node_from_heap_no(h_number);
- if (GCToOSInterface::VirtualCommit(addr, size, numa_node))
+ if (GCToOSInterface::VirtualCommit (addr, size, numa_node))
return true;
}
}
@@ -9029,6 +9488,7 @@ retry:
inline size_t my_get_size (Object* ob)
{
MethodTable* mT = header(ob)->GetMethodTable();
+
return (mT->GetBaseSize() +
(mT->HasComponentSize() ?
((size_t)((CObjectHeader*)ob)->GetNumComponents() * mT->RawGetComponentSize()) : 0));
@@ -9317,8 +9777,8 @@ void gc_heap::decommit_heap_segment_pages (heap_segment* seg,
size -= max (extra_space, 32*OS_PAGE_SIZE);
virtual_decommit (page_start, size, heap_number);
- dprintf (3, ("Decommitting heap segment [%Ix, %Ix[(%d)",
- (size_t)page_start,
+ dprintf (3, ("Decommitting heap segment [%Ix, %Ix[(%d)",
+ (size_t)page_start,
(size_t)(page_start + size),
size));
heap_segment_committed (seg) = page_start;
@@ -10025,7 +10485,7 @@ HRESULT gc_heap::initialize_gc (size_t segment_size,
check_commit_cs.Initialize();
}
- if (!reserve_initial_memory(segment_size,heap_size,block_count,use_large_pages_p))
+ if (!reserve_initial_memory (segment_size,heap_size,block_count,use_large_pages_p))
return E_OUTOFMEMORY;
#ifdef CARD_BUNDLE
@@ -10182,7 +10642,6 @@ gc_heap::init_semi_shared()
goto cleanup;
}
- fgn_maxgen_percent = 0;
fgn_loh_percent = 0;
full_gc_approach_event_set = false;
@@ -10315,7 +10774,7 @@ gc_heap::wait_for_gc_done(int32_t timeOut)
while (gc_heap::gc_started)
{
#ifdef MULTIPLE_HEAPS
- wait_heap = GCHeap::GetHeap(heap_select::select_heap(NULL, 0))->pGenGCHeap;
+ wait_heap = GCHeap::GetHeap(heap_select::select_heap(NULL))->pGenGCHeap;
dprintf(2, ("waiting for the gc_done_event on heap %d", wait_heap->heap_number));
#endif // MULTIPLE_HEAPS
@@ -10680,6 +11139,7 @@ gc_heap::init_gc_heap (int h_number)
allocation_running_amount = dd_min_size (dynamic_data_of (0));
#endif //!MULTIPLE_HEAPS
+ fgn_maxgen_percent = 0;
fgn_last_alloc = dd_min_size (dynamic_data_of (0));
mark* arr = new (nothrow) (mark [MARK_STACK_INITIAL_LENGTH]);
@@ -10721,8 +11181,7 @@ gc_heap::init_gc_heap (int h_number)
#endif //MARK_ARRAY
#ifdef MULTIPLE_HEAPS
- //register the heap in the heaps array
-
+ get_proc_and_numa_for_heap (heap_number);
if (!create_gc_thread ())
return 0;
@@ -11654,12 +12113,13 @@ size_t gc_heap::new_allocation_limit (size_t size, size_t physical_limit, int ge
dynamic_data* dd = dynamic_data_of (gen_number);
ptrdiff_t new_alloc = dd_new_allocation (dd);
assert (new_alloc == (ptrdiff_t)Align (new_alloc,
- get_alignment_constant (!(gen_number == (max_generation+1)))));
+ get_alignment_constant (!(gen_number == (max_generation + 1)))));
ptrdiff_t logical_limit = max (new_alloc, (ptrdiff_t)size);
size_t limit = min (logical_limit, (ptrdiff_t)physical_limit);
assert (limit == Align (limit, get_alignment_constant (!(gen_number == (max_generation+1)))));
dd_new_allocation (dd) = (new_alloc - limit);
+
return limit;
}
@@ -11917,7 +12377,13 @@ void gc_heap::send_full_gc_notification (int gen_num, BOOL due_to_alloc_p)
wait_full_gc_status gc_heap::full_gc_wait (GCEvent *event, int time_out_ms)
{
- if (fgn_maxgen_percent == 0)
+ uint32_t maxgen_percent = 0;
+#ifdef MULTIPLE_HEAPS
+ maxgen_percent = g_heaps[0]->fgn_maxgen_percent;
+#else
+ maxgen_percent = fgn_maxgen_percent;
+#endif //MULTIPLE_HEAPS
+ if (maxgen_percent == 0)
{
return wait_full_gc_na;
}
@@ -11926,7 +12392,7 @@ wait_full_gc_status gc_heap::full_gc_wait (GCEvent *event, int time_out_ms)
if ((wait_result == WAIT_OBJECT_0) || (wait_result == WAIT_TIMEOUT))
{
- if (fgn_maxgen_percent == 0)
+ if (maxgen_percent == 0)
{
return wait_full_gc_cancelled;
}
@@ -12351,10 +12817,9 @@ found_fit:
VolatileStore(((void**)allocated - 1), (void*)0); //clear the sync block
#endif //VERIFY_HEAP && _DEBUG
- dprintf (3, ("found fit at end of seg: %Ix", old_alloc));
-
uint8_t* old_alloc;
old_alloc = allocated;
+ dprintf (3, ("found fit at end of seg: %Ix", old_alloc));
#ifdef BACKGROUND_GC
if (cookie != -1)
@@ -12471,7 +12936,7 @@ BOOL gc_heap::trigger_ephemeral_gc (gc_reason gr)
BOOL did_full_compact_gc = FALSE;
- dprintf (2, ("triggering a gen1 GC"));
+ dprintf (1, ("h%d triggering a gen1 GC", heap_number));
size_t last_full_compact_gc_count = get_full_compact_gc_count();
vm_heap->GarbageCollectGeneration(max_generation - 1, gr);
@@ -13443,7 +13908,7 @@ allocation_state gc_heap::try_allocate_more_space (alloc_context* acontext, size
#ifdef SYNCHRONIZATION_STATS
bad_suspension++;
#endif //SYNCHRONIZATION_STATS
- dprintf (/*100*/ 2, ("running out of budget on gen%d, gc", gen_number));
+ dprintf (2, ("h%d running out of budget on gen%d, gc", heap_number, gen_number));
if (!settings.concurrent || (gen_number == 0))
{
@@ -13497,70 +13962,138 @@ void gc_heap::balance_heaps (alloc_context* acontext)
{
if (acontext->alloc_count == 0)
{
- acontext->set_home_heap(GCHeap::GetHeap( heap_select::select_heap(acontext, 0) ));
- gc_heap* hp = acontext->get_home_heap()->pGenGCHeap;
- dprintf (3, ("First allocation for context %Ix on heap %d\n", (size_t)acontext, (size_t)hp->heap_number));
- acontext->set_alloc_heap(acontext->get_home_heap());
+ int home_hp_num = heap_select::select_heap (acontext);
+ acontext->set_home_heap (GCHeap::GetHeap (home_hp_num));
+ gc_heap* hp = acontext->get_home_heap ()->pGenGCHeap;
+ acontext->set_alloc_heap (acontext->get_home_heap ());
hp->alloc_context_count++;
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ uint16_t ideal_proc_no = 0;
+ GCToOSInterface::GetCurrentThreadIdealProc (&ideal_proc_no);
+
+ uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber ();
+
+ add_to_hb_numa (proc_no, ideal_proc_no,
+ home_hp_num, false, true, false);
+
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPafter GC: 1st alloc on p%3d, h%d, ip: %d",
+ proc_no, home_hp_num, ideal_proc_no));
+#endif //HEAP_BALANCE_INSTRUMENTATION
}
}
else
{
BOOL set_home_heap = FALSE;
- int hint = 0;
+ gc_heap* home_hp = NULL;
+ int proc_hp_num = 0;
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ bool alloc_count_p = true;
+ bool multiple_procs_p = false;
+ bool set_ideal_p = false;
+ uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber ();
+ uint32_t last_proc_no = proc_no;
+#endif //HEAP_BALANCE_INSTRUMENTATION
- if (heap_select::can_find_heap_fast())
+ if (heap_select::can_find_heap_fast ())
{
- if (acontext->get_home_heap() != NULL)
- hint = acontext->get_home_heap()->pGenGCHeap->heap_number;
- if (acontext->get_home_heap() != GCHeap::GetHeap(hint = heap_select::select_heap(acontext, hint)) || ((acontext->alloc_count & 15) == 0))
+ assert (acontext->get_home_heap () != NULL);
+ home_hp = acontext->get_home_heap ()->pGenGCHeap;
+ proc_hp_num = heap_select::select_heap (acontext);
+
+ if (acontext->get_home_heap () != GCHeap::GetHeap (proc_hp_num))
{
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ alloc_count_p = false;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+ set_home_heap = TRUE;
+ }
+ else if ((acontext->alloc_count & 15) == 0)
set_home_heap = TRUE;
+
+ if (set_home_heap)
+ {
}
}
else
{
- // can't use gdt
if ((acontext->alloc_count & 3) == 0)
set_home_heap = TRUE;
}
if (set_home_heap)
{
-/*
- // Since we are balancing up to MAX_SUPPORTED_CPUS, no need for this.
- if (n_heaps > MAX_SUPPORTED_CPUS)
- {
- // on machines with many processors cache affinity is really king, so don't even try
- // to balance on these.
- acontext->home_heap = GCHeap::GetHeap( heap_select::select_heap(acontext, hint) );
- acontext->alloc_heap = acontext->home_heap;
- }
- else
-*/
+ /*
+ // Since we are balancing up to MAX_SUPPORTED_CPUS, no need for this.
+ if (n_heaps > MAX_SUPPORTED_CPUS)
+ {
+ // on machines with many processors cache affinity is really king, so don't even try
+ // to balance on these.
+ acontext->home_heap = GCHeap::GetHeap( heap_select::select_heap(acontext));
+ acontext->alloc_heap = acontext->home_heap;
+ }
+ else
+ */
{
- gc_heap* org_hp = acontext->get_alloc_heap()->pGenGCHeap;
+ gc_heap* org_hp = acontext->get_alloc_heap ()->pGenGCHeap;
+ int org_hp_num = org_hp->heap_number;
+ int final_alloc_hp_num = org_hp_num;
dynamic_data* dd = org_hp->dynamic_data_of (0);
ptrdiff_t org_size = dd_new_allocation (dd);
+ ptrdiff_t total_size = (ptrdiff_t)dd_desired_allocation (dd);
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP[p%3d] ph h%3d, hh: %3d, ah: %3d (%dmb-%dmb), ac: %5d(%s)",
+ proc_no, proc_hp_num, home_hp->heap_number,
+ org_hp_num, (total_size / 1024 / 1024), (org_size / 1024 / 1024),
+ acontext->alloc_count,
+ ((proc_hp_num == home_hp->heap_number) ? "AC" : "H")));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
int org_alloc_context_count;
int max_alloc_context_count;
gc_heap* max_hp;
+ int max_hp_num = 0;
ptrdiff_t max_size;
- size_t delta = dd_min_size (dd)/4;
+ size_t local_delta = max (((size_t)org_size >> 6), min_gen0_balance_delta);
+ size_t delta = local_delta;
+
+ if (((size_t)org_size + 2 * delta) >= (size_t)total_size)
+ {
+ acontext->alloc_count++;
+ return;
+ }
int start, end, finish;
- heap_select::get_heap_range_for_heap(org_hp->heap_number, &start, &end);
+ heap_select::get_heap_range_for_heap (org_hp->heap_number, &start, &end);
finish = start + n_heaps;
try_again:
+ gc_heap* new_home_hp = 0;
+
do
{
max_hp = org_hp;
+ max_hp_num = org_hp_num;
max_size = org_size + delta;
- acontext->set_home_heap(GCHeap::GetHeap( heap_select::select_heap(acontext, hint) ));
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ proc_no = GCToOSInterface::GetCurrentProcessorNumber ();
+ if (proc_no != last_proc_no)
+ {
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSP: %d->%d", last_proc_no, proc_no));
+ multiple_procs_p = true;
+ last_proc_no = proc_no;
+ }
- if (org_hp == acontext->get_home_heap()->pGenGCHeap)
+ int current_hp_num = heap_select::proc_no_to_heap_no[proc_no];
+ acontext->set_home_heap (GCHeap::GetHeap (current_hp_num));
+#else
+ acontext->set_home_heap (GCHeap::GetHeap (heap_select::select_heap (acontext)));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+ new_home_hp = acontext->get_home_heap ()->pGenGCHeap;
+ if (org_hp == new_home_hp)
max_size = max_size + delta;
org_alloc_context_count = org_hp->alloc_context_count;
@@ -13568,62 +14101,98 @@ try_again:
if (max_alloc_context_count > 1)
max_size /= max_alloc_context_count;
+ int actual_start = start;
+ int actual_end = (end - 1);
+
for (int i = start; i < end; i++)
{
- gc_heap* hp = GCHeap::GetHeap(i%n_heaps)->pGenGCHeap;
+ gc_heap* hp = GCHeap::GetHeap (i % n_heaps)->pGenGCHeap;
dd = hp->dynamic_data_of (0);
ptrdiff_t size = dd_new_allocation (dd);
- if (hp == acontext->get_home_heap()->pGenGCHeap)
+
+ if (hp == new_home_hp)
+ {
size = size + delta;
+ }
int hp_alloc_context_count = hp->alloc_context_count;
+
if (hp_alloc_context_count > 0)
+ {
size /= (hp_alloc_context_count + 1);
+ }
if (size > max_size)
{
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPorg h%d(%dmb), m h%d(%dmb)",
+ org_hp_num, (max_size / 1024 / 1024),
+ hp->heap_number, (size / 1024 / 1024)));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
max_hp = hp;
max_size = size;
+ max_hp_num = max_hp->heap_number;
max_alloc_context_count = hp_alloc_context_count;
}
}
- }
+ }
while (org_alloc_context_count != org_hp->alloc_context_count ||
- max_alloc_context_count != max_hp->alloc_context_count);
+ max_alloc_context_count != max_hp->alloc_context_count);
if ((max_hp == org_hp) && (end < finish))
- {
- start = end; end = finish;
- delta = dd_min_size(dd)/2; // Make it twice as hard to balance to remote nodes on NUMA.
+ {
+ start = end; end = finish;
+ delta = local_delta * 2; // Make it twice as hard to balance to remote nodes on NUMA.
goto try_again;
}
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ uint16_t ideal_proc_no_before_set_ideal = 0;
+ GCToOSInterface::GetCurrentThreadIdealProc (&ideal_proc_no_before_set_ideal);
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
if (max_hp != org_hp)
{
+ final_alloc_hp_num = max_hp->heap_number;
+
org_hp->alloc_context_count--;
max_hp->alloc_context_count++;
- acontext->set_alloc_heap(GCHeap::GetHeap(max_hp->heap_number));
+
+ acontext->set_alloc_heap (GCHeap::GetHeap (final_alloc_hp_num));
if (!gc_thread_no_affinitize_p)
{
- uint16_t src_proc_no = heap_select::find_proc_no_from_heap_no(org_hp->heap_number);
- uint16_t dst_proc_no = heap_select::find_proc_no_from_heap_no(max_hp->heap_number);
+ uint16_t src_proc_no = heap_select::find_proc_no_from_heap_no (org_hp->heap_number);
+ uint16_t dst_proc_no = heap_select::find_proc_no_from_heap_no (max_hp->heap_number);
+
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSW! h%d(p%d)->h%d(p%d)",
+ org_hp_num, src_proc_no, final_alloc_hp_num, dst_proc_no));
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ int current_proc_no_before_set_ideal = GCToOSInterface::GetCurrentProcessorNumber ();
+ if (current_proc_no_before_set_ideal != last_proc_no)
+ {
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSPa: %d->%d", last_proc_no, current_proc_no_before_set_ideal));
+ multiple_procs_p = true;
+ }
+#endif //HEAP_BALANCE_INSTRUMENTATION
- if (!GCToOSInterface::SetCurrentThreadIdealAffinity(src_proc_no, dst_proc_no))
+ if (!GCToOSInterface::SetCurrentThreadIdealAffinity (src_proc_no, dst_proc_no))
+ {
+ dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPFailed to set the ideal processor for heap %d %d->%d",
+ org_hp->heap_number, (int)src_proc_no, (int)dst_proc_no));
+ }
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ else
{
- dprintf (3, ("Failed to set the ideal processor for heap %d.",
- org_hp->heap_number));
+ set_ideal_p = true;
}
+#endif //HEAP_BALANCE_INSTRUMENTATION
}
- dprintf (3, ("Switching context %p (home heap %d) ",
- acontext,
- acontext->get_home_heap()->pGenGCHeap->heap_number));
- dprintf (3, (" from heap %d (%Id free bytes, %d contexts) ",
- org_hp->heap_number,
- org_size,
- org_alloc_context_count));
- dprintf (3, (" to heap %d (%Id free bytes, %d contexts)\n",
- max_hp->heap_number,
- dd_new_allocation(max_hp->dynamic_data_of(0)),
- max_alloc_context_count));
}
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ add_to_hb_numa (proc_no, ideal_proc_no_before_set_ideal,
+ final_alloc_hp_num, multiple_procs_p, alloc_count_p, set_ideal_p);
+#endif //HEAP_BALANCE_INSTRUMENTATION
}
}
}
@@ -13650,8 +14219,8 @@ ptrdiff_t gc_heap::get_balance_heaps_loh_effective_budget ()
gc_heap* gc_heap::balance_heaps_loh (alloc_context* acontext, size_t alloc_size)
{
- const int home_hp_num = heap_select::select_heap(acontext, 0);
- dprintf (3, ("[h%d] LA: %Id", home_heap, alloc_size));
+ const int home_hp_num = heap_select::select_heap(acontext);
+ dprintf (3, ("[h%d] LA: %Id", home_hp_num, alloc_size));
gc_heap* home_hp = GCHeap::GetHeap(home_hp_num)->pGenGCHeap;
dynamic_data* dd = home_hp->dynamic_data_of (max_generation + 1);
const ptrdiff_t home_hp_size = home_hp->get_balance_heaps_loh_effective_budget ();
@@ -13705,7 +14274,7 @@ try_again:
gc_heap* gc_heap::balance_heaps_loh_hard_limit_retry (alloc_context* acontext, size_t alloc_size)
{
assert (heap_hard_limit);
- const int home_heap = heap_select::select_heap(acontext, 0);
+ const int home_heap = heap_select::select_heap(acontext);
dprintf (3, ("[h%d] balance_heaps_loh_hard_limit_retry alloc_size: %d", home_heap, alloc_size));
int start, end;
heap_select::get_heap_range_for_heap (home_heap, &start, &end);
@@ -15056,13 +15625,13 @@ int gc_heap::generation_to_condemn (int n_initial,
generation_free_obj_space (large_object_generation);
//save new_allocation
- for (i = 0; i <= max_generation+1; i++)
+ for (i = 0; i <= max_generation + 1; i++)
{
dynamic_data* dd = dynamic_data_of (i);
- dprintf (GTC_LOG, ("h%d: g%d: l: %Id (%Id)",
- heap_number, i,
- dd_new_allocation (dd),
- dd_desired_allocation (dd)));
+ dprintf (GTC_LOG, ("h%d: g%d: l: %Id (%Id)",
+ heap_number, i,
+ dd_new_allocation (dd),
+ dd_desired_allocation (dd)));
dd_gc_new_allocation (dd) = dd_new_allocation (dd);
}
@@ -15788,6 +16357,14 @@ void gc_heap::gc1()
dynamic_data* dd = dynamic_data_of (n);
dd_gc_elapsed_time (dd) = end_gc_time - dd_time_clock (dd);
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ if (heap_number == 0)
+ {
+ last_gc_end_time_ms = end_gc_time;
+ dprintf (HEAP_BALANCE_LOG, ("[GC#%Id-%Id-BGC]", settings.gc_index, dd_gc_elapsed_time (dd)));
+ }
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
free_list_info (max_generation, "after computing new dynamic data");
gc_history_per_heap* current_gc_data_per_heap = get_gc_data_per_heap();
@@ -15829,10 +16406,16 @@ void gc_heap::gc1()
if (heap_number == 0)
{
- dprintf (GTC_LOG, ("GC#%d(gen%d) took %Idms",
+ size_t gc_elapsed_time = dd_gc_elapsed_time (dynamic_data_of (0));
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ last_gc_end_time_ms = end_gc_time;
+ dprintf (HEAP_BALANCE_LOG, ("[GC#%Id-%Id-%Id]", settings.gc_index, gc_elapsed_time, dd_time_clock (dynamic_data_of (0))));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
+ dprintf (GTC_LOG, ("GC#%d(gen%d) took %Idms",
dd_collection_count (dynamic_data_of (0)),
settings.condemned_generation,
- dd_gc_elapsed_time (dynamic_data_of (0))));
+ gc_elapsed_time));
}
for (int gen_number = 0; gen_number <= (max_generation + 1); gen_number++)
@@ -16095,12 +16678,9 @@ void gc_heap::gc1()
#if 1 //subsumed by the linear allocation model
// to avoid spikes in mem usage due to short terms fluctuations in survivorship,
// apply some smoothing.
- static size_t smoothed_desired_per_heap = 0;
size_t smoothing = 3; // exponential smoothing factor
- if (smoothing > VolatileLoad(&settings.gc_index))
- smoothing = VolatileLoad(&settings.gc_index);
smoothed_desired_per_heap = desired_per_heap / smoothing + ((smoothed_desired_per_heap / smoothing) * (smoothing-1));
- dprintf (1, ("sn = %Id n = %Id", smoothed_desired_per_heap, desired_per_heap));
+ dprintf (HEAP_BALANCE_LOG, ("TEMPsn = %Id n = %Id", smoothed_desired_per_heap, desired_per_heap));
desired_per_heap = Align(smoothed_desired_per_heap, get_alignment_constant (true));
#endif //0
@@ -17162,6 +17742,11 @@ void gc_heap::garbage_collect (int n)
settings.gc_index = (uint32_t)dd_collection_count (dynamic_data_of (0)) + 1;
+#ifdef MULTIPLE_HEAPS
+ hb_log_balance_activities();
+ hb_log_new_allocation();
+#endif //MULTIPLE_HEAPS
+
// Call the EE for start of GC work
// just one thread for MP GC
GCToEEInterface::GcStartWork (settings.condemned_generation,
@@ -25445,7 +26030,7 @@ void gc_heap::gc_thread_stub (void* arg)
// We are about to set affinity for GC threads. It is a good place to set up NUMA and
// CPU groups because the process mask, processor number, and group number are all
// readily available.
- set_thread_affinity_for_heap(heap->heap_number);
+ set_thread_affinity_for_heap (heap->heap_number, heap_select::find_proc_no_from_heap_no (heap->heap_number));
}
// server GC threads run at a higher priority than normal.
@@ -27492,9 +28077,6 @@ void gc_heap::bgc_thread_function()
dprintf (3, ("no concurrent GC needed, exiting"));
break;
}
-#ifdef TRACE_GC
- //trace_gc = TRUE;
-#endif //TRACE_GC
recursive_gc_sync::begin_background();
dprintf (2, ("beginning of bgc: gen2 FL: %d, FO: %d, frag: %d",
generation_free_list_space (generation_of (max_generation)),
@@ -27505,10 +28087,6 @@ void gc_heap::bgc_thread_function()
current_bgc_state = bgc_not_in_process;
-#ifdef TRACE_GC
- //trace_gc = FALSE;
-#endif //TRACE_GC
-
enable_preemptive ();
#ifdef MULTIPLE_HEAPS
bgc_t_join.join(this, gc_join_done);
@@ -30060,7 +30638,6 @@ void gc_heap::init_static_data()
}
gen0_max_size = Align (gen0_max_size);
-
gen0_min_size = min (gen0_min_size, gen0_max_size);
// TODO: gen0_max_size has a 200mb cap; gen1_max_size should also have a cap.
@@ -30086,12 +30663,23 @@ void gc_heap::init_static_data()
bool gc_heap::init_dynamic_data()
{
- qpf = GCToOSInterface::QueryPerformanceFrequency();
-
- uint32_t now = (uint32_t)GetHighPrecisionTimeStamp();
+ uint64_t now_raw_ts = RawGetHighPrecisionTimeStamp ();
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ start_raw_ts = now_raw_ts;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+ uint32_t now = (uint32_t)(now_raw_ts / (qpf / 1000));
set_static_data();
+ if (heap_number == 0)
+ {
+ smoothed_desired_per_heap = dynamic_data_of (0)->min_size;
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ last_gc_end_time_ms = now;
+ dprintf (HEAP_BALANCE_LOG, ("qpf=%I64d, start: %I64d(%d)", qpf, start_raw_ts, now));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+ }
+
for (int i = 0; i <= max_generation+1; i++)
{
dynamic_data* dd = dynamic_data_of (i);
@@ -34129,6 +34717,8 @@ HRESULT GCHeap::Initialize()
{
HRESULT hr = S_OK;
+ qpf = GCToOSInterface::QueryPerformanceFrequency();
+
g_gc_pFreeObjectMethodTable = GCToEEInterface::GetFreeObjectMethodTable();
g_num_processors = GCToOSInterface::GetTotalProcessorCount();
assert(g_num_processors != 0);
@@ -34191,18 +34781,23 @@ HRESULT GCHeap::Initialize()
return CLR_E_GC_BAD_AFFINITY_CONFIG;
}
+ if ((cpu_index_ranges_holder.Get() != nullptr) || (config_affinity_mask != 0))
+ {
+ affinity_config_specified_p = true;
+ }
+
nhp_from_config = static_cast<uint32_t>(GCConfig::GetHeapCount());
- uint32_t nhp_from_process = GCToOSInterface::GetCurrentProcessCpuCount();
+ g_num_active_processors = GCToOSInterface::GetCurrentProcessCpuCount();
if (nhp_from_config)
{
// Even when the user specifies a heap count, it should not be more
// than the number of procs this process can use.
- nhp_from_config = min (nhp_from_config, nhp_from_process);
+ nhp_from_config = min (nhp_from_config, g_num_active_processors);
}
- nhp = ((nhp_from_config == 0) ? nhp_from_process : nhp_from_config);
+ nhp = ((nhp_from_config == 0) ? g_num_active_processors : nhp_from_config);
nhp = min (nhp, MAX_SUPPORTED_CPUS);
#ifndef FEATURE_REDHAWK
@@ -34350,8 +34945,62 @@ HRESULT GCHeap::Initialize()
return hr;
}
}
- // initialize numa node to heap map
- heap_select::init_numa_node_to_heap_map(nhp);
+
+ heap_select::init_numa_node_to_heap_map (nhp);
+
+ // If we have more active processors than heaps we still want to initialize some of the
+ // mapping for the rest of the active processors because user threads can still run on
+ // them which means it's important to know their numa nodes and map them to a reasonable
+ // heap, ie, we wouldn't want to have all such procs go to heap 0.
+ if (g_num_active_processors > nhp)
+ heap_select::distribute_other_procs();
+
+ gc_heap* hp = gc_heap::g_heaps[0];
+
+ dynamic_data* gen0_dd = hp->dynamic_data_of (0);
+ gc_heap::min_gen0_balance_delta = (dd_min_size (gen0_dd) >> 3);
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ cpu_group_enabled_p = GCToOSInterface::CanEnableGCCPUGroups();
+
+ if (!GCToOSInterface::GetNumaInfo (&total_numa_nodes_on_machine, &procs_per_numa_node))
+ {
+ total_numa_nodes_on_machine = 1;
+
+ // Note that if we are in cpu groups we need to take the way proc index is calculated
+ // into consideration. It would mean we have more than 64 procs on one numa node -
+ // this is mostly for testing (if we want to simulate no numa on a numa system).
+ // see vm\gcenv.os.cpp GroupProcNo implementation.
+ if (GCToOSInterface::GetCPUGroupInfo (&total_cpu_groups_on_machine, &procs_per_cpu_group))
+ procs_per_numa_node = procs_per_cpu_group + ((total_cpu_groups_on_machine - 1) << 6);
+ else
+ procs_per_numa_node = g_num_processors;
+ }
+ hb_info_numa_nodes = new (nothrow) heap_balance_info_numa[total_numa_nodes_on_machine];
+ dprintf (HEAP_BALANCE_LOG, ("total: %d, numa: %d", g_num_processors, total_numa_nodes_on_machine));
+
+ int hb_info_size_per_proc = sizeof (heap_balance_info_proc);
+
+ for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+ {
+ int hb_info_size_per_node = hb_info_size_per_proc * procs_per_numa_node;
+ uint8_t* numa_mem = (uint8_t*)GCToOSInterface::VirtualReserve (hb_info_size_per_node, 0, 0, numa_node_index);
+ if (!numa_mem)
+ return E_FAIL;
+ if (!GCToOSInterface::VirtualCommit (numa_mem, hb_info_size_per_node, numa_node_index))
+ return E_FAIL;
+
+ heap_balance_info_proc* hb_info_procs = (heap_balance_info_proc*)numa_mem;
+ hb_info_numa_nodes[numa_node_index].hb_info_procs = hb_info_procs;
+
+ for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+ {
+ heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+ hb_info_proc->count = default_max_hb_heap_balance_info;
+ hb_info_proc->index = 0;
+ }
+ }
+#endif //HEAP_BALANCE_INSTRUMENTATION
#else
hr = Init (0);
#endif //MULTIPLE_HEAPS
@@ -36106,13 +36755,14 @@ size_t GCHeap::ApproxTotalBytesInUse(BOOL small_heap_only)
void GCHeap::AssignHeap (alloc_context* acontext)
{
// Assign heap based on processor
- acontext->set_alloc_heap(GetHeap(heap_select::select_heap(acontext, 0)));
+ acontext->set_alloc_heap(GetHeap(heap_select::select_heap(acontext)));
acontext->set_home_heap(acontext->get_alloc_heap());
}
+
GCHeap* GCHeap::GetHeap (int n)
{
assert (n < gc_heap::n_heaps);
- return gc_heap::g_heaps [n]->vm_heap;
+ return gc_heap::g_heaps[n]->vm_heap;
}
#endif //MULTIPLE_HEAPS
@@ -36244,6 +36894,7 @@ bool GCHeap::RegisterForFullGCNotification(uint32_t gen2Percentage,
{
gc_heap* hp = gc_heap::g_heaps [hn];
hp->fgn_last_alloc = dd_new_allocation (hp->dynamic_data_of (0));
+ hp->fgn_maxgen_percent = gen2Percentage;
}
#else //MULTIPLE_HEAPS
pGenGCHeap->fgn_last_alloc = dd_new_allocation (pGenGCHeap->dynamic_data_of (0));
@@ -36253,7 +36904,6 @@ bool GCHeap::RegisterForFullGCNotification(uint32_t gen2Percentage,
pGenGCHeap->full_gc_end_event.Reset();
pGenGCHeap->full_gc_approach_event_set = false;
- pGenGCHeap->fgn_maxgen_percent = gen2Percentage;
pGenGCHeap->fgn_loh_percent = lohPercentage;
return TRUE;
diff --git a/src/gc/gcpriv.h b/src/gc/gcpriv.h
index 7c8286babf..c194c547b6 100644
--- a/src/gc/gcpriv.h
+++ b/src/gc/gcpriv.h
@@ -40,6 +40,13 @@ inline void FATAL_GC_ERROR()
GCToEEInterface::HandleFatalError((unsigned int)COR_E_EXECUTIONENGINE);
}
+#ifdef MULTIPLE_HEAPS
+// This turns on instrumentation that collects info for heap balancing.
+// Define it and make sure you have HEAP_BALANCE_LOG/HEAP_BALANCE_TEMP_LOG
+// level logging enabled *only*.
+//#define HEAP_BALANCE_INSTRUMENTATION
+#endif //MULTIPLE_HEAPS
+
#ifdef _MSC_VER
#pragma inline_depth(20)
#endif
@@ -255,6 +262,15 @@ const int policy_expand = 2;
#define SPINLOCK_LOG (DT_LOG_0 + 5)
#define SNOOP_LOG (DT_LOG_0 + 6)
+// NOTE! This is for HEAP_BALANCE_INSTRUMENTATION
+// This particular one is special and needs to be well formatted because we
+// do post processing on it with tools\GCLogParser. If you need to add some
+// detail to help with investigation that's not 't processed by tooling
+// prefix it with TEMP so that line will be written to the results as is in
+// the result. I have some already logged with HEAP_BALANCE_TEMP_LOG.
+#define HEAP_BALANCE_LOG (DT_LOG_0 + 7)
+#define HEAP_BALANCE_TEMP_LOG (DT_LOG_0 + 8)
+
#ifndef DACCESS_COMPILE
#ifdef SIMPLE_DPRINTF
@@ -267,6 +283,7 @@ void GCLog (const char *fmt, ... );
//#define dprintf(l,x) {if (trace_gc && ((l <= 2) || (l == BGC_LOG) || (l==GTC_LOG))) {GCLog x;}}
//#define dprintf(l,x) {if ((l == 1) || (l == 2222)) {GCLog x;}}
#define dprintf(l,x) {if ((l <= 1) || (l == GTC_LOG)) {GCLog x;}}
+//#define dprintf(l,x) {if (l == HEAP_BALANCE_LOG) {GCLog x;}}
//#define dprintf(l,x) {if ((l==GTC_LOG) || (l <= 1)) {GCLog x;}}
//#define dprintf(l,x) {if (trace_gc && ((l <= print_level) || (l==GTC_LOG))) {GCLog x;}}
//#define dprintf(l,x) {if (l==GTC_LOG) {printf ("\n");printf x ; fflush(stdout);}}
@@ -1222,6 +1239,12 @@ public:
uint32_t flags);
#ifdef MULTIPLE_HEAPS
+ PER_HEAP_ISOLATED
+ void hb_log_new_allocation();
+
+ PER_HEAP_ISOLATED
+ void hb_log_balance_activities();
+
static
void balance_heaps (alloc_context* acontext);
PER_HEAP
@@ -2965,7 +2988,7 @@ public:
GCEvent full_gc_end_event;
// Full GC Notification percentages.
- PER_HEAP_ISOLATED
+ PER_HEAP
uint32_t fgn_maxgen_percent;
PER_HEAP_ISOLATED
@@ -3026,6 +3049,9 @@ public:
PER_HEAP
heap_segment* new_heap_segment;
+ PER_HEAP_ISOLATED
+ size_t min_gen0_balance_delta;
+
#define alloc_quantum_balance_units (16)
PER_HEAP_ISOLATED
@@ -3091,7 +3117,7 @@ public:
PER_HEAP_ISOLATED
uint64_t total_physical_mem;
-
+
PER_HEAP_ISOLATED
uint64_t entry_available_physical_mem;
@@ -3175,6 +3201,11 @@ public:
PER_HEAP_ISOLATED
size_t last_gc_index;
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+ PER_HEAP_ISOLATED
+ size_t last_gc_end_time_ms;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
#ifdef SEG_MAPPING_TABLE
PER_HEAP_ISOLATED
size_t min_segment_size;
diff --git a/src/gc/unix/gcenv.unix.cpp b/src/gc/unix/gcenv.unix.cpp
index 3d7442ada9..4a48a4a5a9 100644
--- a/src/gc/unix/gcenv.unix.cpp
+++ b/src/gc/unix/gcenv.unix.cpp
@@ -527,9 +527,10 @@ static void* VirtualReserveInner(size_t size, size_t alignment, uint32_t flags,
// size - size of the virtual memory range
// alignment - requested memory alignment, 0 means no specific alignment requested
// flags - flags to control special settings like write watching
+// node - the NUMA node to reserve memory on
// Return:
// Starting virtual address of the reserved range
-void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
+void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node)
{
return VirtualReserveInner(size, alignment, flags);
}
@@ -910,6 +911,11 @@ bool GCToOSInterface::CanEnableGCNumaAware()
return g_numaAvailable;
}
+bool GCToOSInterface::CanEnableGCCPUGroups()
+{
+ return false;
+}
+
// Get processor number and optionally its NUMA node number for the specified heap number
// Parameters:
// heap_number - heap number to get the result for
diff --git a/src/gc/windows/gcenv.windows.cpp b/src/gc/windows/gcenv.windows.cpp
index e5c4d7a8db..e8f0035505 100644
--- a/src/gc/windows/gcenv.windows.cpp
+++ b/src/gc/windows/gcenv.windows.cpp
@@ -37,6 +37,7 @@ typedef BOOL (WINAPI *PQUERY_INFORMATION_JOB_OBJECT)(HANDLE jobHandle, JOBOBJECT
namespace {
static bool g_fEnableGCNumaAware;
+static uint32_t g_nNodes;
class GroupProcNo
{
@@ -91,6 +92,7 @@ void InitNumaNodeInfo()
if (!GetNumaHighestNodeNumber(&highest) || (highest == 0))
return;
+ g_nNodes = highest + 1;
g_fEnableGCNumaAware = true;
return;
}
@@ -659,6 +661,21 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
return success;
}
+bool GCToOSInterface::GetCurrentThreadIdealProc(uint16_t* procNo)
+{
+ PROCESSOR_NUMBER proc;
+
+ bool success = GetThreadIdealProcessorEx (GetCurrentThread (), &proc);
+
+ if (success)
+ {
+ GroupProcNo groupProcNo(proc.Group, proc.Number);
+ *procNo = groupProcNo.GetCombinedValue();
+ }
+
+ return success;
+}
+
// Get the number of the current processor
uint32_t GCToOSInterface::GetCurrentProcessorNumber()
{
@@ -713,17 +730,26 @@ void GCToOSInterface::YieldThread(uint32_t switchCount)
// size - size of the virtual memory range
// alignment - requested memory alignment, 0 means no specific alignment requested
// flags - flags to control special settings like write watching
+// node - the NUMA node to reserve memory on
// Return:
// Starting virtual address of the reserved range
-void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
+void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node)
{
// Windows already ensures 64kb alignment on VirtualAlloc. The current CLR
// implementation ignores it on Windows, other than making some sanity checks on it.
UNREFERENCED_PARAMETER(alignment);
assert((alignment & (alignment - 1)) == 0);
assert(alignment <= 0x10000);
- DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
- return ::VirtualAlloc(nullptr, size, memFlags, PAGE_READWRITE);
+
+ if (node == NUMA_NODE_UNDEFINED)
+ {
+ DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
+ return ::VirtualAlloc (nullptr, size, memFlags, PAGE_READWRITE);
+ }
+ else
+ {
+ return ::VirtualAllocExNuma (::GetCurrentProcess (), NULL, size, MEM_RESERVE, PAGE_READWRITE, node);
+ }
}
// Release virtual memory range previously reserved using VirtualReserve
@@ -1286,6 +1312,57 @@ bool GCToOSInterface::CanEnableGCNumaAware()
return g_fEnableGCNumaAware;
}
+bool GCToOSInterface::GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node)
+{
+ if (g_fEnableGCNumaAware)
+ {
+ DWORD currentProcsOnNode = 0;
+ for (uint32_t i = 0; i < g_nNodes; i++)
+ {
+ GROUP_AFFINITY processorMask;
+ if (GetNumaNodeProcessorMaskEx(i, &processorMask))
+ {
+ DWORD procsOnNode = 0;
+ uintptr_t mask = (uintptr_t)processorMask.Mask;
+ while (mask)
+ {
+ procsOnNode++;
+ mask &= mask - 1;
+ }
+
+ currentProcsOnNode = max(currentProcsOnNode, procsOnNode);
+ }
+ *max_procs_per_node = currentProcsOnNode;
+ *total_nodes = g_nNodes;
+ }
+ return true;
+ }
+
+ return false;
+}
+
+bool GCToOSInterface::CanEnableGCCPUGroups()
+{
+ return g_fEnableGCCPUGroups;
+}
+
+bool GCToOSInterface::GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group)
+{
+ if (g_fEnableGCCPUGroups)
+ {
+ *total_groups = (uint16_t)g_nGroups;
+ DWORD currentProcsInGroup = 0;
+ for (WORD i = 0; i < g_nGroups; i++)
+ {
+ currentProcsInGroup = max(currentProcsInGroup, g_CPUGroupInfoArray[i].nr_active);
+ }
+ *max_procs_per_group = currentProcsInGroup;
+ return true;
+ }
+
+ return false;
+}
+
// Get processor number and optionally its NUMA node number for the specified heap number
// Parameters:
// heap_number - heap number to get the result for