many core (#25350)

author: Maoni Stephens <Maoni0@users.noreply.github.com> 2019-07-05 12:48:34 -0700
committer: GitHub <noreply@github.com> 2019-07-05 12:48:34 -0700
commit: c7d12d2b91f79802a12484d4d56c41ba355b7058 (patch)
tree: 823ab1ef9de13e8f5493fc219af7a5deaa118e13
parent: 41407b7d4a2eac4cae6a3c4855ba072cdd01f8b1 (diff)
download: coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.tar.gz
coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.tar.bz2
coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.zip
15 files changed, 2346 insertions, 168 deletions
diff --git a/src/gc/env/gcenv.interlocked.h b/src/gc/env/gcenv.interlocked.h
index 1da222de75..eb73a36eb7 100644
--- a/src/gc/env/gcenv.interlocked.h
+++ b/src/gc/env/gcenv.interlocked.h
@@ -78,6 +78,12 @@ public:
     template<typename T>
     static T ExchangeAdd(T volatile *addend, T value);
 
+    template<typename T>
+    static T ExchangeAdd64(T volatile* addend, T value);
+
+    template<typename T>
+    static T ExchangeAddPtr(T volatile* addend, T value);
+
     // Performs an atomic compare-and-exchange operation on the specified values. 
     // Parameters:
     //  destination - value to be exchanged
diff --git a/src/gc/env/gcenv.interlocked.inl b/src/gc/env/gcenv.interlocked.inl
index 1df2700d7f..401c4da0db 100644
--- a/src/gc/env/gcenv.interlocked.inl
+++ b/src/gc/env/gcenv.interlocked.inl
@@ -116,6 +116,37 @@ __forceinline T Interlocked::ExchangeAdd(T volatile *addend, T value)
 #endif
 }
 
+template <typename T>
+__forceinline T Interlocked::ExchangeAdd64(T volatile* addend, T value)
+{
+#ifdef _MSC_VER
+    static_assert(sizeof(int64_t) == sizeof(T), "Size of LONGLONG must be the same as size of T");
+    return _InterlockedExchangeAdd64((int64_t*)addend, value);
+#else
+    T result = __sync_fetch_and_add(addend, value);
+    ArmInterlockedOperationBarrier();
+    return result;
+#endif
+}
+
+template <typename T>
+__forceinline T Interlocked::ExchangeAddPtr(T volatile* addend, T value)
+{
+#ifdef _MSC_VER
+#ifdef BIT64
+    static_assert(sizeof(int64_t) == sizeof(T), "Size of LONGLONG must be the same as size of T");
+    return _InterlockedExchangeAdd64((int64_t*)addend, value);
+#else
+    static_assert(sizeof(long) == sizeof(T), "Size of long must be the same as size of T");
+    return _InterlockedExchangeAdd((long*)addend, value);
+#endif
+#else
+    T result = __sync_fetch_and_add(addend, value);
+    ArmInterlockedOperationBarrier();
+    return result;
+#endif
+}
+
 // Perform an atomic AND operation on the specified values values
 // Parameters:
 //  destination - the first operand and the destination
diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h
index 393bd1fe18..e56321cace 100644
--- a/src/gc/env/gcenv.os.h
+++ b/src/gc/env/gcenv.os.h
@@ -144,8 +144,10 @@ typedef void (*GCThreadFunction)(void* param);
 // Right now we support maximum 1024 procs - meaning that we will create at most
 // that many GC threads and GC heaps.
 #define MAX_SUPPORTED_CPUS 1024
+#define MAX_SUPPORTED_NODES 64
 #else
 #define MAX_SUPPORTED_CPUS 64
+#define MAX_SUPPORTED_NODES 16
 #endif // BIT64
 
 // Add of processor indices used to store affinity.
@@ -253,6 +255,7 @@ public:
     //  size      - size of the virtual memory range
     //  alignment - requested memory alignment
     //  flags     - flags to control special settings like write watching
+    //  node      - the NUMA node to reserve memory on
     // Return:
     //  Starting virtual address of the reserved range
     // Notes:
@@ -264,7 +267,7 @@ public:
     //
     //  Windows guarantees that the returned mapping will be aligned to the allocation
     //  granularity.
-    static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags);
+    static void* VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node = NUMA_NODE_UNDEFINED);
 
     // Release virtual memory range previously reserved using VirtualReserve
     // Parameters:
@@ -360,6 +363,8 @@ public:
     //  true if it has succeeded, false if it has failed
     static bool SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t dstProcNo);
 
+    static bool GetCurrentThreadIdealProc(uint16_t* procNo);
+
     // Get numeric id of the current thread if possible on the
     // current platform. It is indended for logging purposes only.
     // Return:
@@ -484,6 +489,15 @@ public:
     // Is NUMA support available
     static bool CanEnableGCNumaAware();
 
+    // TODO: add Linux implementation.
+    // For no NUMA this returns false.
+    static bool GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node);
+
+    // Is CPU Group enabled
+    // This only applies on Windows and only used by instrumentation but is on the
+    // interface due to LocalGC.
+    static bool CanEnableGCCPUGroups();
+
     // Get processor number and optionally its NUMA node number for the specified heap number
     // Parameters:
     //  heap_number - heap number to get the result for
@@ -493,6 +507,9 @@ public:
     //  true if it succeeded
     static bool GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_no, uint16_t* node_no);
 
+    // For no CPU groups this returns false.
+    static bool GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group);
+
     // Parse the confing string describing affinitization ranges and update the passed in affinitySet accordingly
     // Parameters:
     //  config_string - string describing the affinitization range, platform specific
diff --git a/src/gc/gc.cpp b/src/gc/gc.cpp
index 0917527ab4..f4d877fe88 100644
--- a/src/gc/gc.cpp
+++ b/src/gc/gc.cpp
@@ -56,6 +56,8 @@ BOOL bgc_heap_walk_for_etw_p = FALSE;
 #define MAX_PTR ((uint8_t*)(~(ptrdiff_t)0))
 #define commit_min_th (16*OS_PAGE_SIZE)
 
+static size_t smoothed_desired_per_heap = 0;
+
 #ifdef SERVER_GC
 #define partial_size_th 100
 #define num_partial_refs 64
@@ -205,6 +207,12 @@ size_t GetHighPrecisionTimeStamp()
     
     return (size_t)(ts / (qpf / 1000));    
 }
+
+uint64_t RawGetHighPrecisionTimeStamp()
+{
+    return (uint64_t)GCToOSInterface::QueryPerformanceCounter();
+}
+
 #endif
 
 #ifdef GC_STATS
@@ -436,6 +444,7 @@ void c_write (uint32_t& place, uint32_t value)
 }
 
 #ifndef DACCESS_COMPILE
+
 // If every heap's gen2 or gen3 size is less than this threshold we will do a blocking GC.
 const size_t bgc_min_per_heap = 4*1024*1024;
 
@@ -682,6 +691,7 @@ process_sync_log_stats()
 
 #ifdef MULTIPLE_HEAPS
 #ifndef DACCESS_COMPILE
+uint32_t g_num_active_processors = 0;
 
 enum gc_join_stage
 {
@@ -2442,6 +2452,7 @@ sorted_table* gc_heap::seg_table;
 
 #ifdef MULTIPLE_HEAPS
 GCEvent     gc_heap::ee_suspend_event;
+size_t      gc_heap::min_gen0_balance_delta = 0;
 size_t      gc_heap::min_balance_threshold = 0;
 #endif //MULTIPLE_HEAPS
 
@@ -2537,6 +2548,7 @@ uint64_t    gc_heap::entry_available_physical_mem = 0;
 
 size_t      gc_heap::heap_hard_limit = 0;
 
+bool        affinity_config_specified_p = false;
 #ifdef BACKGROUND_GC
 GCEvent     gc_heap::bgc_start_event;
 
@@ -2576,6 +2588,7 @@ int         gc_heap::spinlock_info_index = 0;
 spinlock_info gc_heap::last_spinlock_info[max_saved_spinlock_info + 8];
 #endif //SPINLOCK_HISTORY
 
+uint32_t    gc_heap::fgn_maxgen_percent = 0;
 size_t      gc_heap::fgn_last_alloc = 0;
 
 int         gc_heap::generation_skip_ratio = 100;
@@ -2834,6 +2847,9 @@ size_t gc_heap::eph_gen_starts_size = 0;
 heap_segment* gc_heap::segment_standby_list;
 bool          gc_heap::use_large_pages_p = 0;
 size_t        gc_heap::last_gc_index = 0;
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+size_t        gc_heap::last_gc_end_time_ms = 0;
+#endif //HEAP_BALANCE_INSTRUMENTATION
 #ifdef SEG_MAPPING_TABLE
 size_t        gc_heap::min_segment_size = 0;
 size_t        gc_heap::min_segment_size_shr = 0;
@@ -2859,8 +2875,6 @@ GCEvent gc_heap::full_gc_approach_event;
 
 GCEvent gc_heap::full_gc_end_event;
 
-uint32_t gc_heap::fgn_maxgen_percent = 0;
-
 uint32_t gc_heap::fgn_loh_percent = 0;
 
 #ifdef BACKGROUND_GC
@@ -4283,10 +4297,10 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
     // should only be called once
     assert (memory_details.initial_memory == 0);
 
-    memory_details.initial_memory = new (nothrow) imemory_data[num_heaps*2];
+    memory_details.initial_memory = new (nothrow) imemory_data[num_heaps * 2];
     if (memory_details.initial_memory == 0)
     {
-        dprintf (2, ("failed to reserve %Id bytes for imemory_data", num_heaps*2*sizeof(imemory_data)));
+        dprintf (2, ("failed to reserve %Id bytes for imemory_data", num_heaps * 2 * sizeof (imemory_data)));
         return FALSE;
     }
 
@@ -4320,15 +4334,15 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
     uint8_t* allatonce_block = (uint8_t*)virtual_alloc (requestedMemory, use_large_pages_p);
     if (allatonce_block)
     {
-        g_gc_lowest_address =  allatonce_block;
+        g_gc_lowest_address = allatonce_block;
         g_gc_highest_address = allatonce_block + requestedMemory;
         memory_details.allocation_pattern = initial_memory_details::ALLATONCE;
 
-        for(size_t i = 0; i < memory_details.block_count; i++)
+        for (size_t i = 0; i < memory_details.block_count; i++)
         {
-            memory_details.initial_normal_heap[i].memory_base = allatonce_block + (i*normal_size);
+            memory_details.initial_normal_heap[i].memory_base = allatonce_block + (i * normal_size);
             memory_details.initial_large_heap[i].memory_base = allatonce_block +
-                            (memory_details.block_count*normal_size) + (i*large_size);
+                (memory_details.block_count * normal_size) + (i * large_size);
             reserve_success = TRUE;
         }
     }
@@ -4344,13 +4358,13 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
             if (b2)
             {
                 memory_details.allocation_pattern = initial_memory_details::TWO_STAGE;
-                g_gc_lowest_address = min(b1,b2);
-                g_gc_highest_address = max(b1 + memory_details.block_count*normal_size,
-                                        b2 + memory_details.block_count*large_size);
-                for(size_t i = 0; i < memory_details.block_count; i++)
+                g_gc_lowest_address = min (b1, b2);
+                g_gc_highest_address = max (b1 + memory_details.block_count * normal_size,
+                    b2 + memory_details.block_count * large_size);
+                for (size_t i = 0; i < memory_details.block_count; i++)
                 {
-                    memory_details.initial_normal_heap[i].memory_base = b1 + (i*normal_size);
-                    memory_details.initial_large_heap[i].memory_base = b2 + (i*large_size);
+                    memory_details.initial_normal_heap[i].memory_base = b1 + (i * normal_size);
+                    memory_details.initial_large_heap[i].memory_base = b2 + (i * large_size);
                     reserve_success = TRUE;
                 }
             }
@@ -4362,28 +4376,28 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
             }
         }
 
-        if ((b2==NULL) && ( memory_details.block_count > 1))
+        if ((b2 == NULL) && (memory_details.block_count > 1))
         {
             memory_details.allocation_pattern = initial_memory_details::EACH_BLOCK;
 
-            imemory_data *current_block = memory_details.initial_memory;
-            for(size_t i = 0; i < (memory_details.block_count*2); i++, current_block++)
+            imemory_data* current_block = memory_details.initial_memory;
+            for (size_t i = 0; i < (memory_details.block_count * 2); i++, current_block++)
             {
                 size_t block_size = ((i < memory_details.block_count) ?
-                                     memory_details.block_size_normal :
-                                     memory_details.block_size_large);
+                    memory_details.block_size_normal :
+                    memory_details.block_size_large);
                 current_block->memory_base =
                     (uint8_t*)virtual_alloc (block_size, use_large_pages_p);
                 if (current_block->memory_base == 0)
                 {
                     // Free the blocks that we've allocated so far
                     current_block = memory_details.initial_memory;
-                    for(size_t j = 0; j < i; j++, current_block++){
-                        if (current_block->memory_base != 0){
+                    for (size_t j = 0; j < i; j++, current_block++) {
+                        if (current_block->memory_base != 0) {
                             block_size = ((j < memory_details.block_count) ?
-                                     memory_details.block_size_normal :
-                                     memory_details.block_size_large);
-                             virtual_free (current_block->memory_base , block_size);
+                                memory_details.block_size_normal :
+                                memory_details.block_size_large);
+                            virtual_free (current_block->memory_base, block_size);
                         }
                     }
                     reserve_success = FALSE;
@@ -4392,8 +4406,8 @@ BOOL reserve_initial_memory (size_t normal_size, size_t large_size, size_t num_h
                 else
                 {
                     if (current_block->memory_base < g_gc_lowest_address)
-                        g_gc_lowest_address =  current_block->memory_base;
-                    if (((uint8_t *) current_block->memory_base + block_size) > g_gc_highest_address)
+                        g_gc_lowest_address = current_block->memory_base;
+                    if (((uint8_t*)current_block->memory_base + block_size) > g_gc_highest_address)
                         g_gc_highest_address = (current_block->memory_base + block_size);
                 }
                 reserve_success = TRUE;
@@ -4507,7 +4521,9 @@ void* virtual_alloc (size_t size, bool use_large_pages_p)
     }
 #endif // !FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
 
-    void* prgmem = use_large_pages_p ? GCToOSInterface::VirtualReserveAndCommitLargePages(requested_size) : GCToOSInterface::VirtualReserve(requested_size, card_size * card_word_width, flags);
+    void* prgmem = use_large_pages_p ? 
+        GCToOSInterface::VirtualReserveAndCommitLargePages(requested_size) : 
+        GCToOSInterface::VirtualReserve(requested_size, card_size * card_word_width, flags);
     void *aligned_mem = prgmem;
 
     // We don't want (prgmem + size) to be right at the end of the address space 
@@ -5053,9 +5069,18 @@ extern "C" uint64_t __rdtsc();
     }
 #endif //_TARGET_X86_
 
+// We may not be on contiguous numa nodes so need to store 
+// the node index as well.
+struct node_heap_count
+{
+    int node_no;
+    int heap_count;
+};
+
 class heap_select
 {
     heap_select() {}
+public:
     static uint8_t* sniff_buffer;
     static unsigned n_sniff_buffers;
     static unsigned cur_sniff_index;
@@ -5063,7 +5088,12 @@ class heap_select
     static uint16_t proc_no_to_heap_no[MAX_SUPPORTED_CPUS];
     static uint16_t heap_no_to_proc_no[MAX_SUPPORTED_CPUS];
     static uint16_t heap_no_to_numa_node[MAX_SUPPORTED_CPUS];
+    static uint16_t proc_no_to_numa_node[MAX_SUPPORTED_CPUS];
     static uint16_t numa_node_to_heap_map[MAX_SUPPORTED_CPUS+4];
+    // Note this is the total numa nodes GC heaps are on. There might be
+    // more on the machine if GC threads aren't using all of them. 
+    static uint16_t total_numa_nodes;
+    static node_heap_count heaps_on_node[MAX_SUPPORTED_NODES];
 
     static int access_time(uint8_t *sniff_buffer, int heap_number, unsigned sniff_index, unsigned n_sniff_buffers)
     {
@@ -5104,14 +5134,11 @@ public:
         return TRUE;
     }
 
-    static void init_cpu_mapping(gc_heap * /*heap*/, int heap_number)
+    static void init_cpu_mapping(int heap_number)
     {
         if (GCToOSInterface::CanGetCurrentProcessorNumber())
         {
-            uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber() % gc_heap::n_heaps;
-            // We can safely cast heap_number to a uint16_t 'cause GetCurrentProcessCpuCount
-            // only returns up to MAX_SUPPORTED_CPUS procs right now. We only ever create at most
-            // MAX_SUPPORTED_CPUS GC threads.
+            uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber();
             proc_no_to_heap_no[proc_no] = (uint16_t)heap_number;
         }
     }
@@ -5125,12 +5152,15 @@ public:
             sniff_buffer[(1 + heap_number*n_sniff_buffers + sniff_index)*HS_CACHE_LINE_SIZE] &= 1;
     }
 
-    static int select_heap(alloc_context* acontext, int /*hint*/)
+    static int select_heap(alloc_context* acontext)
     {
         UNREFERENCED_PARAMETER(acontext); // only referenced by dprintf
 
         if (GCToOSInterface::CanGetCurrentProcessorNumber())
-            return proc_no_to_heap_no[GCToOSInterface::GetCurrentProcessorNumber() % gc_heap::n_heaps];
+        {
+            uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber();
+            return proc_no_to_heap_no[proc_no];
+        }
 
         unsigned sniff_index = Interlocked::Increment(&cur_sniff_index);
         sniff_index %= n_sniff_buffers;
@@ -5175,6 +5205,11 @@ public:
         return GCToOSInterface::CanGetCurrentProcessorNumber();
     }
 
+    static uint16_t find_heap_no_from_proc_no(uint16_t proc_no)
+    {
+        return proc_no_to_heap_no[proc_no];
+    }
+
     static uint16_t find_proc_no_from_heap_no(int heap_number)
     {
         return heap_no_to_proc_no[heap_number];
@@ -5190,33 +5225,102 @@ public:
         return heap_no_to_numa_node[heap_number];
     }
 
-    static void set_numa_node_for_heap(int heap_number, uint16_t numa_node)
+    static uint16_t find_numa_node_from_proc_no (uint16_t proc_no)
+    {
+        return proc_no_to_numa_node[proc_no];
+    }
+
+    static void set_numa_node_for_heap_and_proc(int heap_number, uint16_t proc_no, uint16_t numa_node)
     {
         heap_no_to_numa_node[heap_number] = numa_node;
+        proc_no_to_numa_node[proc_no] = numa_node;
     }
 
     static void init_numa_node_to_heap_map(int nheaps)
-    {   // Called right after GCHeap::Init() for each heap
+    {
+        // Called right after GCHeap::Init() for each heap
         // For each NUMA node used by the heaps, the
         // numa_node_to_heap_map[numa_node] is set to the first heap number on that node and
         // numa_node_to_heap_map[numa_node + 1] is set to the first heap number not on that node 
-
         // Set the start of the heap number range for the first NUMA node
         numa_node_to_heap_map[heap_no_to_numa_node[0]] = 0;
+        total_numa_nodes = 0;
+        memset (heaps_on_node, 0, sizeof (heaps_on_node));
+        heaps_on_node[0].node_no = heap_no_to_numa_node[0];
+        heaps_on_node[0].heap_count = 1;
 
         for (int i=1; i < nheaps; i++)
         {
             if (heap_no_to_numa_node[i] != heap_no_to_numa_node[i-1])
             {
+                total_numa_nodes++;
+                heaps_on_node[total_numa_nodes].node_no = heap_no_to_numa_node[i];
+
                 // Set the end of the heap number range for the previous NUMA node
                 numa_node_to_heap_map[heap_no_to_numa_node[i-1] + 1] =
                 // Set the start of the heap number range for the current NUMA node
                 numa_node_to_heap_map[heap_no_to_numa_node[i]] = (uint16_t)i;
             }
+            (heaps_on_node[total_numa_nodes].heap_count)++;
         }
 
         // Set the end of the heap range for the last NUMA node
         numa_node_to_heap_map[heap_no_to_numa_node[nheaps-1] + 1] = (uint16_t)nheaps; //mark the end with nheaps
+        total_numa_nodes++;
+    }
+
+    // TODO: curently this doesn't work with GCHeapAffinitizeMask/GCHeapAffinitizeRanges 
+    // because the heaps may not be on contiguous active procs. 
+    //
+    // This is for scenarios where GCHeapCount is specified as something like 
+    // (g_num_active_processors - 2) to allow less randomization to the Server GC threads.
+    // In this case we want to assign the right heaps to those procs, ie if they share 
+    // the same numa node we want to assign local heaps to those procs. Otherwise we 
+    // let the heap balancing mechanism take over for now. 
+    static void distribute_other_procs()
+    {
+        if (affinity_config_specified_p) 
+            return;
+
+        uint16_t proc_no = 0;
+        uint16_t node_no = 0;
+        bool res = false;
+        int start_heap = -1;
+        int end_heap = -1;
+        int current_node_no = -1;
+        int current_heap_on_node = -1;
+
+        for (int i = gc_heap::n_heaps; i < (int)g_num_active_processors; i++)
+        {
+            if (!GCToOSInterface::GetProcessorForHeap (i, &proc_no, &node_no))
+                break;
+            
+            int start_heap = (int)numa_node_to_heap_map[node_no];
+            int end_heap = (int)(numa_node_to_heap_map[node_no + 1]);
+
+            if ((end_heap - start_heap) > 0)
+            {
+                if (node_no == current_node_no)
+                {
+                    // We already iterated through all heaps on this node, don't add more procs to these
+                    // heaps. 
+                    if (current_heap_on_node >= end_heap)
+                    {
+                        continue;
+                    }
+                }
+                else
+                {
+                    current_node_no = node_no;
+                    current_heap_on_node = start_heap;
+                }
+
+                proc_no_to_heap_no[proc_no] = current_heap_on_node;
+                proc_no_to_numa_node[proc_no] = node_no;
+
+                current_heap_on_node++;
+            }
+        }
     }
 
     static void get_heap_range_for_heap(int hn, int* start, int* end)
@@ -5225,6 +5329,43 @@ public:
         *start = (int)numa_node_to_heap_map[numa_node];
         *end   = (int)(numa_node_to_heap_map[numa_node+1]);
     }
+
+    // This gets the next valid numa node index starting at current_index+1.
+    // It assumes that current_index is a valid node index. 
+    // If current_index+1 is at the end this will start at the beginning. So this will
+    // always return a valid node index, along with that node's start/end heaps.
+    static uint16_t get_next_numa_node (uint16_t current_index, int* start, int* end)
+    {
+        int start_index = current_index + 1;
+        int nheaps = gc_heap::n_heaps;
+
+        bool found_node_with_heaps_p = false;
+        do
+        {
+            int start_heap = (int)numa_node_to_heap_map[start_index];
+            int end_heap = (int)numa_node_to_heap_map[start_index + 1];
+            if (start_heap == nheaps)
+            {
+                // This is the last node.
+                start_index = 0;
+                continue;
+            }
+
+            if ((end_heap - start_heap) == 0)
+            {
+                // This node has no heaps.
+                start_index++;
+            }
+            else
+            {
+                found_node_with_heaps_p = true;
+                *start = start_heap;
+                *end = end_heap;
+            }
+        } while (!found_node_with_heaps_p);
+
+        return start_index;
+    }
 };
 uint8_t* heap_select::sniff_buffer;
 unsigned heap_select::n_sniff_buffers;
@@ -5232,7 +5373,318 @@ unsigned heap_select::cur_sniff_index;
 uint16_t heap_select::proc_no_to_heap_no[MAX_SUPPORTED_CPUS];
 uint16_t heap_select::heap_no_to_proc_no[MAX_SUPPORTED_CPUS];
 uint16_t heap_select::heap_no_to_numa_node[MAX_SUPPORTED_CPUS];
+uint16_t heap_select::proc_no_to_numa_node[MAX_SUPPORTED_CPUS];
 uint16_t heap_select::numa_node_to_heap_map[MAX_SUPPORTED_CPUS+4];
+uint16_t  heap_select::total_numa_nodes;
+node_heap_count heap_select::heaps_on_node[MAX_SUPPORTED_NODES];
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+// This records info we use to look at effect of different strategies 
+// for heap balancing.
+struct heap_balance_info
+{
+    uint64_t timestamp;
+    // This also encodes when we detect the thread runs on 
+    // different proc during a balance attempt. Sometimes 
+    // I observe this happens multiple times during one attempt!
+    // If this happens, I just record the last proc we observe
+    // and set MSB.
+    int tid;
+    // This records the final alloc_heap for the thread.
+    // 
+    // This also encodes the reason why we needed to set_home_heap 
+    // in balance_heaps.
+    // If we set it because the home heap is not the same as the proc, 
+    // we set MSB.
+    //
+    // If we set ideal proc, we set the 2nd MSB.
+    int alloc_heap;
+    int ideal_proc_no;
+};
+
+// This means inbetween each GC we can log at most this many entries per proc.
+// This is usually enough. Most of the time we only need to log something every 128k
+// of allocations in balance_heaps and gen0 budget is <= 200mb. 
+#define default_max_hb_heap_balance_info 4096
+
+struct heap_balance_info_proc
+{
+    int count;
+    int index;
+    heap_balance_info hb_info[default_max_hb_heap_balance_info];
+};
+
+struct heap_balance_info_numa
+{
+    heap_balance_info_proc* hb_info_procs;
+};
+
+uint64_t start_raw_ts = 0;
+bool cpu_group_enabled_p = false;
+uint32_t procs_per_numa_node = 0;
+uint16_t total_numa_nodes_on_machine = 0;
+uint32_t procs_per_cpu_group = 0;
+uint16_t total_cpu_groups_on_machine = 0;
+// Note this is still on one of the numa nodes, so we'll incur a remote access 
+// no matter what.
+heap_balance_info_numa* hb_info_numa_nodes = NULL;
+
+// TODO: This doesn't work for multiple nodes per CPU group yet.
+int get_proc_index_numa (int proc_no, int* numa_no)
+{
+    if (total_numa_nodes_on_machine == 1)
+    {
+        *numa_no = 0;
+        return proc_no;
+    }
+    else
+    {
+        if (cpu_group_enabled_p)
+        {
+            // see vm\gcenv.os.cpp GroupProcNo implementation.
+            *numa_no = proc_no >> 6;
+            return (proc_no % 64);
+        }
+        else
+        {
+            *numa_no = proc_no / procs_per_numa_node;
+            return (proc_no % procs_per_numa_node);
+        }
+    }
+}
+
+// We could consider optimizing it so we don't need to get the tid 
+// everytime but it's not very expensive to get.
+void add_to_hb_numa (
+    int proc_no,
+    int ideal_proc_no,
+    int alloc_heap,
+    bool multiple_procs_p,
+    bool alloc_count_p,
+    bool set_ideal_p)
+{
+    int tid = (int)GCToOSInterface::GetCurrentThreadIdForLogging ();
+    uint64_t timestamp = RawGetHighPrecisionTimeStamp ();
+
+    int saved_proc_no = proc_no;
+    int numa_no = -1;
+    proc_no = get_proc_index_numa (proc_no, &numa_no);
+
+    heap_balance_info_numa* hb_info_numa_node = &hb_info_numa_nodes[numa_no];
+
+    heap_balance_info_proc* hb_info_proc = &(hb_info_numa_node->hb_info_procs[proc_no]);
+    int index = hb_info_proc->index;
+    int count = hb_info_proc->count;
+
+    if (index == count)
+    {
+        // Too much info inbetween GCs. This can happen if the thread is scheduled on a different
+        // processor very often so it caused us to log many entries due to that reason. You could
+        // increase default_max_hb_heap_balance_info but this usually indicates a problem that
+        // should be investigated.
+        dprintf (HEAP_BALANCE_LOG, ("too much info between GCs, already logged %d entries", index));
+        GCToOSInterface::DebugBreak ();
+    }
+    heap_balance_info* hb_info = &(hb_info_proc->hb_info[index]);
+
+    dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP[p%3d->%3d(i:%3d), N%d] #%4d: %I64d, tid %d, ah: %d, m: %d, p: %d, i: %d",
+        saved_proc_no, proc_no, ideal_proc_no, numa_no, index, 
+        (timestamp - start_raw_ts), tid, alloc_heap, (int)multiple_procs_p, (int)(!alloc_count_p), (int)set_ideal_p));
+
+    if (multiple_procs_p)
+    {
+        tid |= (1 << (sizeof (tid) * 8 - 1));
+    }
+
+    if (!alloc_count_p)
+    {
+        alloc_heap |= (1 << (sizeof (alloc_heap) * 8 - 1));
+    }
+
+    if (set_ideal_p)
+    {
+        alloc_heap |= (1 << (sizeof (alloc_heap) * 8 - 2));
+    }
+
+    hb_info->timestamp = timestamp;
+    hb_info->tid = tid;
+    hb_info->alloc_heap = alloc_heap;
+    hb_info->ideal_proc_no = ideal_proc_no;
+    (hb_info_proc->index)++;
+}
+
+const int hb_log_buffer_size = 1024;
+static char hb_log_buffer[hb_log_buffer_size];
+int last_hb_recorded_gc_index = -1;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
+// This logs what we recorded in balance_heaps
+// The format for this is
+//
+// [ms since last GC end]
+// [cpu index]
+// all elements we stored before this GC for this CPU in the format
+// timestamp,tid, alloc_heap_no
+// repeat this for each CPU
+//
+// the timestamp here is just the result of calling QPC,
+// it's not converted to ms. The conversion will be done when we process
+// the log.
+void gc_heap::hb_log_balance_activities()
+{
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+    char* log_buffer = hb_log_buffer;
+
+    size_t now = GetHighPrecisionTimeStamp ();
+    size_t time_since_last_gc_ms = now - last_gc_end_time_ms;
+    dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP%Id - %Id = %Id", now, last_gc_end_time_ms, time_since_last_gc_ms));
+
+    // We want to get the min and the max timestamp for all procs because it helps with our post processing
+    // to know how big an array to allocate to display the history inbetween the GCs.
+    uint64_t min_timestamp = 0xffffffffffffffff;
+    uint64_t max_timestamp = 0;
+
+    for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+    {
+        heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs;
+        for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+        {
+            heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+            int total_entries_on_proc = hb_info_proc->index;
+
+            if (total_entries_on_proc > 0)
+            {
+                min_timestamp = min (min_timestamp, hb_info_proc->hb_info[0].timestamp);
+                max_timestamp = max (max_timestamp, hb_info_proc->hb_info[total_entries_on_proc - 1].timestamp);
+            }
+        }
+    }
+
+    dprintf (HEAP_BALANCE_LOG, ("[GCA#%Id %Id-%I64d-%I64d]",
+        settings.gc_index, time_since_last_gc_ms, (min_timestamp - start_raw_ts), (max_timestamp - start_raw_ts)));
+
+    if (last_hb_recorded_gc_index == (int)settings.gc_index)
+    {
+        GCToOSInterface::DebugBreak ();
+    }
+
+    last_hb_recorded_gc_index = (int)settings.gc_index;
+
+    // When we print out the proc index we need to convert it to the actual proc index (this is contiguous).
+    // It helps with post processing.
+    for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+    {
+        heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs;
+        for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+        {
+            heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+            int total_entries_on_proc = hb_info_proc->index;
+            if (total_entries_on_proc > 0)
+            {
+                int total_exec_time_ms = (int)((hb_info_proc->hb_info[total_entries_on_proc - 1].timestamp - hb_info_proc->hb_info[0].timestamp) / (qpf / 1000));
+                dprintf (HEAP_BALANCE_LOG, ("[p%d]-%d-%dms", (proc_index + numa_node_index * procs_per_numa_node), total_entries_on_proc, total_exec_time_ms));
+            }
+
+            for (int i = 0; i < hb_info_proc->index; i++)
+            {
+                heap_balance_info* hb_info = &hb_info_proc->hb_info[i];
+                bool multiple_procs_p = false;
+                bool alloc_count_p = true;
+                bool set_ideal_p = false;
+                int tid = hb_info->tid;
+                int alloc_heap = hb_info->alloc_heap;
+
+                if (tid & (1 << (sizeof (tid) * 8 - 1)))
+                {
+                    multiple_procs_p = true;
+                    tid &= ~(1 << (sizeof (tid) * 8 - 1));
+                }
+
+                if (alloc_heap & (1 << (sizeof (alloc_heap) * 8 - 1)))
+                {
+                    alloc_count_p = false;
+                    alloc_heap &= ~(1 << (sizeof (alloc_heap) * 8 - 1));
+                }
+
+                if (alloc_heap & (1 << (sizeof (alloc_heap) * 8 - 2)))
+                {
+                    set_ideal_p = true;
+                    alloc_heap &= ~(1 << (sizeof (alloc_heap) * 8 - 2));
+                }
+
+                // TODO - This assumes ideal proc is in the same cpu group which is not true
+                // when we don't have CPU groups.
+                int ideal_proc_no = hb_info->ideal_proc_no;
+                int ideal_node_no = -1;
+                ideal_proc_no = get_proc_index_numa (ideal_proc_no, &ideal_node_no);
+                ideal_proc_no = ideal_proc_no + ideal_node_no * procs_per_numa_node;
+
+                dprintf (HEAP_BALANCE_LOG, ("%I64d,%d,%d,%d%s%s%s",
+                    (hb_info->timestamp - start_raw_ts),
+                    tid,
+                    ideal_proc_no,
+                    (int)alloc_heap,
+                    (multiple_procs_p ? "|m" : ""), (!alloc_count_p ? "|p" : ""), (set_ideal_p ? "|i" : "")));
+            }
+        }
+    }
+
+    for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+    {
+        heap_balance_info_proc* hb_info_procs = hb_info_numa_nodes[numa_node_index].hb_info_procs;
+        for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+        {
+            heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+            hb_info_proc->index = 0;
+        }
+    }
+#endif //HEAP_BALANCE_INSTRUMENTATION
+}
+
+// The format for this is
+//
+// [GC_alloc_mb] 
+// h0_new_alloc, h1_new_alloc, ...
+//
+void gc_heap::hb_log_new_allocation()
+{
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+    char* log_buffer = hb_log_buffer;
+
+    int desired_alloc_mb = (int)(dd_desired_allocation (g_heaps[0]->dynamic_data_of (0)) / 1024 / 1024);
+
+    int buffer_pos = sprintf_s (hb_log_buffer, hb_log_buffer_size, "[GC_alloc_mb]\n");
+    for (int numa_node_index = 0; numa_node_index < heap_select::total_numa_nodes; numa_node_index++)
+    {
+        int node_allocated_mb = 0;
+
+        // I'm printing out the budget here instead of the numa node index so we know how much
+        // of the budget we consumed.
+        buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "[N#%3d]",
+            //numa_node_index);
+            desired_alloc_mb);
+
+        int heaps_on_node = heap_select::heaps_on_node[numa_node_index].heap_count;
+
+        for (int heap_index = 0; heap_index < heaps_on_node; heap_index++)
+        {
+            int actual_heap_index = heap_index + numa_node_index * heaps_on_node;
+            gc_heap* hp = g_heaps[actual_heap_index];
+            dynamic_data* dd0 = hp->dynamic_data_of (0);
+            int allocated_mb = (int)((dd_desired_allocation (dd0) - dd_new_allocation (dd0)) / 1024 / 1024);
+            node_allocated_mb += allocated_mb;
+            buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "%d,",
+                allocated_mb);
+        }
+
+        dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPN#%d a %dmb(%dmb)", numa_node_index, node_allocated_mb, desired_alloc_mb));
+
+        buffer_pos += sprintf_s (hb_log_buffer + buffer_pos, hb_log_buffer_size - buffer_pos, "\n");
+    }
+
+    dprintf (HEAP_BALANCE_LOG, ("%s", hb_log_buffer));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+}
 
 BOOL gc_heap::create_thread_support (unsigned number_of_heaps)
 {
@@ -5274,23 +5726,30 @@ void gc_heap::destroy_thread_support ()
     }
 }
 
-void set_thread_affinity_for_heap(int heap_number)
+bool get_proc_and_numa_for_heap (int heap_number)
 {
     uint16_t proc_no;
     uint16_t node_no;
 
-    if (GCToOSInterface::GetProcessorForHeap(heap_number, &proc_no, &node_no))
+    bool res = GCToOSInterface::GetProcessorForHeap (heap_number, &proc_no, &node_no);
+    if (res)
     {
-        heap_select::set_proc_no_for_heap(heap_number, proc_no);
+        heap_select::set_proc_no_for_heap (heap_number, proc_no);
         if (node_no != NUMA_NODE_UNDEFINED)
         {
-            heap_select::set_numa_node_for_heap(heap_number, node_no);
-        }
-        if (!GCToOSInterface::SetThreadAffinity(proc_no))
-        {
-            dprintf(1, ("Failed to set thread affinity for server GC thread"));
+            heap_select::set_numa_node_for_heap_and_proc (heap_number, proc_no, node_no);
         }
     }
+
+    return res;
+}
+
+void set_thread_affinity_for_heap (int heap_number, uint16_t proc_no)
+{
+    if (!GCToOSInterface::SetThreadAffinity (proc_no))
+    {
+        dprintf (1, ("Failed to set thread affinity for GC thread %d on proc #%d", heap_number, proc_no));
+    }
 }
 
 bool gc_heap::create_gc_thread ()
@@ -5308,7 +5767,7 @@ void gc_heap::gc_thread_function ()
     assert (gc_start_event.IsValid());
     dprintf (3, ("gc thread started"));
 
-    heap_select::init_cpu_mapping(this, heap_number);
+    heap_select::init_cpu_mapping(heap_number);
 
     while (1)
     {
@@ -5430,7 +5889,7 @@ bool gc_heap::virtual_alloc_commit_for_heap (void* addr, size_t size, int h_numb
         if (GCToOSInterface::CanEnableGCNumaAware())
         {
             uint16_t numa_node = heap_select::find_numa_node_from_heap_no(h_number);
-            if (GCToOSInterface::VirtualCommit(addr, size, numa_node))
+            if (GCToOSInterface::VirtualCommit (addr, size, numa_node))
                 return true;
         }
     }
@@ -9029,6 +9488,7 @@ retry:
 inline size_t my_get_size (Object* ob)
 {
     MethodTable* mT = header(ob)->GetMethodTable();
+
     return (mT->GetBaseSize() +
             (mT->HasComponentSize() ?
              ((size_t)((CObjectHeader*)ob)->GetNumComponents() * mT->RawGetComponentSize()) : 0));
@@ -9317,8 +9777,8 @@ void gc_heap::decommit_heap_segment_pages (heap_segment* seg,
         size -= max (extra_space, 32*OS_PAGE_SIZE);
 
         virtual_decommit (page_start, size, heap_number);
-        dprintf (3, ("Decommitting heap segment [%Ix, %Ix[(%d)", 
-            (size_t)page_start, 
+        dprintf (3, ("Decommitting heap segment [%Ix, %Ix[(%d)",
+            (size_t)page_start,
             (size_t)(page_start + size),
             size));
         heap_segment_committed (seg) = page_start;
@@ -10025,7 +10485,7 @@ HRESULT gc_heap::initialize_gc (size_t segment_size,
         check_commit_cs.Initialize();
     }
 
-    if (!reserve_initial_memory(segment_size,heap_size,block_count,use_large_pages_p))
+    if (!reserve_initial_memory (segment_size,heap_size,block_count,use_large_pages_p))
         return E_OUTOFMEMORY;
 
 #ifdef CARD_BUNDLE
@@ -10182,7 +10642,6 @@ gc_heap::init_semi_shared()
         goto cleanup;
     }
 
-    fgn_maxgen_percent = 0;
     fgn_loh_percent = 0;
     full_gc_approach_event_set = false;
 
@@ -10315,7 +10774,7 @@ gc_heap::wait_for_gc_done(int32_t timeOut)
     while (gc_heap::gc_started)
     {       
 #ifdef MULTIPLE_HEAPS
-        wait_heap = GCHeap::GetHeap(heap_select::select_heap(NULL, 0))->pGenGCHeap;
+        wait_heap = GCHeap::GetHeap(heap_select::select_heap(NULL))->pGenGCHeap;
         dprintf(2, ("waiting for the gc_done_event on heap %d", wait_heap->heap_number));
 #endif // MULTIPLE_HEAPS
 
@@ -10680,6 +11139,7 @@ gc_heap::init_gc_heap (int  h_number)
     allocation_running_amount = dd_min_size (dynamic_data_of (0));
 #endif //!MULTIPLE_HEAPS
 
+    fgn_maxgen_percent = 0;
     fgn_last_alloc = dd_min_size (dynamic_data_of (0));
 
     mark* arr = new (nothrow) (mark [MARK_STACK_INITIAL_LENGTH]);
@@ -10721,8 +11181,7 @@ gc_heap::init_gc_heap (int  h_number)
 #endif //MARK_ARRAY
 
 #ifdef MULTIPLE_HEAPS
-    //register the heap in the heaps array
-
+    get_proc_and_numa_for_heap (heap_number);
     if (!create_gc_thread ())
         return 0;
 
@@ -11654,12 +12113,13 @@ size_t gc_heap::new_allocation_limit (size_t size, size_t physical_limit, int ge
     dynamic_data* dd = dynamic_data_of (gen_number);
     ptrdiff_t new_alloc = dd_new_allocation (dd);
     assert (new_alloc == (ptrdiff_t)Align (new_alloc,
-                                           get_alignment_constant (!(gen_number == (max_generation+1)))));
+        get_alignment_constant (!(gen_number == (max_generation + 1)))));
 
     ptrdiff_t logical_limit = max (new_alloc, (ptrdiff_t)size);
     size_t limit = min (logical_limit, (ptrdiff_t)physical_limit);
     assert (limit == Align (limit, get_alignment_constant (!(gen_number == (max_generation+1)))));
     dd_new_allocation (dd) = (new_alloc - limit);
+
     return limit;
 }
 
@@ -11917,7 +12377,13 @@ void gc_heap::send_full_gc_notification (int gen_num, BOOL due_to_alloc_p)
 
 wait_full_gc_status gc_heap::full_gc_wait (GCEvent *event, int time_out_ms)
 {
-    if (fgn_maxgen_percent == 0)
+    uint32_t maxgen_percent = 0;
+#ifdef MULTIPLE_HEAPS
+    maxgen_percent = g_heaps[0]->fgn_maxgen_percent;
+#else
+    maxgen_percent = fgn_maxgen_percent;
+#endif //MULTIPLE_HEAPS
+    if (maxgen_percent == 0)
     {
         return wait_full_gc_na;
     }
@@ -11926,7 +12392,7 @@ wait_full_gc_status gc_heap::full_gc_wait (GCEvent *event, int time_out_ms)
 
     if ((wait_result == WAIT_OBJECT_0) || (wait_result == WAIT_TIMEOUT))
     {
-        if (fgn_maxgen_percent == 0)
+        if (maxgen_percent == 0)
         {
             return wait_full_gc_cancelled;
         }
@@ -12351,10 +12817,9 @@ found_fit:
     VolatileStore(((void**)allocated - 1), (void*)0);     //clear the sync block	
 #endif //VERIFY_HEAP && _DEBUG
 
-    dprintf (3, ("found fit at end of seg: %Ix", old_alloc));
-
     uint8_t* old_alloc;
     old_alloc = allocated;
+    dprintf (3, ("found fit at end of seg: %Ix", old_alloc));
 
 #ifdef BACKGROUND_GC
     if (cookie != -1)
@@ -12471,7 +12936,7 @@ BOOL gc_heap::trigger_ephemeral_gc (gc_reason gr)
 
     BOOL did_full_compact_gc = FALSE;
 
-    dprintf (2, ("triggering a gen1 GC"));
+    dprintf (1, ("h%d triggering a gen1 GC", heap_number));
     size_t last_full_compact_gc_count = get_full_compact_gc_count();
     vm_heap->GarbageCollectGeneration(max_generation - 1, gr);
 
@@ -13443,7 +13908,7 @@ allocation_state gc_heap::try_allocate_more_space (alloc_context* acontext, size
 #ifdef SYNCHRONIZATION_STATS
         bad_suspension++;
 #endif //SYNCHRONIZATION_STATS
-        dprintf (/*100*/ 2, ("running out of budget on gen%d, gc", gen_number));
+        dprintf (2, ("h%d running out of budget on gen%d, gc", heap_number, gen_number));
 
         if (!settings.concurrent || (gen_number == 0))
         {
@@ -13497,70 +13962,138 @@ void gc_heap::balance_heaps (alloc_context* acontext)
     {
         if (acontext->alloc_count == 0)
         {
-            acontext->set_home_heap(GCHeap::GetHeap( heap_select::select_heap(acontext, 0) ));
-            gc_heap* hp = acontext->get_home_heap()->pGenGCHeap;
-            dprintf (3, ("First allocation for context %Ix on heap %d\n", (size_t)acontext, (size_t)hp->heap_number));
-            acontext->set_alloc_heap(acontext->get_home_heap());
+            int home_hp_num = heap_select::select_heap (acontext);
+            acontext->set_home_heap (GCHeap::GetHeap (home_hp_num));
+            gc_heap* hp = acontext->get_home_heap ()->pGenGCHeap;
+            acontext->set_alloc_heap (acontext->get_home_heap ());
             hp->alloc_context_count++;
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+            uint16_t ideal_proc_no = 0;
+            GCToOSInterface::GetCurrentThreadIdealProc (&ideal_proc_no);
+
+            uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber ();
+
+            add_to_hb_numa (proc_no, ideal_proc_no,
+                home_hp_num, false, true, false);
+
+            dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPafter GC: 1st alloc on p%3d, h%d, ip: %d",
+                proc_no, home_hp_num, ideal_proc_no));
+#endif //HEAP_BALANCE_INSTRUMENTATION
         }
     }
     else
     {
         BOOL set_home_heap = FALSE;
-        int hint = 0;
+        gc_heap* home_hp = NULL;
+        int proc_hp_num = 0;
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+        bool alloc_count_p = true;
+        bool multiple_procs_p = false;
+        bool set_ideal_p = false;
+        uint32_t proc_no = GCToOSInterface::GetCurrentProcessorNumber ();
+        uint32_t last_proc_no = proc_no;
+#endif //HEAP_BALANCE_INSTRUMENTATION
 
-        if (heap_select::can_find_heap_fast())
+        if (heap_select::can_find_heap_fast ())
         {
-            if (acontext->get_home_heap() != NULL)
-                hint = acontext->get_home_heap()->pGenGCHeap->heap_number;
-            if (acontext->get_home_heap() != GCHeap::GetHeap(hint = heap_select::select_heap(acontext, hint)) || ((acontext->alloc_count & 15) == 0))
+            assert (acontext->get_home_heap () != NULL);
+            home_hp = acontext->get_home_heap ()->pGenGCHeap;
+            proc_hp_num = heap_select::select_heap (acontext);
+
+            if (acontext->get_home_heap () != GCHeap::GetHeap (proc_hp_num))
             {
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                alloc_count_p = false;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+                set_home_heap = TRUE;
+            }
+            else if ((acontext->alloc_count & 15) == 0)
                 set_home_heap = TRUE;
+
+            if (set_home_heap)
+            {
             }
         }
         else
         {
-            // can't use gdt
             if ((acontext->alloc_count & 3) == 0)
                 set_home_heap = TRUE;
         }
 
         if (set_home_heap)
         {
-/*
-            // Since we are balancing up to MAX_SUPPORTED_CPUS, no need for this.
-            if (n_heaps > MAX_SUPPORTED_CPUS)
-            {
-                // on machines with many processors cache affinity is really king, so don't even try
-                // to balance on these.
-                acontext->home_heap = GCHeap::GetHeap( heap_select::select_heap(acontext, hint) );
-                acontext->alloc_heap = acontext->home_heap;
-            }
-            else
-*/
+            /*
+                        // Since we are balancing up to MAX_SUPPORTED_CPUS, no need for this.
+                        if (n_heaps > MAX_SUPPORTED_CPUS)
+                        {
+                            // on machines with many processors cache affinity is really king, so don't even try
+                            // to balance on these.
+                            acontext->home_heap = GCHeap::GetHeap( heap_select::select_heap(acontext));
+                            acontext->alloc_heap = acontext->home_heap;
+                        }
+                        else
+            */
             {
-                gc_heap* org_hp = acontext->get_alloc_heap()->pGenGCHeap;
+                gc_heap* org_hp = acontext->get_alloc_heap ()->pGenGCHeap;
+                int org_hp_num = org_hp->heap_number;
+                int final_alloc_hp_num = org_hp_num;
 
                 dynamic_data* dd = org_hp->dynamic_data_of (0);
                 ptrdiff_t org_size = dd_new_allocation (dd);
+                ptrdiff_t total_size = (ptrdiff_t)dd_desired_allocation (dd);
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMP[p%3d] ph h%3d, hh: %3d, ah: %3d (%dmb-%dmb), ac: %5d(%s)",
+                    proc_no, proc_hp_num, home_hp->heap_number,
+                    org_hp_num, (total_size / 1024 / 1024), (org_size / 1024 / 1024),
+                    acontext->alloc_count,
+                    ((proc_hp_num == home_hp->heap_number) ? "AC" : "H")));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
                 int org_alloc_context_count;
                 int max_alloc_context_count;
                 gc_heap* max_hp;
+                int max_hp_num = 0;
                 ptrdiff_t max_size;
-                size_t delta = dd_min_size (dd)/4;
+                size_t local_delta = max (((size_t)org_size >> 6), min_gen0_balance_delta);
+                size_t delta = local_delta;
+
+                if (((size_t)org_size + 2 * delta) >= (size_t)total_size)
+                {
+                    acontext->alloc_count++;
+                    return;
+                }
 
                 int start, end, finish;
-                heap_select::get_heap_range_for_heap(org_hp->heap_number, &start, &end);
+                heap_select::get_heap_range_for_heap (org_hp->heap_number, &start, &end);
                 finish = start + n_heaps;
 
 try_again:
+                gc_heap* new_home_hp = 0;
+
                 do
                 {
                     max_hp = org_hp;
+                    max_hp_num = org_hp_num;
                     max_size = org_size + delta;
-                    acontext->set_home_heap(GCHeap::GetHeap( heap_select::select_heap(acontext, hint) ));
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                    proc_no = GCToOSInterface::GetCurrentProcessorNumber ();
+                    if (proc_no != last_proc_no)
+                    {
+                        dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSP: %d->%d", last_proc_no, proc_no));
+                        multiple_procs_p = true;
+                        last_proc_no = proc_no;
+                    }
 
-                    if (org_hp == acontext->get_home_heap()->pGenGCHeap)
+                    int current_hp_num = heap_select::proc_no_to_heap_no[proc_no];
+                    acontext->set_home_heap (GCHeap::GetHeap (current_hp_num));
+#else
+                    acontext->set_home_heap (GCHeap::GetHeap (heap_select::select_heap (acontext)));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+                    new_home_hp = acontext->get_home_heap ()->pGenGCHeap;
+                    if (org_hp == new_home_hp)
                         max_size = max_size + delta;
 
                     org_alloc_context_count = org_hp->alloc_context_count;
@@ -13568,62 +14101,98 @@ try_again:
                     if (max_alloc_context_count > 1)
                         max_size /= max_alloc_context_count;
 
+                    int actual_start = start;
+                    int actual_end = (end - 1);
+
                     for (int i = start; i < end; i++)
                     {
-                        gc_heap* hp = GCHeap::GetHeap(i%n_heaps)->pGenGCHeap;
+                        gc_heap* hp = GCHeap::GetHeap (i % n_heaps)->pGenGCHeap;
                         dd = hp->dynamic_data_of (0);
                         ptrdiff_t size = dd_new_allocation (dd);
-                        if (hp == acontext->get_home_heap()->pGenGCHeap)
+
+                        if (hp == new_home_hp)
+                        {
                             size = size + delta;
+                        }
                         int hp_alloc_context_count = hp->alloc_context_count;
+
                         if (hp_alloc_context_count > 0)
+                        {
                             size /= (hp_alloc_context_count + 1);
+                        }
                         if (size > max_size)
                         {
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                            dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPorg h%d(%dmb), m h%d(%dmb)",
+                                org_hp_num, (max_size / 1024 / 1024),
+                                hp->heap_number, (size / 1024 / 1024)));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
                             max_hp = hp;
                             max_size = size;
+                            max_hp_num = max_hp->heap_number;
                             max_alloc_context_count = hp_alloc_context_count;
                         }
                     }
-                }
+                } 
                 while (org_alloc_context_count != org_hp->alloc_context_count ||
-                       max_alloc_context_count != max_hp->alloc_context_count);
+                    max_alloc_context_count != max_hp->alloc_context_count);
 
                 if ((max_hp == org_hp) && (end < finish))
-                {   
-                    start = end; end = finish; 
-                    delta = dd_min_size(dd)/2; // Make it twice as hard to balance to remote nodes on NUMA.
+                {
+                    start = end; end = finish;
+                    delta = local_delta * 2; // Make it twice as hard to balance to remote nodes on NUMA.
                     goto try_again;
                 }
 
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                uint16_t ideal_proc_no_before_set_ideal = 0;
+                GCToOSInterface::GetCurrentThreadIdealProc (&ideal_proc_no_before_set_ideal);
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
                 if (max_hp != org_hp)
                 {
+                    final_alloc_hp_num = max_hp->heap_number;
+
                     org_hp->alloc_context_count--;
                     max_hp->alloc_context_count++;
-                    acontext->set_alloc_heap(GCHeap::GetHeap(max_hp->heap_number));
+
+                    acontext->set_alloc_heap (GCHeap::GetHeap (final_alloc_hp_num));
                     if (!gc_thread_no_affinitize_p)
                     {
-                        uint16_t src_proc_no = heap_select::find_proc_no_from_heap_no(org_hp->heap_number);
-                        uint16_t dst_proc_no = heap_select::find_proc_no_from_heap_no(max_hp->heap_number);
+                        uint16_t src_proc_no = heap_select::find_proc_no_from_heap_no (org_hp->heap_number);
+                        uint16_t dst_proc_no = heap_select::find_proc_no_from_heap_no (max_hp->heap_number);
+
+                        dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSW! h%d(p%d)->h%d(p%d)",
+                            org_hp_num, src_proc_no, final_alloc_hp_num, dst_proc_no));
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                        int current_proc_no_before_set_ideal = GCToOSInterface::GetCurrentProcessorNumber ();
+                        if (current_proc_no_before_set_ideal != last_proc_no)
+                        {
+                            dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPSPa: %d->%d", last_proc_no, current_proc_no_before_set_ideal));
+                            multiple_procs_p = true;
+                        }
+#endif //HEAP_BALANCE_INSTRUMENTATION
 
-                        if (!GCToOSInterface::SetCurrentThreadIdealAffinity(src_proc_no, dst_proc_no))
+                        if (!GCToOSInterface::SetCurrentThreadIdealAffinity (src_proc_no, dst_proc_no))
+                        {
+                            dprintf (HEAP_BALANCE_TEMP_LOG, ("TEMPFailed to set the ideal processor for heap %d %d->%d",
+                                org_hp->heap_number, (int)src_proc_no, (int)dst_proc_no));
+                        }
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                        else
                         {
-                            dprintf (3, ("Failed to set the ideal processor for heap %d.",
-                                        org_hp->heap_number));
+                            set_ideal_p = true;
                         }
+#endif //HEAP_BALANCE_INSTRUMENTATION
                     }
-                    dprintf (3, ("Switching context %p (home heap %d) ", 
-                                 acontext,
-                                 acontext->get_home_heap()->pGenGCHeap->heap_number));
-                    dprintf (3, (" from heap %d (%Id free bytes, %d contexts) ", 
-                                 org_hp->heap_number,
-                                 org_size,
-                                 org_alloc_context_count));
-                    dprintf (3, (" to heap %d (%Id free bytes, %d contexts)\n", 
-                                 max_hp->heap_number,
-                                 dd_new_allocation(max_hp->dynamic_data_of(0)),
-                                                   max_alloc_context_count));
                 }
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+                add_to_hb_numa (proc_no, ideal_proc_no_before_set_ideal,
+                    final_alloc_hp_num, multiple_procs_p, alloc_count_p, set_ideal_p);
+#endif //HEAP_BALANCE_INSTRUMENTATION
             }
         }
     }
@@ -13650,8 +14219,8 @@ ptrdiff_t gc_heap::get_balance_heaps_loh_effective_budget ()
 
 gc_heap* gc_heap::balance_heaps_loh (alloc_context* acontext, size_t alloc_size)
 {
-    const int home_hp_num = heap_select::select_heap(acontext, 0);
-    dprintf (3, ("[h%d] LA: %Id", home_heap, alloc_size));
+    const int home_hp_num = heap_select::select_heap(acontext);
+    dprintf (3, ("[h%d] LA: %Id", home_hp_num, alloc_size));
     gc_heap* home_hp = GCHeap::GetHeap(home_hp_num)->pGenGCHeap;
     dynamic_data* dd = home_hp->dynamic_data_of (max_generation + 1);
     const ptrdiff_t home_hp_size = home_hp->get_balance_heaps_loh_effective_budget ();
@@ -13705,7 +14274,7 @@ try_again:
 gc_heap* gc_heap::balance_heaps_loh_hard_limit_retry (alloc_context* acontext, size_t alloc_size)
 {
     assert (heap_hard_limit);
-    const int home_heap = heap_select::select_heap(acontext, 0);
+    const int home_heap = heap_select::select_heap(acontext);
     dprintf (3, ("[h%d] balance_heaps_loh_hard_limit_retry alloc_size: %d", home_heap, alloc_size));
     int start, end;
     heap_select::get_heap_range_for_heap (home_heap, &start, &end);
@@ -15056,13 +15625,13 @@ int gc_heap::generation_to_condemn (int n_initial,
             generation_free_obj_space (large_object_generation);
 
         //save new_allocation
-        for (i = 0; i <= max_generation+1; i++)
+        for (i = 0; i <= max_generation + 1; i++)
         {
             dynamic_data* dd = dynamic_data_of (i);
-            dprintf (GTC_LOG, ("h%d: g%d: l: %Id (%Id)", 
-                            heap_number, i,
-                            dd_new_allocation (dd),
-                            dd_desired_allocation (dd)));
+            dprintf (GTC_LOG, ("h%d: g%d: l: %Id (%Id)",
+                heap_number, i,
+                dd_new_allocation (dd),
+                dd_desired_allocation (dd)));
             dd_gc_new_allocation (dd) = dd_new_allocation (dd);
         }
 
@@ -15788,6 +16357,14 @@ void gc_heap::gc1()
         dynamic_data* dd = dynamic_data_of (n);
         dd_gc_elapsed_time (dd) = end_gc_time - dd_time_clock (dd);
 
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+        if (heap_number == 0)
+        {
+            last_gc_end_time_ms = end_gc_time;
+            dprintf (HEAP_BALANCE_LOG, ("[GC#%Id-%Id-BGC]", settings.gc_index, dd_gc_elapsed_time (dd)));
+        }
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
         free_list_info (max_generation, "after computing new dynamic data");
 
         gc_history_per_heap* current_gc_data_per_heap = get_gc_data_per_heap();
@@ -15829,10 +16406,16 @@ void gc_heap::gc1()
         
         if (heap_number == 0)
         {
-            dprintf (GTC_LOG, ("GC#%d(gen%d) took %Idms", 
+            size_t gc_elapsed_time = dd_gc_elapsed_time (dynamic_data_of (0));
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+            last_gc_end_time_ms = end_gc_time;
+            dprintf (HEAP_BALANCE_LOG, ("[GC#%Id-%Id-%Id]", settings.gc_index, gc_elapsed_time, dd_time_clock (dynamic_data_of (0))));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
+            dprintf (GTC_LOG, ("GC#%d(gen%d) took %Idms",
                 dd_collection_count (dynamic_data_of (0)), 
                 settings.condemned_generation,
-                dd_gc_elapsed_time (dynamic_data_of (0))));
+                gc_elapsed_time));
         }
 
         for (int gen_number = 0; gen_number <= (max_generation + 1); gen_number++)
@@ -16095,12 +16678,9 @@ void gc_heap::gc1()
 #if 1 //subsumed by the linear allocation model 
                     // to avoid spikes in mem usage due to short terms fluctuations in survivorship,
                     // apply some smoothing.
-                    static size_t smoothed_desired_per_heap = 0;
                     size_t smoothing = 3; // exponential smoothing factor
-                    if (smoothing  > VolatileLoad(&settings.gc_index))
-                        smoothing  = VolatileLoad(&settings.gc_index);
                     smoothed_desired_per_heap = desired_per_heap / smoothing + ((smoothed_desired_per_heap / smoothing) * (smoothing-1));
-                    dprintf (1, ("sn = %Id  n = %Id", smoothed_desired_per_heap, desired_per_heap));
+                    dprintf (HEAP_BALANCE_LOG, ("TEMPsn = %Id  n = %Id", smoothed_desired_per_heap, desired_per_heap));
                     desired_per_heap = Align(smoothed_desired_per_heap, get_alignment_constant (true));
 #endif //0
 
@@ -17162,6 +17742,11 @@ void gc_heap::garbage_collect (int n)
 
         settings.gc_index = (uint32_t)dd_collection_count (dynamic_data_of (0)) + 1;
 
+#ifdef MULTIPLE_HEAPS
+        hb_log_balance_activities();
+        hb_log_new_allocation();
+#endif //MULTIPLE_HEAPS
+
         // Call the EE for start of GC work
         // just one thread for MP GC
         GCToEEInterface::GcStartWork (settings.condemned_generation,
@@ -25445,7 +26030,7 @@ void gc_heap::gc_thread_stub (void* arg)
         // We are about to set affinity for GC threads. It is a good place to set up NUMA and
         // CPU groups because the process mask, processor number, and group number are all
         // readily available.
-        set_thread_affinity_for_heap(heap->heap_number);
+        set_thread_affinity_for_heap (heap->heap_number, heap_select::find_proc_no_from_heap_no (heap->heap_number));
     }
 
     // server GC threads run at a higher priority than normal.
@@ -27492,9 +28077,6 @@ void gc_heap::bgc_thread_function()
             dprintf (3, ("no concurrent GC needed, exiting"));
             break;
         }
-#ifdef TRACE_GC
-        //trace_gc = TRUE;
-#endif //TRACE_GC
         recursive_gc_sync::begin_background();
         dprintf (2, ("beginning of bgc: gen2 FL: %d, FO: %d, frag: %d", 
             generation_free_list_space (generation_of (max_generation)),
@@ -27505,10 +28087,6 @@ void gc_heap::bgc_thread_function()
 
         current_bgc_state = bgc_not_in_process;
 
-#ifdef TRACE_GC
-        //trace_gc = FALSE;
-#endif //TRACE_GC
-
         enable_preemptive ();
 #ifdef MULTIPLE_HEAPS
         bgc_t_join.join(this, gc_join_done);
@@ -30060,7 +30638,6 @@ void gc_heap::init_static_data()
     }
 
     gen0_max_size = Align (gen0_max_size);
-
     gen0_min_size = min (gen0_min_size, gen0_max_size);
 
     // TODO: gen0_max_size has a 200mb cap; gen1_max_size should also have a cap.
@@ -30086,12 +30663,23 @@ void gc_heap::init_static_data()
 
 bool gc_heap::init_dynamic_data()
 {
-    qpf = GCToOSInterface::QueryPerformanceFrequency();
-
-    uint32_t now = (uint32_t)GetHighPrecisionTimeStamp();
+    uint64_t now_raw_ts = RawGetHighPrecisionTimeStamp ();
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+    start_raw_ts = now_raw_ts;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+    uint32_t now = (uint32_t)(now_raw_ts / (qpf / 1000));
 
     set_static_data();
 
+    if (heap_number == 0)
+    {
+        smoothed_desired_per_heap = dynamic_data_of (0)->min_size;
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+        last_gc_end_time_ms = now;
+        dprintf (HEAP_BALANCE_LOG, ("qpf=%I64d, start: %I64d(%d)", qpf, start_raw_ts, now));
+#endif //HEAP_BALANCE_INSTRUMENTATION
+    }
+
     for (int i = 0; i <= max_generation+1; i++)
     {
         dynamic_data* dd = dynamic_data_of (i);
@@ -34129,6 +34717,8 @@ HRESULT GCHeap::Initialize()
 {
     HRESULT hr = S_OK;
 
+    qpf = GCToOSInterface::QueryPerformanceFrequency();
+
     g_gc_pFreeObjectMethodTable = GCToEEInterface::GetFreeObjectMethodTable();
     g_num_processors = GCToOSInterface::GetTotalProcessorCount();
     assert(g_num_processors != 0);
@@ -34191,18 +34781,23 @@ HRESULT GCHeap::Initialize()
         return CLR_E_GC_BAD_AFFINITY_CONFIG;
     }
 
+    if ((cpu_index_ranges_holder.Get() != nullptr) || (config_affinity_mask != 0))
+    {
+        affinity_config_specified_p = true;
+    }
+
     nhp_from_config = static_cast<uint32_t>(GCConfig::GetHeapCount());
     
-    uint32_t nhp_from_process = GCToOSInterface::GetCurrentProcessCpuCount();
+    g_num_active_processors = GCToOSInterface::GetCurrentProcessCpuCount();
 
     if (nhp_from_config)
     {
         // Even when the user specifies a heap count, it should not be more
         // than the number of procs this process can use.
-        nhp_from_config = min (nhp_from_config, nhp_from_process);
+        nhp_from_config = min (nhp_from_config, g_num_active_processors);
     }
 
-    nhp = ((nhp_from_config == 0) ? nhp_from_process : nhp_from_config);
+    nhp = ((nhp_from_config == 0) ? g_num_active_processors : nhp_from_config);
 
     nhp = min (nhp, MAX_SUPPORTED_CPUS);
 #ifndef FEATURE_REDHAWK
@@ -34350,8 +34945,62 @@ HRESULT GCHeap::Initialize()
             return hr;
         }
     }
-    // initialize numa node to heap map
-    heap_select::init_numa_node_to_heap_map(nhp);
+
+    heap_select::init_numa_node_to_heap_map (nhp);
+
+    // If we have more active processors than heaps we still want to initialize some of the 
+    // mapping for the rest of the active processors because user threads can still run on
+    // them which means it's important to know their numa nodes and map them to a reasonable
+    // heap, ie, we wouldn't want to have all such procs go to heap 0.
+    if (g_num_active_processors > nhp)
+        heap_select::distribute_other_procs();
+
+    gc_heap* hp = gc_heap::g_heaps[0];
+
+    dynamic_data* gen0_dd = hp->dynamic_data_of (0);
+    gc_heap::min_gen0_balance_delta = (dd_min_size (gen0_dd) >> 3);
+
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+    cpu_group_enabled_p = GCToOSInterface::CanEnableGCCPUGroups();
+
+    if (!GCToOSInterface::GetNumaInfo (&total_numa_nodes_on_machine, &procs_per_numa_node))
+    {
+        total_numa_nodes_on_machine = 1;
+        
+        // Note that if we are in cpu groups we need to take the way proc index is calculated
+        // into consideration. It would mean we have more than 64 procs on one numa node - 
+        // this is mostly for testing (if we want to simulate no numa on a numa system).
+        // see vm\gcenv.os.cpp GroupProcNo implementation.
+        if (GCToOSInterface::GetCPUGroupInfo (&total_cpu_groups_on_machine, &procs_per_cpu_group))
+            procs_per_numa_node = procs_per_cpu_group + ((total_cpu_groups_on_machine - 1) << 6);
+        else
+            procs_per_numa_node = g_num_processors;
+    }
+    hb_info_numa_nodes = new (nothrow) heap_balance_info_numa[total_numa_nodes_on_machine];
+    dprintf (HEAP_BALANCE_LOG, ("total: %d, numa: %d", g_num_processors, total_numa_nodes_on_machine));
+
+    int hb_info_size_per_proc = sizeof (heap_balance_info_proc);
+
+    for (int numa_node_index = 0; numa_node_index < total_numa_nodes_on_machine; numa_node_index++)
+    {
+        int hb_info_size_per_node = hb_info_size_per_proc * procs_per_numa_node;
+        uint8_t* numa_mem = (uint8_t*)GCToOSInterface::VirtualReserve (hb_info_size_per_node, 0, 0, numa_node_index);
+        if (!numa_mem)
+            return E_FAIL;
+        if (!GCToOSInterface::VirtualCommit (numa_mem, hb_info_size_per_node, numa_node_index))
+            return E_FAIL;
+
+        heap_balance_info_proc* hb_info_procs = (heap_balance_info_proc*)numa_mem;
+        hb_info_numa_nodes[numa_node_index].hb_info_procs = hb_info_procs;
+
+        for (int proc_index = 0; proc_index < (int)procs_per_numa_node; proc_index++)
+        {
+            heap_balance_info_proc* hb_info_proc = &hb_info_procs[proc_index];
+            hb_info_proc->count = default_max_hb_heap_balance_info;
+            hb_info_proc->index = 0;
+        }
+    }
+#endif //HEAP_BALANCE_INSTRUMENTATION
 #else
     hr = Init (0);
 #endif //MULTIPLE_HEAPS
@@ -36106,13 +36755,14 @@ size_t GCHeap::ApproxTotalBytesInUse(BOOL small_heap_only)
 void GCHeap::AssignHeap (alloc_context* acontext)
 {
     // Assign heap based on processor
-    acontext->set_alloc_heap(GetHeap(heap_select::select_heap(acontext, 0)));
+    acontext->set_alloc_heap(GetHeap(heap_select::select_heap(acontext)));
     acontext->set_home_heap(acontext->get_alloc_heap());
 }
+
 GCHeap* GCHeap::GetHeap (int n)
 {
     assert (n < gc_heap::n_heaps);
-    return gc_heap::g_heaps [n]->vm_heap;
+    return gc_heap::g_heaps[n]->vm_heap;
 }
 #endif //MULTIPLE_HEAPS
 
@@ -36244,6 +36894,7 @@ bool GCHeap::RegisterForFullGCNotification(uint32_t gen2Percentage,
     {
         gc_heap* hp = gc_heap::g_heaps [hn];
         hp->fgn_last_alloc = dd_new_allocation (hp->dynamic_data_of (0));
+        hp->fgn_maxgen_percent = gen2Percentage;
     }
 #else //MULTIPLE_HEAPS
     pGenGCHeap->fgn_last_alloc = dd_new_allocation (pGenGCHeap->dynamic_data_of (0));
@@ -36253,7 +36904,6 @@ bool GCHeap::RegisterForFullGCNotification(uint32_t gen2Percentage,
     pGenGCHeap->full_gc_end_event.Reset();
     pGenGCHeap->full_gc_approach_event_set = false;
 
-    pGenGCHeap->fgn_maxgen_percent = gen2Percentage;
     pGenGCHeap->fgn_loh_percent = lohPercentage;
 
     return TRUE;
diff --git a/src/gc/gcpriv.h b/src/gc/gcpriv.h
index 7c8286babf..c194c547b6 100644
--- a/src/gc/gcpriv.h
+++ b/src/gc/gcpriv.h
@@ -40,6 +40,13 @@ inline void FATAL_GC_ERROR()
     GCToEEInterface::HandleFatalError((unsigned int)COR_E_EXECUTIONENGINE);
 }
 
+#ifdef MULTIPLE_HEAPS
+// This turns on instrumentation that collects info for heap balancing.
+// Define it and make sure you have HEAP_BALANCE_LOG/HEAP_BALANCE_TEMP_LOG 
+// level logging enabled *only*.
+//#define HEAP_BALANCE_INSTRUMENTATION
+#endif //MULTIPLE_HEAPS
+
 #ifdef _MSC_VER
 #pragma inline_depth(20)
 #endif
@@ -255,6 +262,15 @@ const int policy_expand  = 2;
 #define SPINLOCK_LOG (DT_LOG_0 + 5)
 #define SNOOP_LOG (DT_LOG_0 + 6)
 
+// NOTE! This is for HEAP_BALANCE_INSTRUMENTATION
+// This particular one is special and needs to be well formatted because we 
+// do post processing on it with tools\GCLogParser. If you need to add some 
+// detail to help with investigation that's not 't processed by tooling 
+// prefix it with TEMP so that line will be written to the results as is in
+// the result. I have some already logged with HEAP_BALANCE_TEMP_LOG.
+#define HEAP_BALANCE_LOG (DT_LOG_0 + 7)
+#define HEAP_BALANCE_TEMP_LOG (DT_LOG_0 + 8)
+
 #ifndef DACCESS_COMPILE
 
 #ifdef SIMPLE_DPRINTF
@@ -267,6 +283,7 @@ void GCLog (const char *fmt, ... );
 //#define dprintf(l,x) {if (trace_gc && ((l <= 2) || (l == BGC_LOG) || (l==GTC_LOG))) {GCLog x;}}
 //#define dprintf(l,x) {if ((l == 1) || (l == 2222)) {GCLog x;}}
 #define dprintf(l,x) {if ((l <= 1) || (l == GTC_LOG)) {GCLog x;}}
+//#define dprintf(l,x) {if (l == HEAP_BALANCE_LOG) {GCLog x;}}
 //#define dprintf(l,x) {if ((l==GTC_LOG) || (l <= 1)) {GCLog x;}}
 //#define dprintf(l,x) {if (trace_gc && ((l <= print_level) || (l==GTC_LOG))) {GCLog x;}}
 //#define dprintf(l,x) {if (l==GTC_LOG) {printf ("\n");printf x ; fflush(stdout);}}
@@ -1222,6 +1239,12 @@ public:
                              uint32_t flags);
 
 #ifdef MULTIPLE_HEAPS
+    PER_HEAP_ISOLATED
+    void hb_log_new_allocation();
+
+    PER_HEAP_ISOLATED
+    void hb_log_balance_activities();
+
     static
     void balance_heaps (alloc_context* acontext);
     PER_HEAP
@@ -2965,7 +2988,7 @@ public:
     GCEvent full_gc_end_event;
 
     // Full GC Notification percentages.
-    PER_HEAP_ISOLATED
+    PER_HEAP
     uint32_t fgn_maxgen_percent;
 
     PER_HEAP_ISOLATED
@@ -3026,6 +3049,9 @@ public:
     PER_HEAP
     heap_segment* new_heap_segment;
 
+    PER_HEAP_ISOLATED
+    size_t min_gen0_balance_delta;
+
 #define alloc_quantum_balance_units (16)
 
     PER_HEAP_ISOLATED
@@ -3091,7 +3117,7 @@ public:
 
     PER_HEAP_ISOLATED
     uint64_t total_physical_mem;
-
+    
     PER_HEAP_ISOLATED
     uint64_t entry_available_physical_mem;
 
@@ -3175,6 +3201,11 @@ public:
     PER_HEAP_ISOLATED
     size_t last_gc_index;
 
+#ifdef HEAP_BALANCE_INSTRUMENTATION
+    PER_HEAP_ISOLATED
+    size_t last_gc_end_time_ms;
+#endif //HEAP_BALANCE_INSTRUMENTATION
+
 #ifdef SEG_MAPPING_TABLE
     PER_HEAP_ISOLATED
     size_t min_segment_size;
diff --git a/src/gc/unix/gcenv.unix.cpp b/src/gc/unix/gcenv.unix.cpp
index 3d7442ada9..4a48a4a5a9 100644
--- a/src/gc/unix/gcenv.unix.cpp
+++ b/src/gc/unix/gcenv.unix.cpp
@@ -527,9 +527,10 @@ static void* VirtualReserveInner(size_t size, size_t alignment, uint32_t flags,
 //  size      - size of the virtual memory range
 //  alignment - requested memory alignment, 0 means no specific alignment requested
 //  flags     - flags to control special settings like write watching
+//  node      - the NUMA node to reserve memory on
 // Return:
 //  Starting virtual address of the reserved range
-void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
+void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node)
 {
     return VirtualReserveInner(size, alignment, flags);
 }
@@ -910,6 +911,11 @@ bool GCToOSInterface::CanEnableGCNumaAware()
     return g_numaAvailable;
 }
 
+bool GCToOSInterface::CanEnableGCCPUGroups()
+{
+    return false;
+}
+
 // Get processor number and optionally its NUMA node number for the specified heap number
 // Parameters:
 //  heap_number - heap number to get the result for
diff --git a/src/gc/windows/gcenv.windows.cpp b/src/gc/windows/gcenv.windows.cpp
index e5c4d7a8db..e8f0035505 100644
--- a/src/gc/windows/gcenv.windows.cpp
+++ b/src/gc/windows/gcenv.windows.cpp
@@ -37,6 +37,7 @@ typedef BOOL (WINAPI *PQUERY_INFORMATION_JOB_OBJECT)(HANDLE jobHandle, JOBOBJECT
 namespace {
 
 static bool g_fEnableGCNumaAware;
+static uint32_t g_nNodes;
 
 class GroupProcNo
 {
@@ -91,6 +92,7 @@ void InitNumaNodeInfo()
     if (!GetNumaHighestNodeNumber(&highest) || (highest == 0))
         return;
 
+    g_nNodes = highest + 1;
     g_fEnableGCNumaAware = true;
     return;
 }
@@ -659,6 +661,21 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
     return success;
 }
 
+bool GCToOSInterface::GetCurrentThreadIdealProc(uint16_t* procNo)
+{
+    PROCESSOR_NUMBER proc;
+
+    bool success = GetThreadIdealProcessorEx (GetCurrentThread (), &proc);
+
+    if (success)
+    {
+        GroupProcNo groupProcNo(proc.Group, proc.Number);
+        *procNo = groupProcNo.GetCombinedValue();
+    }
+
+    return success;
+}
+
 // Get the number of the current processor
 uint32_t GCToOSInterface::GetCurrentProcessorNumber()
 {
@@ -713,17 +730,26 @@ void GCToOSInterface::YieldThread(uint32_t switchCount)
 //  size      - size of the virtual memory range
 //  alignment - requested memory alignment, 0 means no specific alignment requested
 //  flags     - flags to control special settings like write watching
+//  node      - the NUMA node to reserve memory on
 // Return:
 //  Starting virtual address of the reserved range
-void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
+void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node)
 {
     // Windows already ensures 64kb alignment on VirtualAlloc. The current CLR
     // implementation ignores it on Windows, other than making some sanity checks on it.
     UNREFERENCED_PARAMETER(alignment);
     assert((alignment & (alignment - 1)) == 0);
     assert(alignment <= 0x10000);
-    DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
-    return ::VirtualAlloc(nullptr, size, memFlags, PAGE_READWRITE);
+
+    if (node == NUMA_NODE_UNDEFINED)
+    {
+        DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
+        return ::VirtualAlloc (nullptr, size, memFlags, PAGE_READWRITE);
+    }
+    else
+    {
+        return ::VirtualAllocExNuma (::GetCurrentProcess (), NULL, size, MEM_RESERVE, PAGE_READWRITE, node);
+    }
 }
 
 // Release virtual memory range previously reserved using VirtualReserve
@@ -1286,6 +1312,57 @@ bool GCToOSInterface::CanEnableGCNumaAware()
     return g_fEnableGCNumaAware;
 }
 
+bool GCToOSInterface::GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node)
+{
+    if (g_fEnableGCNumaAware)
+    {
+        DWORD currentProcsOnNode = 0;
+        for (uint32_t i = 0; i < g_nNodes; i++)
+        {
+            GROUP_AFFINITY processorMask;
+            if (GetNumaNodeProcessorMaskEx(i, &processorMask))
+            {
+                DWORD procsOnNode = 0;
+                uintptr_t mask = (uintptr_t)processorMask.Mask;
+                while (mask)
+                {
+                    procsOnNode++;
+                    mask &= mask - 1;
+                }
+
+                currentProcsOnNode = max(currentProcsOnNode, procsOnNode);
+            }
+            *max_procs_per_node = currentProcsOnNode;
+            *total_nodes = g_nNodes;
+        }
+        return true;
+    }
+
+    return false;
+}
+
+bool GCToOSInterface::CanEnableGCCPUGroups()
+{
+    return g_fEnableGCCPUGroups;
+}
+
+bool GCToOSInterface::GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group)
+{
+    if (g_fEnableGCCPUGroups)
+    {
+        *total_groups = (uint16_t)g_nGroups;
+        DWORD currentProcsInGroup = 0;
+        for (WORD i = 0; i < g_nGroups; i++)
+        {
+            currentProcsInGroup = max(currentProcsInGroup, g_CPUGroupInfoArray[i].nr_active);
+        }
+        *max_procs_per_group = currentProcsInGroup;
+        return true;
+    }
+
+    return false;
+}
+
 // Get processor number and optionally its NUMA node number for the specified heap number
 // Parameters:
 //  heap_number - heap number to get the result for
diff --git a/src/inc/utilcode.h b/src/inc/utilcode.h
index 1021a1d37e..5b8107c4a4 100644
--- a/src/inc/utilcode.h
+++ b/src/inc/utilcode.h
@@ -1306,6 +1306,7 @@ class NumaNodeInfo
 {
 private:
     static BOOL m_enableGCNumaAware;
+    static uint16_t m_nNodes;
     static BOOL InitNumaNodeInfoAPI();
 
 public:
@@ -1319,6 +1320,7 @@ public: 	// functions
                                      DWORD allocType, DWORD prot, DWORD node);
 #ifndef FEATURE_PAL
     static BOOL GetNumaProcessorNodeEx(PPROCESSOR_NUMBER proc_no, PUSHORT node_no);
+    static bool GetNumaInfo(PUSHORT total_nodes, DWORD* max_procs_per_node);
 #else // !FEATURE_PAL
     static BOOL GetNumaProcessorNodeEx(USHORT proc_no, PUSHORT node_no);
 #endif // !FEATURE_PAL
@@ -1363,6 +1365,7 @@ public:
     static void GetGroupForProcessor(WORD processor_number, 
 		    WORD *group_number, WORD *group_processor_number);
     static DWORD CalculateCurrentProcessorNumber();
+    static bool GetCPUGroupInfo(PUSHORT total_groups, DWORD* max_procs_per_group);
     //static void PopulateCPUUsageArray(void * infoBuffer, ULONG infoSize);
 
 #if !defined(FEATURE_REDHAWK)
diff --git a/src/tools/GCLogParser/App.config b/src/tools/GCLogParser/App.config
new file mode 100644
index 0000000000..56efbc7b5f
--- /dev/null
+++ b/src/tools/GCLogParser/App.config
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<configuration>
+    <startup> 
+        <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7.2" />
+    </startup>
+</configuration>
+\ No newline at end of file
diff --git a/src/tools/GCLogParser/Properties/AssemblyInfo.cs b/src/tools/GCLogParser/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000000..047ddd688e
--- /dev/null
+++ b/src/tools/GCLogParser/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("parse-hb-log")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("parse-hb-log")]
+[assembly: AssemblyCopyright("Copyright ©  2019")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components.  If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("6ae26cd6-c971-48d6-8c03-2ffa272b942c")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
diff --git a/src/tools/GCLogParser/parse-hb-log.cs b/src/tools/GCLogParser/parse-hb-log.cs
new file mode 100644
index 0000000000..44ac0671bf
--- /dev/null
+++ b/src/tools/GCLogParser/parse-hb-log.cs
@@ -0,0 +1,1099 @@
+// parse-hb-log.exe -ia 1 -l gclog.pid.log
+// where gclog.pid.log is the log you get from GC with HEAP_BALANCE_LOG. See
+// comments for it in gcpriv.h
+//
+// -ia 1 means to include all timestamp instead of only timestamps where we
+// observed samples. This is usually what you want as it makes it really
+// obvious when a thread is not active enough.
+// 
+// This prints out the following in the console:
+// Samples observed on each thread and thread mapping (from id to index)
+// and it tells you the BFR (Budget Fill Ratio) like this:
+// 
+//Printing out proc [0,[28
+//Total 1074 GCs, avg 0ms, node 0: BFR %90.83
+//Printing out proc [28,[56
+//Total 1074 GCs, avg 0ms, node 1: BFR %89.43
+//
+// and generates 2 logs per numa node - 
+// pass-zero-pass1-nX-ia1-alloc.txt
+// pass-zero-pass1-nX-ia1-thread.txt
+// 
+// thread tells you the thread index running on each proc at each timestamp.
+// 4240| 63₉ | 65₉ | 62₉ | 56₁₀| 87₁₀|109₁₀| 59₉ | 70₁₀| 78₉ | 64₉ | 71₁₀|107₁₀|
+// 
+// 4240 is the 4240th ms since we started recording.
+// the numbers following are the thread indices and the subscript is the # of samples
+// observed during that ms. The tool can do a time unit that's larger than 1ms.
+//
+// alloc tells you which alloc heap the each proc, for the same timestamp 
+// 4240| 56  | 57  | 58ⁱ | 59  | 60  | 61  | 62  | 63  | 64  | 65  | 66  | 67  |
+// 56 means heap 56. The subscript i means we did a SetIdealProcessor during this
+// ms. You may also see 
+// ᵖ meaning we went through the balancing logic due to the proc for the thread changed
+// from the home heap.
+// ᵐ meaning while we were going through balancing logic the proc switched.
+
+using System;
+using System.Collections.Generic;
+using System.Text;
+using System.IO;
+using System.Linq;
+
+namespace parse_hb_log
+{
+    enum HeapBalanceFlagMask
+    {
+        MutipleProcs = 0xf,
+        EnterDueToProc = 0xf0,
+        SetIdeal = 0xf00,
+    };
+    struct SampleInfo
+    {
+        // -1 means uninit.
+        public int tid;
+        public int allocHeap;
+        public int countSamples;
+        // This is to present m/p/i - each takes 4 bits, mask for them
+        // is defined in HeapBalanceFlagMask. Most of the time the flags are
+        // 0s.
+        public int flags;
+        public int idealProcNo;
+
+        public SampleInfo(int _tid, int _allocHeap, int _countSamples, int _flags, int _idealProcNo)
+        {
+            tid = _tid;
+            allocHeap = _allocHeap;
+            countSamples = _countSamples;
+            flags = _flags;
+            idealProcNo = _idealProcNo;
+        }
+    };
+
+    enum PassOneViewType
+    {
+        Thread = 0,
+        AllocHeap = 1,
+        MaxType = 2
+    };
+    class Program
+    {
+        static bool fLogging = false;
+
+        static string ParseString(string str, string strStart, string strEnd, out string strRemaining)
+        {
+            int startIndex = 0;
+            if (strStart != null)
+            {
+                startIndex = str.IndexOf(strStart);
+                if (startIndex == -1)
+                {
+                    Console.WriteLine("couldn't find {0} in \"{1}\"!!!", strStart, str);
+                    strRemaining = str;
+                    return null;
+                }
+
+                startIndex += strStart.Length;
+            }
+
+            //string strRest = str.Substring(startIndex + strStart.Length);
+            string strRest = str.Substring(startIndex);
+            string strRet = null;
+            //Console.WriteLine(strRest);
+            if (strEnd == null)
+            {
+                strRet = strRest;
+                strRemaining = null;
+            }
+            else
+            {
+                int endIndex = strRest.IndexOf(strEnd);
+                strRet = strRest.Substring(0, endIndex);
+                strRemaining = strRest.Substring(endIndex);
+            }
+
+            //Console.WriteLine("string is --{0}--, remaining --{1}--", strRet, strRemaining);
+            return strRet;
+        }
+
+        static StreamWriter swPassZero = null;
+        static string strPassZeroLog = "pass-zero.txt";
+        static int totalProcs = 0;
+        static int totalNodes = 0;
+        static int procsPerNode = 0;
+        // this is qpf / 1000 so we calculate ms instead of s.
+        static UInt64 qpfAdjusted = 0;
+        // we log the current qpc so substract by this.
+        static UInt64 qpcStart = 0;
+        static Int32 totalAllocThreads = 0;
+        static Dictionary<string, int> threadMapping = new Dictionary<string, int>(112);
+        // We do compress the samples, this stores the aggregated sample counts for all procs.
+        static int[] aggregatedSampleCount = null;
+
+        static int timeUnitMS = 1;
+        // We wanna compress the log by compressing the samples in the same unit of time.
+        // We get samples by procs, so on the same proc if we observe the same thread 
+        // allocating on the same alloc heap in the same time unit we simply count the total
+        // samples. However we do want to remember if any of the p/m/i is set. 
+        static string strLastTID = null;
+        static string strTID = null;
+        static string strLastThreadAllocHeap = null;
+        static string strAllocHeap = null;
+        static string strIdealProc = null;
+        static int lastTimeUnit = 0;
+        static int lastThreadSampleCount = 0;
+        static int lastThreadPCount = 0;
+        static int lastThreadMCount = 0;
+        static int lastThreadICount = 0;
+        static int lastProcIndex = -1;
+        static int largestTimeInBetweenGCs = 0;
+
+        static void InitSampleInfoPassZero()
+        {
+            strLastTID = null;
+            strTID = null;
+            strLastThreadAllocHeap = null;
+            strAllocHeap = null;
+            lastTimeUnit = 0;
+            lastThreadSampleCount = 0;
+            lastThreadPCount = 0;
+            lastThreadMCount = 0;
+            lastThreadICount = 0;
+        }
+
+        static void LogPassZeroAggregatedSample(int _lastTimeUnit, int _tid, int _sampleCount, string _strAllocHeap)
+        {
+            swPassZero.WriteLine("{0}ms,{1}({2}),{3},m:{4},p:{5},i:{6}({7})",
+                _lastTimeUnit,
+                _tid,
+                _sampleCount,
+                _strAllocHeap,
+                lastThreadMCount, lastThreadPCount, lastThreadICount,
+                strIdealProc);
+
+            (aggregatedSampleCount[lastProcIndex])++;
+        }
+
+        // Aside from writing out a new log this also prints out some stats for further processing:
+        // How many samples are observed on each proc. Because we could use this for many proc analysis,
+        // it's impractical to have a wide enough window to display all procs.
+        static void PassZero(string strLog)
+        {
+            swPassZero = new StreamWriter(strPassZeroLog);
+            string strTemp, strRemaining;
+            using (StreamReader sr = new StreamReader(strLog))
+            {
+                string s;
+
+                while ((s = sr.ReadLine()) != null)
+                {
+                    if (s.StartsWith("["))
+                    {
+                        string strAfterTID = ParseString(s, "]", null, out strRemaining);
+                        //Console.WriteLine(s);
+
+                        if (strAfterTID.StartsWith("qpf"))
+                        {
+                            // [15900]qpf=10000000, start: 17262240813778(1726224081)
+                            strTemp = ParseString(s, "qpf=", ",", out strRemaining);
+                            qpfAdjusted = UInt64.Parse(strTemp) / 1000;
+                            strTemp = ParseString(strRemaining, "start:", "(", out strRemaining);
+                            qpcStart = UInt64.Parse(strTemp);
+                            Console.WriteLine("QPF adjusted: {0}, init QPC: {1}", qpfAdjusted, qpcStart);
+                        }
+                        else if (strAfterTID.StartsWith("total: "))
+                        {
+                            // [15900]total: 112, numa: 2
+                            strTemp = ParseString(strAfterTID, "total: ", ",", out strRemaining);
+                            totalProcs = Int32.Parse(strTemp);
+                            strTemp = ParseString(strRemaining, "numa: ", null, out strRemaining);
+                            totalNodes = Int32.Parse(strTemp);
+                            Console.WriteLine("total procs: {0}, nodes: {1}", totalProcs, totalNodes);
+                            procsPerNode = totalProcs / totalNodes;
+                            swPassZero.WriteLine("P: {0}, N: {1}", totalProcs, totalNodes);
+                            aggregatedSampleCount = new int[totalProcs];
+                        }
+                        else if (strAfterTID == "[GC_alloc_mb]")
+                        {
+                            if (strTID != null)
+                            {
+                                LogPassZeroAggregatedSample(
+                                //swPassZero.WriteLine("last sample before GC: {0}ms,{1}({2}),{3},m:{4},p:{5},i:{6}",
+                                    lastTimeUnit,
+                                    threadMapping[strTID],
+                                    lastThreadSampleCount,
+                                    strAllocHeap);
+                                InitSampleInfoPassZero();
+                            }
+
+                            swPassZero.WriteLine("[GC_alloc_mb]");
+                            //[36812][GC_alloc_mb]
+                            //[N# 24]0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+                            //[N# 24]0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+                            int totalNodesRead = 0;
+                            while ((s = sr.ReadLine()) != null)
+                            {
+                                if (s.StartsWith("[N"))
+                                {
+                                    //Console.WriteLine(s);
+                                    swPassZero.WriteLine(s);
+                                    totalNodesRead++;
+                                    if (totalNodesRead == totalNodes)
+                                        break;
+                                }
+                            }
+                        }
+                        else if (strAfterTID.StartsWith("[p"))
+                        {
+                            if (strTID != null)
+                            {
+                                //swPassZero.WriteLine("last sample before p: {0}ms,{1}({2}),{3},m:{4},p:{5},i:{6}",
+                                LogPassZeroAggregatedSample(
+                                    lastTimeUnit,
+                                    threadMapping[strTID],
+                                    lastThreadSampleCount,
+                                    strAllocHeap);
+                                InitSampleInfoPassZero();
+                            }
+
+                            // [36812][p22]-192-254ms
+                            //Console.WriteLine(strAfterTID);
+                            swPassZero.WriteLine(strAfterTID);
+                            strTemp = ParseString(strAfterTID, "p", "]", out strRemaining);
+                            lastProcIndex = Int32.Parse(strTemp);
+                        }
+                        else if (strAfterTID.StartsWith("[GCA#"))
+                        {
+                            // 270 is the time elapsed in ms since the last GC end.
+                            // 18280955362126/18280957621306 are the min/max timestamps observed 
+                            // on all procs for this period.
+                            // [39652][GCA#1 270-18280955362126-18280957621306]
+                            //Console.WriteLine(strAfterTID);
+                            // convert the raw ts to relative ms.
+                            strTemp = ParseString(strAfterTID, "-", "-", out strRemaining);
+                            //Console.WriteLine("min ts is {0}", strTemp);
+                            UInt64 minTimestampMS = UInt64.Parse(strTemp);
+                            minTimestampMS /= qpfAdjusted;
+                            strTemp = ParseString(strRemaining, "-", "]", out strRemaining);
+                            //Console.WriteLine("max ts is {0}", strTemp);
+                            UInt64 maxTimestampMS = UInt64.Parse(strTemp);
+                            maxTimestampMS /= qpfAdjusted;
+                            strTemp = ParseString(strAfterTID, null, "-", out strRemaining);
+                            swPassZero.WriteLine("{0}-{1}-{2}", strTemp, minTimestampMS, maxTimestampMS);
+
+                            largestTimeInBetweenGCs = Math.Max(largestTimeInBetweenGCs, (int)(maxTimestampMS - minTimestampMS));
+                        }
+                        else if (strAfterTID.StartsWith("[GC#"))
+                        {
+                            // 12 is GC duration in ms
+                            // [15628][GC#1-12]
+                            //Console.WriteLine(strAfterTID);
+                            swPassZero.WriteLine(strAfterTID);
+                        }
+                        else if (strAfterTID.StartsWith("TEMP"))
+                        {
+                            // If I want to only log something temporarily I prefix it with TEMP so we know to just 
+                            // write it out as is.
+                            swPassZero.WriteLine(strAfterTID);
+                        }
+                        else
+                        {
+                            // This is the majority of the log -
+                            // qpc,tid,ideal_proc,alloc_heap
+                            // |m means we observed the thread changed the proc it was running on
+                            // during the iterations we do in balance_heaps since we check at various
+                            // points for the current proc.
+                            // |p means we detected proc != home_heap proc
+                            // |i means we did SetIdeal
+                            //[38328]12791000,80,24540|m,101|p
+                            //[38328]12792000,80,24540,101
+                            //[38328]12801000,90,33356,111|p
+                            //[38328]12802000,90,33356,111
+                            //
+                            // this will be converted to 
+                            //1279ms,2(2),80,101,m:1,p:1,i:0
+                            //1280ms,3(2),90,111,m:0,p:1,i:0
+                            //
+                            // the format is:
+                            // relative time from start in ms, thread index(sample count during the unit of time),
+                            // count for m/p/i observed during this unit of time.
+                            //
+                            strTemp = ParseString(s, "]", ",", out strRemaining);
+                            //UInt64 currentQPC = UInt64.Parse(strTemp);
+                            UInt64 currentQPC = 0;
+                            if (!UInt64.TryParse(strTemp, out currentQPC))
+                            {
+                                continue;
+                            }
+                            if (fLogging)
+                            {
+                                //Console.WriteLine("ts: {0}-{1}={2}", currentQPC, qpcStart, (currentQPC - qpcStart));
+                                Console.WriteLine();
+                                Console.WriteLine(strAfterTID);
+                            }
+                            //double timestamp = (double)(currentQPC - qpcStart) / qpfAdjusted;
+                            double timestamp = (double)currentQPC / qpfAdjusted;
+
+                            strTID = ParseString(strRemaining, ",", ",", out strRemaining);
+                            if (strTID == "15580|m")
+                            {
+                                Console.WriteLine(s);
+                            }
+                            if (!threadMapping.ContainsKey(strTID))
+                            {
+                                threadMapping.Add(strTID, totalAllocThreads);
+                                if (fLogging)
+                                {
+                                    Console.WriteLine("Adding {0} as T#{1}", strTID, totalAllocThreads);
+                                }
+                                totalAllocThreads++;
+                            }
+
+                            strIdealProc = ParseString(strRemaining, ",", ",", out strRemaining);
+
+                            bool multiple_procs_p = false;
+                            bool alloc_count_p = true;
+                            bool set_ideal_p = false;
+
+                            if (strRemaining.Contains("|"))
+                            {
+                                strAllocHeap = ParseString(strRemaining, ",", "|", out strRemaining);
+                                if (strRemaining.Contains("m"))
+                                    multiple_procs_p = true;
+                                if (strRemaining.Contains("p"))
+                                    alloc_count_p = false;
+                                if (strRemaining.Contains("i"))
+                                    set_ideal_p = true;
+                            }
+                            else
+                            {
+                                strAllocHeap = ParseString(strRemaining, ",", null, out strRemaining);
+                            }
+
+                            if (fLogging)
+                            {
+                                Console.WriteLine("{0:0.00}ms, tid: {1}/{2}, alloc heap: {3}, m: {4}, p: {5}, i: {6}, ideal {7}",
+                                    timestamp, strTID, threadMapping[strTID],
+                                    strAllocHeap, multiple_procs_p, !alloc_count_p, set_ideal_p,
+                                    strIdealProc);
+                            }
+                            //swPassZero.WriteLine("{0:0.00}ms,{1},{2},m:{3},p:{4},i:{5}",
+                            //    timestamp, 
+                            //    threadMapping[strTID],
+                            //    strAllocHeap, 
+                            //    (multiple_procs_p ? "T" : ""), 
+                            //    (!alloc_count_p ? "T" : ""), 
+                            //    (set_ideal_p? "T" : ""));
+
+                            if ((strTID == strLastTID) && (strLastThreadAllocHeap == strAllocHeap))
+                            {
+                                if (((int)timestamp - lastTimeUnit) >= timeUnitMS)
+                                {
+                                    if (fLogging)
+                                    {
+                                        Console.WriteLine("time is now {7}>{0}+{8},{1}({2}),{3},m:{4},p:{5},i:{6}",
+                                            lastTimeUnit,
+                                            threadMapping[strTID],
+                                            lastThreadSampleCount,
+                                            strAllocHeap,
+                                            lastThreadMCount, lastThreadPCount, lastThreadICount,
+                                            (int)timestamp, timeUnitMS);
+                                    }
+                                    LogPassZeroAggregatedSample(
+                                        lastTimeUnit,
+                                        threadMapping[strTID],
+                                        lastThreadSampleCount,
+                                        strAllocHeap);
+                                    if (fLogging)
+                                    {
+                                        Console.WriteLine("Set last time to {0} for thread {1}, alloc {2}",
+                                            (int)timestamp, strLastTID, strLastThreadAllocHeap);
+                                    }
+                                    lastTimeUnit = (int)timestamp;
+                                    lastThreadSampleCount = 0;
+                                    lastThreadMCount = lastThreadPCount = lastThreadICount = 0;
+                                }
+                            }
+                            else
+                            {
+                                if (strLastTID != null)
+                                {
+                                    // If we detect the thread or alloc heap changed, we always want to 
+                                    // log.
+                                    if (fLogging)
+                                    {
+                                        Console.WriteLine("{0},{1}({2}),{3},m:{4},p:{5},i:{6} -> {7}/{8}",
+                                            lastTimeUnit,
+                                            threadMapping[strLastTID],
+                                            lastThreadSampleCount,
+                                            strLastThreadAllocHeap,
+                                            lastThreadMCount, lastThreadPCount, lastThreadICount,
+                                            strTID, strAllocHeap);
+                                    }
+                                    LogPassZeroAggregatedSample(
+                                        lastTimeUnit,
+                                        threadMapping[strLastTID],
+                                        lastThreadSampleCount,
+                                        strLastThreadAllocHeap);
+                                }
+
+                                if (fLogging)
+                                {
+                                    Console.WriteLine("last tid {0}, last alloc heap {1}, diff, set last time to {2}",
+                                        strLastTID, strLastThreadAllocHeap, (int)timestamp);
+                                }
+                                strLastTID = strTID;
+                                strLastThreadAllocHeap = strAllocHeap;
+                                lastTimeUnit = (int)timestamp;
+                                lastThreadSampleCount = 0;
+                                lastThreadPCount = 0;
+                                lastThreadMCount = 0;
+                                lastThreadICount = 0;
+                            }
+
+                            lastThreadSampleCount++;
+                            if (multiple_procs_p) lastThreadMCount++;
+                            if (!alloc_count_p) lastThreadPCount++;
+                            if (set_ideal_p) lastThreadICount++;
+
+                            //if ((lastThreadMCount > 1) || (lastThreadPCount > 1) || (lastThreadICount > 1))
+                            //{
+                            //    swPassZero.WriteLine("DETECTED: m: {0}, p: {1}, i: {2}",
+                            //        lastThreadMCount, lastThreadPCount, lastThreadICount);
+                            //}
+
+                            if (fLogging)
+                            {
+                                Console.WriteLine("timeunit {0}, sample->{1}, MCount->{2}, PCount->{3}, ICount->{4}",
+                                    lastTimeUnit, lastThreadSampleCount,
+                                    lastThreadMCount, lastThreadPCount, lastThreadICount);
+                            }
+                        }
+                    }
+                }
+            }
+
+            Console.WriteLine("{0,-3} | {1,-10}", "p", "samples");
+            for (int procIndex = 0; procIndex < totalProcs; procIndex++)
+            {
+                if (aggregatedSampleCount[procIndex] != 0)
+                {
+                    Console.WriteLine("{0,-3} | {1,-10} ", procIndex, aggregatedSampleCount[procIndex]);
+                }
+            }
+
+            Console.WriteLine("---Threading mapping---");
+            foreach (var t in threadMapping)
+            {
+                Console.WriteLine("t#{0,-3} - {1}", t.Key, t.Value);
+            }
+
+            Console.WriteLine("Largest time span inbetween GCs: {0}ms", largestTimeInBetweenGCs);
+            swPassZero.Close();
+        }
+
+        //=========================================================================
+        // Pass one. TODO: should separate this from pass zero.
+        //=========================================================================
+        static StreamWriter[] swPassOneFiles = null;
+        // It's difficult to print all of the procs on one line so we only print per node. 
+        static int nodeIndexToPrint = -1;
+        static bool fIncludeAllTime = false;
+        static bool fPrintThreadInfoPerTimeUnit = false;
+
+        // This represents the samples for all the procs inbetween GCs so it doesn't grow very large.
+        // Cleared every GC.
+        static SampleInfo[][] samples;
+        // This is the min/max time for inbetween each GC. 
+        static int startTimeMS = 0;
+        static int endTimeMS = 0;
+        static int currentProcIndex = -1;
+        static int lastTimeIndex = -1;
+
+        // This respresents the threads we see in samples so we can get info such as when threads
+        // start running and how active they are.
+        // Cleared every GC.
+        static List<int> threadsSeenPerTimeUnit = new List<int>(56);
+        static Dictionary<int, int> threadsSeen = new Dictionary<int, int>(56);
+        static Dictionary<int, int> threadsSeenTotal = new Dictionary<int, int>(56);
+        static List<int> threadsToPrint = new List<int>(8);
+        static string strThreadIndices = null;
+        static int gcIndexToPrintStart = -1;
+        static int gcIndexToPrintEnd = 1000000;
+        static string strGCRange = null;
+        static int totalGCCount = 0;
+        static int totalGCDurationMS = 0;
+        static int totalAllocMB = 0;
+        static int totalBudgetMB = 0;
+
+        // This represents allocated MB on each heap; parse from the lines following [GC_alloc_mb]
+        static int[] AllocMB;
+        static int budgetMB = 0;
+        // These are subscript chars for 0-9.
+        static char[] unicodeChars = { '\x2080', '\x2081', '\x2082', '\x2083', '\x2084', '\x2085', '\x2086', '\x2087', '\x2088', '\x2089' };
+        static string[] passOneFileTypes = { "thread", "alloc" };
+
+        static void PrintToAllPassOneFiles(string strLine)
+        {
+            for (int fileIndex = 0; fileIndex < (int)PassOneViewType.MaxType; fileIndex++)
+            {
+                swPassOneFiles[fileIndex].WriteLine(strLine);
+            }
+        }
+
+        static void PrintAllocToAllPassOneFiles()
+        {
+            string strAlloc = string.Format("{0,6}", nodeIndexToPrint);
+            int procStart = procsPerNode * nodeIndexToPrint;
+            int procEnd = procsPerNode * (nodeIndexToPrint + 1);
+
+            int currentGCAllocMBAllHeaps = 0;
+            int currentGCAllocMB = 0;
+            for (int procIndex = procStart; procIndex < procEnd; procIndex++)
+            {
+                currentGCAllocMB += AllocMB[procIndex];
+                //Console.WriteLine("P#{0,2} alloc {1,3}mb, total is now {2,3}mb", 
+                //    procIndex, AllocMB[procIndex],
+                //    currentGCAllocMB);
+            }
+
+            currentGCAllocMBAllHeaps = currentGCAllocMB;
+            currentGCAllocMB /= procsPerNode;
+            totalAllocMB += currentGCAllocMB;
+            totalBudgetMB += budgetMB;
+
+            //Console.WriteLine("N#{0} GC#{1,2} alloc {2,3}mb, budget {3,3}mb, total alloc {4,3}mb, total budget {5,3}mb",
+            //    nodeIndexToPrint, totalGCCount, currentGCAllocMB, budgetMB, totalAllocMB, totalBudgetMB);
+
+            //Console.WriteLine("{0,3}mb - {1,3}mb (%{2:f2} - {3}ms ({4:f2}mb/ms)",
+            //    budgetMB, currentGCAllocMB, ((currentGCAllocMB * 100.0) / budgetMB),
+            //    (endTimeMS - startTimeMS),
+            //    ((double)currentGCAllocMBAllHeaps / (endTimeMS - startTimeMS)));
+
+            int procIndexBase = nodeIndexToPrint * procsPerNode;
+            for (int procIndex = procIndexBase; procIndex < (procIndexBase + procsPerNode); procIndex++)
+            {
+                strAlloc += string.Format("|{0,5}", AllocMB[procIndex]);
+            }
+            strAlloc += string.Format("|");
+            PrintToAllPassOneFiles(strAlloc);
+        }
+        static void CloseAllPassOneFiles()
+        {
+            Console.WriteLine("Total {0} GCs, avg {1}ms, node {2}: BFR %{3:f2}",
+                totalGCCount,
+                (totalGCDurationMS / totalGCCount),
+                nodeIndexToPrint,
+                (totalAllocMB * 100.0 / totalBudgetMB));
+
+            for (int fileIndex = 0; fileIndex < (int)PassOneViewType.MaxType; fileIndex++)
+            {
+                swPassOneFiles[fileIndex].Close();
+            }
+        }
+
+        static void PrintHeader()
+        {
+            // TEMP..
+            //procsPerNode = 16;
+            int procStart = procsPerNode * nodeIndexToPrint;
+            int procEnd = procsPerNode * (nodeIndexToPrint + 1);
+
+            Console.WriteLine("Printing out proc [{0},[{1}", procStart, procEnd);
+
+            swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("##########Thread view - indices of threads currently running on each proc in between GCs\n");
+            swPassOneFiles[(int)PassOneViewType.AllocHeap].WriteLine("##########Alloc heap view - alloc heaps for threads currently running on each proc in between GCs\n");
+            string strHeader = string.Format("{0,6}", "ms");
+            for (int procIndex = procStart; procIndex < procEnd; procIndex++)
+            {
+                strHeader += string.Format("|{0,5}", procIndex);
+            }
+            strHeader += string.Format("|");
+            PrintToAllPassOneFiles(strHeader);
+        }
+
+        // Note that I'm only allocating 2 chars for count and we don't expect to have more than 99 of them.
+        static string FormatCount(int count)
+        {
+            if (count > 99)
+            {
+                Console.WriteLine("actually have {0} samples in {1}ms!!!!", count, timeUnitMS);
+                count = 99;
+            }
+
+            string strFormattedCount = null;
+
+            if (count >= 10)
+            {
+                strFormattedCount += unicodeChars[count / 10];
+            }
+
+            strFormattedCount += unicodeChars[count % 10];
+            return strFormattedCount;
+        }
+
+        static string FormatFlags(int flags)
+        {
+            string strFormattedFlags = "";
+            if ((flags & (int)HeapBalanceFlagMask.MutipleProcs) != 0)
+            {
+                strFormattedFlags += "ᵐ";
+            }
+            if ((flags & (int)HeapBalanceFlagMask.EnterDueToProc) != 0)
+            {
+                strFormattedFlags += "ᵖ";
+            }
+            if ((flags & (int)HeapBalanceFlagMask.SetIdeal) != 0)
+            {
+                strFormattedFlags += "ⁱ";
+            }
+
+            return strFormattedFlags;
+        }
+
+        // We display the following -
+        // Each row represents a time unit, each column represents a proc.
+        // Each element represents a piece of info on that proc for that time.
+        // This info could be the thread index, or the alloc heap, along with
+        // things like m/p/i or count of samples for that time period for that
+        // thread and alloc heap.
+        // 
+        // There could be multiple entries for the same time on a proc because
+        // we happen to observe multiple threads running during that time. 
+        // But we only store one piece of info. Here are the rules to decide 
+        // what we store -
+        // We take the last one on the same time unless we see one with interesting
+        // info (ie, one of m/p/i isn't 0); or one with higher sample count.
+        // 
+        static void PrintProcActivityOnNode()
+        {
+            // TEMP..
+            //procsPerNode = 16;
+            int totalTimeUnits = (endTimeMS - startTimeMS) / timeUnitMS + 1;
+            int procStart = procsPerNode * nodeIndexToPrint;
+            int procEnd = procsPerNode * (nodeIndexToPrint + 1);
+
+            bool fCheckThreadIndex = (threadsToPrint.Count != 0);
+
+            threadsSeen.Clear();
+
+            for (int timeIndex = 0; timeIndex < totalTimeUnits; timeIndex++)
+            {
+                bool fPrint = fIncludeAllTime;
+
+                if (!fPrint)
+                {
+                    // Only write something if there's samples observed on at least one proc.
+                    for (int procIndex = procStart; procIndex < procEnd; procIndex++)
+                    {
+                        if (samples[timeIndex][procIndex].tid != -1)
+                        {
+                            fPrint = true;
+                            break;
+                        }
+                    }
+                }
+
+                // see https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts
+                // for subscript characters
+                // ᵐ,ᵖ,ⁱ
+                if (fPrint)
+                {
+                    int procsHadSamples = 0;
+                    if (fPrintThreadInfoPerTimeUnit)
+                        threadsSeenPerTimeUnit.Clear();
+
+                    string strCellFormat = "|{0,3}{1,-2}";
+                    string strThread = string.Format("{0,6}", (timeIndex + startTimeMS));
+                    string strAllocHeap = string.Format("{0,6}", (timeIndex + startTimeMS));
+
+                    for (int procIndex = procStart; procIndex < procEnd; procIndex++)
+                    {
+                        SampleInfo currentSample = samples[timeIndex][procIndex];
+                        int tid = currentSample.tid;
+                        bool fIncludeThisSample = true;
+
+                        if (fCheckThreadIndex)
+                        {
+                            if (!threadsToPrint.Contains(tid))
+                            {
+                                fIncludeThisSample = false;
+                            }
+                        }
+
+                        if ((tid == -1) || !fIncludeThisSample)
+                        {
+                            strThread += string.Format(strCellFormat, "", "");
+                            strAllocHeap += string.Format(strCellFormat, "", "");
+                        }
+                        else
+                        {
+                            if (fPrintThreadInfoPerTimeUnit)
+                            {
+                                if (!threadsSeenPerTimeUnit.Contains(tid))
+                                    threadsSeenPerTimeUnit.Add(tid);
+                                procsHadSamples++;
+                            }
+
+                            if (!threadsSeen.ContainsKey(tid))
+                                threadsSeen.Add(tid, currentSample.countSamples);
+                            else
+                                threadsSeen[tid] += currentSample.countSamples;
+
+                            string strCount = ((currentSample.countSamples > 0) ? FormatCount(currentSample.countSamples) : "");
+                            strThread += string.Format(strCellFormat, tid.ToString(), strCount);
+
+                            string strFlags = FormatFlags(currentSample.flags);
+                            strAllocHeap += string.Format(strCellFormat, currentSample.allocHeap.ToString(), strFlags);
+                        }
+                    }
+                    //swPassOne.WriteLine("|");
+                    strThread += "|";
+                    swPassOneFiles[(int)PassOneViewType.Thread].WriteLine(strThread);
+
+                    if (fPrintThreadInfoPerTimeUnit)
+                    {
+                        swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("----{0,3} threads on {1,3} procs----",
+                            threadsSeenPerTimeUnit.Count, procsHadSamples);
+                    }
+
+                    strAllocHeap += "|";
+                    swPassOneFiles[(int)PassOneViewType.AllocHeap].WriteLine(strAllocHeap);
+                }
+            }
+
+            //var threadsSeenOrdered = threadsSeen.OrderByDescending(i => i.Value);
+            //foreach (var item in threadsSeenOrdered)
+            //{
+            //    int k = item.Key;
+            //    int v = item.Value;
+            //    swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("tid: {0,5}-{1,-5}", k, v);
+            //    if (!threadsSeenTotal.ContainsKey(k))
+            //        threadsSeenTotal.Add(k, v);
+            //    else
+            //        threadsSeenTotal[k] += v;
+            //}
+        }
+
+        static int GetAdjustedProcIndex(int pIndex)
+        {
+            return (pIndex % procsPerNode);
+        }
+
+        static void ParseThreadIndices(string _strThreadIndices)
+        {
+            strThreadIndices = _strThreadIndices;
+            string[] fields = strThreadIndices.Split(new Char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
+            for (int i = 0; i < fields.Length; i++)
+            {
+                threadsToPrint.Add(Int32.Parse(fields[i]));
+            }
+
+            int len = threadsToPrint.Count;
+            Console.WriteLine("getting info for {0} threads", len);
+            for (int i = 0; i < len; i++)
+            {
+                Console.WriteLine(threadsToPrint[i]);
+            }
+        }
+
+        static void ParseGCRange(string _strGCRange)
+        {
+            strGCRange = _strGCRange;
+            int dashIndex= strGCRange.IndexOf('-');
+            if (dashIndex== -1)
+            {
+                Console.WriteLine("Invalid GC range {0}", strGCRange);
+                return;
+            }
+
+            string strStart = strGCRange.Substring(0, dashIndex);
+            string strEnd = strGCRange.Substring(dashIndex + 1);
+
+            if (strStart != "start")
+                gcIndexToPrintStart = Int32.Parse(strStart);
+            if (strEnd != "end")
+                gcIndexToPrintEnd = Int32.Parse(strEnd);
+
+            Console.WriteLine("printing GC {0}->{1}", gcIndexToPrintStart, gcIndexToPrintEnd);
+        }
+
+        static void PassOne(string strPassZeroLog)
+        {
+            // This generates 2 files - thread view and alloc heap view. This is just so you could
+            // compare them SxS. 
+            string strLogNameWithoutExtension = Path.GetFileNameWithoutExtension(strPassZeroLog);
+            string strIncludeAll = (fIncludeAllTime ? "-ia1-" : "-ia0-");
+            string strTI = ((strThreadIndices == null) ? "" : strThreadIndices);
+            string strPassOneLog = strLogNameWithoutExtension + "-pass1-n" + nodeIndexToPrint + strIncludeAll + strTI;
+            
+            if (strGCRange != null)
+                strPassOneLog += "-" + strGCRange + "-";
+
+            swPassOneFiles = new StreamWriter[(int)PassOneViewType.MaxType];
+            for (int fileIndex = 0; fileIndex < (int)PassOneViewType.MaxType; fileIndex++)
+            {
+                swPassOneFiles[fileIndex] = new StreamWriter(strPassOneLog + passOneFileTypes[fileIndex] + ".txt");
+            }
+
+            string strTemp, strRemaining;
+            using (StreamReader sr = new StreamReader(strPassZeroLog))
+            {
+                string s;
+                bool fSkip = false;
+
+                while ((s = sr.ReadLine()) != null)
+                {
+                    if (s.StartsWith("P:"))
+                    {
+                        //P: 112, N: 2
+                        strTemp = ParseString(s, "P: ", ",", out strRemaining);
+                        totalProcs = Int32.Parse(strTemp);
+                        strTemp = ParseString(strRemaining, "N: ", null, out strRemaining);
+                        totalNodes = Int32.Parse(strTemp);
+                        procsPerNode = totalProcs / totalNodes;
+                        PrintHeader();
+
+                        int timeUnits = largestTimeInBetweenGCs / timeUnitMS + 1;
+                        //Console.WriteLine("allocating {0} columns, {1} rows", totalProcs, timeUnits);
+                        samples = new SampleInfo[timeUnits][];
+                        for (int timeIndex = 0; timeIndex < timeUnits; timeIndex++)
+                        {
+                            samples[timeIndex] = new SampleInfo[totalProcs];
+                            for (int procIndex = 0; procIndex < totalProcs; procIndex++)
+                            {
+                                samples[timeIndex][procIndex].tid = -1;
+                            }
+                        }
+
+                        // Note - I'm allocating totalProcs elements but there are not necessarily that many
+                        // heaps. But there can't be more heaps than procs so we just waste some space here which
+                        // is ok.
+                        AllocMB = new int[totalProcs];
+
+                        totalGCCount = 0;
+                        totalGCDurationMS = 0;
+                        totalAllocMB = 0;
+                        totalBudgetMB = 0;
+                    }
+                    else if (s.StartsWith("TEMP"))
+                    {
+                        // If I want to only log something temporarily I prefix it with TEMP so we know to just 
+                        // write it out as is.
+                        //PrintToAllPassOneFiles(s);
+                    }
+                    else if (s.StartsWith("[GCA#"))
+                    {
+                        // Clear what we recorded for the last GC.
+                        int usedEntryCount = (endTimeMS - startTimeMS) / timeUnitMS + 1;
+                        //Console.WriteLine("Clearing {0} time entries", usedEntryCount);
+                        for (int timeIndex = 0; timeIndex < usedEntryCount; timeIndex++)
+                        {
+                            for (int procIndex = 0; procIndex < totalProcs; procIndex++)
+                            {
+                                samples[timeIndex][procIndex].tid = -1;
+                            }
+                        }
+
+                        // GCA#gc_index time_since_last_gc-min_time-max_time
+                        //[GCA#1 270-42-268
+                        strTemp = ParseString(s, "A#", " ", out strRemaining);
+                        int gcIndex = Int32.Parse(strTemp);
+
+                        fSkip = !((gcIndex >= gcIndexToPrintStart) && (gcIndex <= gcIndexToPrintEnd));
+
+                        if (fSkip)
+                        {
+                            //Console.WriteLine("GC#{0} is skipped", gcIndex);
+                            continue;
+                        }
+                        else
+                        {
+                            //Console.WriteLine("GC#{0} is NOT skipped", gcIndex);
+                        }
+                        //swPassOne.WriteLine("[GC#{0}-GC#{1}]", (gcIndex - 1), gcIndex);
+                        strTemp = ParseString(s, "-", "-", out strRemaining);
+                        startTimeMS = Int32.Parse(strTemp);
+                        strTemp = ParseString(strRemaining, "-", null, out strRemaining);
+                        endTimeMS = Int32.Parse(strTemp);
+                        //Console.WriteLine("Before GC#{0} time {1} {2}, {3} entries",
+                        //    gcIndex, startTimeMS, endTimeMS, ((endTimeMS - startTimeMS) / timeUnitMS));
+                    }
+                    else if (s.StartsWith("[p"))
+                    {
+                        if (fSkip)
+                            continue;
+
+                        //[p78]-192-225ms
+                        strTemp = ParseString(s, "[p", "]", out strRemaining);
+                        currentProcIndex = Int32.Parse(strTemp);
+                        lastTimeIndex = -1;
+                        //currentProcIndex = GetAdjustedProcIndex(currentProcIndex);
+                    }
+                    else if (s.StartsWith("[GC_"))
+                    {
+                        if (fSkip)
+                            continue;
+
+                        //[GC_alloc_mb]
+                        //[N# 24]17,16,16,15,16,16,17,16,16,15,17,16,15,16,17,16,17,15,16,16,16,16,17,16,16,16,16,16,16,16,16,16,17,15,16,16,16,16,16,16,16,15,16,16,16,16,16,16,16,16,16,15,16,16,16,16,
+                        //[N# 24]21,22,22,23,22,22,21,22,22,22,23,21,22,22,22,22,22,22,22,21,21,24,4,22,21,22,22,22,22,22,22,22,22,23,21,21,22,23,23,22,21,22,23,21,22,22,22,22,22,22,22,21,21,22,21,22,
+                        int totalNodesRead = 0;
+                        while ((s = sr.ReadLine()) != null)
+                        {
+                            if (s.StartsWith("[N"))
+                            {
+                                //Console.WriteLine(s);
+                                int procIndexBase = totalNodesRead * procsPerNode;
+                                strTemp = ParseString(s, "[N#", "]", out strRemaining);
+                                budgetMB = Int32.Parse(strTemp);
+                                string strAllocLine = s.Substring(7);
+                                //Console.WriteLine("spliting {0}", strAllocLine);
+                                string[] fieldsAlloc = strAllocLine.Split(new Char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
+                                for (int fieldIndex = 0; fieldIndex < procsPerNode; fieldIndex++)
+                                {
+                                    AllocMB[procIndexBase + fieldIndex] = Int32.Parse(fieldsAlloc[fieldIndex]);
+                                }
+                                totalNodesRead++;
+                                if (totalNodesRead == totalNodes)
+                                    break;
+                            }
+                        }
+                    }
+                    else if (s.StartsWith("[GC#"))
+                    {
+                        if (fSkip)
+                            continue;
+
+                        //Console.WriteLine(s);
+                        PrintProcActivityOnNode();
+                        PrintAllocToAllPassOneFiles();
+
+                        // gc index-duration
+                        //[GC#1-15-2911711]
+                        strTemp = ParseString(s, "-", "-", out strRemaining);
+                        totalGCCount++;
+                        totalGCDurationMS += Int32.Parse(strTemp);
+
+                        //Console.WriteLine(s);
+                        //Console.WriteLine("GC#{0} {1}ms ->total {2}ms", totalGCCount, Int32.Parse(strTemp), totalGCDurationMS);
+
+                        PrintToAllPassOneFiles(s);
+                    }
+                    else if (s.StartsWith("[N#"))
+                    {
+                        if (fSkip)
+                            continue;
+                    }
+                    else
+                    {
+                        if (fSkip)
+                            continue;
+
+                        // Majority of the log, (83) is the ideal proc for that thread.
+                        //1051ms,95(1),83,m:0,p:0,i:0(83)
+                        strTemp = ParseString(s, null, "ms", out strRemaining);
+                        int currentTimeIndex = Int32.Parse(strTemp);
+
+                        if (currentTimeIndex < lastTimeIndex)
+                        {
+                            //Console.WriteLine("!!bad time stamp at p{0} -{1}-", currentProcIndex, s);
+                            continue;
+                        }
+
+                        lastTimeIndex = currentTimeIndex;
+
+                        currentTimeIndex -= startTimeMS;
+                        currentTimeIndex /= timeUnitMS;
+                        strTemp = ParseString(strRemaining, ",", "(", out strRemaining);
+                        int tid = Int32.Parse(strTemp);
+                        strTemp = ParseString(strRemaining, "(", ")", out strRemaining);
+                        int countSamples = Int32.Parse(strTemp);
+                        strTemp = ParseString(strRemaining, ",", ",", out strRemaining);
+                        int allocHeap = Int32.Parse(strTemp);
+                        strTemp = ParseString(strRemaining, ",m:", ",", out strRemaining);
+                        int flags = Int32.Parse(strTemp);
+                        strTemp = ParseString(strRemaining, ",p:", ",", out strRemaining);
+                        flags |= Int32.Parse(strTemp) << 4;
+                        strTemp = ParseString(strRemaining, ",i:", "(", out strRemaining);
+                        flags |= Int32.Parse(strTemp) << 8;
+                        strTemp = ParseString(strRemaining, "(", ")", out strRemaining);
+                        int idealProcNo = Int32.Parse(strTemp);
+                        //Console.WriteLine("ADDING time {0}ms, entry {1}, thread #{2}, ah {3}, ideal {4}",
+                        //    currentTimeIndex, currentProcIndex, tid, allocHeap, idealProcNo);
+                        samples[currentTimeIndex][currentProcIndex] = new SampleInfo(tid, allocHeap, countSamples, flags, idealProcNo);
+                    }
+                }
+            }
+
+            var threadsSeenTotalOrdered = threadsSeenTotal.OrderByDescending(i => i.Value);
+            swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("\n-----------Total samples per thread-----------");
+            foreach (var item in threadsSeenTotalOrdered)
+            {
+                int k = item.Key;
+                int v = item.Value;
+                swPassOneFiles[(int)PassOneViewType.Thread].WriteLine("tid: {0,5}-{1,-5}", k, v);
+            }
+
+            CloseAllPassOneFiles();
+        }
+
+        // TODO: in pass zero there's merit in assigning thread indices based on the first CPU they appear on,
+        // instead of just assigning one as we come across them.
+        static void Main(string[] args)
+        {
+            int len = args.Length;
+            string strLog = null;
+
+            for (int i = 0; i < args.Length; ++i)
+            {
+                string currentArg = args[i];
+                string currentArgValue;
+                if (currentArg.Equals("-l") || currentArg.Equals("-Log"))
+                {
+                    strLog = args[++i];
+                }
+                else if (currentArg.Equals("-ia") || currentArg.Equals("-IncludeAll"))
+                {
+                    currentArgValue = args[++i];
+                    fIncludeAllTime = (Int32.Parse(currentArgValue) == 1);
+                }
+                else if (currentArg.Equals("-ti") || currentArg.Equals("-ThreadIndices"))
+                {
+                    // This is a comma separated indices
+                    // -ti 88,110
+                    currentArgValue = args[++i];
+                    ParseThreadIndices(currentArgValue);
+                }
+                else if (currentArg.Equals("-gr") || currentArg.Equals("-GCRange"))
+                {
+                    // This specifies a range of GCs, inclusive 
+                    //10-20
+                    //10-end
+                    //start-10
+                    currentArgValue = args[++i];
+                    ParseGCRange(currentArgValue);
+                }
+                else if (currentArg.Equals("-pti") || currentArg.Equals("-PrintThreadInfo"))
+                {
+                    // prints the # of threads we see for each unit of time we display and how many
+                    // total procs they ran on during that time. This shows us how volatile threads
+                    // are jumping between procs.
+                    currentArgValue = args[++i];
+                    if (Int32.Parse(currentArgValue) == 1)
+                        fPrintThreadInfoPerTimeUnit = true;
+                }
+            }
+
+            Console.WriteLine("Processing {0}, {1}", strLog, (fIncludeAllTime ? "full time view" : "compressed time view"));
+            PassZero(strLog);
+            // Feel free to convert PassOne to print out all node's activity at once.
+            // I used pass in the node # to print so did this the lazy way.
+            for (int i = 0; i < totalNodes; i++)
+            {
+                nodeIndexToPrint = i;
+                PassOne(strPassZeroLog);
+            }
+        }
+    }
+}
diff --git a/src/tools/GCLogParser/parse-hb-log.csproj b/src/tools/GCLogParser/parse-hb-log.csproj
new file mode 100644
index 0000000000..2c8953b984
--- /dev/null
+++ b/src/tools/GCLogParser/parse-hb-log.csproj
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{6AE26CD6-C971-48D6-8C03-2FFA272B942C}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <RootNamespace>parse_hb_log</RootNamespace>
+    <AssemblyName>parse-hb-log</AssemblyName>
+    <TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+    <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
+    <Deterministic>true</Deterministic>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <PlatformTarget>AnyCPU</PlatformTarget>
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <PlatformTarget>AnyCPU</PlatformTarget>
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
+    <DebugSymbols>true</DebugSymbols>
+    <OutputPath>bin\x64\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <DebugType>full</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+    <Prefer32Bit>true</Prefer32Bit>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
+    <OutputPath>bin\x64\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <Optimize>true</Optimize>
+    <DebugType>pdbonly</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+    <Prefer32Bit>true</Prefer32Bit>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Net.Http" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="parse-hb-log.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="App.config" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+</Project>
+\ No newline at end of file
diff --git a/src/tools/GCLogParser/parse-hb-log.sln b/src/tools/GCLogParser/parse-hb-log.sln
new file mode 100644
index 0000000000..5ae5dcb65c
--- /dev/null
+++ b/src/tools/GCLogParser/parse-hb-log.sln
@@ -0,0 +1,31 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.28803.452
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "parse-hb-log", "parse-hb-log.csproj", "{6AE26CD6-C971-48D6-8C03-2FFA272B942C}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|x64 = Debug|x64
+		Release|Any CPU = Release|Any CPU
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|x64.ActiveCfg = Debug|x64
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Debug|x64.Build.0 = Debug|x64
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|Any CPU.Build.0 = Release|Any CPU
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|x64.ActiveCfg = Release|x64
+		{6AE26CD6-C971-48D6-8C03-2FFA272B942C}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {7A3E16A0-5AC4-4DB6-A016-161E57874740}
+	EndGlobalSection
+EndGlobal
diff --git a/src/utilcode/util.cpp b/src/utilcode/util.cpp
index e7091604af..0e142cf76c 100644
--- a/src/utilcode/util.cpp
+++ b/src/utilcode/util.cpp
@@ -738,6 +738,34 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
 {
     return ::GetNumaProcessorNodeEx(proc_no, node_no);
 }
+/*static*/ bool NumaNodeInfo::GetNumaInfo(PUSHORT total_nodes, DWORD* max_procs_per_node)
+{
+    if (m_enableGCNumaAware)
+    {
+        DWORD currentProcsOnNode = 0;
+        for (int i = 0; i < m_nNodes; i++)
+        {
+            GROUP_AFFINITY processorMask;
+            if (GetNumaNodeProcessorMaskEx(i, &processorMask))
+            {
+                DWORD procsOnNode = 0;
+                uintptr_t mask = (uintptr_t)processorMask.Mask;
+                while (mask)
+                {
+                    procsOnNode++;
+                    mask &= mask - 1;
+                }
+
+                currentProcsOnNode = max(currentProcsOnNode, procsOnNode);
+            }
+            *max_procs_per_node = currentProcsOnNode;
+            *total_nodes = m_nNodes;
+        }
+        return true;
+    }
+
+    return false;
+}
 #else // !FEATURE_PAL
 /*static*/ BOOL NumaNodeInfo::GetNumaProcessorNodeEx(USHORT proc_no, PUSHORT node_no)
 {
@@ -747,6 +775,7 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
 #endif
 
 /*static*/ BOOL NumaNodeInfo::m_enableGCNumaAware = FALSE;
+/*static*/ uint16_t NumaNodeInfo::m_nNodes = 0;
 /*static*/ BOOL NumaNodeInfo::InitNumaNodeInfoAPI()
 {
 #if !defined(FEATURE_REDHAWK)
@@ -760,6 +789,8 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
     if (!::GetNumaHighestNodeNumber(&highest) || (highest == 0))
         return FALSE;
 
+    m_nNodes = (USHORT)(highest + 1);
+
     return TRUE;
 #else
     return FALSE;
@@ -1058,6 +1089,7 @@ retry:
         {
             *group_number = i;
             *group_processor_number = bDiff;
+
             break;
         }
         bDiff = processor_number - bTemp;
@@ -1098,6 +1130,24 @@ retry:
 #endif
 }
 
+// There can be different numbers of procs in groups. We take the max.
+/*static*/ bool CPUGroupInfo::GetCPUGroupInfo(PUSHORT total_groups, DWORD* max_procs_per_group)
+{
+    if (m_enableGCCPUGroups)
+    {
+        *total_groups = m_nGroups;
+        DWORD currentProcsInGroup = 0;
+        for (WORD i = 0; i < m_nGroups; i++)
+        {
+            currentProcsInGroup = max(currentProcsInGroup, m_CPUGroupInfoArray[i].nr_active);
+        }
+        *max_procs_per_group = currentProcsInGroup;
+        return true;
+    }
+
+    return false;
+}
+
 #if !defined(FEATURE_REDHAWK)
 //Lock ThreadStore before calling this function, so that updates of weights/counts are consistent
 /*static*/ void CPUGroupInfo::ChooseCPUGroupAffinity(GROUP_AFFINITY *gf)
diff --git a/src/vm/gcenv.os.cpp b/src/vm/gcenv.os.cpp
index 5658b19cd4..5fb3745493 100644
--- a/src/vm/gcenv.os.cpp
+++ b/src/vm/gcenv.os.cpp
@@ -38,7 +38,7 @@ class GroupProcNo
 
 public:
 
-    static const uint16_t NoGroup = 0x3ff;
+    static const uint16_t NoGroup = 0;
 
     GroupProcNo(uint16_t groupProc) : m_groupProc(groupProc)
     {
@@ -46,7 +46,8 @@ public:
 
     GroupProcNo(uint16_t group, uint16_t procIndex) : m_groupProc((group << 6) | procIndex)
     {
-        assert(group <= 0x3ff);
+        // Making this the same as the # of NUMA node we support.
+        assert(group < 0x40);
         assert(procIndex <= 0x3f);
     }
 
@@ -208,13 +209,13 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
 #else
     PROCESSOR_NUMBER proc;
 
-    if (dstGroupProcNo.GetGroup() != GroupProcNo::NoGroup)
+    if (CPUGroupInfo::CanEnableGCCPUGroups())
     {
         proc.Group = (WORD)dstGroupProcNo.GetGroup();
         proc.Number = (BYTE)dstGroupProcNo.GetProcIndex();
         proc.Reserved = 0;
 
-        success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL);
+        success = !!SetThreadIdealProcessorEx(GetCurrentThread (), &proc, NULL);
     }
     else
     {
@@ -235,13 +236,38 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
 #endif // !FEATURE_PAL
 }
 
+bool GCToOSInterface::GetCurrentThreadIdealProc(uint16_t* procNo)
+{
+    LIMITED_METHOD_CONTRACT;
+    bool success = false;
+#ifndef FEATURE_PAL
+    PROCESSOR_NUMBER proc;
+    success = !!GetThreadIdealProcessorEx(GetCurrentThread(), &proc);
+    if (success)
+    {
+        GroupProcNo groupProcNo(proc.Group, proc.Number);
+        *procNo = groupProcNo.GetCombinedValue();
+    }
+#endif //FEATURE_PAL
+    return success;
+}
+
 // Get the number of the current processor
 uint32_t GCToOSInterface::GetCurrentProcessorNumber()
 {
     LIMITED_METHOD_CONTRACT;
 
     _ASSERTE(CanGetCurrentProcessorNumber());
+
+#ifndef FEATURE_PAL
+    PROCESSOR_NUMBER proc_no_cpu_group;
+    GetCurrentProcessorNumberEx(&proc_no_cpu_group);
+
+    GroupProcNo groupProcNo(proc_no_cpu_group.Group, proc_no_cpu_group.Number);
+    return groupProcNo.GetCombinedValue();
+#else
     return ::GetCurrentProcessorNumber();
+#endif //!FEATURE_PAL
 }
 
 // Check if the OS supports getting current processor number
@@ -295,25 +321,33 @@ void GCToOSInterface::YieldThread(uint32_t switchCount)
 //  size      - size of the virtual memory range
 //  alignment - requested memory alignment
 //  flags     - flags to control special settings like write watching
+//  node      - the NUMA node to reserve memory on
 // Return:
 //  Starting virtual address of the reserved range
-void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
+void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags, uint16_t node)
 {
     LIMITED_METHOD_CONTRACT;
 
-    DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
-
-    // This is not strictly necessary for a correctness standpoint. Windows already guarantees
-    // allocation granularity alignment when using MEM_RESERVE, so aligning the size here has no effect.
-    // However, ClrVirtualAlloc does expect the size to be aligned to the allocation granularity.
-    size_t aligned_size = (size + g_SystemInfo.dwAllocationGranularity - 1) & ~static_cast<size_t>(g_SystemInfo.dwAllocationGranularity - 1);
-    if (alignment == 0)
+    if (node == NUMA_NODE_UNDEFINED)
     {
-        return ::ClrVirtualAlloc(0, aligned_size, memFlags, PAGE_READWRITE);
+        DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
+
+        // This is not strictly necessary for a correctness standpoint. Windows already guarantees
+        // allocation granularity alignment when using MEM_RESERVE, so aligning the size here has no effect.
+        // However, ClrVirtualAlloc does expect the size to be aligned to the allocation granularity.
+        size_t aligned_size = (size + g_SystemInfo.dwAllocationGranularity - 1) & ~static_cast<size_t>(g_SystemInfo.dwAllocationGranularity - 1);
+        if (alignment == 0)
+        {
+            return ::ClrVirtualAlloc (0, aligned_size, memFlags, PAGE_READWRITE);
+        }
+        else
+        {
+            return ::ClrVirtualAllocAligned (0, aligned_size, memFlags, PAGE_READWRITE, alignment);
+        }
     }
     else
     {
-        return ::ClrVirtualAllocAligned(0, aligned_size, memFlags, PAGE_READWRITE, alignment);
+        return NumaNodeInfo::VirtualAllocExNuma (::GetCurrentProcess (), NULL, size, MEM_RESERVE, PAGE_READWRITE, node);
     }
 }
 
@@ -425,7 +459,7 @@ bool GCToOSInterface::SupportsWriteWatch()
     // check if the OS supports write-watch. 
     // Drawbridge does not support write-watch so we still need to do the runtime detection for them.
     // Otherwise, all currently supported OSes do support write-watch.
-    void* mem = VirtualReserve (g_SystemInfo.dwAllocationGranularity, 0, VirtualReserveFlags::WriteWatch);
+    void* mem = VirtualReserve(g_SystemInfo.dwAllocationGranularity, 0, VirtualReserveFlags::WriteWatch);
     if (mem != NULL)
     {
         VirtualRelease (mem, g_SystemInfo.dwAllocationGranularity);
@@ -493,7 +527,7 @@ bool GCToOSInterface::SetThreadAffinity(uint16_t procNo)
 #ifndef FEATURE_PAL
     GroupProcNo groupProcNo(procNo);
 
-    if (groupProcNo.GetGroup() != GroupProcNo::NoGroup)
+    if (CPUGroupInfo::CanEnableGCCPUGroups())
     {
         GROUP_AFFINITY ga;
         ga.Group = (WORD)groupProcNo.GetGroup();
@@ -937,6 +971,34 @@ bool GCToOSInterface::CanEnableGCNumaAware()
     return NumaNodeInfo::CanEnableGCNumaAware() != FALSE;
 }
 
+bool GCToOSInterface::GetNumaInfo(uint16_t* total_nodes, uint32_t* max_procs_per_node)
+{
+#ifndef FEATURE_PAL
+    return NumaNodeInfo::GetNumaInfo(total_nodes, (DWORD*)max_procs_per_node);
+#else
+    return false;
+#endif //!FEATURE_PAL
+}
+
+bool GCToOSInterface::GetCPUGroupInfo(uint16_t* total_groups, uint32_t* max_procs_per_group)
+{
+#ifndef FEATURE_PAL
+    return CPUGroupInfo::GetCPUGroupInfo(total_groups, (DWORD*)max_procs_per_group);
+#else
+    return false;
+#endif //!FEATURE_PAL
+}
+
+bool GCToOSInterface::CanEnableGCCPUGroups()
+{
+    LIMITED_METHOD_CONTRACT;
+#ifndef FEATURE_PAL
+    return CPUGroupInfo::CanEnableGCCPUGroups() != FALSE;
+#else
+    return false;
+#endif //!FEATURE_PAL
+}
+
 // Get processor number and optionally its NUMA node number for the specified heap number
 // Parameters:
 //  heap_number - heap number to get the result for
author	Maoni Stephens <Maoni0@users.noreply.github.com>	2019-07-05 12:48:34 -0700
committer	GitHub <noreply@github.com>	2019-07-05 12:48:34 -0700
commit	c7d12d2b91f79802a12484d4d56c41ba355b7058 (patch)
tree	823ab1ef9de13e8f5493fc219af7a5deaa118e13
parent	41407b7d4a2eac4cae6a3c4855ba072cdd01f8b1 (diff)
download	coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.tar.gz coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.tar.bz2 coreclr-c7d12d2b91f79802a12484d4d56c41ba355b7058.zip