Perform PhysicalMemoryLimit check for workstation GC, refactor GetLargestOnDieCacheSize into GetCacheSizePerLogicalCpu (#15975)

* refactor: combine GetLargestOnDieCacheSize and GetLogicalCpuCount in GetCacheSizePerLogicalCpu * Perform PhysicalMemoryLimit check also for workstation GC
author: Tom Deseyn <tom.deseyn@gmail.com> 2018-01-29 21:11:10 +0100
committer: Maoni Stephens <Maoni0@users.noreply.github.com> 2018-01-29 12:11:10 -0800
commit: cb73944d6d159bd02adc29f0588b97f0f76da1a1 (patch)
tree: 9c4f406b9d5bf886c64fceb19213c25176e988cd
parent: 850a5bea5b8c317250f7f7ef85152d92468382b0 (diff)
download: coreclr-cb73944d6d159bd02adc29f0588b97f0f76da1a1.tar.gz
coreclr-cb73944d6d159bd02adc29f0588b97f0f76da1a1.tar.bz2
coreclr-cb73944d6d159bd02adc29f0588b97f0f76da1a1.zip
12 files changed, 50 insertions, 259 deletions
diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h
index 1707f0dabe..41e46f8f0f 100644
--- a/src/gc/env/gcenv.os.h
+++ b/src/gc/env/gcenv.os.h
@@ -282,16 +282,13 @@ public:
     // Processor topology
     //
 
-    // Get number of logical processors
-    static uint32_t GetLogicalCpuCount();
-
-    // Get size of the largest cache on the processor die
+    // Get size of the on die cache per logical processor
     // Parameters:
     //  trueSize - true to return true cache size, false to return scaled up size based on
     //             the processor architecture
     // Return:
     //  Size of the cache
-    static size_t GetLargestOnDieCacheSize(bool trueSize = true);
+    static size_t GetCacheSizePerLogicalCpu(bool trueSize = true);
 
     // Get number of processors assigned to the current process
     // Return:
diff --git a/src/gc/gc.cpp b/src/gc/gc.cpp
index 706c805db5..2b31388d38 100644
--- a/src/gc/gc.cpp
+++ b/src/gc/gc.cpp
@@ -15756,7 +15756,7 @@ void gc_heap::gc1()
                     size_t min_gc_size = dd_min_gc_size(dd);
                     // if min GC size larger than true on die cache, then don't bother
                     // limiting the desired size
-                    if ((min_gc_size <= GCToOSInterface::GetLargestOnDieCacheSize(TRUE) / GCToOSInterface::GetLogicalCpuCount()) &&
+                    if ((min_gc_size <= GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)) &&
                         desired_per_heap <= 2*min_gc_size)
                     {
                         desired_per_heap = min_gc_size;
@@ -35523,19 +35523,26 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size)
 #ifdef SERVER_GC
         // performance data seems to indicate halving the size results
         // in optimal perf.  Ask for adjusted gen0 size.
-        gen0size = max(GCToOSInterface::GetLargestOnDieCacheSize(FALSE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024));
+        gen0size = max(GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),(256*1024));
 
         // if gen0 size is too large given the available memory, reduce it.
         // Get true cache size, as we don't want to reduce below this.
-        size_t trueSize = max(GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/GCToOSInterface::GetLogicalCpuCount(),(256*1024));
+        size_t trueSize = max(GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE),(256*1024));
         dprintf (2, ("cache: %Id-%Id, cpu: %Id", 
-            GCToOSInterface::GetLargestOnDieCacheSize(FALSE),
-            GCToOSInterface::GetLargestOnDieCacheSize(TRUE),
-            GCToOSInterface::GetLogicalCpuCount()));
+            GCToOSInterface::GetCacheSizePerLogicalCpu(FALSE),
+            GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE)));
+
+        int n_heaps = gc_heap::n_heaps;
+#else //SERVER_GC
+        size_t trueSize = GCToOSInterface::GetCacheSizePerLogicalCpu(TRUE);
+        gen0size = max((4*trueSize/5),(256*1024));
+        trueSize = max(trueSize, (256*1024));
+        int n_heaps = 1;
+#endif //SERVER_GC
 
         // if the total min GC across heaps will exceed 1/6th of available memory,
         // then reduce the min GC size until it either fits or has been reduced to cache size.
-        while ((gen0size * gc_heap::n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6)
+        while ((gen0size * n_heaps) > GCToOSInterface::GetPhysicalMemoryLimit() / 6)
         {
             gen0size = gen0size / 2;
             if (gen0size <= trueSize)
@@ -35544,9 +35551,6 @@ size_t GCHeap::GetValidGen0MaxSize(size_t seg_size)
                 break;
             }
         }
-#else //SERVER_GC
-        gen0size = max((4*GCToOSInterface::GetLargestOnDieCacheSize(TRUE)/5),(256*1024));
-#endif //SERVER_GC
     }
 
     // Generation 0 must never be more than 1/2 the segment size.
diff --git a/src/gc/unix/gcenv.unix.cpp b/src/gc/unix/gcenv.unix.cpp
index 7bc6a37068..737c5efcf0 100644
--- a/src/gc/unix/gcenv.unix.cpp
+++ b/src/gc/unix/gcenv.unix.cpp
@@ -221,12 +221,6 @@ void GCToOSInterface::DebugBreak()
 #endif
 }
 
-// Get number of logical processors
-uint32_t GCToOSInterface::GetLogicalCpuCount()
-{
-    return g_logicalCpuCount;
-}
-
 // Causes the calling thread to sleep for the specified number of milliseconds
 // Parameters:
 //  sleepMSec   - time to sleep before switching to another thread
@@ -403,7 +397,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
 //             the processor architecture
 // Return:
 //  Size of the cache
-size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
+size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 {
     // TODO(segilles) processor detection
     return 0;
diff --git a/src/gc/windows/gcenv.windows.cpp b/src/gc/windows/gcenv.windows.cpp
index 69e5d7273a..e258834abc 100644
--- a/src/gc/windows/gcenv.windows.cpp
+++ b/src/gc/windows/gcenv.windows.cpp
@@ -228,13 +228,6 @@ void GCToOSInterface::DebugBreak()
     ::DebugBreak();
 }
 
-// Get number of logical processors
-uint32_t GCToOSInterface::GetLogicalCpuCount()
-{
-    // TODO(segilles) processor detection
-    return 1;
-}
-
 // Causes the calling thread to sleep for the specified number of milliseconds
 // Parameters:
 //  sleepMSec   - time to sleep before switching to another thread
@@ -381,7 +374,7 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
 //             the processor architecture
 // Return:
 //  Size of the cache
-size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
+size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 {
     // TODO(segilles) processor detection (see src/vm/util.cpp:1935)
     return 0;
diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt
index 5a30bce6b8..cfa1b2ac43 100644
--- a/src/vm/CMakeLists.txt
+++ b/src/vm/CMakeLists.txt
@@ -477,7 +477,6 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM)
     )
 elseif(CLR_CMAKE_TARGET_ARCH_ARM64)
     set(VM_SOURCES_DAC_AND_WKS_ARCH
-        ${ARCH_SOURCES_DIR}/cgenarm64.cpp
         ${ARCH_SOURCES_DIR}/stubs.cpp
         exceptionhandling.cpp
         gcinfodecoder.cpp
diff --git a/src/vm/amd64/cgenamd64.cpp b/src/vm/amd64/cgenamd64.cpp
index 6075134943..56e3bfa738 100644
--- a/src/vm/amd64/cgenamd64.cpp
+++ b/src/vm/amd64/cgenamd64.cpp
@@ -458,89 +458,6 @@ BOOL GetAnyThunkTarget (CONTEXT *pctx, TADDR *pTarget, TADDR *pTargetMethodDesc)
 // determine the number of logical cpus, or the machine is not populated uniformly with the same
 // type of processors, this function returns 1.
 
-extern "C" DWORD __stdcall getcpuid(DWORD arg, unsigned char result[16]);
-
-// fix this if/when AMD does multicore or SMT
-DWORD GetLogicalCpuCount()
-{
-    // No CONTRACT possible because GetLogicalCpuCount uses SEH
-
-    STATIC_CONTRACT_THROWS;
-    STATIC_CONTRACT_GC_NOTRIGGER;
-
-    static DWORD val = 0;
-
-    // cache value for later re-use
-    if (val)
-    {
-        return val;
-    }   
-
-    struct Param : DefaultCatchFilterParam
-    {
-        DWORD retVal;
-    } param;
-    param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER;
-    param.retVal = 1;    
-
-    PAL_TRY(Param *, pParam, &param)
-    {    
-
-        unsigned char buffer[16];
-        DWORD maxCpuId = getcpuid(0, buffer);
-        DWORD* dwBuffer = (DWORD*)buffer;
-
-        if (maxCpuId < 1)
-            goto qExit;
-
-        if (dwBuffer[1] == 'uneG') {
-            if (dwBuffer[3] == 'Ieni') {
-                if (dwBuffer[2] == 'letn')  {        // get SMT/multicore enumeration for Intel EM64T 
-
-                   
-                    // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on 
-                    // multi-core processor, but we never call into those two functions since we don't halve the
-                    // gen0size when it's prescott and above processor. We keep the old version here for earlier
-                    // generation system(Northwood based), perf data suggests on those systems, halve gen0 size 
-                    // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) 
-                    // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS() 
-                    // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. 
-                    // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 
-                    // size at all gives us overall better performance. 
-                    // This is going to be fixed with a new version in orcas time frame. 
-
-                    if( (maxCpuId > 3) && (maxCpuId < 0x80000000) )   
-                        goto qExit;
-
-                    val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
-                    if (val )
-                    {
-                        pParam->retVal = val;     // OS API HT enumeration successful, we are Done
-                        goto qExit;
-                    }
-
-                    val = GetLogicalCpuCountFallback();    // Fallback to HT enumeration using CPUID
-                    if( val )
-                        pParam->retVal = val;
-                }
-            }
-        }
-qExit: ;
-    }
-
-    PAL_EXCEPT_FILTER(DefaultCatchFilter)
-    {
-    }
-    PAL_ENDTRY
-
-    if (val == 0)
-    {
-        val = param.retVal;  
-    }
-
-    return param.retVal;
-}
-
 void EncodeLoadAndJumpThunk (LPBYTE pBuffer, LPVOID pv, LPVOID pTarget)
 {
     CONTRACTL
diff --git a/src/vm/arm/stubs.cpp b/src/vm/arm/stubs.cpp
index 38009167e3..9668d7b4f0 100644
--- a/src/vm/arm/stubs.cpp
+++ b/src/vm/arm/stubs.cpp
@@ -3369,13 +3369,6 @@ void emitCOMStubCall (ComCallMethodDesc *pCOMMethod, PCODE target)
 
 #ifndef CROSSGEN_COMPILE
 
-DWORD GetLogicalCpuCount()
-{
-    // Just use the OS to return this information (the APIs used exist on all versions of Windows which
-    // support ARM).
-    return GetLogicalCpuCountFromOS();
-}
-
 #ifdef FEATURE_READYTORUN
 
 //
diff --git a/src/vm/arm64/cgenarm64.cpp b/src/vm/arm64/cgenarm64.cpp
deleted file mode 100644
index 59905bf098..0000000000
--- a/src/vm/arm64/cgenarm64.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-//
-// Various helper routines for generating AMD64 assembly code.
-//
-
-// Precompiled Header
-
-#include "common.h"
-
-#include "stublink.h"
-#include "cgensys.h"
-#include "siginfo.hpp"
-#include "excep.h"
-#include "ecall.h"
-#include "dllimport.h"
-#include "dllimportcallback.h"
-#include "dbginterface.h"
-#include "fcall.h"
-#include "array.h"
-#include "virtualcallstub.h"
-
-#ifndef DACCESS_COMPILE
-
-// Note: This is only used on server GC on Windows.
-
-DWORD GetLogicalCpuCount()
-{
-    LIMITED_METHOD_CONTRACT;
-
-    // The contact with any callers of this function is that if we're unable to determine
-    // the processor count, or the number of processors is not distributed evenly, then
-    // we should return 1.
-    return 1;
-}
-
-#endif // DACCESS_COMPILE
diff --git a/src/vm/cgensys.h b/src/vm/cgensys.h
index d55d15dd7d..b5158b722c 100644
--- a/src/vm/cgensys.h
+++ b/src/vm/cgensys.h
@@ -34,10 +34,6 @@ int  CallJitEHFilter (CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHC
 void CallJitEHFinally(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel);
 #endif // _TARGET_X86_
 
-
-// get number of logical to physical processors.  Returns 1 on failure or non-intel x86 processors.
-DWORD GetLogicalCpuCount();
-
 //These are in util.cpp
 extern size_t GetLogicalProcessorCacheSizeFromOS();
 extern size_t GetIntelDeterministicCacheEnum();
@@ -47,7 +43,7 @@ extern DWORD GetLogicalCpuCountFallback();
 
 
 // Try to determine the largest last-level cache size of the machine - return 0 if unknown or no L2/L3 cache
-size_t GetLargestOnDieCacheSize(BOOL bTrueSize = TRUE);
+size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize = TRUE);
 
 
 #ifdef FEATURE_COMINTEROP
diff --git a/src/vm/gcenv.os.cpp b/src/vm/gcenv.os.cpp
index 78670b0af3..8d8630ec62 100644
--- a/src/vm/gcenv.os.cpp
+++ b/src/vm/gcenv.os.cpp
@@ -145,13 +145,6 @@ void GCToOSInterface::DebugBreak()
     ::DebugBreak();
 }
 
-// Get number of logical processors
-uint32_t GCToOSInterface::GetLogicalCpuCount()
-{
-    LIMITED_METHOD_CONTRACT;
-    return ::GetLogicalCpuCount();
-}
-
 // Causes the calling thread to sleep for the specified number of milliseconds
 // Parameters:
 //  sleepMSec   - time to sleep before switching to another thread
@@ -322,11 +315,11 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
 //             the processor architecture
 // Return:
 //  Size of the cache
-size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
+size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 {
     LIMITED_METHOD_CONTRACT;
 
-    return ::GetLargestOnDieCacheSize(trueSize);
+    return ::GetCacheSizePerLogicalCpu(trueSize);
 }
 
 // Sets the calling thread's affinity to only run on the processor specified
diff --git a/src/vm/i386/cgenx86.cpp b/src/vm/i386/cgenx86.cpp
index a43bc8558a..492cec4668 100644
--- a/src/vm/i386/cgenx86.cpp
+++ b/src/vm/i386/cgenx86.cpp
@@ -1513,89 +1513,6 @@ extern "C" DWORD __stdcall xmmYmmStateSupport()
 
 #endif // !FEATURE_PAL
 
-// This function returns the number of logical processors on a given physical chip.  If it cannot
-// determine the number of logical cpus, or the machine is not populated uniformly with the same
-// type of processors, this function returns 1.
-DWORD GetLogicalCpuCount()
-{
-    // No CONTRACT possible because GetLogicalCpuCount uses SEH
-
-    STATIC_CONTRACT_THROWS;
-    STATIC_CONTRACT_GC_NOTRIGGER;
-
-    static DWORD val = 0;
-
-    // cache value for later re-use
-    if (val)
-    {
-        return val;
-    }
-
-    struct Param : DefaultCatchFilterParam
-    {
-        DWORD retVal;
-    } param;
-    param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER;
-    param.retVal = 1;
-
-    PAL_TRY(Param *, pParam, &param)
-    {
-        unsigned char buffer[16];
-        DWORD* dwBuffer = NULL;
-
-        DWORD maxCpuId = getcpuid(0, buffer);
-
-        if (maxCpuId < 1)
-            goto lDone;
-
-        dwBuffer = (DWORD*)buffer;
-
-        if (dwBuffer[1] == 'uneG') {
-            if (dwBuffer[3] == 'Ieni') {
-                if (dwBuffer[2] == 'letn')  {  // get SMT/multicore enumeration for Intel EM64T 
-
-                    // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on 
-                    // multi-core processor, but we never call into those two functions since we don't halve the
-                    // gen0size when it's prescott and above processor. We keep the old version here for earlier
-                    // generation system(Northwood based), perf data suggests on those systems, halve gen0 size 
-                    // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) 
-                    // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS() 
-                    // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. 
-                    // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 
-                    // size at all gives us overall better performance. 
-                    // This is going to be fixed with a new version in orcas time frame. 
-
-                    if( (maxCpuId > 3) && (maxCpuId < 0x80000000) ) 
-                        goto lDone;
-
-                    val = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
-                    if (val )
-                    {
-                        pParam->retVal = val;     // OS API HT enumeration successful, we are Done        
-                        goto lDone;
-                    }
-
-                    val = GetLogicalCpuCountFallback();    // OS API failed, Fallback to HT enumeration using CPUID
-                    if( val )
-                        pParam->retVal = val;
-                }
-            }
-        }
-lDone: ;
-    }
-    PAL_EXCEPT_FILTER(DefaultCatchFilter)
-    {
-    }
-    PAL_ENDTRY
-
-    if (val == 0)
-    {
-        val = param.retVal;
-    }
-
-    return param.retVal;
-}
-
 void UMEntryThunkCode::Encode(BYTE* pTargetCode, void* pvSecretParam)
 {
     LIMITED_METHOD_CONTRACT;
diff --git a/src/vm/util.cpp b/src/vm/util.cpp
index 692b72fc39..b9448dadbe 100644
--- a/src/vm/util.cpp
+++ b/src/vm/util.cpp
@@ -1854,9 +1854,10 @@ fDone:
 
 #endif // _TARGET_X86_ || _TARGET_AMD64_
 
-size_t GetLargestOnDieCacheSize(BOOL bTrueSize)
+// fix this if/when AMD does multicore or SMT
+size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize)
 {
-    // No CONTRACT possible because GetLargestOnDieCacheSize uses SEH
+    // No CONTRACT possible because GetCacheSizePerLogicalCpu uses SEH
 
     STATIC_CONTRACT_NOTHROW;
     STATIC_CONTRACT_GC_NOTRIGGER;
@@ -1911,6 +1912,31 @@ size_t GetLargestOnDieCacheSize(BOOL bTrueSize)
                         }   
                     }
 
+                    // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on 
+                    // multi-core processor, but we never call into those two functions since we don't halve the
+                    // gen0size when it's prescott and above processor. We keep the old version here for earlier
+                    // generation system(Northwood based), perf data suggests on those systems, halve gen0 size 
+                    // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood) 
+                    // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS() 
+                    // and GetLogicalCpuCountFallback() works fine for those earlier generation systems. 
+                    // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0 
+                    // size at all gives us overall better performance. 
+                    // This is going to be fixed with a new version in orcas time frame.
+                    if (maxCpuId >= 2 && !((maxCpuId > 3) && (maxCpuId < 0x80000000)))
+                    {
+                        DWORD logicalProcessorCount = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
+
+                        if (!logicalProcessorCount)
+                        {
+                            logicalProcessorCount = GetLogicalCpuCountFallback();    // OS API failed, Fallback to HT enumeration using CPUID
+                        }
+
+                        if (logicalProcessorCount)
+                        {
+                            tempSize = tempSize / logicalProcessorCount;
+                        }
+                    }
+
                     // update maxSize once with final value
                     maxTrueSize = tempSize;
 
@@ -2009,7 +2035,7 @@ size_t GetLargestOnDieCacheSize(BOOL bTrueSize)
     maxSize = maxTrueSize * 3;
 #endif
 
-    //    printf("GetLargestOnDieCacheSize returns %d, adjusted size %d\n", maxSize, maxTrueSize);
+    //    printf("GetCacheSizePerLogicalCpu returns %d, adjusted size %d\n", maxSize, maxTrueSize);
     if (bTrueSize)
         return maxTrueSize;
     else
author	Tom Deseyn <tom.deseyn@gmail.com>	2018-01-29 21:11:10 +0100
committer	Maoni Stephens <Maoni0@users.noreply.github.com>	2018-01-29 12:11:10 -0800
commit	cb73944d6d159bd02adc29f0588b97f0f76da1a1 (patch)
tree	9c4f406b9d5bf886c64fceb19213c25176e988cd
parent	850a5bea5b8c317250f7f7ef85152d92468382b0 (diff)
download	coreclr-cb73944d6d159bd02adc29f0588b97f0f76da1a1.tar.gz coreclr-cb73944d6d159bd02adc29f0588b97f0f76da1a1.tar.bz2 coreclr-cb73944d6d159bd02adc29f0588b97f0f76da1a1.zip