19 files changed, 502 insertions, 1102 deletions
diff --git a/src/classlibnative/bcltype/system.cpp b/src/classlibnative/bcltype/system.cpp
index 12397a330e..38e5bba9ad 100644
--- a/src/classlibnative/bcltype/system.cpp
+++ b/src/classlibnative/bcltype/system.cpp
@@ -325,13 +325,14 @@ INT32 QCALLTYPE SystemNative::GetProcessorCount()
 
     BEGIN_QCALL;
 
+#ifndef FEATURE_PAL
     CPUGroupInfo::EnsureInitialized();
 
     if(CPUGroupInfo::CanEnableThreadUseAllCpuGroups())
     {
         processorCount = CPUGroupInfo::GetNumActiveProcessors();
     }
-
+#endif // !FEATURE_PAL
     // Processor count will be 0 if CPU groups are disabled/not supported
     if(processorCount == 0)
     {
diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h
index 05dccf7a6d..7fa1ba7062 100644
--- a/src/gc/env/gcenv.os.h
+++ b/src/gc/env/gcenv.os.h
@@ -167,11 +167,18 @@ class AffinitySet
 
 public:
 
+    static const size_t BitsetDataSize = MAX_SUPPORTED_CPUS / BitsPerBitsetEntry;
+
     AffinitySet()
     {
         memset(m_bitset, 0, sizeof(m_bitset));
     }
 
+    uintptr_t* GetBitsetData()
+    {
+        return m_bitset;
+    }
+
     // Check if the set contains a processor
     bool Contains(size_t cpuIndex) const
     {
@@ -477,9 +484,6 @@ public:
     // Is NUMA support available
     static bool CanEnableGCNumaAware();
 
-    // Gets the NUMA node for the processor
-    static bool GetNumaProcessorNode(uint16_t proc_no, uint16_t *node_no);
-
     // Get processor number and optionally its NUMA node number for the specified heap number
     // Parameters:
     //  heap_number - heap number to get the result for
diff --git a/src/gc/unix/gcenv.unix.cpp b/src/gc/unix/gcenv.unix.cpp
index c71d211e01..a6d56f2433 100644
--- a/src/gc/unix/gcenv.unix.cpp
+++ b/src/gc/unix/gcenv.unix.cpp
@@ -55,6 +55,33 @@
 #include "globals.h"
 #include "cgroup.h"
 
+#if HAVE_NUMA_H
+
+#include <numa.h>
+#include <numaif.h>
+#include <dlfcn.h>
+
+// List of all functions from the numa library that are used
+#define FOR_ALL_NUMA_FUNCTIONS \
+    PER_FUNCTION_BLOCK(mbind) \
+    PER_FUNCTION_BLOCK(numa_available) \
+    PER_FUNCTION_BLOCK(numa_max_node) \
+    PER_FUNCTION_BLOCK(numa_node_of_cpu)
+
+// Declare pointers to all the used numa functions
+#define PER_FUNCTION_BLOCK(fn) extern decltype(fn)* fn##_ptr;
+FOR_ALL_NUMA_FUNCTIONS
+#undef PER_FUNCTION_BLOCK
+
+// Redefine all calls to numa functions as calls through pointers that are set
+// to the functions of libnuma in the initialization.
+#define mbind(...) mbind_ptr(__VA_ARGS__)
+#define numa_available() numa_available_ptr()
+#define numa_max_node() numa_max_node_ptr()
+#define numa_node_of_cpu(...) numa_node_of_cpu_ptr(__VA_ARGS__)
+
+#endif // HAVE_NUMA_H
+
 #if defined(_ARM_) || defined(_ARM64_)
 #define SYSCONF_GET_NUMPROCS _SC_NPROCESSORS_CONF
 #else
@@ -109,6 +136,74 @@ uint32_t g_pageSizeUnixInl = 0;
 
 AffinitySet g_processAffinitySet;
 
+#if HAVE_CPUSET_T
+typedef cpuset_t cpu_set_t;
+#endif
+
+// The highest NUMA node available
+int g_highestNumaNode = 0;
+// Is numa available
+bool g_numaAvailable = false;
+
+void* g_numaHandle = nullptr;
+
+#if HAVE_NUMA_H
+#define PER_FUNCTION_BLOCK(fn) decltype(fn)* fn##_ptr;
+FOR_ALL_NUMA_FUNCTIONS
+#undef PER_FUNCTION_BLOCK
+#endif // HAVE_NUMA_H
+
+
+// Initialize data structures for getting and setting thread affinities to processors and
+// querying NUMA related processor information.
+// On systems with no NUMA support, it behaves as if there was a single NUMA node with
+// a single group of processors.
+void NUMASupportInitialize()
+{
+#if HAVE_NUMA_H
+    g_numaHandle = dlopen("libnuma.so", RTLD_LAZY);
+    if (g_numaHandle == 0)
+    {
+        g_numaHandle = dlopen("libnuma.so.1", RTLD_LAZY);
+    }
+    if (g_numaHandle != 0)
+    {
+        dlsym(g_numaHandle, "numa_allocate_cpumask");
+#define PER_FUNCTION_BLOCK(fn) \
+    fn##_ptr = (decltype(fn)*)dlsym(g_numaHandle, #fn); \
+    if (fn##_ptr == NULL) { fprintf(stderr, "Cannot get symbol " #fn " from libnuma\n"); abort(); }
+FOR_ALL_NUMA_FUNCTIONS
+#undef PER_FUNCTION_BLOCK
+
+        if (numa_available() == -1)
+        {
+            dlclose(g_numaHandle);
+        }
+        else
+        {
+            g_numaAvailable = true;
+            g_highestNumaNode = numa_max_node();
+        }
+    }
+#endif // HAVE_NUMA_H
+    if (!g_numaAvailable)
+    {
+        // No NUMA
+        g_highestNumaNode = 0;
+    }
+}
+
+// Cleanup of the NUMA support data structures
+void NUMASupportCleanup()
+{
+#if HAVE_NUMA_H
+    if (g_numaAvailable)
+    {
+        dlclose(g_numaHandle);
+    }
+#endif // HAVE_NUMA_H
+}
+
 // Initialize the interface implementation
 // Return:
 //  true if it has succeeded, false if it has failed
@@ -221,6 +316,8 @@ bool GCToOSInterface::Initialize()
 
 #endif // HAVE_SCHED_GETAFFINITY
 
+    NUMASupportInitialize();
+
     return true;
 }
 
@@ -235,6 +332,7 @@ void GCToOSInterface::Shutdown()
     munmap(g_helperPage, OS_PAGE_SIZE);
 
     CleanupCGroup();
+    NUMASupportCleanup();
 }
 
 // Get numeric id of the current thread if possible on the
@@ -468,8 +566,29 @@ void* GCToOSInterface::VirtualReserveAndCommitLargePages(size_t size)
 //  true if it has succeeded, false if it has failed
 bool GCToOSInterface::VirtualCommit(void* address, size_t size, uint16_t node)
 {
-    assert(node == NUMA_NODE_UNDEFINED && "Numa allocation is not ported to local GC on unix yet");
-    return mprotect(address, size, PROT_WRITE | PROT_READ) == 0;
+    bool success = mprotect(address, size, PROT_WRITE | PROT_READ) == 0;
+
+#if HAVE_NUMA_H
+    if (success && g_numaAvailable && (node != NUMA_NODE_UNDEFINED))
+    {
+        if ((int)node <= g_highestNumaNode)
+        {
+            int nodeMaskLength = (g_highestNumaNode + 1 + sizeof(unsigned long) - 1) / sizeof(unsigned long);
+            unsigned long *nodeMask = (unsigned long*)alloca(nodeMaskLength * sizeof(unsigned long));
+            memset(nodeMask, 0, nodeMaskLength);
+
+            int index = node / sizeof(unsigned long);
+            int mask = ((unsigned long)1) << (node & (sizeof(unsigned long) - 1));
+            nodeMask[index] = mask;
+
+            int st = mbind(address, size, MPOL_PREFERRED, nodeMask, g_highestNumaNode, 0);
+            assert(st == 0);
+            // If the mbind fails, we still return the allocated memory since the node is just a hint
+        }
+    }
+#endif // HAVE_NUMA_H
+
+    return success;
 }
 
 // Decomit virtual memory range.
@@ -775,13 +894,7 @@ uint32_t GCToOSInterface::GetTotalProcessorCount()
 
 bool GCToOSInterface::CanEnableGCNumaAware()
 {
-    return false;
-}
-
-bool GCToOSInterface::GetNumaProcessorNode(uint16_t proc_no, uint16_t *node_no)
-{
-    assert(!"Numa has not been ported to local GC for unix");
-    return false;
+    return g_numaAvailable;
 }
 
 // Get processor number and optionally its NUMA node number for the specified heap number
@@ -806,10 +919,8 @@ bool GCToOSInterface::GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_n
 
                 if (GCToOSInterface::CanEnableGCNumaAware())
                 {
-                    if (!GCToOSInterface::GetNumaProcessorNode(procNumber, node_no))
-                    {
-                        *node_no = NUMA_NODE_UNDEFINED;
-                    }
+                    int result = numa_node_of_cpu(procNumber);
+                    *node_no = (result >= 0) ? (uint16_t)result : NUMA_NODE_UNDEFINED;
                 }
                 else
                 {
diff --git a/src/gc/windows/gcenv.windows.cpp b/src/gc/windows/gcenv.windows.cpp
index 86bd7038c0..d2bcde899a 100644
--- a/src/gc/windows/gcenv.windows.cpp
+++ b/src/gc/windows/gcenv.windows.cpp
@@ -1286,19 +1286,6 @@ bool GCToOSInterface::CanEnableGCNumaAware()
     return g_fEnableGCNumaAware;
 }
 
-bool GCToOSInterface::GetNumaProcessorNode(uint16_t proc_no, uint16_t *node_no)
-{
-    GroupProcNo groupProcNo(proc_no);
-
-    PROCESSOR_NUMBER procNumber;
-    procNumber.Group    = groupProcNo.GetGroup();
-    procNumber.Number   = (BYTE)groupProcNo.GetProcIndex();
-    procNumber.Reserved = 0;
-
-    assert(g_fEnableGCNumaAware);
-    return ::GetNumaProcessorNodeEx(&procNumber, node_no) != FALSE;
-}
-
 // Get processor number and optionally its NUMA node number for the specified heap number
 // Parameters:
 //  heap_number - heap number to get the result for
@@ -1310,53 +1297,67 @@ bool GCToOSInterface::GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_n
 {
     bool success = false;
 
-    if (CanEnableGCCPUGroups())
+    // Locate heap_number-th available processor
+    uint16_t procNumber;
+    size_t cnt = heap_number;
+    for (uint16_t i = 0; i < GCToOSInterface::GetTotalProcessorCount(); i++)
     {
-        uint16_t gn, gpn;
-        GetGroupForProcessor((uint16_t)heap_number, &gn, &gpn);
-
-        *proc_no = GroupProcNo(gn, gpn).GetCombinedValue();
-
-        if (GCToOSInterface::CanEnableGCNumaAware())
+        if (g_processAffinitySet.Contains(i))
         {
-            if (!GCToOSInterface::GetNumaProcessorNode(*proc_no, node_no))
+            if (cnt == 0)
             {
-                *node_no = NUMA_NODE_UNDEFINED;
+                procNumber = i;
+                success = true;
+                break;
             }
+
+            cnt--;
+        }
+    }
+
+    if (success)
+    {
+        WORD gn, gpn;
+
+        if (CanEnableGCCPUGroups())
+        {
+            GetGroupForProcessor(procNumber, &gn, &gpn);
         }
         else
-        {   // no numa setting, each cpu group is treated as a node
-            *node_no = gn;
+        {
+            gn = GroupProcNo::NoGroup;
+            gpn = procNumber;
         }
 
-        success = true;
-    }
-    else
-    {
-        int bit_number = 0;
-        uint8_t proc_number = 0;
-        for (uintptr_t mask = 1; mask != 0; mask <<= 1)
+        GroupProcNo groupProcNo(gn, gpn);
+        *proc_no = groupProcNo.GetCombinedValue();
+
+        if (GCToOSInterface::CanEnableGCNumaAware())
         {
-            if (g_processAffinitySet.Contains(proc_number))
+            PROCESSOR_NUMBER procNumber;
+
+            if (CanEnableGCCPUGroups())
             {
-                if (bit_number == heap_number)
-                {
-                    *proc_no = GroupProcNo(GroupProcNo::NoGroup, proc_number).GetCombinedValue();
+                procNumber.Group = gn;
+            }
+            else
+            {
+                // Get the current processor group
+                PROCESSOR_NUMBER procNumber;
+                GetCurrentProcessorNumberEx(&procNumber);
+            }
 
-                    if (GCToOSInterface::CanEnableGCNumaAware())
-                    {
-                        if (!GCToOSInterface::GetNumaProcessorNode(proc_number, node_no))
-                        {
-                            *node_no = NUMA_NODE_UNDEFINED;
-                        }
-                    }
+            procNumber.Number   = (BYTE)gpn;
+            procNumber.Reserved = 0;
 
-                    success = true;
-                    break;
-                }
-                bit_number++;
+            if (GetNumaProcessorNodeEx(&procNumber, node_no))
+            {
+                *node_no = NUMA_NODE_UNDEFINED;
             }
-            proc_number++;
+        }
+        else
+        {   // no numa setting, each cpu group is treated as a node
+            *node_no = groupProcNo.GetGroup();
         }
     }
 
diff --git a/src/inc/utilcode.h b/src/inc/utilcode.h
index 5b222b1ab9..cf4b8ddf12 100644
--- a/src/inc/utilcode.h
+++ b/src/inc/utilcode.h
@@ -1331,10 +1331,7 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
 // Allocate free memory with specific alignment                                   
 //
 LPVOID ClrVirtualAllocAligned(LPVOID lpAddress, SIZE_T dwSize, DWORD flAllocationType, DWORD flProtect, SIZE_T alignment);
-                                   
-//******************************************************************************
-// Returns the number of processors that a process has been configured to run on
-//******************************************************************************
+
 class NumaNodeInfo 
 {
 private:
@@ -1350,10 +1347,16 @@ public: 	// functions
 
     static LPVOID VirtualAllocExNuma(HANDLE hProc, LPVOID lpAddr, SIZE_T size,
                                      DWORD allocType, DWORD prot, DWORD node);
+#ifndef FEATURE_PAL
     static BOOL GetNumaProcessorNodeEx(PPROCESSOR_NUMBER proc_no, PUSHORT node_no);
+#else // !FEATURE_PAL
+    static BOOL GetNumaProcessorNodeEx(USHORT proc_no, PUSHORT node_no);
+#endif // !FEATURE_PAL
 #endif
 };
 
+#ifndef FEATURE_PAL
+
 struct CPU_Group_Info 
 {
     WORD	nr_active;	// at most 64
@@ -1413,9 +1416,15 @@ public:
     }
 };
 
-int GetCurrentProcessCpuCount();
 DWORD_PTR GetCurrentProcessCpuMask();
 
+#endif // !FEATURE_PAL
+
+//******************************************************************************
+// Returns the number of processors that a process has been configured to run on
+//******************************************************************************
+int GetCurrentProcessCpuCount();
+
 uint32_t GetOsPageSize();
 
 
diff --git a/src/pal/inc/pal.h b/src/pal/inc/pal.h
index 0c9b5a70b1..79bc677830 100644
--- a/src/pal/inc/pal.h
+++ b/src/pal/inc/pal.h
@@ -3996,88 +3996,6 @@ CreatePipe(
 // NUMA related APIs
 //
 
-typedef enum _PROCESSOR_CACHE_TYPE {
-  CacheUnified,
-  CacheInstruction,
-  CacheData,
-  CacheTrace
-} PROCESSOR_CACHE_TYPE;
-
-typedef struct _PROCESSOR_NUMBER {
-  WORD Group;
-  BYTE Number;
-  BYTE Reserved;
-} PROCESSOR_NUMBER, *PPROCESSOR_NUMBER;
-
-typedef enum _LOGICAL_PROCESSOR_RELATIONSHIP {
-  RelationProcessorCore,
-  RelationNumaNode,
-  RelationCache,
-  RelationProcessorPackage,
-  RelationGroup,
-  RelationAll               = 0xffff
-} LOGICAL_PROCESSOR_RELATIONSHIP;
-
-typedef ULONG_PTR KAFFINITY;
-
-#define ANYSIZE_ARRAY 1
-
-typedef struct _GROUP_AFFINITY {
-  KAFFINITY Mask;
-  WORD      Group;
-  WORD      Reserved[3];
-} GROUP_AFFINITY, *PGROUP_AFFINITY;
-
-typedef struct _PROCESSOR_GROUP_INFO {
-  BYTE      MaximumProcessorCount;
-  BYTE      ActiveProcessorCount;
-  BYTE      Reserved[38];
-  KAFFINITY ActiveProcessorMask;
-} PROCESSOR_GROUP_INFO, *PPROCESSOR_GROUP_INFO;
-
-typedef struct _PROCESSOR_RELATIONSHIP {
-  BYTE           Flags;
-  BYTE           EfficiencyClass;
-  BYTE           Reserved[21];
-  WORD           GroupCount;
-  GROUP_AFFINITY GroupMask[ANYSIZE_ARRAY];
-} PROCESSOR_RELATIONSHIP, *PPROCESSOR_RELATIONSHIP;
-
-typedef struct _GROUP_RELATIONSHIP {
-  WORD                 MaximumGroupCount;
-  WORD                 ActiveGroupCount;
-  BYTE                 Reserved[20];
-  PROCESSOR_GROUP_INFO GroupInfo[ANYSIZE_ARRAY];
-} GROUP_RELATIONSHIP, *PGROUP_RELATIONSHIP;
-
-typedef struct _NUMA_NODE_RELATIONSHIP {
-  DWORD          NodeNumber;
-  BYTE           Reserved[20];
-  GROUP_AFFINITY GroupMask;
-} NUMA_NODE_RELATIONSHIP, *PNUMA_NODE_RELATIONSHIP;
-
-typedef struct _CACHE_RELATIONSHIP {
-  BYTE                 Level;
-  BYTE                 Associativity;
-  WORD                 LineSize;
-  DWORD                CacheSize;
-  PROCESSOR_CACHE_TYPE Type;
-  BYTE                 Reserved[20];
-  GROUP_AFFINITY       GroupMask;
-} CACHE_RELATIONSHIP, *PCACHE_RELATIONSHIP;
-
-typedef struct _SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX {
-  LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
-  DWORD                          Size;
-  union {
-    PROCESSOR_RELATIONSHIP Processor;
-    NUMA_NODE_RELATIONSHIP NumaNode;
-    CACHE_RELATIONSHIP     Cache;
-    GROUP_RELATIONSHIP     Group;
-  };
-} SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, *PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
-
-
 PALIMPORT
 BOOL
 PALAPI
@@ -4088,10 +4006,7 @@ GetNumaHighestNodeNumber(
 PALIMPORT
 BOOL
 PALAPI
-GetNumaProcessorNodeEx(
-  IN  PPROCESSOR_NUMBER Processor,
-  OUT PUSHORT NodeNumber
-);
+PAL_GetNumaProcessorNode(WORD procNo, WORD* node);
 
 PALIMPORT
 LPVOID
@@ -4108,61 +4023,12 @@ VirtualAllocExNuma(
 PALIMPORT
 BOOL
 PALAPI
-GetLogicalProcessorInformationEx(
-  IN LOGICAL_PROCESSOR_RELATIONSHIP RelationshipType,
-  OUT OPTIONAL PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer,
-  IN OUT PDWORD ReturnedLength
-);
-
-PALIMPORT
-DWORD_PTR
-PALAPI
-SetThreadAffinityMask(
-  IN HANDLE hThread,
-  IN DWORD_PTR dwThreadAffinityMask
-);
-
-PALIMPORT
-BOOL
-PALAPI
-SetThreadGroupAffinity(
-  IN HANDLE hThread,
-  IN const GROUP_AFFINITY *GroupAffinity,
-  OUT OPTIONAL PGROUP_AFFINITY PreviousGroupAffinity
-);
+PAL_SetCurrentThreadAffinity(WORD procNo);
 
 PALIMPORT
 BOOL
 PALAPI
-GetThreadGroupAffinity(
-  IN HANDLE hThread,
-  OUT PGROUP_AFFINITY GroupAffinity
-);
-
-PALIMPORT
-VOID
-PALAPI
-GetCurrentProcessorNumberEx(
-  OUT PPROCESSOR_NUMBER ProcNumber
-);
-
-PALIMPORT
-BOOL
-PALAPI
-GetProcessAffinityMask(
-  IN HANDLE hProcess,
-  OUT PDWORD_PTR lpProcessAffinityMask,
-  OUT PDWORD_PTR lpSystemAffinityMask
-);
-
-PALIMPORT
-BOOL
-PALAPI
-SetThreadIdealProcessorEx(
-  IN HANDLE hThread,
-  IN PPROCESSOR_NUMBER lpIdealProcessor,
-  OUT PPROCESSOR_NUMBER lpPreviousIdealProcessor
-);
+PAL_GetCurrentThreadAffinitySet(SIZE_T size, UINT_PTR* data);
 
 //
 // The types of events that can be logged.
diff --git a/src/pal/src/include/pal/palinternal.h b/src/pal/src/include/pal/palinternal.h
index 67236aaa49..6f64208e05 100644
--- a/src/pal/src/include/pal/palinternal.h
+++ b/src/pal/src/include/pal/palinternal.h
@@ -679,6 +679,9 @@ typedef enum _TimeConversionConstants
 bool
 ReadMemoryValueFromFile(const char* filename, size_t* val);
 
+DWORD
+GetTotalCpuCount();
+
 #ifdef __APPLE__
 bool
 GetApplicationContainerFolder(PathCharString& buffer, const char *applicationGroupId, int applicationGroupIdLength);
diff --git a/src/pal/src/misc/sysinfo.cpp b/src/pal/src/misc/sysinfo.cpp
index 2c14949b95..419c3f6708 100644
--- a/src/pal/src/misc/sysinfo.cpp
+++ b/src/pal/src/misc/sysinfo.cpp
@@ -95,24 +95,11 @@ SET_DEFAULT_DEBUG_CHANNEL(MISC);
 #endif
 #endif // __APPLE__
 
-
-DWORD
-PALAPI
-PAL_GetLogicalCpuCountFromOS()
+DWORD GetTotalCpuCount()
 {
     int nrcpus = 0;
 
-#if HAVE_SCHED_GETAFFINITY
-
-    cpu_set_t cpuSet;
-    int st = sched_getaffinity(0, sizeof(cpu_set_t), &cpuSet);
-    if (st != 0)
-    {
-        ASSERT("sched_getaffinity failed (%d)\n", errno);
-    }
-
-    nrcpus = CPU_COUNT(&cpuSet);
-#elif HAVE_SYSCONF
+#if HAVE_SYSCONF
 
 #if defined(_ARM_) || defined(_ARM64_)
 #define SYSCONF_GET_NUMPROCS       _SC_NPROCESSORS_CONF
@@ -139,11 +126,36 @@ PAL_GetLogicalCpuCountFromOS()
     {
         ASSERT("sysctl failed for HW_NCPU (%d)\n", errno);
     }
+#else // HAVE_SYSCONF
+#error "Don't know how to get total CPU count on this platform"
 #endif // HAVE_SYSCONF
 
     return nrcpus;
 }
 
+DWORD
+PALAPI
+PAL_GetLogicalCpuCountFromOS()
+{
+    int nrcpus = 0;
+
+#if HAVE_SCHED_GETAFFINITY
+
+    cpu_set_t cpuSet;
+    int st = sched_getaffinity(0, sizeof(cpu_set_t), &cpuSet);
+    if (st != 0)
+    {
+        ASSERT("sched_getaffinity failed (%d)\n", errno);
+    }
+
+    nrcpus = CPU_COUNT(&cpuSet);
+#else // HAVE_SCHED_GETAFFINITY
+    nrcpus = GetTotalCpuCount();
+#endif // HAVE_SCHED_GETAFFINITY
+
+    return nrcpus;
+}
+
 /*++
 Function:
   GetSystemInfo
diff --git a/src/pal/src/numa/numa.cpp b/src/pal/src/numa/numa.cpp
index 9283a044da..0c9d4090a5 100644
--- a/src/pal/src/numa/numa.cpp
+++ b/src/pal/src/numa/numa.cpp
@@ -47,33 +47,6 @@ using namespace CorUnix;
 typedef cpuset_t cpu_set_t;
 #endif
 
-// CPU affinity descriptor
-struct CpuAffinity
-{
-    // NUMA node
-    BYTE Node;
-    // CPU number relative to the group the CPU is in
-    BYTE Number;
-    // CPU group
-    WORD Group;
-};
-
-// Array mapping global CPU index to its affinity
-CpuAffinity *g_cpuToAffinity = NULL;
-
-// Array mapping CPU group and index in the group to the global CPU index
-short *g_groupAndIndexToCpu = NULL;
-// Array mapping CPU group to the corresponding affinity mask of the CPUs in the group
-KAFFINITY *g_groupToCpuMask = NULL;
-// Array mapping CPU group to the number of processors in the group
-BYTE *g_groupToCpuCount = NULL;
-
-// Total number of processors in the system
-int g_cpuCount = 0;
-// Total number of possible processors in the system
-int g_possibleCpuCount = 0;
-// Total number of CPU groups
-int g_groupCount = 0;
 // The highest NUMA node available
 int g_highestNumaNode = 0;
 // Is numa available
@@ -87,92 +60,6 @@ FOR_ALL_NUMA_FUNCTIONS
 #undef PER_FUNCTION_BLOCK
 #endif // HAVE_NUMA_H
 
-static const int MaxCpusPerGroup = 8 * sizeof(KAFFINITY);
-static const WORD NO_GROUP = 0xffff;
-
-/*++
-Function:
-  FreeLookupArrays
-
-Free CPU and group lookup arrays
---*/
-VOID
-FreeLookupArrays()
-{
-    free(g_groupAndIndexToCpu);
-    free(g_cpuToAffinity);
-    free(g_groupToCpuMask);
-    free(g_groupToCpuCount);
-
-    g_groupAndIndexToCpu = NULL;
-    g_cpuToAffinity = NULL;
-    g_groupToCpuMask = NULL;
-    g_groupToCpuCount = NULL;
-}
-
-/*++
-Function:
-  AllocateLookupArrays
-
-Allocate CPU and group lookup arrays
-Return TRUE if the allocation succeeded
---*/
-BOOL
-AllocateLookupArrays()
-{
-    g_groupAndIndexToCpu = (short*)malloc(g_groupCount * MaxCpusPerGroup * sizeof(short));
-    if (g_groupAndIndexToCpu == NULL)
-    {
-        goto FAILED;
-    }
-
-    g_cpuToAffinity = (CpuAffinity*)malloc(g_possibleCpuCount * sizeof(CpuAffinity));
-    if (g_cpuToAffinity == NULL)
-    {
-        goto FAILED;
-    }
-
-    g_groupToCpuMask = (KAFFINITY*)malloc(g_groupCount * sizeof(KAFFINITY));
-    if (g_groupToCpuMask == NULL)
-    {
-        goto FAILED;
-    }
-
-    g_groupToCpuCount = (BYTE*)malloc(g_groupCount * sizeof(BYTE));
-    if (g_groupToCpuCount == NULL)
-    {
-        goto FAILED;
-    }
-
-    memset(g_groupAndIndexToCpu, 0xff, g_groupCount * MaxCpusPerGroup * sizeof(short));
-    memset(g_cpuToAffinity, 0xff, g_possibleCpuCount * sizeof(CpuAffinity));
-    memset(g_groupToCpuMask, 0, g_groupCount * sizeof(KAFFINITY));
-    memset(g_groupToCpuCount, 0, g_groupCount * sizeof(BYTE));
-
-    return TRUE;
-
-FAILED:
-    FreeLookupArrays();
-
-    return FALSE;
-}
-
-/*++
-Function:
-  GetFullAffinityMask
-
-Get affinity mask for the specified number of processors with all
-the processors enabled.
---*/
-KAFFINITY GetFullAffinityMask(int cpuCount)
-{
-    if ((size_t)cpuCount < sizeof(KAFFINITY) * 8)
-    {
-        return ((KAFFINITY)1 << (cpuCount)) - 1;
-    }
-
-    return ~(KAFFINITY)0;
-}
 
 /*++
 Function:
@@ -208,73 +95,6 @@ FOR_ALL_NUMA_FUNCTIONS
         else
         {
             g_numaAvailable = true;
-
-            struct bitmask *mask = numa_allocate_cpumask();
-            int numaNodesCount = numa_max_node() + 1;
-
-            g_possibleCpuCount = numa_num_possible_cpus();
-            g_cpuCount = 0;
-            g_groupCount = 0;
-
-            for (int i = 0; i < numaNodesCount; i++)
-            {
-                int st = numa_node_to_cpus(i, mask);
-                // The only failure that can happen is that the mask is not large enough
-                // but that cannot happen since the mask was allocated by numa_allocate_cpumask
-                _ASSERTE(st == 0);
-                unsigned int nodeCpuCount = numa_bitmask_weight(mask);
-                g_cpuCount += nodeCpuCount;
-                unsigned int nodeGroupCount = (nodeCpuCount + MaxCpusPerGroup - 1) / MaxCpusPerGroup;
-                g_groupCount += nodeGroupCount;
-            }
-
-            if (!AllocateLookupArrays())
-            {
-                dlclose(numaHandle);
-                return FALSE;
-            }
-
-            WORD currentGroup = 0;
-            int currentGroupCpus = 0;
-
-            for (int i = 0; i < numaNodesCount; i++)
-            {
-                int st = numa_node_to_cpus(i, mask);
-                // The only failure that can happen is that the mask is not large enough
-                // but that cannot happen since the mask was allocated by numa_allocate_cpumask
-                _ASSERTE(st == 0);
-                unsigned int nodeCpuCount = numa_bitmask_weight(mask);
-                unsigned int nodeGroupCount = (nodeCpuCount + MaxCpusPerGroup - 1) / MaxCpusPerGroup;
-                for (int j = 0; j < g_possibleCpuCount; j++)
-                {
-                    if (numa_bitmask_isbitset(mask, j))
-                    {
-                        if (currentGroupCpus == MaxCpusPerGroup)
-                        {
-                            g_groupToCpuCount[currentGroup] = MaxCpusPerGroup;
-                            g_groupToCpuMask[currentGroup] = GetFullAffinityMask(MaxCpusPerGroup);
-                            currentGroupCpus = 0;
-                            currentGroup++;
-                        }
-                        g_cpuToAffinity[j].Node = i;
-                        g_cpuToAffinity[j].Group = currentGroup;
-                        g_cpuToAffinity[j].Number = currentGroupCpus;
-                        g_groupAndIndexToCpu[currentGroup * MaxCpusPerGroup + currentGroupCpus] = j;
-                        currentGroupCpus++;
-                    }
-                }
-
-                if (currentGroupCpus != 0)
-                {
-                    g_groupToCpuCount[currentGroup] = currentGroupCpus;
-                    g_groupToCpuMask[currentGroup] = GetFullAffinityMask(currentGroupCpus);
-                    currentGroupCpus = 0;
-                    currentGroup++;
-                }
-            }
-
-            numa_free_cpumask(mask);
-
             g_highestNumaNode = numa_max_node();
         }
     }
@@ -282,21 +102,7 @@ FOR_ALL_NUMA_FUNCTIONS
     if (!g_numaAvailable)
     {
         // No NUMA
-        g_possibleCpuCount = PAL_GetLogicalCpuCountFromOS();
-        g_cpuCount = PAL_GetLogicalCpuCountFromOS();
-        g_groupCount = 1;
         g_highestNumaNode = 0;
-
-        if (!AllocateLookupArrays())
-        {
-            return FALSE;
-        }
-
-        for (int i = 0; i < g_possibleCpuCount; i++)
-        {
-            g_cpuToAffinity[i].Number = i;
-            g_cpuToAffinity[i].Group = 0;
-        }
     }
 
     return TRUE;
@@ -311,7 +117,6 @@ Cleanup of the NUMA support data structures
 VOID
 NUMASupportCleanup()
 {
-    FreeLookupArrays();
 #if HAVE_NUMA_H
     if (g_numaAvailable)
     {
@@ -346,493 +151,35 @@ GetNumaHighestNodeNumber(
 
 /*++
 Function:
-  GetNumaProcessorNodeEx
-
-See MSDN doc.
---*/
-BOOL
-PALAPI
-GetNumaProcessorNodeEx(
-  IN  PPROCESSOR_NUMBER Processor,
-  OUT PUSHORT NodeNumber
-)
-{
-    PERF_ENTRY(GetNumaProcessorNodeEx);
-    ENTRY("GetNumaProcessorNodeEx(Processor=%p, NodeNumber=%p)\n", Processor, NodeNumber);
+  PAL_GetNumaProcessorNode
 
-    BOOL success = FALSE;
+Abstract
+  Get NUMA node of a processor
 
-    if ((Processor->Group < g_groupCount) &&
-        (Processor->Number < MaxCpusPerGroup) &&
-        (Processor->Reserved == 0))
-    {
-        short cpu = g_groupAndIndexToCpu[Processor->Group * MaxCpusPerGroup + Processor->Number];
-        if (cpu != -1)
-        {
-            *NodeNumber = g_cpuToAffinity[cpu].Node;
-            success = TRUE;
-        }
-    }
-
-    if (!success)
-    {
-        *NodeNumber = 0xffff;
-        SetLastError(ERROR_INVALID_PARAMETER);
-    }
+Parameters:
+  procNo - number of the processor to get the NUMA node for
+  node   - the resulting NUMA node
 
-    LOGEXIT("GetNumaProcessorNodeEx returns BOOL %d\n", success);
-    PERF_EXIT(GetNumaProcessorNodeEx);
-
-    return success;
-}
-
-/*++
-Function:
-  GetLogicalProcessorInformationEx
-
-See MSDN doc.
---*/
-BOOL
-PALAPI
-GetLogicalProcessorInformationEx(
-  IN LOGICAL_PROCESSOR_RELATIONSHIP RelationshipType,
-  OUT OPTIONAL PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer,
-  IN OUT PDWORD ReturnedLength
-)
-{
-    PERF_ENTRY(GetLogicalProcessorInformationEx);
-    ENTRY("GetLogicalProcessorInformationEx(RelationshipType=%d, Buffer=%p, ReturnedLength=%p)\n", RelationshipType, Buffer, ReturnedLength);
-
-    BOOL success = FALSE;
-
-    if (RelationshipType == RelationGroup)
-    {
-        size_t requiredSize = __builtin_offsetof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Group);
-        requiredSize += __builtin_offsetof(GROUP_RELATIONSHIP, GroupInfo);
-        requiredSize += g_groupCount * sizeof(PROCESSOR_GROUP_INFO);
-
-        if (*ReturnedLength >= requiredSize)
-        {
-            Buffer->Relationship = RelationGroup;
-            Buffer->Size = requiredSize;
-            Buffer->Group.MaximumGroupCount = g_groupCount;
-            Buffer->Group.ActiveGroupCount = g_groupCount;
-            for (int i = 0; i < g_groupCount; i++)
-            {
-                Buffer->Group.GroupInfo[i].MaximumProcessorCount = MaxCpusPerGroup;
-                Buffer->Group.GroupInfo[i].ActiveProcessorCount = g_groupToCpuCount[i];
-                Buffer->Group.GroupInfo[i].ActiveProcessorMask = g_groupToCpuMask[i];
-            }
-
-            success = TRUE;
-        }
-        else
-        {
-            SetLastError(ERROR_INSUFFICIENT_BUFFER);
-        }
-
-        *ReturnedLength = requiredSize;
-    }
-    else
-    {
-        // We only support the group relationship
-        SetLastError(ERROR_INVALID_PARAMETER);
-    }
-
-    LOGEXIT("GetLogicalProcessorInformationEx returns BOOL %d\n", success);
-    PERF_EXIT(GetLogicalProcessorInformationEx);
-
-    return success;
-}
-
-/*++
-Function:
-  GetThreadGroupAffinityInternal
-
-Get the group affinity for the specified pthread
---*/
-BOOL
-GetThreadGroupAffinityInternal(
-  IN pthread_t thread,
-  OUT PGROUP_AFFINITY GroupAffinity
-)
-{
-    BOOL success = FALSE;
-
-#if HAVE_PTHREAD_GETAFFINITY_NP
-    cpu_set_t cpuSet;
-
-    int st = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuSet);
-
-    if (st == 0)
-    {
-        WORD group = NO_GROUP;
-        KAFFINITY mask = 0;
-
-        for (int i = 0; i < g_possibleCpuCount; i++)
-        {
-            if (CPU_ISSET(i, &cpuSet))
-            {
-                WORD g = g_cpuToAffinity[i].Group;
-                // Unless the thread affinity was already set by SetThreadGroupAffinity, it is possible that
-                // the current thread has affinity with processors from multiple groups. So we report just the
-                // first group we find.
-                if (group == NO_GROUP || g == group)
-                {
-                    group = g;
-                    mask |= ((KAFFINITY)1) << g_cpuToAffinity[i].Number;
-                }
-            }
-        }
-
-        GroupAffinity->Group = group;
-        GroupAffinity->Mask = mask;
-        success = TRUE;
-    }
-    else
-    {
-        SetLastError(ERROR_GEN_FAILURE);
-    }
-#else // HAVE_PTHREAD_GETAFFINITY_NP
-    // There is no API to manage thread affinity, so let's return a group affinity
-    // with all the CPUs on the system.
-    GroupAffinity->Group = 0;
-    GroupAffinity->Mask = GetFullAffinityMask(g_possibleCpuCount);
-    success = TRUE;
-#endif // HAVE_PTHREAD_GETAFFINITY_NP
-
-    return success;
-}
-
-/*++
-Function:
-  GetThreadGroupAffinity
-
-See MSDN doc.
---*/
-BOOL
-PALAPI
-GetThreadGroupAffinity(
-  IN HANDLE hThread,
-  OUT PGROUP_AFFINITY GroupAffinity
-)
-{
-    PERF_ENTRY(GetThreadGroupAffinity);
-    ENTRY("GetThreadGroupAffinity(hThread=%p, GroupAffinity=%p)\n", hThread, GroupAffinity);
-    CPalThread *pCurrentThread = InternalGetCurrentThread();
-    CPalThread *pTargetThread = NULL;
-    IPalObject *pTargetThreadObject = NULL;
-
-    PAL_ERROR palErr =
-        InternalGetThreadDataFromHandle(pCurrentThread, hThread,
-                                        0, // THREAD_SET_CONTEXT
-                                        &pTargetThread, &pTargetThreadObject);
-
-    if (NO_ERROR != palErr)
-    {
-        ERROR("Unable to obtain thread data for handle %p (error %x)!\n", hThread,
-              palErr);
-        return FALSE;
-    }
-
-    BOOL success = GetThreadGroupAffinityInternal(
-        pTargetThread->GetPThreadSelf(), GroupAffinity);
-    LOGEXIT("GetThreadGroupAffinity returns BOOL %d\n", success);
-    PERF_EXIT(GetThreadGroupAffinity);
-
-    return success;
-}
-
-
-/*++
-Function:
-  SetThreadGroupAffinity
-
-See MSDN doc.
+Return value:
+  TRUE if the function was able to get the NUMA node, FALSE if it has failed.
 --*/
 BOOL
 PALAPI
-SetThreadGroupAffinity(
-  IN HANDLE hThread,
-  IN const GROUP_AFFINITY *GroupAffinity,
-  OUT OPTIONAL PGROUP_AFFINITY PreviousGroupAffinity
-)
+PAL_GetNumaProcessorNode(WORD procNo, WORD* node)
 {
-    PERF_ENTRY(SetThreadGroupAffinity);
-    ENTRY("SetThreadGroupAffinity(hThread=%p, GroupAffinity=%p, PreviousGroupAffinity=%p)\n", hThread, GroupAffinity, PreviousGroupAffinity);
-
-    CPalThread *pCurrentThread = InternalGetCurrentThread();
-    CPalThread *pTargetThread = NULL;
-    IPalObject *pTargetThreadObject = NULL;
-
-    PAL_ERROR palErr =
-        InternalGetThreadDataFromHandle(pCurrentThread, hThread,
-                                        0, // THREAD_SET_CONTEXT
-                                        &pTargetThread, &pTargetThreadObject);
-
-    if (NO_ERROR != palErr)
-    {
-        ERROR("Unable to obtain thread data for handle %p (error %x)!\n", hThread,
-              palErr);
-        return FALSE;
-    }
-
-    pthread_t thread = pTargetThread->GetPThreadSelf();
-
-    if (PreviousGroupAffinity != NULL)
-    {
-        GetThreadGroupAffinityInternal(thread, PreviousGroupAffinity);
-    }
-
-#if HAVE_PTHREAD_GETAFFINITY_NP
-    int groupStartIndex = GroupAffinity->Group * MaxCpusPerGroup;
-    KAFFINITY mask = 1;
-    cpu_set_t cpuSet;
-    CPU_ZERO(&cpuSet);
-
-    for (int i = 0; i < MaxCpusPerGroup; i++, mask <<= 1)
-    {
-        if (GroupAffinity->Mask & mask)
-        {
-            int cpu = g_groupAndIndexToCpu[groupStartIndex + i];
-            if (cpu != -1)
-            {
-                CPU_SET(cpu, &cpuSet);
-            }
-        }
-    }
-
-    int st = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuSet);
-
-    if (st != 0)
-    {
-        switch (st)
-        {
-        case EINVAL:
-            // There is no processor in the mask that is allowed to execute the process
-            SetLastError(ERROR_INVALID_PARAMETER);
-            break;
-        case ESRCH:
-            SetLastError(ERROR_INVALID_HANDLE);
-            break;
-        default:
-            SetLastError(ERROR_GEN_FAILURE);
-            break;
-        }
-    }
-
-    BOOL success = (st == 0);
-#else // HAVE_PTHREAD_GETAFFINITY_NP
-    // There is no API to manage thread affinity, so let's ignore the request
-    BOOL success = TRUE;
-#endif // HAVE_PTHREAD_GETAFFINITY_NP
-
-    LOGEXIT("SetThreadGroupAffinity returns BOOL %d\n", success);
-    PERF_EXIT(SetThreadGroupAffinity);
-
-    return success;
-}
-
-/*++
-Function:
-  SetThreadAffinityMask
-
-See MSDN doc.
---*/
-DWORD_PTR
-PALAPI
-SetThreadAffinityMask(
-  IN HANDLE hThread,
-  IN DWORD_PTR dwThreadAffinityMask
-)
-{
-    PERF_ENTRY(SetThreadAffinityMask);
-    ENTRY("SetThreadAffinityMask(hThread=%p, dwThreadAffinityMask=%p)\n", hThread, dwThreadAffinityMask);
-
-    CPalThread *pCurrentThread = InternalGetCurrentThread();
-    CPalThread *pTargetThread = NULL;
-    IPalObject *pTargetThreadObject = NULL;
-
-    PAL_ERROR palErr =
-        InternalGetThreadDataFromHandle(pCurrentThread, hThread,
-                                        0, // THREAD_SET_CONTEXT
-                                        &pTargetThread, &pTargetThreadObject);
-
-    if (NO_ERROR != palErr)
-    {
-        ERROR("Unable to obtain thread data for handle %p (error %x)!\n", hThread,
-              palErr);
-        return 0;
-    }
-
-    pthread_t thread = pTargetThread->GetPThreadSelf();
-
-#if HAVE_PTHREAD_GETAFFINITY_NP
-    cpu_set_t prevCpuSet;
-    CPU_ZERO(&prevCpuSet);
-    KAFFINITY prevMask = 0;
-
-    int st = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &prevCpuSet);
-
-    if (st == 0)
-    {
-        for (int i = 0; i < std::min(8 * (int)sizeof(KAFFINITY), g_possibleCpuCount); i++)
-        {
-            if (CPU_ISSET(i, &prevCpuSet))
-            {
-                prevMask |= ((KAFFINITY)1) << i;
-            }
-        }
-    }
-
-    cpu_set_t cpuSet;
-    CPU_ZERO(&cpuSet);
-
-    int cpu = 0;
-    while (dwThreadAffinityMask)
-    {
-        if (dwThreadAffinityMask & 1)
-        {
-            CPU_SET(cpu, &cpuSet);
-        }
-        cpu++;
-        dwThreadAffinityMask >>= 1;
-    }
-
-    st = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuSet);
-
-    if (st != 0)
-    {
-        switch (st)
-        {
-        case EINVAL:
-            // There is no processor in the mask that is allowed to execute the
-            // process
-            SetLastError(ERROR_INVALID_PARAMETER);
-            break;
-        case ESRCH:
-            SetLastError(ERROR_INVALID_HANDLE);
-            break;
-        default:
-            SetLastError(ERROR_GEN_FAILURE);
-            break;
-        }
-    }
-
-    DWORD_PTR ret = (st == 0) ? prevMask : 0;
-#else  // HAVE_PTHREAD_GETAFFINITY_NP
-    // There is no API to manage thread affinity, so let's ignore the request
-    DWORD_PTR ret = 0;
-#endif // HAVE_PTHREAD_GETAFFINITY_NP
-    LOGEXIT("SetThreadAffinityMask returns  %lu\n", ret);
-    PERF_EXIT(SetThreadAffinityMask);
-
-    return ret;
-}
-
-/*++
-Function:
-  GetCurrentProcessorNumberEx
-
-See MSDN doc.
---*/
-VOID
-PALAPI
-GetCurrentProcessorNumberEx(
-  OUT PPROCESSOR_NUMBER ProcNumber
-)
-{
-    PERF_ENTRY(GetCurrentProcessorNumberEx);
-    ENTRY("GetCurrentProcessorNumberEx(ProcNumber=%p\n", ProcNumber);
-
-    DWORD cpu = GetCurrentProcessorNumber();
-    _ASSERTE((int)cpu < g_possibleCpuCount);
-    ProcNumber->Group = g_cpuToAffinity[cpu].Group;
-    ProcNumber->Number = g_cpuToAffinity[cpu].Number;
-
-    LOGEXIT("GetCurrentProcessorNumberEx\n");
-    PERF_EXIT(GetCurrentProcessorNumberEx);
-}
-
-/*++
-Function:
-  GetProcessAffinityMask
-
-See MSDN doc.
---*/
-BOOL
-PALAPI
-GetProcessAffinityMask(
-  IN HANDLE hProcess,
-  OUT PDWORD_PTR lpProcessAffinityMask,
-  OUT PDWORD_PTR lpSystemAffinityMask
-)
-{
-    PERF_ENTRY(GetProcessAffinityMask);
-    ENTRY("GetProcessAffinityMask(hProcess=%p, lpProcessAffinityMask=%p, lpSystemAffinityMask=%p\n", hProcess, lpProcessAffinityMask, lpSystemAffinityMask);
-
-    BOOL success = FALSE;
-
-    if (hProcess == GetCurrentProcess())
+#if HAVE_NUMA_H
+    if (g_numaAvailable)
     {
-        int cpuCountInMask = (g_cpuCount > 64) ? 64 : g_cpuCount;
-
-        DWORD_PTR systemMask = GetFullAffinityMask(cpuCountInMask);
-
-#if HAVE_SCHED_GETAFFINITY
-        int pid = getpid();
-        cpu_set_t cpuSet;
-        int st = sched_getaffinity(pid, sizeof(cpu_set_t), &cpuSet);
-        if (st == 0)
+        int result = numa_node_of_cpu(procNo);
+        if (result >= 0)
         {
-            DWORD_PTR processMask = 0;
-
-            for (int i = 0; i < cpuCountInMask; i++)
-            {
-                if (CPU_ISSET(i, &cpuSet))
-                {
-                    processMask |= ((DWORD_PTR)1) << i;
-                }
-            }
-
-            success = TRUE;
-
-            *lpProcessAffinityMask = processMask;
-            *lpSystemAffinityMask = systemMask;
-        }
-        else if (errno == EINVAL)
-        {
-            // There are more processors than can fit in a cpu_set_t
-            // return all bits set for all processors (upto 64) for both masks.
-            *lpProcessAffinityMask = systemMask;
-            *lpSystemAffinityMask = systemMask;
-            success = TRUE;
-        }
-        else
-        {
-            // We should not get any of the errors that the sched_getaffinity can return since none
-            // of them applies for the current thread, so this is an unexpected kind of failure.
-            SetLastError(ERROR_GEN_FAILURE);
+            *node = (WORD)result;
+            return TRUE;
         }
-#else // HAVE_SCHED_GETAFFINITY
-        // There is no API to manage thread affinity, so let's return both affinity masks
-        // with all the CPUs on the system set.
-        *lpSystemAffinityMask = systemMask;
-        *lpProcessAffinityMask = systemMask;
-
-        success = TRUE;
-#endif // HAVE_SCHED_GETAFFINITY
-    }
-    else
-    {
-        // PAL supports getting affinity mask for the current process only
-        SetLastError(ERROR_INVALID_PARAMETER);
     }
+#endif // HAVE_NUMA_H
 
-    LOGEXIT("GetProcessAffinityMask returns BOOL %d\n", success);
-    PERF_EXIT(GetProcessAffinityMask);
-
-    return success;
+    return FALSE;
 }
 
 /*++
@@ -898,115 +245,3 @@ VirtualAllocExNuma(
 
     return result;
 }
-
-/*++
-Function:
-  SetThreadIdealProcessorEx
-
-See MSDN doc.
---*/
-BOOL
-PALAPI
-SetThreadIdealProcessorEx(
-  IN HANDLE hThread,
-  IN PPROCESSOR_NUMBER lpIdealProcessor,
-  OUT PPROCESSOR_NUMBER lpPreviousIdealProcessor)
-{
-    PERF_ENTRY(SetThreadIdealProcessorEx);
-    ENTRY("SetThreadIdealProcessorEx(hThread=%p, lpIdealProcessor=%p)\n", hThread, lpIdealProcessor);
-
-    CPalThread *pCurrentThread = InternalGetCurrentThread();
-    CPalThread *pTargetThread = NULL;
-    IPalObject *pTargetThreadObject = NULL;
-
-    PAL_ERROR palErr =
-        InternalGetThreadDataFromHandle(pCurrentThread, hThread,
-                                        0, // THREAD_SET_CONTEXT
-                                        &pTargetThread, &pTargetThreadObject);
-
-    if (NO_ERROR != palErr)
-    {
-        ERROR("Unable to obtain thread data for handle %p (error %x)!\n", hThread,
-              palErr);
-        return 0;
-    }
-
-    pthread_t thread = pTargetThread->GetPThreadSelf();
-
-#if HAVE_PTHREAD_GETAFFINITY_NP
-    int cpu = -1;
-    if ((lpIdealProcessor->Group < g_groupCount) &&
-        (lpIdealProcessor->Number < MaxCpusPerGroup) &&
-        (lpIdealProcessor->Reserved == 0))
-    {
-        cpu = g_groupAndIndexToCpu[lpIdealProcessor->Group * MaxCpusPerGroup + lpIdealProcessor->Number];
-    }
-
-    if (cpu == -1)
-    {
-        SetLastError(ERROR_INVALID_PARAMETER);
-        return FALSE;
-    }
-
-    if (lpPreviousIdealProcessor != NULL)
-    {
-        cpu_set_t prevCpuSet;
-        CPU_ZERO(&prevCpuSet);
-        DWORD prevCpu = GetCurrentProcessorNumber();
-
-        int st = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &prevCpuSet);
-
-        if (st == 0)
-        {
-            for (int i = 0; i < g_possibleCpuCount; i++)
-            {
-                if (CPU_ISSET(i, &prevCpuSet))
-                {
-                    prevCpu = i;
-                    break;
-                }
-            }
-        }
-
-        _ASSERTE((int)prevCpu < g_possibleCpuCount);
-        lpPreviousIdealProcessor->Group = g_cpuToAffinity[prevCpu].Group;
-        lpPreviousIdealProcessor->Number = g_cpuToAffinity[prevCpu].Number;
-        lpPreviousIdealProcessor->Reserved = 0;
-    }
-
-    cpu_set_t cpuSet;
-    CPU_ZERO(&cpuSet);
-    CPU_SET(cpu, &cpuSet);
-
-    int st = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuSet);
-
-    if (st != 0)
-    {
-        switch (st)
-        {
-        case EINVAL:
-            // There is no processor in the mask that is allowed to execute the
-            // process
-            SetLastError(ERROR_INVALID_PARAMETER);
-            break;
-        case ESRCH:
-            SetLastError(ERROR_INVALID_HANDLE);
-            break;
-        default:
-            SetLastError(ERROR_GEN_FAILURE);
-            break;
-        }
-    }
-
-    BOOL success = (st == 0);
-
-#else  // HAVE_PTHREAD_GETAFFINITY_NP
-    // There is no API to manage thread affinity, so let's ignore the request
-    BOOL success = FALSE;
-#endif // HAVE_PTHREAD_GETAFFINITY_NP
-
-    LOGEXIT("SetThreadIdealProcessorEx returns BOOL %d\n", success);
-    PERF_EXIT(SetThreadIdealProcessorEx);
-
-    return success;
-}
diff --git a/src/pal/src/numa/numashim.h b/src/pal/src/numa/numashim.h
index dd7f58d6de..e56cfab9d1 100644
--- a/src/pal/src/numa/numashim.h
+++ b/src/pal/src/numa/numashim.h
@@ -13,19 +13,12 @@
 #include <numa.h>
 #include <numaif.h>
 
-#define numa_free_cpumask numa_bitmask_free
-
 // List of all functions from the numa library that are used
 #define FOR_ALL_NUMA_FUNCTIONS \
     PER_FUNCTION_BLOCK(numa_available) \
     PER_FUNCTION_BLOCK(mbind) \
-    PER_FUNCTION_BLOCK(numa_num_possible_cpus) \
     PER_FUNCTION_BLOCK(numa_max_node) \
-    PER_FUNCTION_BLOCK(numa_allocate_cpumask) \
-    PER_FUNCTION_BLOCK(numa_node_to_cpus) \
-    PER_FUNCTION_BLOCK(numa_bitmask_weight) \
-    PER_FUNCTION_BLOCK(numa_bitmask_isbitset) \
-    PER_FUNCTION_BLOCK(numa_bitmask_free)
+    PER_FUNCTION_BLOCK(numa_node_of_cpu)
 
 // Declare pointers to all the used numa functions
 #define PER_FUNCTION_BLOCK(fn) extern decltype(fn)* fn##_ptr;
@@ -36,13 +29,8 @@ FOR_ALL_NUMA_FUNCTIONS
 // to the functions of libnuma in the initialization.
 #define numa_available() numa_available_ptr()
 #define mbind(...) mbind_ptr(__VA_ARGS__)
-#define numa_num_possible_cpus() numa_num_possible_cpus_ptr()
 #define numa_max_node() numa_max_node_ptr()
-#define numa_allocate_cpumask() numa_allocate_cpumask_ptr()
-#define numa_node_to_cpus(...) numa_node_to_cpus_ptr(__VA_ARGS__)
-#define numa_bitmask_weight(...) numa_bitmask_weight_ptr(__VA_ARGS__)
-#define numa_bitmask_isbitset(...) numa_bitmask_isbitset_ptr(__VA_ARGS__)
-#define numa_bitmask_free(...) numa_bitmask_free_ptr(__VA_ARGS__)
+#define numa_node_of_cpu(...) numa_node_of_cpu_ptr(__VA_ARGS__)
 
 #endif // HAVE_NUMA_H
 
diff --git a/src/pal/src/thread/thread.cpp b/src/pal/src/thread/thread.cpp
index 86a08639c7..122e86014c 100644
--- a/src/pal/src/thread/thread.cpp
+++ b/src/pal/src/thread/thread.cpp
@@ -64,6 +64,7 @@ SET_DEFAULT_DEBUG_CHANNEL(THREAD); // some headers have code with asserts, so do
 #include "pal/fakepoll.h"
 #endif  // HAVE_POLL
 #include <limits.h>
+#include <algorithm>
 
 #if HAVE_SYS_LWP_H
 #include <sys/lwp.h>
@@ -2921,3 +2922,95 @@ int CorUnix::CThreadMachExceptionHandlers::GetIndexOfHandler(exception_mask_t bm
 }
 
 #endif // HAVE_MACH_EXCEPTIONS
+
+/*++
+Function:
+  PAL_SetCurrentThreadAffinity
+
+Abstract
+  Set affinity of the current thread to the specified processor.
+
+Parameters:
+  procNo - number of the processor to affinitize the current thread to
+
+Return value:
+  TRUE if the function was able to set the affinity, FALSE if it has failed.
+--*/
+BOOL
+PALAPI
+PAL_SetCurrentThreadAffinity(WORD procNo)
+{
+#if HAVE_PTHREAD_GETAFFINITY_NP
+    cpu_set_t cpuSet;
+    CPU_ZERO(&cpuSet);
+
+    int st = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuSet);
+
+    if (st == 0)
+    {
+        CPU_SET(procNo, &cpuSet);
+        st = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuSet);
+    }
+
+    return st == 0;
+#else  // HAVE_PTHREAD_GETAFFINITY_NP
+    // There is no API to manage thread affinity, so let's ignore the request
+    return FALSE;
+#endif // HAVE_PTHREAD_GETAFFINITY_NP
+}
+
+/*++
+Function:
+  PAL_SetCurrentThreadAffinity
+
+Abstract
+  Get affinity set of the current thread. The set is represented by an array of "size" entries of UINT_PTR type.
+
+Parameters:
+  size - number of entries in the "data" array
+  data - pointer to the data of the resulting set, the LSB of the first entry in the array represents processor 0
+
+Return value:
+  TRUE if the function was able to get the affinity set, FALSE if it has failed.
+--*/
+BOOL
+PALAPI
+PAL_GetCurrentThreadAffinitySet(SIZE_T size, UINT_PTR* data)
+{
+    cpu_set_t cpuSet;
+    CPU_ZERO(&cpuSet);
+
+#if HAVE_PTHREAD_GETAFFINITY_NP
+    int st = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuSet);
+
+    if (st == 0)
+    {
+        const SIZE_T BitsPerBitsetEntry = 8 * sizeof(UINT_PTR);
+        int nrcpus = GetTotalCpuCount();
+
+        // Get info for as much processors as it is possible to fit into the resulting set
+        SIZE_T remainingCount = std::min(size * BitsPerBitsetEntry, (SIZE_T)nrcpus);
+        SIZE_T i = 0;
+        while (remainingCount != 0)
+        {
+            UINT_PTR entry = 0;
+            SIZE_T bitsToCopy = std::min(remainingCount, BitsPerBitsetEntry);
+            SIZE_T cpuSetOffset = i * BitsPerBitsetEntry;
+            for (SIZE_T j = 0; j < bitsToCopy; j++)
+            {
+                if (CPU_ISSET(cpuSetOffset + j, &cpuSet))
+                {
+                    entry |= (UINT_PTR)1 << j;
+                }
+            }
+            remainingCount -= bitsToCopy;
+            data[i++] = entry;
+        }
+    }
+
+    return st == 0;
+#else  // HAVE_PTHREAD_GETAFFINITY_NP
+    // There is no API to manage thread affinity, so let's ignore the request
+    return FALSE;
+#endif // HAVE_PTHREAD_GETAFFINITY_NP
+}
diff --git a/src/utilcode/util.cpp b/src/utilcode/util.cpp
index 61f41d7a22..e7091604af 100644
--- a/src/utilcode/util.cpp
+++ b/src/utilcode/util.cpp
@@ -733,10 +733,17 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
     return ::VirtualAllocExNuma(hProc, lpAddr, dwSize, allocType, prot, node);
 }
 
+#ifndef FEATURE_PAL
 /*static*/ BOOL NumaNodeInfo::GetNumaProcessorNodeEx(PPROCESSOR_NUMBER proc_no, PUSHORT node_no)
 {
     return ::GetNumaProcessorNodeEx(proc_no, node_no);
 }
+#else // !FEATURE_PAL
+/*static*/ BOOL NumaNodeInfo::GetNumaProcessorNodeEx(USHORT proc_no, PUSHORT node_no)
+{
+    return PAL_GetNumaProcessorNode(proc_no, node_no);
+}
+#endif // !FEATURE_PAL
 #endif
 
 /*static*/ BOOL NumaNodeInfo::m_enableGCNumaAware = FALSE;
@@ -749,15 +756,6 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
     if (CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_GCNumaAware) == 0)
         return FALSE;
 
-#ifndef FEATURE_PAL
-    // check if required APIs are supported
-    HMODULE hMod = GetModuleHandleW(WINDOWS_KERNEL32_DLLNAME_W);
-#else
-    HMODULE hMod = GetCLRModule();
-#endif    
-    if (hMod == NULL)
-        return FALSE;
-
     // fail to get the highest numa node number
     if (!::GetNumaHighestNodeNumber(&highest) || (highest == 0))
         return FALSE;
@@ -778,8 +776,10 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
     m_enableGCNumaAware = InitNumaNodeInfoAPI();
 }
 
+#ifndef FEATURE_PAL
+
 //******************************************************************************
-// NumaNodeInfo 
+// CPUGroupInfo 
 //******************************************************************************
 #if !defined(FEATURE_REDHAWK)
 /*static*/ //CPUGroupInfo::PNTQSIEx CPUGroupInfo::m_pNtQuerySystemInformationEx = NULL;
@@ -1187,6 +1187,7 @@ BOOL CPUGroupInfo::GetCPUGroupRange(WORD group_number, WORD* group_begin, WORD*
     LIMITED_METHOD_CONTRACT;
     return m_threadUseAllCpuGroups;
 }
+#endif // !FEATURE_PAL
 
 //******************************************************************************
 // Returns the number of processors that a process has been configured to run on
@@ -1206,6 +1207,8 @@ int GetCurrentProcessCpuCount()
         return cCPUs;
 
     unsigned int count = 0;
+
+#ifndef FEATURE_PAL
     DWORD_PTR pmask, smask;
 
     if (!GetProcessAffinityMask(GetCurrentProcess(), &pmask, &smask))
@@ -1233,18 +1236,20 @@ int GetCurrentProcessCpuCount()
             count = 64;
     }
 
-#ifdef FEATURE_PAL
-    uint32_t cpuLimit;
+#else // !FEATURE_PAL
+    count = PAL_GetLogicalCpuCountFromOS();
 
+    uint32_t cpuLimit;
     if (PAL_GetCpuLimit(&cpuLimit) && cpuLimit < count)
         count = cpuLimit;
-#endif
+#endif // !FEATURE_PAL
 
     cCPUs = count;
 
     return count;
 }
 
+#ifndef FEATURE_PAL
 DWORD_PTR GetCurrentProcessCpuMask()
 {
     CONTRACTL
@@ -1266,6 +1271,7 @@ DWORD_PTR GetCurrentProcessCpuMask()
     return 0;
 #endif
 }
+#endif // !FEATURE_PAL
 
 uint32_t GetOsPageSizeUncached()
 {
diff --git a/src/vm/ceemain.cpp b/src/vm/ceemain.cpp
index 1b85649d67..9ac0cc6a71 100644
--- a/src/vm/ceemain.cpp
+++ b/src/vm/ceemain.cpp
@@ -654,8 +654,9 @@ void EEStartupHelper(COINITIEE fFlags)
         // Need to do this as early as possible. Used by creating object handle
         // table inside Ref_Initialization() before GC is initialized.
         NumaNodeInfo::InitNumaNodeInfo();
+#ifndef FEATURE_PAL
         CPUGroupInfo::EnsureInitialized();
-
+#endif // !FEATURE_PAL
 
         // Initialize global configuration settings based on startup flags
         // This needs to be done before the EE has started
diff --git a/src/vm/eeconfig.cpp b/src/vm/eeconfig.cpp
index e59a85e1e5..6bd0eddf2b 100644
--- a/src/vm/eeconfig.cpp
+++ b/src/vm/eeconfig.cpp
@@ -1226,7 +1226,14 @@ HRESULT EEConfig::sync()
 
     tieredCompilation_StartupTier_CallCountingDelayMs =
         CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TC_StartupTier_CallCountingDelayMs);
-    if (CPUGroupInfo::HadSingleProcessorAtStartup())
+
+#ifndef FEATURE_PAL
+    bool hadSingleProcessorAtStartup = g_SystemInfo.dwNumberOfProcessors == 1;//CPUGroupInfo::HadSingleProcessorAtStartup();
+#else // !FEATURE_PAL
+    bool hadSingleProcessorAtStartup = g_SystemInfo.dwNumberOfProcessors == 1;
+#endif // !FEATURE_PAL
+
+    if (hadSingleProcessorAtStartup)
     {
         DWORD delayMultiplier =
             CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TC_StartupTier_DelaySingleProcMultiplier);
diff --git a/src/vm/gcenv.os.cpp b/src/vm/gcenv.os.cpp
index a56215a207..8f9e1ba1bb 100644
--- a/src/vm/gcenv.os.cpp
+++ b/src/vm/gcenv.os.cpp
@@ -32,6 +32,10 @@ uint32_t g_pageSizeUnixInl = 0;
 
 static AffinitySet g_processAffinitySet;
 
+#ifdef FEATURE_PAL
+static uint32_t g_currentProcessCpuCount;
+#endif // FEATURE_PAL
+
 class GroupProcNo
 {
     uint16_t m_groupProc;
@@ -106,8 +110,21 @@ bool GCToOSInterface::Initialize()
 
 #ifdef FEATURE_PAL
     g_pageSizeUnixInl = GetOsPageSize();
-#endif
 
+    g_currentProcessCpuCount = PAL_GetLogicalCpuCountFromOS();
+    if (PAL_GetCurrentThreadAffinitySet(AffinitySet::BitsetDataSize, g_processAffinitySet.GetBitsetData()))
+    {
+        assert(g_currentProcessCpuCount == g_processAffinitySet.Count());
+    }
+    else
+    {
+        // There is no way to get affinity on the current OS, set the affinity set to reflect all processors
+        for (size_t i = 0; i < g_currentProcessCpuCount; i++)
+        {
+            g_processAffinitySet.Add(i);
+        }
+    }
+#else // FEATURE_PAL
     if (CPUGroupInfo::CanEnableGCCPUGroups())
     {
         // When CPU groups are enabled, then the process is not bound by the process affinity set at process launch.
@@ -135,6 +152,7 @@ bool GCToOSInterface::Initialize()
             }
         }
     }
+#endif // FEATURE_PAL
 
     return true;
 }
@@ -175,7 +193,7 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
     LIMITED_METHOD_CONTRACT;
 
     bool success = true;
-
+#ifndef FEATURE_PAL
     GroupProcNo srcGroupProcNo(srcProcNo);
     GroupProcNo dstGroupProcNo(dstProcNo);
 
@@ -202,7 +220,6 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
 
         success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL);
     }
-#if !defined(FEATURE_PAL)
     else
     {
         if (GetThreadIdealProcessorEx(GetCurrentThread(), &proc))
@@ -211,10 +228,13 @@ bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t
             success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, &proc);
         }
     }
-#endif // !defined(FEATURE_PAL)
-#endif
-
+#endif // !FEATURE_CORESYSTEM
     return success;
+
+#else // !FEATURE_PAL
+    return GCToOSInterface::SetThreadAffinity(dstProcNo);
+
+#endif // !FEATURE_PAL
 }
 
 // Get the number of the current processor
@@ -472,7 +492,7 @@ size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 bool GCToOSInterface::SetThreadAffinity(uint16_t procNo)
 {
     LIMITED_METHOD_CONTRACT;
-
+#ifndef FEATURE_PAL
     GroupProcNo groupProcNo(procNo);
 
     if (groupProcNo.GetGroup() != GroupProcNo::NoGroup)
@@ -489,6 +509,9 @@ bool GCToOSInterface::SetThreadAffinity(uint16_t procNo)
     {
         return !!SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)1 << groupProcNo.GetProcIndex());
     }
+#else //  FEATURE_PAL
+    return PAL_SetCurrentThreadAffinity(procNo);
+#endif //  FEATURE_PAL
 }
 
 // Boosts the calling thread's thread priority to a level higher than the default
@@ -510,7 +533,9 @@ bool GCToOSInterface::BoostThreadPriority()
 //  set of enabled processors
 const AffinitySet* GCToOSInterface::SetGCThreadsAffinitySet(uintptr_t configAffinityMask, const AffinitySet* configAffinitySet)
 {
+#ifndef FEATURE_PAL
     if (CPUGroupInfo::CanEnableGCCPUGroups())
+#endif // !FEATURE_PAL
     {
         if (!configAffinitySet->IsEmpty())
         {
@@ -524,6 +549,7 @@ const AffinitySet* GCToOSInterface::SetGCThreadsAffinitySet(uintptr_t configAffi
             }
         }
     }
+#ifndef FEATURE_PAL
     else
     {
         if (configAffinityMask != 0)
@@ -538,6 +564,7 @@ const AffinitySet* GCToOSInterface::SetGCThreadsAffinitySet(uintptr_t configAffi
             }
         }
     }
+#endif // !FEATURE_PAL
 
     return &g_processAffinitySet;
 }
@@ -549,10 +576,14 @@ uint32_t GCToOSInterface::GetCurrentProcessCpuCount()
 {
     LIMITED_METHOD_CONTRACT;
 
+#ifndef FEATURE_PAL
     // GetCurrentProcessCpuCount only returns up to 64 procs.
     return CPUGroupInfo::CanEnableGCCPUGroups() ?
                 GCToOSInterface::GetTotalProcessorCount():
                 ::GetCurrentProcessCpuCount();
+#else // !FEATURE_PAL
+    return g_currentProcessCpuCount;
+#endif // !FEATURE_PAL
 }
 
 // Return the size of the user-mode portion of the virtual address space of this process.
@@ -887,6 +918,7 @@ uint32_t GCToOSInterface::GetTotalProcessorCount()
 {
     LIMITED_METHOD_CONTRACT;
 
+#ifndef FEATURE_PAL
     if (CPUGroupInfo::CanEnableGCCPUGroups())
     {
         return CPUGroupInfo::GetNumActiveProcessors();
@@ -895,6 +927,9 @@ uint32_t GCToOSInterface::GetTotalProcessorCount()
     {
         return g_SystemInfo.dwNumberOfProcessors;
     }
+#else // !FEATURE_PAL
+    return g_currentProcessCpuCount;
+#endif // !FEATURE_PAL
 }
 
 bool GCToOSInterface::CanEnableGCNumaAware()
@@ -904,20 +939,6 @@ bool GCToOSInterface::CanEnableGCNumaAware()
     return NumaNodeInfo::CanEnableGCNumaAware() != FALSE;
 }
 
-bool GCToOSInterface::GetNumaProcessorNode(uint16_t proc_no, uint16_t *node_no)
-{
-    LIMITED_METHOD_CONTRACT;
-
-    GroupProcNo groupProcNo(proc_no);
-
-    PROCESSOR_NUMBER procNumber;
-    procNumber.Group    = groupProcNo.GetGroup();
-    procNumber.Number   = (BYTE)groupProcNo.GetProcIndex();
-    procNumber.Reserved = 0;
-
-    return NumaNodeInfo::GetNumaProcessorNodeEx(&procNumber, node_no) != FALSE;
-}
-
 // Get processor number and optionally its NUMA node number for the specified heap number
 // Parameters:
 //  heap_number - heap number to get the result for
@@ -929,53 +950,76 @@ bool GCToOSInterface::GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_n
 {
     bool success = false;
 
-    if (CPUGroupInfo::CanEnableGCCPUGroups())
+    // Locate heap_number-th available processor
+    uint16_t procNumber;
+    size_t cnt = heap_number;
+    for (uint16_t i = 0; i < GCToOSInterface::GetTotalProcessorCount(); i++)
     {
-        uint16_t gn, gpn;
-        CPUGroupInfo::GetGroupForProcessor((uint16_t)heap_number, &gn, &gpn);
+        if (g_processAffinitySet.Contains(i))
+        {
+            if (cnt == 0)
+            {
+                procNumber = i;
+                success = true;
+                break;
+            }
+
+            cnt--;
+        }
+    }
+
+    if (success)
+    {
+#ifndef FEATURE_PAL
+        WORD gn, gpn;
+
+        if (CPUGroupInfo::CanEnableGCCPUGroups())
+        {
+            CPUGroupInfo::GetGroupForProcessor(procNumber, &gn, &gpn);
+        }
+        else
+        {
+            gn = GroupProcNo::NoGroup;
+            gpn = procNumber;
+        }
+
+        GroupProcNo groupProcNo(gn, gpn);
+        *proc_no = groupProcNo.GetCombinedValue();
 
-        *proc_no = GroupProcNo(gn, gpn).GetCombinedValue();
         if (GCToOSInterface::CanEnableGCNumaAware())
         {
-            if (!GCToOSInterface::GetNumaProcessorNode(*proc_no, node_no))
+            PROCESSOR_NUMBER procNumber;
+
+            if (CPUGroupInfo::CanEnableGCCPUGroups())
+            {
+                procNumber.Group = gn;
+            }
+            else
+            {
+                // Get the current processor group
+                PROCESSOR_NUMBER procNumber;
+                GetCurrentProcessorNumberEx(&procNumber);
+            }
+
+            procNumber.Number   = (BYTE)gpn;
+            procNumber.Reserved = 0;
+
+            if (NumaNodeInfo::GetNumaProcessorNodeEx(&procNumber, node_no))
             {
                 *node_no = NUMA_NODE_UNDEFINED;
             }
         }
         else
         {   // no numa setting, each cpu group is treated as a node
-            *node_no = gn;
+            *node_no = groupProcNo.GetGroup();
         }
-
-        success = true;
-    }
-    else
-    {
-        int bit_number = 0;
-        uint8_t proc_number = 0;
-        for (uintptr_t mask = 1; mask != 0; mask <<= 1)
+#else // !FEATURE_PAL
+        *proc_no = procNumber;
+        if (!GCToOSInterface::CanEnableGCNumaAware() || !NumaNodeInfo::GetNumaProcessorNodeEx(procNumber, (WORD*)node_no))
         {
-            if (g_processAffinitySet.Contains(proc_number))
-            {
-                if (bit_number == heap_number)
-                {
-                    *proc_no = GroupProcNo(GroupProcNo::NoGroup, proc_number).GetCombinedValue();
-
-                    if (GCToOSInterface::CanEnableGCNumaAware())
-                    {
-                        if (!GCToOSInterface::GetNumaProcessorNode(proc_number, node_no))
-                        {
-                            *node_no = NUMA_NODE_UNDEFINED;
-                        }
-                    }
-
-                    success = true;
-                    break;
-                }
-                bit_number++;
-            }
-            proc_number++;
+            *node_no = NUMA_NODE_UNDEFINED;
         }
+#endif // !FEATURE_PAL
     }
 
     return success;
@@ -993,6 +1037,7 @@ bool GCToOSInterface::ParseGCHeapAffinitizeRangesEntry(const char** config_strin
     size_t index_offset = 0;
 
     char* number_end;
+#ifndef FEATURE_PAL
     size_t group_number = strtoul(*config_string, &number_end, 10);
 
     if ((number_end == *config_string) || (*number_end != ':'))
@@ -1011,6 +1056,7 @@ bool GCToOSInterface::ParseGCHeapAffinitizeRangesEntry(const char** config_strin
 
     index_offset = group_begin;
     *config_string = number_end + 1;
+#endif // !FEATURE_PAL
 
     size_t start, end;
     if (!ParseIndexOrRange(config_string, &start, &end))
@@ -1018,11 +1064,13 @@ bool GCToOSInterface::ParseGCHeapAffinitizeRangesEntry(const char** config_strin
         return false;
     }
 
+#ifndef FEATURE_PAL
     if ((start >= group_size) || (end >= group_size))
     {
         // Invalid CPU index values or range
         return false;
     }
+#endif // !FEATURE_PAL
 
     *start_index = index_offset + start;
     *end_index = index_offset + end;
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index 6890290caa..43976a91ff 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -451,7 +451,7 @@ void Thread::ChooseThreadCPUGroupAffinity()
         GC_TRIGGERS;
     }
     CONTRACTL_END;
-
+#ifndef FEATURE_PAL
     if (!CPUGroupInfo::CanEnableGCCPUGroups() || !CPUGroupInfo::CanEnableThreadUseAllCpuGroups()) 
          return;
 
@@ -471,6 +471,7 @@ void Thread::ChooseThreadCPUGroupAffinity()
     CPUGroupInfo::SetThreadGroupAffinity(GetThreadHandle(), &groupAffinity, NULL);
     m_wCPUGroup = groupAffinity.Group;
     m_pAffinityMask = groupAffinity.Mask;
+#endif // !FEATURE_PAL
 }
 
 void Thread::ClearThreadCPUGroupAffinity()
@@ -481,7 +482,7 @@ void Thread::ClearThreadCPUGroupAffinity()
         GC_NOTRIGGER;
     }
     CONTRACTL_END;
-
+#ifndef FEATURE_PAL
     if (!CPUGroupInfo::CanEnableGCCPUGroups() || !CPUGroupInfo::CanEnableThreadUseAllCpuGroups()) 
          return;
 
@@ -499,6 +500,7 @@ void Thread::ClearThreadCPUGroupAffinity()
 
     m_wCPUGroup = 0;
     m_pAffinityMask = 0;
+#endif // !FEATURE_PAL
 }
 
 DWORD Thread::StartThread()
@@ -1561,8 +1563,10 @@ Thread::Thread()
     
     m_fGCSpecial = FALSE;
 
+#ifndef FEATURE_PAL
     m_wCPUGroup = 0;
     m_pAffinityMask = 0;
+#endif // !FEATURE_PAL
 
     m_pAllLoggedTypes = NULL;
 
diff --git a/src/vm/threads.h b/src/vm/threads.h
index 94ce275604..e5307d9a5d 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -4824,9 +4824,10 @@ public:
     void SetGCSpecial(bool fGCSpecial);
 
 private:
+#ifndef FEATURE_PAL
     WORD m_wCPUGroup;
     DWORD_PTR m_pAffinityMask;
-
+#endif // !FEATURE_PAL
 public:
     void ChooseThreadCPUGroupAffinity();
     void ClearThreadCPUGroupAffinity();
diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp
index 29c1d21c99..09a3a07745 100644
--- a/src/vm/win32threadpool.cpp
+++ b/src/vm/win32threadpool.cpp
@@ -345,12 +345,16 @@ BOOL ThreadpoolMgr::Initialize()
     UnManagedPerAppDomainTPCount* pADTPCount;
     pADTPCount = PerAppDomainTPCountList::GetUnmanagedTPCount();
 
+#ifndef FEATURE_PAL
     //ThreadPool_CPUGroup
     CPUGroupInfo::EnsureInitialized();
     if (CPUGroupInfo::CanEnableGCCPUGroups() && CPUGroupInfo::CanEnableThreadUseAllCpuGroups())
         NumberOfProcessors = CPUGroupInfo::GetNumActiveProcessors();
     else
         NumberOfProcessors = GetCurrentProcessCpuCount();
+#else // !FEATURE_PAL
+    NumberOfProcessors = GetCurrentProcessCpuCount();
+#endif // !FEATURE_PAL
     InitPlatformVariables();
 
     EX_TRY
@@ -380,20 +384,15 @@ BOOL ThreadpoolMgr::Initialize()
         RetiredWorkerSemaphore = new CLRLifoSemaphore();
         RetiredWorkerSemaphore->Create(0, ThreadCounter::MaxPossibleCount);
 
+#ifndef FEATURE_PAL
         //ThreadPool_CPUGroup
         if (CPUGroupInfo::CanEnableGCCPUGroups() && CPUGroupInfo::CanEnableThreadUseAllCpuGroups())
             RecycledLists.Initialize( CPUGroupInfo::GetNumActiveProcessors() );
         else
             RecycledLists.Initialize( g_SystemInfo.dwNumberOfProcessors );
-        /*
-            {
-                SYSTEM_INFO sysInfo;
-
-                ::GetSystemInfo( &sysInfo );
-
-                RecycledLists.Initialize( sysInfo.dwNumberOfProcessors );
-            }
-        */
+#else // !FEATURE_PAL
+        RecycledLists.Initialize( g_SystemInfo.dwNumberOfProcessors );
+#endif // !FEATURE_PAL
     }
     EX_CATCH
     {
@@ -4095,9 +4094,10 @@ DWORD WINAPI ThreadpoolMgr::GateThreadStart(LPVOID lpArgs)
         return 0;
     }
 
+#ifndef FEATURE_PAL
     //GateThread can start before EESetup, so ensure CPU group information is initialized;
     CPUGroupInfo::EnsureInitialized();
-
+#endif // !FEATURE_PAL
     // initialize CPU usage information structure;
     prevCPUInfo.idleTime.QuadPart   = 0;
     prevCPUInfo.kernelTime.QuadPart = 0;
diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h
index bb6ebc0613..55f321c37f 100644
--- a/src/vm/win32threadpool.h
+++ b/src/vm/win32threadpool.h
@@ -735,12 +735,22 @@ public:
         {
             LIMITED_METHOD_CONTRACT;
 
+            DWORD processorNumber = 0;
+
+#ifndef FEATURE_PAL
 	        if (CPUGroupInfo::CanEnableGCCPUGroups() && CPUGroupInfo::CanEnableThreadUseAllCpuGroups())
-                return pRecycledListPerProcessor[CPUGroupInfo::CalculateCurrentProcessorNumber()][memType];
+                processorNumber = CPUGroupInfo::CalculateCurrentProcessorNumber();
             else
                 // Turns out GetCurrentProcessorNumber can return a value greater than the number of processors reported by
                 // GetSystemInfo, if we're running in WOW64 on a machine with >32 processors.
-        	    return pRecycledListPerProcessor[GetCurrentProcessorNumber()%NumberOfProcessors][memType];
+        	    processorNumber = GetCurrentProcessorNumber()%NumberOfProcessors;
+#else // !FEATURE_PAL
+            if (PAL_HasGetCurrentProcessorNumber())
+            {
+                processorNumber = GetCurrentProcessorNumber();
+            }
+#endif // !FEATURE_PAL
+            return pRecycledListPerProcessor[processorNumber][memType];
     	}
     };