1 files changed, 692 insertions, 0 deletions
diff --git a/src/pal/src/numa/numa.cpp b/src/pal/src/numa/numa.cpp
new file mode 100644
index 0000000000..549c10a71f
--- /dev/null
+++ b/src/pal/src/numa/numa.cpp
@@ -0,0 +1,692 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*++
+
+
+
+Module Name:
+
+    numa.cpp
+
+Abstract:
+
+    Implementation of NUMA related APIs
+
+--*/
+
+#include "pal/dbgmsg.h"
+SET_DEFAULT_DEBUG_CHANNEL(NUMA);
+
+#include "pal/palinternal.h"
+#include "pal/dbgmsg.h"
+#include "pal/numa.h"
+#include "pal/corunix.hpp"
+#include "pal/thread.hpp"
+
+#if HAVE_NUMA_H
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#if HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+
+#include <pthread.h>
+
+using namespace CorUnix;
+
+#if HAVE_CPUSET_T
+typedef cpuset_t cpu_set_t;
+#endif
+
+int GetNumberOfProcessors();
+
+// CPU affinity descriptor
+struct CpuAffinity
+{
+    // NUMA node
+    BYTE Node;
+    // CPU number relative to the group the CPU is in
+    BYTE Number;
+    // CPU group
+    WORD Group;
+};
+
+// Array mapping global CPU index to its affinity
+CpuAffinity *g_cpuToAffinity = NULL;
+
+// Array mapping CPU group and index in the group to the global CPU index
+short *g_groupAndIndexToCpu = NULL;
+// Array mapping CPU group to the corresponding affinity mask of the CPUs in the group
+KAFFINITY *g_groupToCpuMask = NULL;
+// Array mapping CPU group to the number of processors in the group
+BYTE *g_groupToCpuCount = NULL;
+
+// Total number of processors in the system
+int g_cpuCount = 0;
+// Total number of CPU groups
+int g_groupCount = 0;
+// The highest NUMA node available
+int g_highestNumaNode = 0;
+
+static const int MaxCpusPerGroup = 8 * sizeof(KAFFINITY);
+static const WORD NO_GROUP = 0xffff;
+
+/*++
+Function:
+  AllocateLookupArrays
+
+Allocate CPU and group lookup arrays
+--*/
+VOID
+AllocateLookupArrays()
+{
+    g_groupAndIndexToCpu = (short*)malloc(g_groupCount * MaxCpusPerGroup * sizeof(short));
+    g_cpuToAffinity = (CpuAffinity*)malloc(g_cpuCount * sizeof(CpuAffinity));
+    g_groupToCpuMask = (KAFFINITY*)malloc(g_groupCount * sizeof(KAFFINITY));
+    g_groupToCpuCount = (BYTE*)malloc(g_groupCount * sizeof(BYTE));
+
+    memset(g_groupAndIndexToCpu, 0xff, g_groupCount * MaxCpusPerGroup * sizeof(short));
+    memset(g_cpuToAffinity, 0xff, g_cpuCount * sizeof(CpuAffinity));
+    memset(g_groupToCpuMask, 0, g_groupCount * sizeof(KAFFINITY));
+    memset(g_groupToCpuCount, 0, g_groupCount * sizeof(BYTE));
+}
+
+/*++
+Function:
+  FreeLookupArrays
+
+Free CPU and group lookup arrays
+--*/
+VOID
+FreeLookupArrays()
+{
+    free(g_groupAndIndexToCpu);
+    free(g_cpuToAffinity);
+    free(g_groupToCpuMask);
+    free(g_groupToCpuCount);
+
+    g_groupAndIndexToCpu = NULL;
+    g_cpuToAffinity = NULL;
+    g_groupToCpuMask = NULL;
+    g_groupToCpuCount = NULL;
+}
+
+/*++
+Function:
+  GetFullAffinityMask
+
+Get affinity mask for the specified number of processors with all
+the processors enabled.
+--*/
+KAFFINITY GetFullAffinityMask(int cpuCount)
+{
+    return ((KAFFINITY)1 << (cpuCount)) - 1;
+}
+
+/*++
+Function:
+  NUMASupportInitialize
+
+Initialize data structures for getting and setting thread affinities to processors and
+querying NUMA related processor information.
+On systems with no NUMA support, it behaves as if there was a single NUMA node with
+a single group of processors.
+--*/
+BOOL
+NUMASupportInitialize()
+{
+#if HAVE_NUMA_H
+    if (numa_available() != -1)
+    {
+        struct bitmask *mask = numa_allocate_cpumask();
+        int numaNodesCount = numa_max_node() + 1;
+
+        g_cpuCount = numa_num_possible_cpus();
+        g_groupCount = 0;
+
+        for (int i = 0; i < numaNodesCount; i++)
+        {
+            int st = numa_node_to_cpus(i, mask);
+            // The only failure that can happen is that the mask is not large enough
+            // but that cannot happen since the mask was allocated by numa_allocate_cpumask
+            _ASSERTE(st == 0);
+            unsigned int nodeCpuCount = numa_bitmask_weight(mask);
+            unsigned int nodeGroupCount = (nodeCpuCount + MaxCpusPerGroup - 1) / MaxCpusPerGroup;
+            g_groupCount += nodeGroupCount;
+        }
+
+        AllocateLookupArrays();
+
+        WORD currentGroup = 0;
+        int currentGroupCpus = 0;
+
+        for (int i = 0; i < numaNodesCount; i++)
+        {
+            int st = numa_node_to_cpus(i, mask);
+            // The only failure that can happen is that the mask is not large enough
+            // but that cannot happen since the mask was allocated by numa_allocate_cpumask
+            _ASSERTE(st == 0);
+            unsigned int nodeCpuCount = numa_bitmask_weight(mask);
+            unsigned int nodeGroupCount = (nodeCpuCount + MaxCpusPerGroup - 1) / MaxCpusPerGroup;
+            for (int j = 0; j < g_cpuCount; j++)
+            {
+                if (numa_bitmask_isbitset(mask, j))
+                {
+                    if (currentGroupCpus == MaxCpusPerGroup)
+                    {
+                        g_groupToCpuCount[currentGroup] = MaxCpusPerGroup;
+                        g_groupToCpuMask[currentGroup] = GetFullAffinityMask(MaxCpusPerGroup);
+                        currentGroupCpus = 0;
+                        currentGroup++;
+                    }
+                    g_cpuToAffinity[j].Node = i;
+                    g_cpuToAffinity[j].Group = currentGroup;
+                    g_cpuToAffinity[j].Number = currentGroupCpus;
+                    g_groupAndIndexToCpu[currentGroup * MaxCpusPerGroup + currentGroupCpus] = j;
+                    currentGroupCpus++;
+                }
+            }
+
+            if (currentGroupCpus != 0)
+            {
+                g_groupToCpuCount[currentGroup] = currentGroupCpus;
+                g_groupToCpuMask[currentGroup] = GetFullAffinityMask(currentGroupCpus);
+                currentGroupCpus = 0;
+                currentGroup++;
+            }
+        }
+
+        numa_free_cpumask(mask);
+
+        g_highestNumaNode = numa_max_node();
+    }
+    else
+#endif // HAVE_NUMA_H
+    {
+        // No NUMA
+        g_cpuCount = GetNumberOfProcessors();
+        g_groupCount = 1;
+        g_highestNumaNode = 0;
+
+        AllocateLookupArrays();
+    }
+
+    return TRUE;
+}
+
+/*++
+Function:
+  NUMASupportCleanup
+
+Cleanup of the NUMA support data structures
+--*/
+VOID
+NUMASupportCleanup()
+{
+    FreeLookupArrays();
+}
+
+/*++
+Function:
+  GetNumaHighestNodeNumber
+
+See MSDN doc.
+--*/
+BOOL
+PALAPI
+GetNumaHighestNodeNumber(
+  OUT PULONG HighestNodeNumber
+)
+{
+    PERF_ENTRY(GetNumaHighestNodeNumber);
+    ENTRY("GetNumaHighestNodeNumber(HighestNodeNumber=%p)\n", HighestNodeNumber);
+    *HighestNodeNumber = (ULONG)g_highestNumaNode;
+
+    BOOL success = TRUE;
+
+    LOGEXIT("GetNumaHighestNodeNumber returns BOOL %d\n", success);
+    PERF_EXIT(GetNumaHighestNodeNumber);
+
+    return success;
+}
+
+/*++
+Function:
+  GetNumaProcessorNodeEx
+
+See MSDN doc.
+--*/
+BOOL
+PALAPI
+GetNumaProcessorNodeEx(
+  IN  PPROCESSOR_NUMBER Processor,
+  OUT PUSHORT NodeNumber
+)
+{
+    PERF_ENTRY(GetNumaProcessorNodeEx);
+    ENTRY("GetNumaProcessorNodeEx(Processor=%p, NodeNumber=%p)\n", Processor, NodeNumber);
+
+    BOOL success = FALSE;
+
+    if ((Processor->Group < g_groupCount) && 
+        (Processor->Number < MaxCpusPerGroup) && 
+        (Processor->Reserved == 0))
+    {  
+        short cpu = g_groupAndIndexToCpu[Processor->Group * MaxCpusPerGroup + Processor->Number];
+        if (cpu != -1)
+        {
+            *NodeNumber = g_cpuToAffinity[cpu].Node;
+            success = TRUE;
+        }
+    }
+
+    if (!success)
+    {
+        *NodeNumber = 0xffff;
+        SetLastError(ERROR_INVALID_PARAMETER);  
+    }
+
+    LOGEXIT("GetNumaProcessorNodeEx returns BOOL %d\n", success);
+    PERF_EXIT(GetNumaProcessorNodeEx);
+
+    return success;
+}
+
+/*++
+Function:
+  GetLogicalProcessorInformationEx
+
+See MSDN doc.
+--*/
+BOOL
+PALAPI
+GetLogicalProcessorInformationEx(
+  IN LOGICAL_PROCESSOR_RELATIONSHIP RelationshipType,
+  OUT OPTIONAL PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer,
+  IN OUT PDWORD ReturnedLength
+)
+{
+    PERF_ENTRY(GetLogicalProcessorInformationEx);
+    ENTRY("GetLogicalProcessorInformationEx(RelationshipType=%d, Buffer=%p, ReturnedLength=%p)\n", RelationshipType, Buffer, ReturnedLength);
+
+    BOOL success = FALSE;
+
+    if (RelationshipType == RelationGroup)
+    {
+        size_t requiredSize = __builtin_offsetof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Group);
+        requiredSize += __builtin_offsetof(GROUP_RELATIONSHIP, GroupInfo);
+        requiredSize += g_groupCount * sizeof(PROCESSOR_GROUP_INFO);
+
+        if (*ReturnedLength >= requiredSize)
+        {
+            Buffer->Relationship = RelationGroup;
+            Buffer->Size = requiredSize;
+            Buffer->Group.MaximumGroupCount = g_groupCount;
+            Buffer->Group.ActiveGroupCount = g_groupCount;
+            for (int i = 0; i < g_groupCount; i++)
+            {
+                Buffer->Group.GroupInfo[i].MaximumProcessorCount = MaxCpusPerGroup;
+                Buffer->Group.GroupInfo[i].ActiveProcessorCount = g_groupToCpuCount[i];
+                Buffer->Group.GroupInfo[i].ActiveProcessorMask = g_groupToCpuMask[i];
+            }
+
+            success = TRUE;
+        }
+        else
+        {
+            SetLastError(ERROR_INSUFFICIENT_BUFFER);
+        }
+
+        *ReturnedLength = requiredSize;
+    }
+    else
+    {
+        // We only support the group relationship
+        SetLastError(ERROR_INVALID_PARAMETER);  
+    }
+
+    LOGEXIT("GetLogicalProcessorInformationEx returns BOOL %d\n", success);
+    PERF_EXIT(GetLogicalProcessorInformationEx);
+
+    return success;
+}
+
+/*++
+Function:
+  GetThreadGroupAffinityInternal
+
+Get the group affinity for the specified pthread
+--*/
+BOOL
+GetThreadGroupAffinityInternal(
+  IN pthread_t thread,
+  OUT PGROUP_AFFINITY GroupAffinity
+)
+{
+    BOOL success = FALSE;
+
+#if HAVE_PTHREAD_GETAFFINITY_NP
+    cpu_set_t cpuSet;
+
+    int st = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuSet);
+
+    if (st == 0)
+    {
+        WORD group = NO_GROUP;
+        KAFFINITY mask = 0;
+
+        for (int i = 0; i < g_cpuCount; i++)
+        {
+            if (CPU_ISSET(i, &cpuSet))
+            {
+                WORD g = g_cpuToAffinity[i].Group;
+                // Unless the thread affinity was already set by SetThreadGroupAffinity, it is possible that
+                // the current thread has affinity with processors from multiple groups. So we report just the
+                // first group we find.
+                if (group == NO_GROUP || g == group)
+                {
+                    group = g;
+                    mask |= ((KAFFINITY)1) << g_cpuToAffinity[i].Number;
+                }
+            }
+        }
+
+        GroupAffinity->Group = group;
+        GroupAffinity->Mask = mask;
+        success = TRUE;
+    }
+    else
+    {
+        SetLastError(ERROR_GEN_FAILURE);
+    }
+#else // HAVE_PTHREAD_GETAFFINITY_NP
+    // There is no API to manage thread affinity, so let's return a group affinity
+    // with all the CPUs on the system.
+    GroupAffinity->Group = 0;
+    GroupAffinity->Mask = GetFullAffinityMask(g_cpuCount);
+    success = TRUE;
+#endif // HAVE_PTHREAD_GETAFFINITY_NP
+
+    return success;
+}
+
+/*++
+Function:
+  GetThreadGroupAffinity
+
+See MSDN doc.
+--*/
+BOOL
+PALAPI
+GetThreadGroupAffinity(
+  IN HANDLE hThread,
+  OUT PGROUP_AFFINITY GroupAffinity
+)
+{
+    PERF_ENTRY(GetThreadGroupAffinity);
+    ENTRY("GetThreadGroupAffinity(hThread=%p, GroupAffinity=%p)\n", hThread, GroupAffinity);
+
+    CPalThread *palThread = InternalGetCurrentThread();
+
+    BOOL success = GetThreadGroupAffinityInternal(palThread->GetPThreadSelf(), GroupAffinity);
+
+    LOGEXIT("GetThreadGroupAffinity returns BOOL %d\n", success);
+    PERF_EXIT(GetThreadGroupAffinity);
+
+    return success;
+}
+
+
+/*++
+Function:
+  SetThreadGroupAffinity
+
+See MSDN doc.
+--*/
+BOOL
+PALAPI
+SetThreadGroupAffinity(
+  IN HANDLE hThread,
+  IN const GROUP_AFFINITY *GroupAffinity,
+  OUT OPTIONAL PGROUP_AFFINITY PreviousGroupAffinity
+)
+{
+    PERF_ENTRY(SetThreadGroupAffinity);
+    ENTRY("SetThreadGroupAffinity(hThread=%p, GroupAffinity=%p, PreviousGroupAffinity=%p)\n", hThread, GroupAffinity, PreviousGroupAffinity);
+
+    CPalThread *palThread = InternalGetCurrentThread();
+
+    pthread_t thread = palThread->GetPThreadSelf();
+
+    if (PreviousGroupAffinity != NULL)
+    {
+        GetThreadGroupAffinityInternal(thread, PreviousGroupAffinity);
+    }
+
+#if HAVE_PTHREAD_GETAFFINITY_NP
+    int groupStartIndex = GroupAffinity->Group * MaxCpusPerGroup;
+    KAFFINITY mask = 1;
+    cpu_set_t cpuSet;
+    CPU_ZERO(&cpuSet);
+
+    for (int i = 0; i < MaxCpusPerGroup; i++, mask <<= 1)
+    {
+        if (GroupAffinity->Mask & mask)
+        {
+            int cpu = g_groupAndIndexToCpu[groupStartIndex + i];
+            if (cpu != -1)
+            {
+                CPU_SET(cpu, &cpuSet);
+            }
+        }
+    }
+
+    int st = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuSet);
+
+    if (st == -1)
+    {
+        switch (errno)
+        {
+            case EINVAL:
+                // There is no processor in the mask that is allowed to execute the process
+                SetLastError(ERROR_INVALID_PARAMETER);
+                break;
+            case EPERM:
+                SetLastError(ERROR_ACCESS_DENIED);
+                break;
+            default:
+                SetLastError(ERROR_GEN_FAILURE);
+                break;
+        }
+    }
+
+    BOOL success = (st == 0);
+#else // HAVE_PTHREAD_GETAFFINITY_NP
+    // There is no API to manage thread affinity, so let's ignore the request
+    BOOL success = TRUE;
+#endif // HAVE_PTHREAD_GETAFFINITY_NP
+
+    LOGEXIT("SetThreadGroupAffinity returns BOOL %d\n", success);
+    PERF_EXIT(SetThreadGroupAffinity);
+
+    return success;
+}
+
+/*++
+Function:
+  GetCurrentProcessorNumberEx
+
+See MSDN doc.
+--*/
+VOID
+PALAPI
+GetCurrentProcessorNumberEx(
+  OUT PPROCESSOR_NUMBER ProcNumber
+)
+{
+    PERF_ENTRY(GetCurrentProcessorNumberEx);
+    ENTRY("GetCurrentProcessorNumberEx(ProcNumber=%p\n", ProcNumber);
+
+    DWORD cpu = GetCurrentProcessorNumber();
+    _ASSERTE(cpu < g_cpuCount);
+    ProcNumber->Group = g_cpuToAffinity[cpu].Group;
+    ProcNumber->Number = g_cpuToAffinity[cpu].Number;
+
+    LOGEXIT("GetCurrentProcessorNumberEx\n");
+    PERF_EXIT(GetCurrentProcessorNumberEx);
+}
+
+/*++
+Function:
+  GetProcessAffinityMask
+
+See MSDN doc.
+--*/
+BOOL
+PALAPI
+GetProcessAffinityMask(
+  IN HANDLE hProcess,
+  OUT PDWORD_PTR lpProcessAffinityMask,
+  OUT PDWORD_PTR lpSystemAffinityMask
+)
+{
+    PERF_ENTRY(GetProcessAffinityMask);
+    ENTRY("GetProcessAffinityMask(hProcess=%p, lpProcessAffinityMask=%p, lpSystemAffinityMask=%p\n", hProcess, lpProcessAffinityMask, lpSystemAffinityMask);
+
+    BOOL success = FALSE;
+
+    if (hProcess == GetCurrentProcess())
+    {
+        DWORD_PTR systemMask = GetFullAffinityMask(g_cpuCount);
+
+#if HAVE_SCHED_GETAFFINITY
+        int pid = getpid();
+        cpu_set_t cpuSet;
+        int st = sched_getaffinity(pid, sizeof(cpu_set_t), &cpuSet);
+        if (st == 0)
+        {
+            WORD group = NO_GROUP;
+            DWORD_PTR processMask = 0;
+
+            for (int i = 0; i < g_cpuCount; i++)
+            {
+                if (CPU_ISSET(i, &cpuSet))
+                {
+                    WORD g = g_cpuToAffinity[i].Group;
+                    if (group == NO_GROUP || g == group)
+                    {
+                        group = g;
+                        processMask |= ((DWORD_PTR)1) << g_cpuToAffinity[i].Number;
+                    }
+                    else
+                    {
+                        // The process has affinity in more than one group, in such case
+                        // the function needs to return zero in both masks.
+                        processMask = 0;
+                        systemMask = 0;
+                        group = NO_GROUP;
+                        break;
+                    }
+                }
+            }
+
+            success = TRUE;
+
+            *lpProcessAffinityMask = processMask;
+            *lpSystemAffinityMask = systemMask;
+        }
+        else
+        {
+            // We should not get any of the errors that the sched_getaffinity can return since none
+            // of them applies for the current thread, so this is an unexpected kind of failure.
+            SetLastError(ERROR_GEN_FAILURE);
+        }
+#else // HAVE_SCHED_GETAFFINITY
+        // There is no API to manage thread affinity, so let's return both affinity masks
+        // with all the CPUs on the system set.
+        *lpSystemAffinityMask = systemMask;
+        *lpProcessAffinityMask = systemMask;
+
+        success = TRUE;
+#endif // HAVE_SCHED_GETAFFINITY
+    }
+    else
+    {
+        // PAL supports getting affinity mask for the current process only
+        SetLastError(ERROR_INVALID_PARAMETER);
+    }
+
+    LOGEXIT("GetProcessAffinityMask returns BOOL %d\n", success);
+    PERF_EXIT(GetProcessAffinityMask);
+
+    return success;
+}
+
+/*++
+Function:
+  VirtualAllocExNuma
+
+See MSDN doc.
+--*/
+LPVOID
+PALAPI
+VirtualAllocExNuma(
+  IN HANDLE hProcess,
+  IN OPTIONAL LPVOID lpAddress,
+  IN SIZE_T dwSize,
+  IN DWORD flAllocationType,
+  IN DWORD flProtect,
+  IN DWORD nndPreferred
+)
+{
+    PERF_ENTRY(VirtualAllocExNuma);
+    ENTRY("VirtualAllocExNuma(hProcess=%p, lpAddress=%p, dwSize=%u, flAllocationType=%#x, flProtect=%#x, nndPreferred=%d\n", 
+        hProcess, lpAddress, dwSize, flAllocationType, flProtect, nndPreferred);
+
+    LPVOID result = NULL;
+
+    if (hProcess == GetCurrentProcess())
+    {
+        if (nndPreferred <= g_highestNumaNode)
+        {
+            result = VirtualAlloc(lpAddress, dwSize, flAllocationType, flProtect);
+#if HAVE_NUMA_H
+            if (result != NULL)
+            {
+                int nodeMaskLength = (g_highestNumaNode + 1 + sizeof(unsigned long) - 1) / sizeof(unsigned long);
+                unsigned long *nodeMask = new unsigned long[nodeMaskLength];
+
+                memset(nodeMask, 0, nodeMaskLength);
+
+                int index = nndPreferred / sizeof(unsigned long);
+                int mask = ((unsigned long)1) << (nndPreferred & (sizeof(unsigned long) - 1));
+                nodeMask[index] = mask;
+
+                int st = mbind(result, dwSize, MPOL_PREFERRED, nodeMask, g_highestNumaNode, 0);
+                free(nodeMask);
+                _ASSERTE(st == 0);
+                // If the mbind fails, we still return the allocated memory since the nndPreferred is just a hint
+            }
+#endif // HAVE_NUMA_H
+        }
+        else
+        {
+            // The specified node number is larger than the maximum available one
+            SetLastError(ERROR_INVALID_PARAMETER);
+        }
+    }    
+    else
+    {
+        // PAL supports allocating from the current process virtual space only
+        SetLastError(ERROR_INVALID_PARAMETER);
+    }
+
+    LOGEXIT("VirtualAllocExNuma returns %p\n", result);
+    PERF_EXIT(VirtualAllocExNuma);
+
+    return result;
+}