// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. /*++ Module Name: numa.cpp Abstract: Implementation of NUMA related APIs --*/ #include "pal/dbgmsg.h" SET_DEFAULT_DEBUG_CHANNEL(NUMA); #include "pal/palinternal.h" #include "pal/dbgmsg.h" #include "pal/numa.h" #include "pal/corunix.hpp" #include "pal/thread.hpp" #if HAVE_NUMA_H #include #include #endif #if HAVE_PTHREAD_NP_H #include #endif #include using namespace CorUnix; #if HAVE_CPUSET_T typedef cpuset_t cpu_set_t; #endif int GetNumberOfProcessors(); // CPU affinity descriptor struct CpuAffinity { // NUMA node BYTE Node; // CPU number relative to the group the CPU is in BYTE Number; // CPU group WORD Group; }; // Array mapping global CPU index to its affinity CpuAffinity *g_cpuToAffinity = NULL; // Array mapping CPU group and index in the group to the global CPU index short *g_groupAndIndexToCpu = NULL; // Array mapping CPU group to the corresponding affinity mask of the CPUs in the group KAFFINITY *g_groupToCpuMask = NULL; // Array mapping CPU group to the number of processors in the group BYTE *g_groupToCpuCount = NULL; // Total number of processors in the system int g_cpuCount = 0; // Total number of CPU groups int g_groupCount = 0; // The highest NUMA node available int g_highestNumaNode = 0; static const int MaxCpusPerGroup = 8 * sizeof(KAFFINITY); static const WORD NO_GROUP = 0xffff; /*++ Function: AllocateLookupArrays Allocate CPU and group lookup arrays --*/ VOID AllocateLookupArrays() { g_groupAndIndexToCpu = (short*)malloc(g_groupCount * MaxCpusPerGroup * sizeof(short)); g_cpuToAffinity = (CpuAffinity*)malloc(g_cpuCount * sizeof(CpuAffinity)); g_groupToCpuMask = (KAFFINITY*)malloc(g_groupCount * sizeof(KAFFINITY)); g_groupToCpuCount = (BYTE*)malloc(g_groupCount * sizeof(BYTE)); memset(g_groupAndIndexToCpu, 0xff, g_groupCount * MaxCpusPerGroup * sizeof(short)); memset(g_cpuToAffinity, 0xff, g_cpuCount * sizeof(CpuAffinity)); memset(g_groupToCpuMask, 0, g_groupCount * sizeof(KAFFINITY)); memset(g_groupToCpuCount, 0, g_groupCount * sizeof(BYTE)); } /*++ Function: FreeLookupArrays Free CPU and group lookup arrays --*/ VOID FreeLookupArrays() { free(g_groupAndIndexToCpu); free(g_cpuToAffinity); free(g_groupToCpuMask); free(g_groupToCpuCount); g_groupAndIndexToCpu = NULL; g_cpuToAffinity = NULL; g_groupToCpuMask = NULL; g_groupToCpuCount = NULL; } /*++ Function: GetFullAffinityMask Get affinity mask for the specified number of processors with all the processors enabled. --*/ KAFFINITY GetFullAffinityMask(int cpuCount) { return ((KAFFINITY)1 << (cpuCount)) - 1; } /*++ Function: NUMASupportInitialize Initialize data structures for getting and setting thread affinities to processors and querying NUMA related processor information. On systems with no NUMA support, it behaves as if there was a single NUMA node with a single group of processors. --*/ BOOL NUMASupportInitialize() { #if HAVE_NUMA_H if (numa_available() != -1) { struct bitmask *mask = numa_allocate_cpumask(); int numaNodesCount = numa_max_node() + 1; g_cpuCount = numa_num_possible_cpus(); g_groupCount = 0; for (int i = 0; i < numaNodesCount; i++) { int st = numa_node_to_cpus(i, mask); // The only failure that can happen is that the mask is not large enough // but that cannot happen since the mask was allocated by numa_allocate_cpumask _ASSERTE(st == 0); unsigned int nodeCpuCount = numa_bitmask_weight(mask); unsigned int nodeGroupCount = (nodeCpuCount + MaxCpusPerGroup - 1) / MaxCpusPerGroup; g_groupCount += nodeGroupCount; } AllocateLookupArrays(); WORD currentGroup = 0; int currentGroupCpus = 0; for (int i = 0; i < numaNodesCount; i++) { int st = numa_node_to_cpus(i, mask); // The only failure that can happen is that the mask is not large enough // but that cannot happen since the mask was allocated by numa_allocate_cpumask _ASSERTE(st == 0); unsigned int nodeCpuCount = numa_bitmask_weight(mask); unsigned int nodeGroupCount = (nodeCpuCount + MaxCpusPerGroup - 1) / MaxCpusPerGroup; for (int j = 0; j < g_cpuCount; j++) { if (numa_bitmask_isbitset(mask, j)) { if (currentGroupCpus == MaxCpusPerGroup) { g_groupToCpuCount[currentGroup] = MaxCpusPerGroup; g_groupToCpuMask[currentGroup] = GetFullAffinityMask(MaxCpusPerGroup); currentGroupCpus = 0; currentGroup++; } g_cpuToAffinity[j].Node = i; g_cpuToAffinity[j].Group = currentGroup; g_cpuToAffinity[j].Number = currentGroupCpus; g_groupAndIndexToCpu[currentGroup * MaxCpusPerGroup + currentGroupCpus] = j; currentGroupCpus++; } } if (currentGroupCpus != 0) { g_groupToCpuCount[currentGroup] = currentGroupCpus; g_groupToCpuMask[currentGroup] = GetFullAffinityMask(currentGroupCpus); currentGroupCpus = 0; currentGroup++; } } numa_free_cpumask(mask); g_highestNumaNode = numa_max_node(); } else #endif // HAVE_NUMA_H { // No NUMA g_cpuCount = GetNumberOfProcessors(); g_groupCount = 1; g_highestNumaNode = 0; AllocateLookupArrays(); } return TRUE; } /*++ Function: NUMASupportCleanup Cleanup of the NUMA support data structures --*/ VOID NUMASupportCleanup() { FreeLookupArrays(); } /*++ Function: GetNumaHighestNodeNumber See MSDN doc. --*/ BOOL PALAPI GetNumaHighestNodeNumber( OUT PULONG HighestNodeNumber ) { PERF_ENTRY(GetNumaHighestNodeNumber); ENTRY("GetNumaHighestNodeNumber(HighestNodeNumber=%p)\n", HighestNodeNumber); *HighestNodeNumber = (ULONG)g_highestNumaNode; BOOL success = TRUE; LOGEXIT("GetNumaHighestNodeNumber returns BOOL %d\n", success); PERF_EXIT(GetNumaHighestNodeNumber); return success; } /*++ Function: GetNumaProcessorNodeEx See MSDN doc. --*/ BOOL PALAPI GetNumaProcessorNodeEx( IN PPROCESSOR_NUMBER Processor, OUT PUSHORT NodeNumber ) { PERF_ENTRY(GetNumaProcessorNodeEx); ENTRY("GetNumaProcessorNodeEx(Processor=%p, NodeNumber=%p)\n", Processor, NodeNumber); BOOL success = FALSE; if ((Processor->Group < g_groupCount) && (Processor->Number < MaxCpusPerGroup) && (Processor->Reserved == 0)) { short cpu = g_groupAndIndexToCpu[Processor->Group * MaxCpusPerGroup + Processor->Number]; if (cpu != -1) { *NodeNumber = g_cpuToAffinity[cpu].Node; success = TRUE; } } if (!success) { *NodeNumber = 0xffff; SetLastError(ERROR_INVALID_PARAMETER); } LOGEXIT("GetNumaProcessorNodeEx returns BOOL %d\n", success); PERF_EXIT(GetNumaProcessorNodeEx); return success; } /*++ Function: GetLogicalProcessorInformationEx See MSDN doc. --*/ BOOL PALAPI GetLogicalProcessorInformationEx( IN LOGICAL_PROCESSOR_RELATIONSHIP RelationshipType, OUT OPTIONAL PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX Buffer, IN OUT PDWORD ReturnedLength ) { PERF_ENTRY(GetLogicalProcessorInformationEx); ENTRY("GetLogicalProcessorInformationEx(RelationshipType=%d, Buffer=%p, ReturnedLength=%p)\n", RelationshipType, Buffer, ReturnedLength); BOOL success = FALSE; if (RelationshipType == RelationGroup) { size_t requiredSize = __builtin_offsetof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Group); requiredSize += __builtin_offsetof(GROUP_RELATIONSHIP, GroupInfo); requiredSize += g_groupCount * sizeof(PROCESSOR_GROUP_INFO); if (*ReturnedLength >= requiredSize) { Buffer->Relationship = RelationGroup; Buffer->Size = requiredSize; Buffer->Group.MaximumGroupCount = g_groupCount; Buffer->Group.ActiveGroupCount = g_groupCount; for (int i = 0; i < g_groupCount; i++) { Buffer->Group.GroupInfo[i].MaximumProcessorCount = MaxCpusPerGroup; Buffer->Group.GroupInfo[i].ActiveProcessorCount = g_groupToCpuCount[i]; Buffer->Group.GroupInfo[i].ActiveProcessorMask = g_groupToCpuMask[i]; } success = TRUE; } else { SetLastError(ERROR_INSUFFICIENT_BUFFER); } *ReturnedLength = requiredSize; } else { // We only support the group relationship SetLastError(ERROR_INVALID_PARAMETER); } LOGEXIT("GetLogicalProcessorInformationEx returns BOOL %d\n", success); PERF_EXIT(GetLogicalProcessorInformationEx); return success; } /*++ Function: GetThreadGroupAffinityInternal Get the group affinity for the specified pthread --*/ BOOL GetThreadGroupAffinityInternal( IN pthread_t thread, OUT PGROUP_AFFINITY GroupAffinity ) { BOOL success = FALSE; #if HAVE_PTHREAD_GETAFFINITY_NP cpu_set_t cpuSet; int st = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuSet); if (st == 0) { WORD group = NO_GROUP; KAFFINITY mask = 0; for (int i = 0; i < g_cpuCount; i++) { if (CPU_ISSET(i, &cpuSet)) { WORD g = g_cpuToAffinity[i].Group; // Unless the thread affinity was already set by SetThreadGroupAffinity, it is possible that // the current thread has affinity with processors from multiple groups. So we report just the // first group we find. if (group == NO_GROUP || g == group) { group = g; mask |= ((KAFFINITY)1) << g_cpuToAffinity[i].Number; } } } GroupAffinity->Group = group; GroupAffinity->Mask = mask; success = TRUE; } else { SetLastError(ERROR_GEN_FAILURE); } #else // HAVE_PTHREAD_GETAFFINITY_NP // There is no API to manage thread affinity, so let's return a group affinity // with all the CPUs on the system. GroupAffinity->Group = 0; GroupAffinity->Mask = GetFullAffinityMask(g_cpuCount); success = TRUE; #endif // HAVE_PTHREAD_GETAFFINITY_NP return success; } /*++ Function: GetThreadGroupAffinity See MSDN doc. --*/ BOOL PALAPI GetThreadGroupAffinity( IN HANDLE hThread, OUT PGROUP_AFFINITY GroupAffinity ) { PERF_ENTRY(GetThreadGroupAffinity); ENTRY("GetThreadGroupAffinity(hThread=%p, GroupAffinity=%p)\n", hThread, GroupAffinity); CPalThread *palThread = InternalGetCurrentThread(); BOOL success = GetThreadGroupAffinityInternal(palThread->GetPThreadSelf(), GroupAffinity); LOGEXIT("GetThreadGroupAffinity returns BOOL %d\n", success); PERF_EXIT(GetThreadGroupAffinity); return success; } /*++ Function: SetThreadGroupAffinity See MSDN doc. --*/ BOOL PALAPI SetThreadGroupAffinity( IN HANDLE hThread, IN const GROUP_AFFINITY *GroupAffinity, OUT OPTIONAL PGROUP_AFFINITY PreviousGroupAffinity ) { PERF_ENTRY(SetThreadGroupAffinity); ENTRY("SetThreadGroupAffinity(hThread=%p, GroupAffinity=%p, PreviousGroupAffinity=%p)\n", hThread, GroupAffinity, PreviousGroupAffinity); CPalThread *palThread = InternalGetCurrentThread(); pthread_t thread = palThread->GetPThreadSelf(); if (PreviousGroupAffinity != NULL) { GetThreadGroupAffinityInternal(thread, PreviousGroupAffinity); } #if HAVE_PTHREAD_GETAFFINITY_NP int groupStartIndex = GroupAffinity->Group * MaxCpusPerGroup; KAFFINITY mask = 1; cpu_set_t cpuSet; CPU_ZERO(&cpuSet); for (int i = 0; i < MaxCpusPerGroup; i++, mask <<= 1) { if (GroupAffinity->Mask & mask) { int cpu = g_groupAndIndexToCpu[groupStartIndex + i]; if (cpu != -1) { CPU_SET(cpu, &cpuSet); } } } int st = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuSet); if (st == -1) { switch (errno) { case EINVAL: // There is no processor in the mask that is allowed to execute the process SetLastError(ERROR_INVALID_PARAMETER); break; case EPERM: SetLastError(ERROR_ACCESS_DENIED); break; default: SetLastError(ERROR_GEN_FAILURE); break; } } BOOL success = (st == 0); #else // HAVE_PTHREAD_GETAFFINITY_NP // There is no API to manage thread affinity, so let's ignore the request BOOL success = TRUE; #endif // HAVE_PTHREAD_GETAFFINITY_NP LOGEXIT("SetThreadGroupAffinity returns BOOL %d\n", success); PERF_EXIT(SetThreadGroupAffinity); return success; } /*++ Function: GetCurrentProcessorNumberEx See MSDN doc. --*/ VOID PALAPI GetCurrentProcessorNumberEx( OUT PPROCESSOR_NUMBER ProcNumber ) { PERF_ENTRY(GetCurrentProcessorNumberEx); ENTRY("GetCurrentProcessorNumberEx(ProcNumber=%p\n", ProcNumber); DWORD cpu = GetCurrentProcessorNumber(); _ASSERTE(cpu < g_cpuCount); ProcNumber->Group = g_cpuToAffinity[cpu].Group; ProcNumber->Number = g_cpuToAffinity[cpu].Number; LOGEXIT("GetCurrentProcessorNumberEx\n"); PERF_EXIT(GetCurrentProcessorNumberEx); } /*++ Function: GetProcessAffinityMask See MSDN doc. --*/ BOOL PALAPI GetProcessAffinityMask( IN HANDLE hProcess, OUT PDWORD_PTR lpProcessAffinityMask, OUT PDWORD_PTR lpSystemAffinityMask ) { PERF_ENTRY(GetProcessAffinityMask); ENTRY("GetProcessAffinityMask(hProcess=%p, lpProcessAffinityMask=%p, lpSystemAffinityMask=%p\n", hProcess, lpProcessAffinityMask, lpSystemAffinityMask); BOOL success = FALSE; if (hProcess == GetCurrentProcess()) { DWORD_PTR systemMask = GetFullAffinityMask(g_cpuCount); #if HAVE_SCHED_GETAFFINITY int pid = getpid(); cpu_set_t cpuSet; int st = sched_getaffinity(pid, sizeof(cpu_set_t), &cpuSet); if (st == 0) { WORD group = NO_GROUP; DWORD_PTR processMask = 0; for (int i = 0; i < g_cpuCount; i++) { if (CPU_ISSET(i, &cpuSet)) { WORD g = g_cpuToAffinity[i].Group; if (group == NO_GROUP || g == group) { group = g; processMask |= ((DWORD_PTR)1) << g_cpuToAffinity[i].Number; } else { // The process has affinity in more than one group, in such case // the function needs to return zero in both masks. processMask = 0; systemMask = 0; group = NO_GROUP; break; } } } success = TRUE; *lpProcessAffinityMask = processMask; *lpSystemAffinityMask = systemMask; } else { // We should not get any of the errors that the sched_getaffinity can return since none // of them applies for the current thread, so this is an unexpected kind of failure. SetLastError(ERROR_GEN_FAILURE); } #else // HAVE_SCHED_GETAFFINITY // There is no API to manage thread affinity, so let's return both affinity masks // with all the CPUs on the system set. *lpSystemAffinityMask = systemMask; *lpProcessAffinityMask = systemMask; success = TRUE; #endif // HAVE_SCHED_GETAFFINITY } else { // PAL supports getting affinity mask for the current process only SetLastError(ERROR_INVALID_PARAMETER); } LOGEXIT("GetProcessAffinityMask returns BOOL %d\n", success); PERF_EXIT(GetProcessAffinityMask); return success; } /*++ Function: VirtualAllocExNuma See MSDN doc. --*/ LPVOID PALAPI VirtualAllocExNuma( IN HANDLE hProcess, IN OPTIONAL LPVOID lpAddress, IN SIZE_T dwSize, IN DWORD flAllocationType, IN DWORD flProtect, IN DWORD nndPreferred ) { PERF_ENTRY(VirtualAllocExNuma); ENTRY("VirtualAllocExNuma(hProcess=%p, lpAddress=%p, dwSize=%u, flAllocationType=%#x, flProtect=%#x, nndPreferred=%d\n", hProcess, lpAddress, dwSize, flAllocationType, flProtect, nndPreferred); LPVOID result = NULL; if (hProcess == GetCurrentProcess()) { if (nndPreferred <= g_highestNumaNode) { result = VirtualAlloc(lpAddress, dwSize, flAllocationType, flProtect); #if HAVE_NUMA_H if (result != NULL) { int nodeMaskLength = (g_highestNumaNode + 1 + sizeof(unsigned long) - 1) / sizeof(unsigned long); unsigned long *nodeMask = new unsigned long[nodeMaskLength]; memset(nodeMask, 0, nodeMaskLength); int index = nndPreferred / sizeof(unsigned long); int mask = ((unsigned long)1) << (nndPreferred & (sizeof(unsigned long) - 1)); nodeMask[index] = mask; int st = mbind(result, dwSize, MPOL_PREFERRED, nodeMask, g_highestNumaNode, 0); free(nodeMask); _ASSERTE(st == 0); // If the mbind fails, we still return the allocated memory since the nndPreferred is just a hint } #endif // HAVE_NUMA_H } else { // The specified node number is larger than the maximum available one SetLastError(ERROR_INVALID_PARAMETER); } } else { // PAL supports allocating from the current process virtual space only SetLastError(ERROR_INVALID_PARAMETER); } LOGEXIT("VirtualAllocExNuma returns %p\n", result); PERF_EXIT(VirtualAllocExNuma); return result; }