// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using Microsoft.Win32; using System.Runtime.CompilerServices; using System.Threading; namespace System.Buffers { /// /// Provides an ArrayPool implementation meant to be used as the singleton returned from ArrayPool.Shared. /// /// /// The implementation uses a tiered caching scheme, with a small per-thread cache for each array size, followed /// by a cache per array size shared by all threads, split into per-core stacks meant to be used by threads /// running on that core. Locks are used to protect each per-core stack, because a thread can migrate after /// checking its processor number, because multiple threads could interleave on the same core, and because /// a thread is allowed to check other core's buckets if its core's bucket is empty/full. /// internal sealed partial class TlsOverPerCoreLockedStacksArrayPool : ArrayPool { // TODO: #7747: "Investigate optimizing ArrayPool heuristics" // - Explore caching in TLS more than one array per size per thread, and moving stale buffers to the global queue. // - Explore dumping stale buffers from the global queue, similar to PinnableBufferCache (maybe merging them). // - Explore changing the size of each per-core bucket, potentially dynamically or based on other factors like array size. // - Explore changing number of buckets and what sizes of arrays are cached. // - Investigate whether false sharing is causing any issues, in particular on LockedStack's count and the contents of its array. // ... /// The number of buckets (array sizes) in the pool, one for each array length, starting from length 16. private const int NumBuckets = 17; // Utilities.SelectBucketIndex(2*1024*1024) /// Maximum number of per-core stacks to use per array size. private const int MaxPerCorePerArraySizeStacks = 64; // selected to avoid needing to worry about processor groups /// The maximum number of buffers to store in a bucket's global queue. private const int MaxBuffersPerArraySizePerCore = 8; /// The length of arrays stored in the corresponding indices in and . private readonly int[] _bucketArraySizes; /// /// An array of per-core array stacks. The slots are lazily initialized to avoid creating /// lots of overhead for unused array sizes. /// private readonly PerCoreLockedStacks[] _buckets = new PerCoreLockedStacks[NumBuckets]; /// A per-thread array of arrays, to cache one array per array size per thread. [ThreadStatic] private static T[][] t_tlsBuckets; /// /// Cached processor number used as a hint for which per-core stack to access. /// [ThreadStatic] private static int? t_cachedProcessorNumber; /// Initialize the pool. public TlsOverPerCoreLockedStacksArrayPool() { var sizes = new int[NumBuckets]; for (int i = 0; i < sizes.Length; i++) { sizes[i] = Utilities.GetMaxSizeForBucket(i); } _bucketArraySizes = sizes; } /// Allocate a new PerCoreLockedStacks and try to store it into the array. private PerCoreLockedStacks CreatePerCoreLockedStacks(int bucketIndex) { var inst = new PerCoreLockedStacks(); return Interlocked.CompareExchange(ref _buckets[bucketIndex], inst, null) ?? inst; } /// Gets an ID for the pool to use with events. private int Id => GetHashCode(); /// Gets the processor number associated with the current thread. /// Uses a cached value if one exists on the current thread. private static int CurrentProcessorNumber { [MethodImpl(MethodImplOptions.AggressiveInlining)] get { int? num = t_cachedProcessorNumber; if (!num.HasValue) { t_cachedProcessorNumber = num = Environment.CurrentProcessorNumber; } return num.GetValueOrDefault(); } } public override T[] Rent(int minimumLength) { // Arrays can't be smaller than zero. We allow requesting zero-length arrays (even though // pooling such an array isn't valuable) as it's a valid length array, and we want the pool // to be usable in general instead of using `new`, even for computed lengths. if (minimumLength < 0) { throw new ArgumentOutOfRangeException(nameof(minimumLength)); } else if (minimumLength == 0) { // No need to log the empty array. Our pool is effectively infinite // and we'll never allocate for rents and never store for returns. return EmptyArray.Value; } ArrayPoolEventSource log = ArrayPoolEventSource.Log; T[] buffer; // Get the bucket number for the array length int bucketIndex = Utilities.SelectBucketIndex(minimumLength); // If the array could come from a bucket... if (bucketIndex < _buckets.Length) { // First try to get it from TLS if possible. T[][] tlsBuckets = t_tlsBuckets; if (tlsBuckets != null) { buffer = tlsBuckets[bucketIndex]; if (buffer != null) { tlsBuckets[bucketIndex] = null; if (log.IsEnabled()) { log.BufferRented(buffer.GetHashCode(), buffer.Length, Id, bucketIndex); } return buffer; } } // We couldn't get a buffer from TLS, so try the global stack. PerCoreLockedStacks b = _buckets[bucketIndex]; if (b != null) { buffer = b.TryPop(); if (buffer != null) { if (log.IsEnabled()) { log.BufferRented(buffer.GetHashCode(), buffer.Length, Id, bucketIndex); } return buffer; } } // No buffer available. Allocate a new buffer with a size corresponding to the appropriate bucket. buffer = new T[_bucketArraySizes[bucketIndex]]; } else { // The request was for a size too large for the pool. Allocate an array of exactly the requested length. // When it's returned to the pool, we'll simply throw it away. buffer = new T[minimumLength]; } if (log.IsEnabled()) { int bufferId = buffer.GetHashCode(), bucketId = -1; // no bucket for an on-demand allocated buffer log.BufferRented(bufferId, buffer.Length, Id, bucketId); log.BufferAllocated(bufferId, buffer.Length, Id, bucketId, bucketIndex >= _buckets.Length ? ArrayPoolEventSource.BufferAllocatedReason.OverMaximumSize : ArrayPoolEventSource.BufferAllocatedReason.PoolExhausted); } return buffer; } public override void Return(T[] array, bool clearArray = false) { if (array == null) { throw new ArgumentNullException(nameof(array)); } // Determine with what bucket this array length is associated int bucketIndex = Utilities.SelectBucketIndex(array.Length); // If we can tell that the buffer was allocated (or empty), drop it. Otherwise, check if we have space in the pool. if (bucketIndex < _buckets.Length) { // Clear the array if the user requests. if (clearArray) { Array.Clear(array, 0, array.Length); } // Check to see if the buffer is the correct size for this bucket if (array.Length != _bucketArraySizes[bucketIndex]) { throw new ArgumentException(SR.ArgumentException_BufferNotFromPool, nameof(array)); } // Write through the TLS bucket. If there weren't any buckets, create them // and store this array into it. If there were, store this into it, and // if there was a previous one there, push that to the global stack. This // helps to keep LIFO access such that the most recently pushed stack will // be in TLS and the first to be popped next. T[][] tlsBuckets = t_tlsBuckets; if (tlsBuckets == null) { t_tlsBuckets = tlsBuckets = new T[NumBuckets][]; tlsBuckets[bucketIndex] = array; } else { T[] prev = tlsBuckets[bucketIndex]; tlsBuckets[bucketIndex] = array; if (prev != null) { PerCoreLockedStacks bucket = _buckets[bucketIndex] ?? CreatePerCoreLockedStacks(bucketIndex); bucket.TryPush(prev); } } } // Log that the buffer was returned ArrayPoolEventSource log = ArrayPoolEventSource.Log; if (log.IsEnabled()) { log.BufferReturned(array.GetHashCode(), array.Length, Id); } } /// /// Stores a set of stacks of arrays, with one stack per core. /// private sealed class PerCoreLockedStacks { /// The stacks. private readonly LockedStack[] _perCoreStacks; /// Initializes the stacks. public PerCoreLockedStacks() { // Create the stacks. We create as many as there are processors, limited by our max. var stacks = new LockedStack[Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks)]; for (int i = 0; i < stacks.Length; i++) { stacks[i] = new LockedStack(); } _perCoreStacks = stacks; } /// Try to push the array into the stacks. If each is full when it's tested, the array will be dropped. [MethodImpl(MethodImplOptions.AggressiveInlining)] public void TryPush(T[] array) { // Try to push on to the associated stack first. If that fails, // round-robin through the other stacks. LockedStack[] stacks = _perCoreStacks; int index = ExecutionId % stacks.Length; for (int i = 0; i < stacks.Length; i++) { if (stacks[index].TryPush(array)) return; if (++index == stacks.Length) index = 0; } } /// Try to get an array from the stacks. If each is empty when it's tested, null will be returned. [MethodImpl(MethodImplOptions.AggressiveInlining)] public T[] TryPop() { // Try to pop from the associated stack first. If that fails, // round-robin through the other stacks. T[] arr; LockedStack[] stacks = _perCoreStacks; int index = ExecutionId % stacks.Length; for (int i = 0; i < stacks.Length; i++) { if ((arr = stacks[index].TryPop()) != null) return arr; if (++index == stacks.Length) index = 0; } return null; } } /// Provides a simple stack of arrays, protected by a lock. private sealed class LockedStack { private readonly T[][] _arrays = new T[MaxBuffersPerArraySizePerCore][]; private int _count; [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool TryPush(T[] array) { bool enqueued = false; MonitorEnterWithProcNumberFlush(this); if (_count < MaxBuffersPerArraySizePerCore) { _arrays[_count++] = array; enqueued = true; } Monitor.Exit(this); return enqueued; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public T[] TryPop() { T[] arr = null; MonitorEnterWithProcNumberFlush(this); if (_count > 0) { arr = _arrays[--_count]; _arrays[_count] = null; } Monitor.Exit(this); return arr; } /// /// Enters the monitor on the object. If there is any contention while trying /// to acquire the monitor, it flushes the cached processor number so that subsequent /// attempts to access the per-core stacks will use an updated processor number. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void MonitorEnterWithProcNumberFlush(object obj) { if (!Monitor.TryEnter(obj)) { t_cachedProcessorNumber = null; Monitor.Enter(obj); } } } } }