src/inc/yieldprocessornormalized.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#pragma once

// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor

const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake

extern unsigned int g_yieldsPerNormalizedYield;
extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;

void InitializeYieldProcessorNormalizedCrst();
void EnsureYieldProcessorNormalizedInitialized();

class YieldProcessorNormalizationInfo
{
private:
    unsigned int yieldsPerNormalizedYield;
    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
    unsigned int optimalMaxYieldsPerSpinIteration;

public:
    YieldProcessorNormalizationInfo()
        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
    {
    }

    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
};

// See YieldProcessorNormalized() for preliminary info. Typical usage:
//     if (!condition)
//     {
//         YieldProcessorNormalizationInfo normalizationInfo;
//         do
//         {
//             YieldProcessorNormalized(normalizationInfo);
//         } while (!condition);
//     }
FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
{
    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
    _ASSERTE(n != 0);
    do
    {
        System_YieldProcessor();
    } while (--n != 0);
}

// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
//     and decrease scalability of the operation.
//         while(!condition)
//         {
//             YieldProcessorNormalized();
//         }
//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
//     condition, otherwise it may unnecessarily increase latency of the operation
//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
//     issue above on later iterations.
//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
FORCEINLINE void YieldProcessorNormalized()
{
    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
}

// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
//     if (!moreExpensiveCondition)
//     {
//         YieldProcessorNormalizationInfo normalizationInfo;
//         do
//         {
//             YieldProcessorNormalized(normalizationInfo, 2);
//         } while (!moreExpensiveCondition);
//     }
FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
{
    _ASSERTE(count != 0);

    if (sizeof(SIZE_T) <= sizeof(unsigned int))
    {
        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
        if (count > MaxCount)
        {
            count = MaxCount;
        }
    }

    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
    _ASSERTE(n != 0);
    do
    {
        System_YieldProcessor();
    } while (--n != 0);
}

// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
//     while(!moreExpensiveCondition)
//     {
//         YieldProcessorNormalized(2);
//     }
FORCEINLINE void YieldProcessorNormalized(unsigned int count)
{
    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
}

// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
// info. Typical usage:
//     if (!condition)
//     {
//         YieldProcessorNormalizationInfo normalizationInfo;
//         do
//         {
//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
//         } while (!condition);
//     }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
    const YieldProcessorNormalizationInfo &normalizationInfo,
    unsigned int preSkylakeCount)
{
    _ASSERTE(preSkylakeCount != 0);

    if (sizeof(SIZE_T) <= sizeof(unsigned int))
    {
        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
        if (preSkylakeCount > MaxCount)
        {
            preSkylakeCount = MaxCount;
        }
    }

    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
    if (n == 0)
    {
        n = 1;
    }
    do
    {
        System_YieldProcessor();
    } while (--n != 0);
}

// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
//     while(!condition)
//     {
//         YieldProcessorNormalizedForPreSkylakeCount(100);
//     }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
{
    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
}

// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
// iteration exponentially up to a limit. Typical usage:
//     if (!conditionThatMayNotBeSatisfiedSoon)
//     {
//         YieldProcessorNormalizationInfo normalizationInfo;
//         do
//         {
//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
//         } while (!conditionThatMayNotBeSatisfiedSoon);
//     }
FORCEINLINE void YieldProcessorWithBackOffNormalized(
    const YieldProcessorNormalizationInfo &normalizationInfo,
    unsigned int spinIteration)
{
    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
    // InitializeYieldProcessorNormalized()
    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);

    // This shift value should be adjusted based on the asserted condition below
    const UINT8 MaxShift = 3;
    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);

    unsigned int n;
    if (spinIteration <= MaxShift &&
        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
    {
        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
    }
    else
    {
        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
    }
    _ASSERTE(n != 0);
    do
    {
        System_YieldProcessor();
    } while (--n != 0);
}