src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers.Binary;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using Internal.Runtime.CompilerServices;

namespace System.Text.Unicode
{
    internal static partial class Utf8Utility
    {
        /// <summary>
        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
        /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractCharFromFirstThreeByteSequence(uint value)
        {
            Debug.Assert(UInt32BeginsWithUtf8ThreeByteMask(value));

            if (BitConverter.IsLittleEndian)
            {
                // value = [ ######## | 10xxxxxx 10yyyyyy 1110zzzz ]
                return ((value & 0x003F0_000u) >> 16)
                    | ((value & 0x0000_3F00u) >> 2)
                    | ((value & 0x0000_000Fu) << 12);
            }
            else
            {
                // value = [ 1110zzzz 10yyyyyy 10xxxxxx | ######## ]
                return ((value & 0x0F00_0000u) >> 12)
                    | ((value & 0x003F_0000u) >> 10)
                    | ((value & 0x0000_3F00u) >> 8);
            }
        }

        /// <summary>
        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
        /// first two bytes as a two-byte UTF-8 subsequence and returns the UTF-16 representation.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractCharFromFirstTwoByteSequence(uint value)
        {
            Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value) && !UInt32BeginsWithOverlongUtf8TwoByteSequence(value));

            if (BitConverter.IsLittleEndian)
            {
                // value = [ ######## ######## | 10xxxxxx 110yyyyy ]
                uint leadingByte = (uint)(byte)value << 6;
                return (uint)(byte)(value >> 8) + leadingByte - (0xC0u << 6) - 0x80u; // remove header bits
            }
            else
            {
                // value = [ 110yyyyy 10xxxxxx | ######## ######## ]
                return (char)(((value & 0x1F00_0000u) >> 18) | ((value & 0x003F_0000u) >> 16));
            }
        }

        /// <summary>
        /// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a
        /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractCharsFromFourByteSequence(uint value)
        {
            if (BitConverter.IsLittleEndian)
            {
                // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
                // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
                // where wwww = uuuuu - 1
                uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
                retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
                retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
                retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
                retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
                retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
                retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
                retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
                retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
                return retVal;
            }
            else
            {
                // input is UTF8 [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
                // want to return UTF16 scalar 000uuuuuxxxxxxxxxxxxxxxx = [ 110110wwwwxxxxxx 110111xxxxxxxxx ]
                // where wwww = uuuuu - 1
                uint retVal = value & 0xFF00_0000u; // retVal = [ 11110uuu 00000000 00000000 00000000 ]
                retVal |= (value & 0x003F_0000u) << 2; // retVal = [ 11110uuu uuzzzz00 00000000 00000000 ]
                retVal |= (value & 0x0000_3000u) << 4; // retVal = [ 11110uuu uuzzzzyy 00000000 00000000 ]
                retVal |= (value & 0x0000_0F00u) >> 2; // retVal = [ 11110uuu uuzzzzyy 000000yy yy000000 ]
                retVal |= (value & 0x0000_003Fu); // retVal = [ 11110uuu uuzzzzyy 000000yy yyxxxxxx ]
                retVal -= 0x2000_0000u; // retVal = [ 11010uuu uuzzzzyy 000000yy yyxxxxxx ]
                retVal -= 0x0040_0000u; // retVal = [ 110100ww wwzzzzyy 000000yy yyxxxxxx ]
                retVal += 0x0000_DC00u; // retVal = [ 110100ww wwzzzzyy 110111yy yyxxxxxx ]
                retVal += 0x0800_0000u; // retVal = [ 110110ww wwzzzzyy 110111yy yyxxxxxx ]
                return retVal;
            }
        }

        /// <summary>
        /// Given a 32-bit integer that represents a valid packed UTF-16 surrogate pair, all in machine-endian order,
        /// returns the packed 4-byte UTF-8 representation of this scalar value, also in machine-endian order.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
        {
            Debug.Assert(IsWellFormedUtf16SurrogatePair(value));

            if (BitConverter.IsLittleEndian)
            {
                // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
                // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1

                value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]

                uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
                tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]

                uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
                uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
                tempC |= tempB;

                uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
                tempD |= 0x8080_80F0u;

                return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
            }
            else
            {
                // input = [ 110110wwwwzzzzyy 110111yyyyxxxxxx ], where wwww = uuuuu - 1
                // must return [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ], where wwww = uuuuu - 1

                value -= 0xD800_DC00u; // = [ 000000wwwwzzzzyy 000000yyyyxxxxxx ]
                value += 0x0040_0000u; // = [ 00000uuuuuzzzzyy 000000yyyyxxxxxx ]

                uint tempA = value & 0x0700_0000u; // = [ 00000uuu 00000000 00000000 00000000 ]
                uint tempB = (value >> 2) & 0x003F_0000u; // = [ 00000000 00uuzzzz 00000000 00000000 ]
                tempB |= tempA;

                uint tempC = (value << 2) & 0x0000_0F00u; // = [ 00000000 00000000 0000yyyy 00000000 ]
                uint tempD = (value >> 6) & 0x0003_0000u; // = [ 00000000 00000000 00yy0000 00000000 ]
                tempD |= tempC;

                uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ]
                return (tempE | tempB | tempD); // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
            }
        }

        /// <summary>
        /// Given a machine-endian DWORD which represents two adjacent UTF-8 two-byte sequences,
        /// returns the machine-endian DWORD representation of that same data as two adjacent
        /// UTF-16 byte sequences.
        /// </summary>
        /// <param name="value"></param>
        /// <returns></returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint value)
        {
            // We don't want to swap the position of the high and low WORDs,
            // as the buffer was read in machine order and will be written in
            // machine order.

            if (BitConverter.IsLittleEndian)
            {
                // value = [ 10xxxxxx 110yyyyy | 10xxxxxx 110yyyyy ]
                return ((value & 0x3F003F00u) >> 8) | ((value & 0x001F001Fu) << 6);
            }
            else
            {
                // value = [ 110yyyyy 10xxxxxx | 110yyyyy 10xxxxxx ]
                return ((value & 0x1F001F00u) >> 2) | (value & 0x003F003Fu);
            }
        }

        /// <summary>
        /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences,
        /// returns the machine-endian DWORD representation of that same data as two
        /// adjacent UTF-8 two-byte sequences.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint value)
        {
            // stays in machine endian

            Debug.Assert(IsFirstCharTwoUtf8Bytes(value) && IsSecondCharTwoUtf8Bytes(value));

            if (BitConverter.IsLittleEndian)
            {
                // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ]
                // want to return [ 10XXXXXX 110YYYYY 10xxxxxx 110yyyyy ]

                return ((value >> 6) & 0x001F_001Fu) + ((value << 8) & 0x3F00_3F00u) + 0x80C0_80C0u;
            }
            else
            {
                // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ]
                // want to return [ 110YYYYY 10XXXXXX 110yyyyy 10xxxxxx ]

                return ((value << 2) & 0x1F00_1F00u) + (value & 0x003F_003Fu) + 0xC080_C080u;
            }
        }

        /// <summary>
        /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences,
        /// returns the machine-endian DWORD representation of the first UTF-16 char
        /// as a UTF-8 two-byte sequence packed into a WORD and zero-extended to DWORD.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value)
        {
            // stays in machine endian

            Debug.Assert(IsFirstCharTwoUtf8Bytes(value));

            if (BitConverter.IsLittleEndian)
            {
                // value = [ ######## ######## 00000yyy yyxxxxxx ]
                // want to return [ ######## ######## 10xxxxxx 110yyyyy ]

                uint temp = (value << 2) & 0x1F00u; // [ 00000000 00000000 000yyyyy 00000000 ]
                value &= 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
                return BinaryPrimitives.ReverseEndianness((ushort)(temp + value + 0xC080u)); // [ 00000000 00000000 10xxxxxx 110yyyyy ]
            }
            else
            {
                // value = [ 00000yyy yyxxxxxx ######## ######## ]
                // want to return [ ######## ######## 110yyyyy 10xxxxxx ]

                uint temp = (value >> 16) & 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
                value = (value >> 22) & 0x1F00u; // [ 00000000 00000000 000yyyyy 0000000 ]
                return value + temp + 0xC080u;
            }
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the first UTF-16 character is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsFirstCharAscii(uint value)
        {
            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0000..007F ].
            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0000..007F ].

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
                || (!BitConverter.IsLittleEndian && value < 0x0080_0000u);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the first UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
        /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated).
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
        {
            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
                || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsFirstCharSurrogate(uint value)
        {
            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
                || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsFirstCharTwoUtf8Bytes(uint value)
        {
            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].

            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
            // Tracked as https://github.com/dotnet/coreclr/issues/18022.

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
                || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
        }

        /// <summary>
        /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
        /// is a UTF-8 continuation byte.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsLowByteUtf8ContinuationByte(uint value)
        {
            // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
            // so the best we can do for now is the lea / cmp pair.
            // Tracked as https://github.com/dotnet/coreclr/issues/18022.

            return (byte)(value - 0x80u) <= 0x3Fu;
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the second UTF-16 character is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsSecondCharAscii(uint value)
        {
            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
                || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
        /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
        {
            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
            // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
                || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsSecondCharSurrogate(uint value)
        {
            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
                || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsSecondCharTwoUtf8Bytes(uint value)
        {
            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].

            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
            // Tracked as https://github.com/dotnet/coreclr/issues/18022.

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
                || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
        }

        /// <summary>
        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
        /// i.e., has binary representation 10xxxxxx, where x is any bit.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsUtf8ContinuationByte(in byte value)
        {
            // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
            // directly rather than bounce a temporary through a register. That is, we want the JIT to be
            // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
            // to see if it's a continuation byte. Data that's already enregistered will go through the
            // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
            //
            // The below check takes advantage of the two's complement representation of negative numbers.
            // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]

            return ((sbyte)value < -64);
        }

        /// <summary>
        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
        /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool IsWellFormedUtf16SurrogatePair(uint value)
        {
            // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
            // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
            //
            // We're essentially performing a range check on each component of the input in parallel. The allowed range
            // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
            // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
            // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
            // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
                || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
        }

        /// <summary>
        /// Converts a DWORD from machine-endian to little-endian.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint ToLittleEndian(uint value)
        {
            if (BitConverter.IsLittleEndian)
            {
                return value;
            }
            else
            {
                return BinaryPrimitives.ReverseEndianness(value);
            }
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the first two bytes of the buffer are
        /// an overlong representation of a sequence that should be represented as one byte.
        /// This method *does not* validate that the sequence matches the appropriate
        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
        {
            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
            Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));

            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
            // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
            // to check that it's < C2.

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
                || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the first four bytes of the buffer match
        /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
        /// method *does not* validate that the sequence is well-formed; the caller must
        /// still perform overlong form or out-of-range checking.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
        {
            // The code in this method is equivalent to the code
            // below but is slightly more optimized.
            //
            // if (BitConverter.IsLittleEndian)
            // {
            //     const uint mask = 0xC0C0C0F8U;
            //     const uint comparand = 0x808080F0U;
            //     return ((value & mask) == comparand);
            // }
            // else
            // {
            //     const uint mask = 0xF8C0C0C0U;
            //     const uint comparand = 0xF0808000U;
            //     return ((value & mask) == comparand);
            // }

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
                || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the first three bytes of the buffer match
        /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
        /// validate that the sequence is well-formed; the caller must still perform
        /// overlong form or surrogate checking.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
        {
            // The code in this method is equivalent to the code
            // below but is slightly more optimized.
            //
            // if (BitConverter.IsLittleEndian)
            // {
            //     const uint mask = 0x00C0C0F0U;
            //     const uint comparand = 0x008080E0U;
            //     return ((value & mask) == comparand);
            // }
            // else
            // {
            //     const uint mask = 0xF0C0C000U;
            //     const uint comparand = 0xE0808000U;
            //     return ((value & mask) == comparand);
            // }

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
                || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the first two bytes of the buffer match
        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
        /// validate that the sequence is well-formed; the caller must still perform
        /// overlong form checking.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
        {
            // The code in this method is equivalent to the code
            // below but is slightly more optimized.
            //
            // if (BitConverter.IsLittleEndian)
            // {
            //     const uint mask = 0x0000C0E0U;
            //     const uint comparand = 0x000080C0U;
            //     return ((value & mask) == comparand);
            // }
            // else
            // {
            //     const uint mask = 0xE0C00000U;
            //     const uint comparand = 0xC0800000U;
            //     return ((value & mask) == comparand);
            // }

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
                || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the first two bytes of the buffer are
        /// an overlong representation of a sequence that should be represented as one byte.
        /// This method *does not* validate that the sequence matches the appropriate
        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
        {
            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
            Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));

            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
            // We already validated that it's 80 .. DF (per mask check earlier).
            // C2 = 1100 0010
            // DF = 1101 1111
            // This means that we can AND the leading byte with the mask 0001 1110 (1E),
            // and if the result is zero the sequence is overlong.

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
                || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the last two bytes of the buffer match
        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
        /// validate that the sequence is well-formed; the caller must still perform
        /// overlong form checking.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
        {
            // The code in this method is equivalent to the code
            // below but is slightly more optimized.
            //
            // if (BitConverter.IsLittleEndian)
            // {
            //     const uint mask = 0xC0E00000U;
            //     const uint comparand = 0x80C00000U;
            //     return ((value & mask) == comparand);
            // }
            // else
            // {
            //     const uint mask = 0x0000E0C0U;
            //     const uint comparand = 0x0000C080U;
            //     return ((value & mask) == comparand);
            // }

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
                || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
        /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
        {
            // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
            // In little-endian, that would be represented as:
            // [ ######## ######## 10xxxxxx 110yyyyy ].
            // Due to the little-endian representation we can perform a trick by ANDing the low
            // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
            // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
            // 2-byte-sequence bitmask check and overlong form validation with one comparison.

            Debug.Assert(BitConverter.IsLittleEndian);

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
                || (!BitConverter.IsLittleEndian && false);
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
        /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
        {
            // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.

            Debug.Assert(BitConverter.IsLittleEndian);

            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
                || (!BitConverter.IsLittleEndian && false);
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32FirstByteIsAscii(uint value)
        {
            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
                || (!BitConverter.IsLittleEndian && ((int)value >= 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32FourthByteIsAscii(uint value)
        {
            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((int)value >= 0))
                || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32SecondByteIsAscii(uint value)
        {
            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
                || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
        }

        /// <summary>
        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
        /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static bool UInt32ThirdByteIsAscii(uint value)
        {
            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.

            return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
                || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
        }

        /// <summary>
        /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
        /// converts those scalar values to their 3-byte UTF-8 representation and writes the
        /// resulting 6 bytes to the destination buffer.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outputBuffer, uint value)
        {
            Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
            Debug.Assert(IsSecondCharAtLeastThreeUtf8Bytes(value) && !IsSecondCharSurrogate(value), "Second half of value should've been 0800..D7FF or E000..FFFF");

            if (BitConverter.IsLittleEndian)
            {
                // value = [ ZZZZYYYY YYXXXXXX zzzzyyyy yyxxxxxx ]
                // want to write [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ] [ 10XXXXXX 10YYYYYY ]

                uint tempA = ((value << 2) & 0x3F00u) | ((value & 0x3Fu) << 16); // = [ 00000000 00xxxxxx 00yyyyyy 00000000 ]
                uint tempB = ((value >> 4) & 0x0F00_0000u) | ((value >> 12) & 0x0Fu); // = [ 0000ZZZZ 00000000 00000000 0000zzzz ]
                Unsafe.WriteUnaligned<uint>(ref outputBuffer, tempA + tempB + 0xE080_80E0u); // = [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ]
                Unsafe.WriteUnaligned<ushort>(ref Unsafe.Add(ref outputBuffer, 4), (ushort)(((value >> 22) & 0x3Fu) + ((value >> 8) & 0x3F00u) + 0x8080u)); // = [ 10XXXXXX 10YYYYYY ]
            }
            else
            {
                // value = [ zzzzyyyy yyxxxxxx ZZZZYYYY YYXXXXXX ]
                // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ] [ 1110ZZZZ ] [ 10YYYYYY ] [ 10XXXXXX ]

                Unsafe.Add(ref outputBuffer, 5) = (byte)((value & 0x3Fu) | 0x80u);
                Unsafe.Add(ref outputBuffer, 4) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
                Unsafe.Add(ref outputBuffer, 3) = (byte)(((value >>= 6) & 0x0Fu) | 0xE0u);
                Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 4) & 0x3Fu) | 0x80u);
                Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
                outputBuffer = (byte)((value >>= 6) | 0xE0u);
            }
        }


        /// <summary>
        /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
        /// converts the first UTF-16 value to its 3-byte UTF-8 representation and writes the
        /// resulting 3 bytes to the destination buffer.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static void WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref byte outputBuffer, uint value)
        {
            Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");

            if (BitConverter.IsLittleEndian)
            {
                // value = [ ######## ######## zzzzyyyy yyxxxxxx ]
                // want to write [ 10yyyyyy 1110zzzz ] [ 10xxxxxx ]

                uint tempA = (value << 2) & 0x3F00u; // [ 00yyyyyy 00000000 ]
                uint tempB = ((uint)(ushort)value >> 12); // [ 00000000 0000zzzz ]
                Unsafe.WriteUnaligned<ushort>(ref outputBuffer, (ushort)(tempA + tempB + 0x80E0u)); // [ 10yyyyyy 1110zzzz ]
                Unsafe.Add(ref outputBuffer, 2) = (byte)((value & 0x3Fu) | ~0x7Fu); // [ 10xxxxxx ]
            }
            else
            {
                // value = [ zzzzyyyy yyxxxxxx ######## ######## ]
                // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ]

                Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 16) & 0x3Fu) | 0x80u);
                Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
                outputBuffer = (byte)((value >>= 6) | 0xE0u);
            }
        }
    }
}