18 files changed, 4113 insertions, 2425 deletions
diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
index 19d8105baf..02656f57ad 100644
--- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
+++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
@@ -768,6 +768,7 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.Helpers.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
@@ -799,13 +800,17 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />
-    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf16Utility.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.Validation.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Helpers.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Transcoding.cs" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Validation.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
diff --git a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs
index f5bba908b5..ef2eb4945a 100644
--- a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs
+++ b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs
@@ -8,7 +8,7 @@ using System.Reflection;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Serialization;
-using System.Text;
+using System.Text.Unicode;
 using Internal.Runtime.CompilerServices;
 
 namespace System.Globalization
diff --git a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs
index cf89dff6a2..4391dec044 100644
--- a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs
+++ b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs
@@ -8,6 +8,7 @@ using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Serialization;
 using System.Text;
+using System.Text.Unicode;
 using Internal.Runtime.CompilerServices;
 
 #if BIT64
diff --git a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs
index beab0cfe02..9e9bb31623 100644
--- a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs
+++ b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs
@@ -5,7 +5,7 @@
 using System.Buffers;
 using System.Diagnostics;
 using System.Runtime.InteropServices;
-using System.Text;
+using System.Text.Unicode;
 using Internal.Runtime.CompilerServices;
 
 #if BIT64
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs
new file mode 100644
index 0000000000..b48a001d48
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs
@@ -0,0 +1,77 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+
+namespace System.Text
+{
+    internal static partial class ASCIIUtility
+    {
+        /// <summary>
+        /// A mask which selects only the high bit of each byte of the given <see cref="uint"/>.
+        /// </summary>
+        private const uint UInt32HighBitsOnlyMask = 0x80808080u;
+
+        /// <summary>
+        /// A mask which selects only the high bit of each byte of the given <see cref="ulong"/>.
+        /// </summary>
+        private const ulong UInt64HighBitsOnlyMask = 0x80808080_80808080ul;
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool AllBytesInUInt32AreAscii(uint value)
+        {
+            // If the high bit of any byte is set, that byte is non-ASCII.
+
+            return (value & UInt32HighBitsOnlyMask) == 0;
+        }
+
+
+        /// <summary>
+        /// Given a 24-bit integer which represents a three-byte buffer read in machine endianness,
+        /// counts the number of consecutive ASCII bytes starting from the beginning of the buffer.
+        /// Returns a value 0 - 3, inclusive. (The caller is responsible for ensuring that an all-
+        /// ASCII value does not make its way to this method.)
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value)
+        {
+            Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");
+
+            if (BitConverter.IsLittleEndian)
+            {
+                return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
+            }
+            else
+            {
+                // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
+                // on whether all processed bytes were ASCII. Then we accumulate all of the
+                // results to calculate how many consecutive ASCII bytes are present.
+
+                value = ~value;
+
+                // Read first byte
+                value = BitOperations.RotateLeft(value, 1);
+                uint allBytesUpToNowAreAscii = value & 1;
+                uint numAsciiBytes = allBytesUpToNowAreAscii;
+
+                // Read second byte
+                value = BitOperations.RotateLeft(value, 8);
+                allBytesUpToNowAreAscii &= value;
+                numAsciiBytes += allBytesUpToNowAreAscii;
+
+                // Read third byte
+                value = BitOperations.RotateLeft(value, 8);
+                allBytesUpToNowAreAscii &= value;
+                numAsciiBytes += allBytesUpToNowAreAscii;
+
+                return numAsciiBytes;
+            }
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
index 755f925610..6193a0a5ee 100644
--- a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
@@ -21,19 +21,12 @@ namespace System.Text
 {
     internal static partial class ASCIIUtility
     {
-        /// <summary>
-        /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool AllBytesInUInt32AreAscii(uint value)
-        {
-            return ((value & 0x80808080u) == 0);
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool AllBytesInUInt64AreAscii(ulong value)
         {
-            return ((value & 0x80808080_80808080ul) == 0);
+            // If the high bit of any byte is set, that byte is non-ASCII.
+
+            return ((value & UInt64HighBitsOnlyMask) == 0);
         }
 
         /// <summary>
@@ -55,56 +48,6 @@ namespace System.Text
         }
 
         /// <summary>
-        /// Given a 24-bit integer which represents a three-byte buffer read in machine endianness,
-        /// counts the number of consecutive ASCII bytes starting from the beginning of the buffer.
-        /// Returns a value 0 - 3, inclusive.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value)
-        {
-            // This implementation seems to have better performance than tzcnt.
-
-            // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
-            // on whether all processed bytes were ASCII. Then we accumulate all of the
-            // results to calculate how many consecutive ASCII bytes are present.
-
-            value = ~value;
-
-            if (BitConverter.IsLittleEndian)
-            {
-                // Read first byte
-                uint allBytesUpToNowAreAscii = (value >>= 7) & 1;
-                uint numAsciiBytes = allBytesUpToNowAreAscii;
-
-                // Read second byte
-                allBytesUpToNowAreAscii &= (value >>= 8);
-                numAsciiBytes += allBytesUpToNowAreAscii;
-
-                // Read third byte
-                allBytesUpToNowAreAscii &= (value >>= 8);
-                numAsciiBytes += allBytesUpToNowAreAscii;
-
-                return numAsciiBytes;
-            }
-            else
-            {
-                // Read first byte
-                uint allBytesUpToNowAreAscii = (value = ROL32(value, 1)) & 1;
-                uint numAsciiBytes = allBytesUpToNowAreAscii;
-
-                // Read second byte
-                allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
-                numAsciiBytes += allBytesUpToNowAreAscii;
-
-                // Read third byte
-                allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
-                numAsciiBytes += allBytesUpToNowAreAscii;
-
-                return numAsciiBytes;
-            }
-        }
-
-        /// <summary>
         /// Given a DWORD which represents two packed chars in machine-endian order,
         /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
         /// </summary>
@@ -461,7 +404,7 @@ namespace System.Text
                         // Clear everything but the high bit of each byte, then tzcnt.
                         // Remember the / 8 at the end to convert bit count to byte count.
 
-                        candidateUInt64 &= 0x80808080_80808080ul;
+                        candidateUInt64 &= UInt64HighBitsOnlyMask;
                         pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
                         goto Finish;
                     }
@@ -1395,17 +1338,7 @@ namespace System.Text
             // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
 
             Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-
-            if (Sse41.X64.IsSupported)
-            {
-                // Use PEXTRQ instruction if available, since it can extract from the vector directly to the destination address.
-                Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, Sse41.X64.Extract(asciiVector.AsUInt64(), 0));
-            }
-            else
-            {
-                // Bounce this through a temporary register (with potential stack spillage) before writing to memory.
-                Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, asciiVector.AsUInt64().GetElement(0));
-            }
+            Sse2.StoreLow((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
 
             nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
 
@@ -1444,16 +1377,7 @@ namespace System.Text
 
                 // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
                 asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-
-                // See comments earlier in this method for information about how this works.
-                if (Sse41.X64.IsSupported)
-                {
-                    Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, Sse41.X64.Extract(asciiVector.AsUInt64(), 0));
-                }
-                else
-                {
-                    Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, asciiVector.AsUInt64().GetElement(0));
-                }
+                Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
             }
 
             // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
@@ -1529,27 +1453,13 @@ namespace System.Text
 
             Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
 
-            // See comments earlier in this method for information about how this works.
-            if (Sse41.X64.IsSupported)
-            {
-                *(ulong*)(pAsciiBuffer + currentOffsetInElements) = Sse41.X64.Extract(asciiVector.AsUInt64(), 0);
-            }
-            else
-            {
-                *(ulong*)(pAsciiBuffer + currentOffsetInElements) = asciiVector.AsUInt64().GetElement(0);
-            }
+            Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
             currentOffsetInElements += SizeOfVector128 / 2;
 
             goto Finish;
         }
 
         /// <summary>
-        /// Rotates a <see cref="uint"/> left. The JIT is smart enough to turn this into a ROL / ROR instruction.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static uint ROL32(uint value, int shift) => (value << shift) | (value >> (32 - shift));
-
-        /// <summary>
         /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
         /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
         /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
diff --git a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs
index 9040a94f0f..bb5aa5f0ac 100644
--- a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs
@@ -266,6 +266,7 @@ namespace System.Text
             // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
 
             Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
+            Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
 
             // Copy the existing leftover data plus as many bytes as possible of the new incoming data
             // into a temporary concated buffer, then get its char count by decoding it.
@@ -319,6 +320,7 @@ namespace System.Text
             // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
 
             Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
+            Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
 
             // Copy the existing leftover data plus as many bytes as possible of the new incoming data
             // into a temporary concated buffer, then transcode it from bytes to chars.
@@ -370,6 +372,14 @@ namespace System.Text
 
         Finish:
 
+            // Report back the number of bytes (from the new incoming span) we consumed just now.
+            // This calculation is simple: it's the difference between the original leftover byte
+            // count and the number of bytes from the combined buffer we needed to decode the first
+            // scalar value. We need to report this before the call to SetLeftoverData /
+            // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field.
+
+            bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount;
+
             if (persistNewCombinedBuffer)
             {
                 Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer.");
@@ -380,7 +390,6 @@ namespace System.Text
                 ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths
             }
 
-            bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now
             return charsWritten;
 
         DestinationTooSmall:
diff --git a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs
index 0e32167957..ca740a1adc 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs
@@ -850,8 +850,14 @@ namespace System.Text
 
             ReadOnlySpan<byte> bytes = new ReadOnlySpan<byte>(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar);
 
-            int totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out int bytesConsumedJustNow);
-            bytes = bytes.Slice(bytesConsumedJustNow);
+            int bytesConsumedJustNow = 0;
+            int totalCharCount = 0;
+
+            if (decoder.HasLeftoverData)
+            {
+                totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out bytesConsumedJustNow);
+                bytes = bytes.Slice(bytesConsumedJustNow);
+            }
 
             // Now try invoking the "fast path" (no fallback) implementation.
             // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers).
@@ -1120,10 +1126,15 @@ namespace System.Text
             ReadOnlySpan<byte> bytes = new ReadOnlySpan<byte>(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar);
             Span<char> chars = new Span<char>(pOriginalChars, originalCharCount).Slice(charsWrittenSoFar);
 
-            int charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out int bytesConsumedJustNow);
+            int bytesConsumedJustNow = 0;
+            int charsWrittenJustNow = 0;
 
-            bytes = bytes.Slice(bytesConsumedJustNow);
-            chars = chars.Slice(charsWrittenJustNow);
+            if (decoder.HasLeftoverData)
+            {
+                charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out bytesConsumedJustNow);
+                bytes = bytes.Slice(bytesConsumedJustNow);
+                chars = chars.Slice(charsWrittenJustNow);
+            }
 
             Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, "Should be no remaining fallback data at this point.");
 
diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
index a91c0fcb99..a71750eaa5 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
@@ -6,6 +6,7 @@ using System.Buffers;
 using System.Diagnostics;
 using System.Globalization;
 using System.Runtime.CompilerServices;
+using System.Text.Unicode;
 
 namespace System.Text
 {
diff --git a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs
index aaac975ec8..7a3a1f7de5 100644
--- a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs
@@ -15,9 +15,11 @@
 #define FASTLOOP
 
 using System;
+using System.Buffers;
 using System.Diagnostics;
-using System.Globalization;
+using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Text.Unicode;
 
 namespace System.Text
 {
@@ -129,22 +131,26 @@ namespace System.Text
         public override unsafe int GetByteCount(char[] chars, int index, int count)
         {
             // Validate input parameters
-            if (chars == null)
-                throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
 
-            if (index < 0 || count < 0)
-                throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
+            if (chars is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array);
+            }
 
-            if (chars.Length - index < count)
-                throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
+            if ((index | count) < 0)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            // If no input, return 0, avoid fixed empty array problem
-            if (count == 0)
-                return 0;
+            if (chars.Length - index < count)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
+            }
 
-            // Just call the pointer version
             fixed (char* pChars = chars)
-                return GetByteCount(pChars + index, count, null);
+            {
+                return GetByteCountCommon(pChars + index, count);
+            }
         }
 
         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -154,12 +160,17 @@ namespace System.Text
 
         public override unsafe int GetByteCount(string chars)
         {
-            // Validate input
-            if (chars==null)
-                throw new ArgumentNullException("s");
+            // Validate input parameters
+
+            if (chars is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
+            }
 
             fixed (char* pChars = chars)
-                return GetByteCount(pChars, chars.Length, null);
+            {
+                return GetByteCountCommon(pChars, chars.Length);
+            }
         }
 
         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -170,22 +181,78 @@ namespace System.Text
         public override unsafe int GetByteCount(char* chars, int count)
         {
             // Validate Parameters
+
             if (chars == null)
-                throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
+            }
 
             if (count < 0)
-                throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            // Call it with empty encoder
-            return GetByteCount(chars, count, null);
+            return GetByteCountCommon(chars, count);
         }
 
         public override unsafe int GetByteCount(ReadOnlySpan<char> chars)
         {
-            fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
+            // It's ok for us to pass null pointers down to the workhorse below.
+
+            fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
+            {
+                return GetByteCountCommon(charsPtr, chars.Length);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private unsafe int GetByteCountCommon(char* pChars, int charCount)
+        {
+            // Common helper method for all non-EncoderNLS entry points to GetByteCount.
+            // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
+
+            Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
+            Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+
+            // First call into the fast path.
+            // Don't bother providing a fallback mechanism; our fast path doesn't use it.
+
+            int totalByteCount = GetByteCountFast(pChars, charCount, fallback: null, out int charsConsumed);
+
+            if (charsConsumed != charCount)
+            {
+                // If there's still data remaining in the source buffer, go down the fallback path.
+                // We need to check for integer overflow since the fallback could change the required
+                // output count in unexpected ways.
+
+                totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed);
+                if (totalByteCount < 0)
+                {
+                    ThrowConversionOverflow();
+                }
+            }
+
+            return totalByteCount;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
+        private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback fallback, out int charsConsumed)
+        {
+            // The number of UTF-8 code units may exceed the number of UTF-16 code units,
+            // so we'll need to check for overflow before casting to Int32.
+
+            char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _);
+
+            int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars);
+            charsConsumed = tempCharsConsumed;
+
+            long totalUtf8Bytes = tempCharsConsumed + utf8CodeUnitCountAdjustment;
+            if ((ulong)totalUtf8Bytes > int.MaxValue)
             {
-                return GetByteCount(charsPtr, chars.Length, baseEncoder: null);
+                ThrowConversionOverflow();
             }
+
+            return (int)totalUtf8Bytes;
         }
 
         // Parent method is safe.
@@ -196,22 +263,37 @@ namespace System.Text
         public override unsafe int GetBytes(string s, int charIndex, int charCount,
                                               byte[] bytes, int byteIndex)
         {
-            if (s == null || bytes == null)
-                throw new ArgumentNullException((s == null ? nameof(s) : nameof(bytes)), SR.ArgumentNull_Array);
+            // Validate Parameters
+
+            if (s is null || bytes is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(
+                    argument: (s is null) ? ExceptionArgument.s : ExceptionArgument.bytes,
+                    resource: ExceptionResource.ArgumentNull_Array);
+            }
 
-            if (charIndex < 0 || charCount < 0)
-                throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
+            if ((charIndex | charCount) < 0)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(
+                    argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
+                    resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
             if (s.Length - charIndex < charCount)
-                throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
-
-            if (byteIndex < 0 || byteIndex > bytes.Length)
-                throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.s, ExceptionResource.ArgumentOutOfRange_IndexCount);
+            }
 
-            int byteCount = bytes.Length - byteIndex;
+            if ((uint)byteIndex > bytes.Length)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
+            }
 
-            fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
-                return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
+            fixed (char* pChars = s)
+            fixed (byte* pBytes = bytes)
+            {
+                return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
+            }
         }
 
         // Encodes a range of characters in a character array into a range of bytes
@@ -232,28 +314,36 @@ namespace System.Text
                                                byte[] bytes, int byteIndex)
         {
             // Validate parameters
-            if (chars == null || bytes == null)
-                throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array);
-
-            if (charIndex < 0 || charCount < 0)
-                throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
 
-            if (chars.Length - charIndex < charCount)
-                throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
+            if (chars is null || bytes is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(
+                    argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
+                    resource: ExceptionResource.ArgumentNull_Array);
+            }
 
-            if (byteIndex < 0 || byteIndex > bytes.Length)
-                throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
+            if ((charIndex | charCount) < 0)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(
+                    argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
+                    resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            // If nothing to encode return 0, avoid fixed problem
-            if (charCount == 0)
-                return 0;
+            if (chars.Length - charIndex < charCount)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount);
+            }
 
-            // Just call pointer version
-            int byteCount = bytes.Length - byteIndex;
+            if ((uint)byteIndex > bytes.Length)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
+            }
 
-            fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
-                // Remember that byteCount is # to decode, not size of array.
-                return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
+            fixed (char* pChars = chars)
+            fixed (byte* pBytes = bytes)
+            {
+                return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
+            }
         }
 
         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -264,24 +354,77 @@ namespace System.Text
         public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
         {
             // Validate Parameters
-            if (bytes == null || chars == null)
-                throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
 
-            if (charCount < 0 || byteCount < 0)
-                throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
+            if (chars == null || bytes == null)
+            {
+                ThrowHelper.ThrowArgumentNullException(
+                    argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
+                    resource: ExceptionResource.ArgumentNull_Array);
+            }
+
+            if ((charCount | byteCount) < 0)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(
+                    argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount,
+                    resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            return GetBytes(chars, charCount, bytes, byteCount, null);
+            return GetBytesCommon(chars, charCount, bytes, byteCount);
         }
 
         public override unsafe int GetBytes(ReadOnlySpan<char> chars, Span<byte> bytes)
         {
-            fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
-            fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
+            // It's ok for us to operate on null / empty spans.
+
+            fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
+            fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
             {
-                return GetBytes(charsPtr, chars.Length, bytesPtr, bytes.Length, baseEncoder: null);
+                return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length);
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount)
+        {
+            // Common helper method for all non-EncoderNLS entry points to GetBytes.
+            // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
+
+            Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
+            Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+            Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
+            Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+
+            // First call into the fast path.
+
+            int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed);
+
+            if (charsConsumed == charCount)
+            {
+                // All elements converted - return immediately.
+
+                return bytesWritten;
+            }
+            else
+            {
+                // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
+
+                return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon
+        private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed)
+        {
+            // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
+            // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
+            // we'll handle the remainder in the fallback routine.
+
+            Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining);
+
+            charsConsumed = (int)(pInputBufferRemaining - pChars);
+            return (int)(pOutputBufferRemaining - pBytes);
+        }
+
         // Returns the number of characters produced by decoding a range of bytes
         // in a byte array.
         //
@@ -293,22 +436,26 @@ namespace System.Text
         public override unsafe int GetCharCount(byte[] bytes, int index, int count)
         {
             // Validate Parameters
-            if (bytes == null)
-                throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
 
-            if (index < 0 || count < 0)
-                throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
+            if (bytes is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
+            }
 
-            if (bytes.Length - index < count)
-                throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
+            if ((index | count) < 0)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            // If no input just return 0, fixed doesn't like 0 length arrays.
-            if (count == 0)
-                return 0;
+            if (bytes.Length - index < count)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
+            }
 
-            // Just call pointer version
             fixed (byte* pBytes = bytes)
-                return GetCharCount(pBytes + index, count, null);
+            {
+                return GetCharCountCommon(pBytes + index, count);
+            }
         }
 
         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -319,20 +466,27 @@ namespace System.Text
         public override unsafe int GetCharCount(byte* bytes, int count)
         {
             // Validate Parameters
+
             if (bytes == null)
-                throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
+            {
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
+            }
 
             if (count < 0)
-                throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            return GetCharCount(bytes, count, null);
+            return GetCharCountCommon(bytes, count);
         }
 
         public override unsafe int GetCharCount(ReadOnlySpan<byte> bytes)
         {
-            fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
+            // It's ok for us to pass null pointers down to the workhorse routine.
+
+            fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
             {
-                return GetCharCount(bytesPtr, bytes.Length, baseDecoder: null);
+                return GetCharCountCommon(bytesPtr, bytes.Length);
             }
         }
 
@@ -345,28 +499,36 @@ namespace System.Text
                                               char[] chars, int charIndex)
         {
             // Validate Parameters
-            if (bytes == null || chars == null)
-                throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
-
-            if (byteIndex < 0 || byteCount < 0)
-                throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
 
-            if ( bytes.Length - byteIndex < byteCount)
-                throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
+            if (bytes is null || chars is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(
+                    argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
+                    resource: ExceptionResource.ArgumentNull_Array);
+            }
 
-            if (charIndex < 0 || charIndex > chars.Length)
-                throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
+            if ((byteIndex | byteCount) < 0)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(
+                    argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount,
+                    resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+            }
 
-            // If no input, return 0 & avoid fixed problem
-            if (byteCount == 0)
-                return 0;
+            if (bytes.Length - byteIndex < byteCount)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
+            }
 
-            // Just call pointer version
-            int charCount = chars.Length - charIndex;
+            if ((uint)charIndex > (uint)chars.Length)
+            {
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index);
+            }
 
-            fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
-                // Remember that charCount is # to decode, not size of array
-                return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
+            fixed (byte* pBytes = bytes)
+            fixed (char* pChars = chars)
+            {
+                return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex);
+            }
         }
 
         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -377,2120 +539,245 @@ namespace System.Text
         public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
         {
             // Validate Parameters
-            if (bytes == null || chars == null)
-                throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
-
-            if (charCount < 0 || byteCount < 0)
-                throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
 
-            return GetChars(bytes, byteCount, chars, charCount, null);
-        }
+            if (bytes is null || chars is null)
+            {
+                ThrowHelper.ThrowArgumentNullException(
+                    argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
+                    resource: ExceptionResource.ArgumentNull_Array);
+            }
 
-        public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
-        {
-            fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
-            fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
+            if ((byteCount | charCount) < 0)
             {
-                return GetChars(bytesPtr, bytes.Length, charsPtr, chars.Length, baseDecoder: null);
+                ThrowHelper.ThrowArgumentOutOfRangeException(
+                    argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount,
+                    resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
             }
-        }
 
-        // Returns a string containing the decoded representation of a range of
-        // bytes in a byte array.
-        //
-        // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
-        // So if you fix this, fix the others.  Currently those include:
-        // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
-        // parent method is safe
+            return GetCharsCommon(bytes, byteCount, chars, charCount);
+        }
 
-        public override unsafe string GetString(byte[] bytes, int index, int count)
+        public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
         {
-            // Validate Parameters
-            if (bytes == null)
-                throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
-
-            if (index < 0 || count < 0)
-                throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
-
-            if (bytes.Length - index < count)
-                throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
-
-            // Avoid problems with empty input buffer
-            if (count == 0) return string.Empty;
+            // It's ok for us to pass null pointers down to the workhorse below.
 
-            fixed (byte* pBytes = bytes)
-                return string.CreateStringFromEncoding(
-                    pBytes + index, count, this);
+            fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
+            fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
+            {
+                return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length);
+            }
         }
 
+        // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
+        //           So if we're really broken, then that could also throw an error... recursively.
+        //           So try to make sure GetChars can at least process all uses by
+        //           System.Resources.ResourceReader!
         //
-        // End of standard methods copied from EncodingNLS.cs
-        //
-
-        // To simplify maintenance, the structure of GetByteCount and GetBytes should be
-        // kept the same as much as possible
-        internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
+        // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
+        //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount)
         {
-            // For fallback we may need a fallback buffer.
-            // We wait to initialize it though in case we don't have any broken input unicode
-            EncoderFallbackBuffer fallbackBuffer = null;
-            char* pSrcForFallback;
+            // Common helper method for all non-DecoderNLS entry points to GetChars.
+            // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
 
-            char* pSrc = chars;
-            char* pEnd = pSrc + count;
+            Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
+            Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+            Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
+            Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
 
-            // Start by assuming we have as many as count
-            int byteCount = count;
+            // First call into the fast path.
 
-            int ch = 0;
+            int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed);
 
-            if (baseEncoder != null)
+            if (bytesConsumed == byteCount)
             {
-                UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
-                ch = encoder.surrogateChar;
-
-                // We mustn't have left over fallback data when counting
-                if (encoder.InternalHasFallbackBuffer)
-                {
-                    fallbackBuffer = encoder.FallbackBuffer;
-                    if (fallbackBuffer.Remaining > 0)
-                        throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
+                // All elements converted - return immediately.
 
-                    // Set our internal fallback interesting things.
-                    fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
-                }
+                return charsWritten;
             }
-
-            for (;;)
+            else
             {
-                // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-                if (pSrc >= pEnd)
-                {
-                    if (ch == 0)
-                    {
-                        // Unroll any fallback that happens at the end
-                        ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
-                        if (ch > 0)
-                        {
-                            byteCount++;
-                            goto ProcessChar;
-                        }
-                    }
-                    else
-                    {
-                        // Case of surrogates in the fallback.
-                        if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
-                        {
-                            Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                                "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                            ch = fallbackBuffer.InternalGetNextChar();
-                            byteCount++;
-
-                            if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                            {
-                                ch = 0xfffd;
-                                byteCount++;
-                                goto EncodeChar;
-                            }
-                            else if (ch > 0)
-                            {
-                                goto ProcessChar;
-                            }
-                            else
-                            {
-                                byteCount--; // ignore last one.
-                                break;
-                            }
-                        }
-                    }
-
-                    if (ch <= 0)
-                    {
-                        break;
-                    }
-                    if (baseEncoder != null && !baseEncoder.MustFlush)
-                    {
-                        break;
-                    }
-
-                    // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
-                    byteCount++;
-                    goto EncodeChar;
-                }
-
-                if (ch > 0)
-                {
-                    Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                        "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                    // use separate helper variables for local contexts so that the jit optimizations
-                    // won't get confused about the variable lifetimes
-                    int cha = *pSrc;
-
-                    // count the pending surrogate
-                    byteCount++;
-
-                    // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
-                    // if (IsLowSurrogate(cha)) {
-                    if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                    {
-                        // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
-                        ch = 0xfffd;
-                        //                        ch = cha + (ch << 10) +
-                        //                            (0x10000
-                        //                            - CharUnicodeInfo.LOW_SURROGATE_START
-                        //                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
-
-                        // Use this next char
-                        pSrc++;
-                    }
-                    // else ch is still high surrogate and encoding will fail (so don't add count)
-
-                    // attempt to encode the surrogate or partial surrogate
-                    goto EncodeChar;
-                }
-
-                // If we've used a fallback, then we have to check for it
-                if (fallbackBuffer != null)
-                {
-                    ch = fallbackBuffer.InternalGetNextChar();
-                    if (ch > 0)
-                    {
-                        // We have an extra byte we weren't expecting.
-                        byteCount++;
-                        goto ProcessChar;
-                    }
-                }
-
-                // read next char. The JIT optimization seems to be getting confused when
-                // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-                ch = *pSrc;
-                pSrc++;
-
-            ProcessChar:
-                // if (IsHighSurrogate(ch)) {
-                if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
-                {
-                    // we will count this surrogate next time around
-                    byteCount--;
-                    continue;
-                }
-            // either good char or partial surrogate
-
-            EncodeChar:
-                // throw exception on partial surrogate if necessary
-                // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
-                if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                {
-                    // Lone surrogates aren't allowed
-                    // Have to make a fallback buffer if we don't have one
-                    if (fallbackBuffer == null)
-                    {
-                        // wait on fallbacks if we can
-                        // For fallback we may need a fallback buffer
-                        if (baseEncoder == null)
-                            fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
-                        else
-                            fallbackBuffer = baseEncoder.FallbackBuffer;
-
-                        // Set our internal fallback interesting things.
-                        fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
-                    }
-
-                    // Do our fallback.  Actually we already know its a mixed up surrogate,
-                    // so the ref pSrc isn't gonna do anything.
-                    pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
-                    fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
-                    pSrc = pSrcForFallback;
-
-                    // Ignore it if we don't throw (we had preallocated this ch)
-                    byteCount--;
-                    ch = 0;
-                    continue;
-                }
-
-                // Count them
-                if (ch > 0x7F)
-                {
-                    if (ch > 0x7FF)
-                    {
-                        // the extra surrogate byte was compensated by the second surrogate character
-                        // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
-                        byteCount++;
-                    }
-                    byteCount++;
-                }
-
-#if BIT64
-                // check for overflow
-                if (byteCount < 0)
-                {
-                    break;
-                }
-#endif
-
-#if FASTLOOP
-                // If still have fallback don't do fast loop
-                if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
-                {
-                    // We're reserving 1 byte for each char by default
-                    byteCount++;
-                    goto ProcessChar;
-                }
-
-                int availableChars = PtrDiff(pEnd, pSrc);
+                // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
 
-                // don't fall into the fast decoding loop if we don't have enough characters
-                if (availableChars <= 13)
-                {
-                    // try to get over the remainder of the ascii characters fast though
-                    char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
-                    while (pSrc < pLocalEnd)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-                        if (ch > 0x7F)
-                            goto ProcessChar;
-                    }
-
-                    // we are done
-                    break;
-                }
-
-#if BIT64
-                // make sure that we won't get a silent overflow inside the fast loop
-                // (Fall out to slow loop if we have this many characters)
-                availableChars &= 0x0FFFFFFF;
-#endif
-
-                // To compute the upper bound, assume that all characters are ASCII characters at this point,
-                //  the boundary will be decreased for every non-ASCII character we encounter
-                // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
-                char* pStop = pSrc + availableChars - (3 + 4);
-
-                while (pSrc < pStop)
-                {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    if (ch > 0x7F)                                                  // Not ASCII
-                    {
-                        if (ch > 0x7FF)                                             // Not 2 Byte
-                        {
-                            if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
-                                goto LongCode;
-                            byteCount++;
-                        }
-                        byteCount++;
-                    }
-
-                    // get pSrc aligned
-                    if ((unchecked((int)pSrc) & 0x2) != 0)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-                        if (ch > 0x7F)                                              // Not ASCII
-                        {
-                            if (ch > 0x7FF)                                         // Not 2 Byte
-                            {
-                                if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
-                                    goto LongCode;
-                                byteCount++;
-                            }
-                            byteCount++;
-                        }
-                    }
-
-                    // Run 2 * 4 characters at a time!
-                    while (pSrc < pStop)
-                    {
-                        ch = *(int*)pSrc;
-                        int chc = *(int*)(pSrc + 2);
-                        if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
-                        {
-                            if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
-                            {
-                                goto LongCodeWithMask;
-                            }
-
-
-                            if ((ch & unchecked((int)0xFF800000)) != 0)             // Actually 0x07800780 is all we care about (4 bits)
-                                byteCount++;
-                            if ((ch & unchecked((int)0xFF80)) != 0)
-                                byteCount++;
-                            if ((chc & unchecked((int)0xFF800000)) != 0)
-                                byteCount++;
-                            if ((chc & unchecked((int)0xFF80)) != 0)
-                                byteCount++;
-                        }
-                        pSrc += 4;
-
-                        ch = *(int*)pSrc;
-                        chc = *(int*)(pSrc + 2);
-                        if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
-                        {
-                            if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
-                            {
-                                goto LongCodeWithMask;
-                            }
-
-                            if ((ch & unchecked((int)0xFF800000)) != 0)
-                                byteCount++;
-                            if ((ch & unchecked((int)0xFF80)) != 0)
-                                byteCount++;
-                            if ((chc & unchecked((int)0xFF800000)) != 0)
-                                byteCount++;
-                            if ((chc & unchecked((int)0xFF80)) != 0)
-                                byteCount++;
-                        }
-                        pSrc += 4;
-                    }
-                    break;
-
-                LongCodeWithMask:
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        ch = (char)ch;
-                    }
-                    else
-                    {
-                        // be careful about the sign extension
-                        ch = (int)(((uint)ch) >> 16);
-                    }
-                    pSrc++;
-
-                    if (ch <= 0x7F)
-                    {
-                        continue;
-                    }
-
-                LongCode:
-                    // use separate helper variables for slow and fast loop so that the jit optimizations
-                    // won't get confused about the variable lifetimes
-                    if (ch > 0x7FF)
-                    {
-                        // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
-                        if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                        {
-                            // 4 byte encoding - high surrogate + low surrogate
-
-                            int chd = *pSrc;
-                            if (
-                                // !IsHighSurrogate(ch) // low without high -> bad
-                                ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
-                                // !IsLowSurrogate(chd) // high not followed by low -> bad
-                                !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                            {
-                                // Back up and drop out to slow loop to figure out error
-                                pSrc--;
-                                break;
-                            }
-                            pSrc++;
-
-                            // byteCount - this byte is compensated by the second surrogate character
-                        }
-                        byteCount++;
-                    }
-                    byteCount++;
-
-                    // byteCount - the last byte is already included
-                }
-#endif // FASTLOOP
-
-                // no pending char at this point
-                ch = 0;
+                return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten);
             }
-
-#if BIT64
-            // check for overflow
-            if (byteCount < 0)
-            {
-                throw new ArgumentException(
-                        SR.Argument_ConversionOverflow);
-            }
-#endif
-
-            Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
-                "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
-
-            return byteCount;
         }
 
-        // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
-        // is good enough for us, and it tends to generate better code than the signed
-        // arithmetic generated by default
-        private static unsafe int PtrDiff(char* a, char* b)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon
+        private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed)
         {
-            return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
-        }
+            // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
+            // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
+            // we'll handle the remainder in the fallback routine.
 
-        // byte* flavor just for parity
-        private static unsafe int PtrDiff(byte* a, byte* b)
-        {
-            return (int)(a - b);
-        }
+            Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining);
 
-        private static bool InRange(int ch, int start, int end)
-        {
-            return (uint)(ch - start) <= (uint)(end - start);
+            bytesConsumed = (int)(pInputBufferRemaining - pBytes);
+            return (int)(pOutputBufferRemaining - pChars);
         }
 
-        // Our workhorse
-        // Note:  We ignore mismatched surrogates, unless the exception flag is set in which case we throw
-        internal sealed override unsafe int GetBytes(
-            char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS baseEncoder)
+        private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan<byte> bytes, int originalBytesLength, Span<char> chars, int originalCharsLength, DecoderNLS decoder)
         {
-            Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
-            Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
-            Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
-            Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
-
-            UTF8Encoder encoder = null;
-
-            // For fallback we may need a fallback buffer.
-            // We wait to initialize it though in case we don't have any broken input unicode
-            EncoderFallbackBuffer fallbackBuffer = null;
-            char* pSrcForFallback;
-
-            char* pSrc = chars;
-            byte* pTarget = bytes;
-
-            char* pEnd = pSrc + charCount;
-            byte* pAllocatedBufferEnd = pTarget + byteCount;
-
-            int ch = 0;
-
-            // assume that JIT will en-register pSrc, pTarget and ch
+            // We special-case DecoderReplacementFallback if it's telling us to write a single U+FFFD char,
+            // since we believe this to be relatively common and we can handle it more efficiently than
+            // the base implementation.
 
-            if (baseEncoder != null)
+            if (((decoder is null) ? this.DecoderFallback : decoder.Fallback) is DecoderReplacementFallback replacementFallback
+                && replacementFallback.MaxCharCount == 1
+                && replacementFallback.DefaultString[0] == UnicodeUtility.ReplacementChar)
             {
-                encoder = (UTF8Encoder)baseEncoder;
-                ch = encoder.surrogateChar;
-
-                // We mustn't have left over fallback data when counting
-                if (encoder.InternalHasFallbackBuffer)
-                {
-                    // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
-                    fallbackBuffer = encoder.FallbackBuffer;
-                    if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
-                        throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
-
-                    // Set our internal fallback interesting things.
-                    fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
-                }
-            }
-
-            for (;;)
-            {
-                // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
-                if (pSrc >= pEnd)
-                {
-                    if (ch == 0)
-                    {
-                        // Check if there's anything left to get out of the fallback buffer
-                        ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
-                        if (ch > 0)
-                        {
-                            goto ProcessChar;
-                        }
-                    }
-                    else
-                    {
-                        // Case of leftover surrogates in the fallback buffer
-                        if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
-                        {
-                            Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                                "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                            int cha = ch;
-
-                            ch = fallbackBuffer.InternalGetNextChar();
-
-                            if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                            {
-                                ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
-                                goto EncodeChar;
-                            }
-                            else if (ch > 0)
-                            {
-                                goto ProcessChar;
-                            }
-                            else
-                            {
-                                break;
-                            }
-                        }
-                    }
-
-                    // attempt to encode the partial surrogate (will fail or ignore)
-                    if (ch > 0 && (encoder == null || encoder.MustFlush))
-                        goto EncodeChar;
-
-                    // We're done
-                    break;
-                }
-
-                if (ch > 0)
-                {
-                    // We have a high surrogate left over from a previous loop.
-                    Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                        "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                    // use separate helper variables for local contexts so that the jit optimizations
-                    // won't get confused about the variable lifetimes
-                    int cha = *pSrc;
-
-                    // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
-                    // if (IsLowSurrogate(cha)) {
-                    if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                    {
-                        ch = cha + (ch << 10) +
-                            (0x10000
-                            - CharUnicodeInfo.LOW_SURROGATE_START
-                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
-
-                        pSrc++;
-                    }
-                    // else ch is still high surrogate and encoding will fail
-
-                    // attempt to encode the surrogate or partial surrogate
-                    goto EncodeChar;
-                }
-
-                // If we've used a fallback, then we have to check for it
-                if (fallbackBuffer != null)
-                {
-                    ch = fallbackBuffer.InternalGetNextChar();
-                    if (ch > 0) goto ProcessChar;
-                }
-
-                // read next char. The JIT optimization seems to be getting confused when
-                // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-                ch = *pSrc;
-                pSrc++;
-
-            ProcessChar:
-                // if (IsHighSurrogate(ch)) {
-                if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
-                {
-                    continue;
-                }
-            // either good char or partial surrogate
-
-            EncodeChar:
-                // throw exception on partial surrogate if necessary
-                // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
-                if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                {
-                    // Lone surrogates aren't allowed, we have to do fallback for them
-                    // Have to make a fallback buffer if we don't have one
-                    if (fallbackBuffer == null)
-                    {
-                        // wait on fallbacks if we can
-                        // For fallback we may need a fallback buffer
-                        if (baseEncoder == null)
-                            fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
-                        else
-                            fallbackBuffer = baseEncoder.FallbackBuffer;
-
-                        // Set our internal fallback interesting things.
-                        fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
-                    }
-
-                    // Do our fallback.  Actually we already know its a mixed up surrogate,
-                    // so the ref pSrc isn't gonna do anything.
-                    pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
-                    fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
-                    pSrc = pSrcForFallback;
-
-                    // Ignore it if we don't throw
-                    ch = 0;
-                    continue;
-                }
-
-                // Count bytes needed
-                int bytesNeeded = 1;
-                if (ch > 0x7F)
-                {
-                    if (ch > 0x7FF)
-                    {
-                        if (ch > 0xFFFF)
-                        {
-                            bytesNeeded++;  // 4 bytes (surrogate pair)
-                        }
-                        bytesNeeded++;      // 3 bytes (800-FFFF)
-                    }
-                    bytesNeeded++;          // 2 bytes (80-7FF)
-                }
-
-                if (pTarget > pAllocatedBufferEnd - bytesNeeded)
-                {
-                    // Left over surrogate from last time will cause pSrc == chars, so we'll throw
-                    if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
-                    {
-                        fallbackBuffer.MovePrevious();              // Didn't use this fallback char
-                        if (ch > 0xFFFF)
-                            fallbackBuffer.MovePrevious();          // Was surrogate, didn't use 2nd part either
-                    }
-                    else
-                    {
-                        pSrc--;                                     // Didn't use this char
-                        if (ch > 0xFFFF)
-                            pSrc--;                                 // Was surrogate, didn't use 2nd part either
-                    }
-                    Debug.Assert(pSrc >= chars || pTarget == bytes,
-                        "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
-                    ThrowBytesOverflow(encoder, pTarget == bytes);  // Throw if we must
-                    ch = 0;                                         // Nothing left over (we backed up to start of pair if supplementary)
-                    break;
-                }
-
-                if (ch <= 0x7F)
-                {
-                    *pTarget = (byte)ch;
-                }
-                else
-                {
-                    // use separate helper variables for local contexts so that the jit optimizations
-                    // won't get confused about the variable lifetimes
-                    int chb;
-                    if (ch <= 0x7FF)
-                    {
-                        // 2 byte encoding
-                        chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
-                    }
-                    else
-                    {
-                        if (ch <= 0xFFFF)
-                        {
-                            chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
-                        }
-                        else
-                        {
-                            *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
-                            pTarget++;
-
-                            chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
-                        }
-                        *pTarget = (byte)chb;
-                        pTarget++;
-
-                        chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
-                    }
-                    *pTarget = (byte)chb;
-                    pTarget++;
-
-                    *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
-                }
-                pTarget++;
-
-
-#if FASTLOOP
-                // If still have fallback don't do fast loop
-                if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
-                    goto ProcessChar;
-
-                int availableChars = PtrDiff(pEnd, pSrc);
-                int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
-
-                // don't fall into the fast decoding loop if we don't have enough characters
-                // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
-                if (availableChars <= 13)
-                {
-                    // we are hoping for 1 byte per char
-                    if (availableBytes < availableChars)
-                    {
-                        // not enough output room.  no pending bits at this point
-                        ch = 0;
-                        continue;
-                    }
-
-                    // try to get over the remainder of the ascii characters fast though
-                    char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
-                    while (pSrc < pLocalEnd)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-
-                        // Not ASCII, need more than 1 byte per char
-                        if (ch > 0x7F)
-                            goto ProcessChar;
-
-                        *pTarget = (byte)ch;
-                        pTarget++;
-                    }
-                    // we are done, let ch be 0 to clear encoder
-                    ch = 0;
-                    break;
-                }
-
-                // we need at least 1 byte per character, but Convert might allow us to convert
-                // only part of the input, so try as much as we can.  Reduce charCount if necessary
-                if (availableBytes < availableChars)
-                {
-                    availableChars = availableBytes;
-                }
-
-                // FASTLOOP:
-                // - optimistic range checks
-                // - fallbacks to the slow loop for all special cases, exception throwing, etc.
+                // Don't care about the exact OperationStatus, just how much of the payload we were able
+                // to process.
 
-                // To compute the upper bound, assume that all characters are ASCII characters at this point,
-                //  the boundary will be decreased for every non-ASCII character we encounter
-                // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
-                // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
-                char* pStop = pSrc + availableChars - 5;
+                Utf8.ToUtf16(bytes, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: true, isFinalBlock: decoder is null || decoder.MustFlush);
 
-                while (pSrc < pStop)
-                {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    if (ch > 0x7F)
-                    {
-                        goto LongCode;
-                    }
-                    *pTarget = (byte)ch;
-                    pTarget++;
-
-                    // get pSrc aligned
-                    if ((unchecked((int)pSrc) & 0x2) != 0)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-                        if (ch > 0x7F)
-                        {
-                            goto LongCode;
-                        }
-                        *pTarget = (byte)ch;
-                        pTarget++;
-                    }
-
-                    // Run 4 characters at a time!
-                    while (pSrc < pStop)
-                    {
-                        ch = *(int*)pSrc;
-                        int chc = *(int*)(pSrc + 2);
-                        if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
-                        {
-                            goto LongCodeWithMask;
-                        }
-
-                        // Unfortunately, this is endianess sensitive
-                        if (BitConverter.IsLittleEndian)
-                        {
-                            *pTarget = (byte)ch;
-                            *(pTarget + 1) = (byte)(ch >> 16);
-                            pSrc += 4;
-                            *(pTarget + 2) = (byte)chc;
-                            *(pTarget + 3) = (byte)(chc >> 16);
-                            pTarget += 4;
-                        }
-                        else
-                        {
-                            *pTarget = (byte)(ch>>16);
-                            *(pTarget+1) = (byte)ch;
-                            pSrc += 4;
-                            *(pTarget+2) = (byte)(chc>>16);
-                            *(pTarget+3) = (byte)chc;
-                            pTarget += 4;
-                        }
-                    }
-                    continue;
-
-                LongCodeWithMask:
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        ch = (char)ch;
-                    }
-                    else
-                    {
-                        // be careful about the sign extension
-                        ch = (int)(((uint)ch) >> 16);
-                    }
-                    pSrc++;
-
-                    if (ch > 0x7F)
-                    {
-                        goto LongCode;
-                    }
-                    *pTarget = (byte)ch;
-                    pTarget++;
-                    continue;
-
-                LongCode:
-                    // use separate helper variables for slow and fast loop so that the jit optimizations
-                    // won't get confused about the variable lifetimes
-                    int chd;
-                    if (ch <= 0x7FF)
-                    {
-                        // 2 byte encoding
-                        chd = unchecked((sbyte)0xC0) | (ch >> 6);
-                    }
-                    else
-                    {
-                        // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
-                        if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                        {
-                            // 3 byte encoding
-                            chd = unchecked((sbyte)0xE0) | (ch >> 12);
-                        }
-                        else
-                        {
-                            // 4 byte encoding - high surrogate + low surrogate
-                            // if (!IsHighSurrogate(ch))
-                            if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
-                            {
-                                // low without high -> bad, try again in slow loop
-                                pSrc -= 1;
-                                break;
-                            }
-
-                            chd = *pSrc;
-                            pSrc++;
-
-                            // if (!IsLowSurrogate(chd)) {
-                            if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
-                            {
-                                // high not followed by low -> bad, try again in slow loop
-                                pSrc -= 2;
-                                break;
-                            }
-
-                            ch = chd + (ch << 10) +
-                                (0x10000
-                                - CharUnicodeInfo.LOW_SURROGATE_START
-                                - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
-
-                            *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
-                            // pStop - this byte is compensated by the second surrogate character
-                            // 2 input chars require 4 output bytes.  2 have been anticipated already
-                            // and 2 more will be accounted for by the 2 pStop-- calls below.
-                            pTarget++;
-
-                            chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
-                        }
-                        *pTarget = (byte)chd;
-                        pStop--;                    // 3 byte sequence for 1 char, so need pStop-- and the one below too.
-                        pTarget++;
-
-                        chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
-                    }
-                    *pTarget = (byte)chd;
-                    pStop--;                        // 2 byte sequence for 1 char so need pStop--.
-                    pTarget++;
-
-                    *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
-                    // pStop - this byte is already included
-                    pTarget++;
-                }
+                // Slice off how much we consumed / wrote.
 
-                Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
+                bytes = bytes.Slice(bytesRead);
+                chars = chars.Slice(charsWritten);
+            }
 
-#endif // FASTLOOP
+            // If we couldn't go through our fast fallback mechanism, or if we still have leftover
+            // data because we couldn't consume everything in the loop above, we need to go down the
+            // slow fallback path.
 
-                // no pending char at this point
-                ch = 0;
+            if (bytes.IsEmpty)
+            {
+                return originalCharsLength - chars.Length; // total number of chars written
             }
-
-            // Do we have to set the encoder bytes?
-            if (encoder != null)
+            else
             {
-                Debug.Assert(!encoder.MustFlush || ch == 0,
-                    "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
-
-                encoder.surrogateChar = ch;
-                encoder._charsUsed = (int)(pSrc - chars);
+                return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder);
             }
-
-            Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
-                baseEncoder == null || !baseEncoder._throwOnOverflow,
-                "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
-
-            return (int)(pTarget - bytes);
         }
 
-
-        // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
-        // while the actual character is being built in the lower bits. They are shifted together
-        // with the actual bits of the character.
-
-        // bits 30 & 31 are used for pending bits fixup
-        private const int FinalByte = 1 << 29;
-        private const int SupplimentarySeq = 1 << 28;
-        private const int ThreeByteSeq = 1 << 27;
-
-        // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
-        //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+        // Returns a string containing the decoded representation of a range of
+        // bytes in a byte array.
         //
-        // To simplify maintenance, the structure of GetCharCount and GetChars should be
-        // kept the same as much as possible
-        internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
-        {
-            Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
-            Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
-
-            // Initialize stuff
-            byte* pSrc = bytes;
-            byte* pEnd = pSrc + count;
+        // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+        // So if you fix this, fix the others.  Currently those include:
+        // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+        // parent method is safe
 
-            // Start by assuming we have as many as count, charCount always includes the adjustment
-            // for the character being decoded
-            int charCount = count;
-            int ch = 0;
-            DecoderFallbackBuffer fallback = null;
+        public override unsafe string GetString(byte[] bytes, int index, int count)
+        {
+            // Validate Parameters
 
-            if (baseDecoder != null)
+            if (bytes is null)
             {
-                UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
-                ch = decoder.bits;
-                charCount -= (ch >> 30);        // Adjust char count for # of expected bytes and expected output chars.
-
-                // Shouldn't have anything in fallback buffer for GetCharCount
-                // (don't have to check _throwOnOverflow for count)
-                Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
-                    "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
+                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
             }
 
-            for (;;)
+            if ((index | count) < 0)
             {
-                // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
-                if (pSrc >= pEnd)
-                {
-                    break;
-                }
-
-                if (ch == 0)
-                {
-                    // no pending bits
-                    goto ReadChar;
-                }
-
-                // read next byte. The JIT optimization seems to be getting confused when
-                // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-                int cha = *pSrc;
-                pSrc++;
-
-                // we are expecting to see trailing bytes like 10vvvvvv
-                if ((cha & unchecked((sbyte)0xC0)) != 0x80)
-                {
-                    // This can be a valid starting byte for another UTF8 byte sequence, so let's put
-                    // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
-                    pSrc--;
-                    charCount += (ch >> 30);
-                    goto InvalidByteSequence;
-                }
-
-                // fold in the new byte
-                ch = (ch << 6) | (cha & 0x3F);
-
-                if ((ch & FinalByte) == 0)
-                {
-                    Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
-                        "[UTF8Encoding.GetChars]Invariant volation");
-
-                    if ((ch & SupplimentarySeq) != 0)
-                    {
-                        if ((ch & (FinalByte >> 6)) != 0)
-                        {
-                            // this is 3rd byte (of 4 byte supplementary) - nothing to do
-                            continue;
-                        }
-
-                        // 2nd byte, check for non-shortest form of supplementary char and the valid
-                        // supplementary characters in range 0x010000 - 0x10FFFF at the same time
-                        if (!InRange(ch & 0x1F0, 0x10, 0x100))
-                        {
-                            goto InvalidByteSequence;
-                        }
-                    }
-                    else
-                    {
-                        // Must be 2nd byte of a 3-byte sequence
-                        // check for non-shortest form of 3 byte seq
-                        if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
-                            (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
-                        {
-                            goto InvalidByteSequence;
-                        }
-                    }
-                    continue;
-                }
-
-                // ready to punch
-
-                // adjust for surrogates in non-shortest form
-                if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
-                {
-                    charCount--;
-                }
-                goto EncodeChar;
-
-            InvalidByteSequence:
-                // this code fragment should be close to the goto referencing it
-                // Have to do fallback for invalid bytes
-                if (fallback == null)
-                {
-                    if (baseDecoder == null)
-                        fallback = this.decoderFallback.CreateFallbackBuffer();
-                    else
-                        fallback = baseDecoder.FallbackBuffer;
-                    fallback.InternalInitialize(bytes, null);
-                }
-                charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
-
-                ch = 0;
-                continue;
-
-            ReadChar:
-                ch = *pSrc;
-                pSrc++;
-
-            ProcessChar:
-                if (ch > 0x7F)
-                {
-                    // If its > 0x7F, its start of a new multi-byte sequence
-
-                    // Long sequence, so unreserve our char.
-                    charCount--;
-
-                    // bit 6 has to be non-zero for start of multibyte chars.
-                    if ((ch & 0x40) == 0)
-                    {
-                        // Unexpected trail byte
-                        goto InvalidByteSequence;
-                    }
-
-                    // start a new long code
-                    if ((ch & 0x20) != 0)
-                    {
-                        if ((ch & 0x10) != 0)
-                        {
-                            // 4 byte encoding - supplimentary character (2 surrogates)
-
-                            ch &= 0x0F;
-
-                            // check that bit 4 is zero and the valid supplimentary character
-                            // range 0x000000 - 0x10FFFF at the same time
-                            if (ch > 0x04)
-                            {
-                                ch |= 0xf0;
-                                goto InvalidByteSequence;
-                            }
-
-                            // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
-                            // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
-                            ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
-                                  (1 << 30) |           // If it dies on next byte we'll need an extra char
-                                  (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
-                                (SupplimentarySeq) | (SupplimentarySeq >> 6) |
-                                (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
-
-                            // Our character count will be 2 characters for these 4 bytes, so subtract another char
-                            charCount--;
-                        }
-                        else
-                        {
-                            // 3 byte encoding
-                            // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
-                            ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
-                                (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
-
-                            // We'll expect 1 character for these 3 bytes, so subtract another char.
-                            charCount--;
-                        }
-                    }
-                    else
-                    {
-                        // 2 byte encoding
-
-                        ch &= 0x1F;
-
-                        // check for non-shortest form
-                        if (ch <= 1)
-                        {
-                            ch |= 0xc0;
-                            goto InvalidByteSequence;
-                        }
-
-                        // Add bit flags so we'll be flagged correctly
-                        ch |= (FinalByte >> 6);
-                    }
-                    continue;
-                }
-
-            EncodeChar:
-
-#if FASTLOOP
-                int availableBytes = PtrDiff(pEnd, pSrc);
-
-                // don't fall into the fast decoding loop if we don't have enough bytes
-                if (availableBytes <= 13)
-                {
-                    // try to get over the remainder of the ascii characters fast though
-                    byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
-                    while (pSrc < pLocalEnd)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-
-                        if (ch > 0x7F)
-                            goto ProcessChar;
-                    }
-                    // we are done
-                    ch = 0;
-                    break;
-                }
-
-                // To compute the upper bound, assume that all characters are ASCII characters at this point,
-                //  the boundary will be decreased for every non-ASCII character we encounter
-                // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
-                byte* pStop = pSrc + availableBytes - 7;
-
-                while (pSrc < pStop)
-                {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    if (ch > 0x7F)
-                    {
-                        goto LongCode;
-                    }
-
-                    // get pSrc 2-byte aligned
-                    if ((unchecked((int)pSrc) & 0x1) != 0)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-                        if (ch > 0x7F)
-                        {
-                            goto LongCode;
-                        }
-                    }
-
-                    // get pSrc 4-byte aligned
-                    if ((unchecked((int)pSrc) & 0x2) != 0)
-                    {
-                        ch = *(ushort*)pSrc;
-                        if ((ch & 0x8080) != 0)
-                        {
-                            goto LongCodeWithMask16;
-                        }
-                        pSrc += 2;
-                    }
-
-                    // Run 8 + 8 characters at a time!
-                    while (pSrc < pStop)
-                    {
-                        ch = *(int*)pSrc;
-                        int chb = *(int*)(pSrc + 4);
-                        if (((ch | chb) & unchecked((int)0x80808080)) != 0)
-                        {
-                            goto LongCodeWithMask32;
-                        }
-                        pSrc += 8;
-
-                        // This is a really small loop - unroll it
-                        if (pSrc >= pStop)
-                            break;
-
-                        ch = *(int*)pSrc;
-                        chb = *(int*)(pSrc + 4);
-                        if (((ch | chb) & unchecked((int)0x80808080)) != 0)
-                        {
-                            goto LongCodeWithMask32;
-                        }
-                        pSrc += 8;
-                    }
-                    break;
-
-                LongCodeWithMask32:
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        ch &= 0xFF;
-                    }
-                    else
-                    {
-                        // be careful about the sign extension
-                        ch = (int)(((uint)ch) >> 16);
-                    }
-                LongCodeWithMask16:
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        ch &= 0xFF;
-                    }
-                    else
-                    {
-                        ch = (int)(((uint)ch) >> 8);
-                    }
-
-                    pSrc++;
-                    if (ch <= 0x7F)
-                    {
-                        continue;
-                    }
-
-                LongCode:
-                    int chc = *pSrc;
-                    pSrc++;
-
-                    if (
-                        // bit 6 has to be zero
-                        (ch & 0x40) == 0 ||
-                        // we are expecting to see trailing bytes like 10vvvvvv
-                        (chc & unchecked((sbyte)0xC0)) != 0x80)
-                    {
-                        goto BadLongCode;
-                    }
-
-                    chc &= 0x3F;
-
-                    // start a new long code
-                    if ((ch & 0x20) != 0)
-                    {
-                        // fold the first two bytes together
-                        chc |= (ch & 0x0F) << 6;
-
-                        if ((ch & 0x10) != 0)
-                        {
-                            // 4 byte encoding - surrogate
-                            ch = *pSrc;
-                            if (
-                                // check that bit 4 is zero, the non-shortest form of surrogate
-                                // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
-                                !InRange(chc >> 4, 0x01, 0x10) ||
-                                // we are expecting to see trailing bytes like 10vvvvvv
-                                (ch & unchecked((sbyte)0xC0)) != 0x80)
-                            {
-                                goto BadLongCode;
-                            }
-
-                            chc = (chc << 6) | (ch & 0x3F);
-
-                            ch = *(pSrc + 1);
-                            // we are expecting to see trailing bytes like 10vvvvvv
-                            if ((ch & unchecked((sbyte)0xC0)) != 0x80)
-                            {
-                                goto BadLongCode;
-                            }
-                            pSrc += 2;
-
-                            // extra byte
-                            charCount--;
-                        }
-                        else
-                        {
-                            // 3 byte encoding
-                            ch = *pSrc;
-                            if (
-                                // check for non-shortest form of 3 byte seq
-                                (chc & (0x1F << 5)) == 0 ||
-                                // Can't have surrogates here.
-                                (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
-                                // we are expecting to see trailing bytes like 10vvvvvv
-                                (ch & unchecked((sbyte)0xC0)) != 0x80)
-                            {
-                                goto BadLongCode;
-                            }
-                            pSrc++;
-
-                            // extra byte
-                            charCount--;
-                        }
-                    }
-                    else
-                    {
-                        // 2 byte encoding
-
-                        // check for non-shortest form
-                        if ((ch & 0x1E) == 0)
-                        {
-                            goto BadLongCode;
-                        }
-                    }
-
-                    // extra byte
-                    charCount--;
-                }
-#endif // FASTLOOP
-
-                // no pending bits at this point
-                ch = 0;
-                continue;
-
-            BadLongCode:
-                pSrc -= 2;
-                ch = 0;
-                continue;
+                ThrowHelper.ThrowArgumentOutOfRangeException(
+                    argument: (index < 0) ? ExceptionArgument.index : ExceptionArgument.count,
+                    resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
             }
 
-            // May have a problem if we have to flush
-            if (ch != 0)
+            if (bytes.Length - index < count)
             {
-                // We were already adjusting for these, so need to un-adjust
-                charCount += (ch >> 30);
-                if (baseDecoder == null || baseDecoder.MustFlush)
-                {
-                    // Have to do fallback for invalid bytes
-                    if (fallback == null)
-                    {
-                        if (baseDecoder == null)
-                            fallback = this.decoderFallback.CreateFallbackBuffer();
-                        else
-                            fallback = baseDecoder.FallbackBuffer;
-                        fallback.InternalInitialize(bytes, null);
-                    }
-                    charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
-                }
+                ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
             }
 
-            // Shouldn't have anything in fallback buffer for GetCharCount
-            // (don't have to check _throwOnOverflow for count)
-            Debug.Assert(fallback == null || fallback.Remaining == 0,
-                "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
+            // Avoid problems with empty input buffer
+            if (count == 0)
+                return string.Empty;
 
-            return charCount;
+            fixed (byte* pBytes = bytes)
+            {
+                return string.CreateStringFromEncoding(pBytes + index, count, this);
+            }
         }
 
-        // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
-        //           So if we're really broken, then that could also throw an error... recursively.
-        //           So try to make sure GetChars can at least process all uses by
-        //           System.Resources.ResourceReader!
         //
-        // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
-        //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+        // End of standard methods copied from EncodingNLS.cs
         //
-        // To simplify maintenance, the structure of GetCharCount and GetChars should be
-        // kept the same as much as possible
-        internal sealed override unsafe int GetChars(
-            byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder)
-        {
-            Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
-            Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
-            Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
-            Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
-
-            byte* pSrc = bytes;
-            char* pTarget = chars;
-
-            byte* pEnd = pSrc + byteCount;
-            char* pAllocatedBufferEnd = pTarget + charCount;
-
-            int ch = 0;
-
-            DecoderFallbackBuffer fallback = null;
-            byte* pSrcForFallback;
-            char* pTargetForFallback;
-            if (baseDecoder != null)
-            {
-                UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
-                ch = decoder.bits;
-
-                // Shouldn't have anything in fallback buffer for GetChars
-                // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty)
-                Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
-                    "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
-            }
 
-            for (;;)
-            {
-                // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
-                if (pSrc >= pEnd)
-                {
-                    break;
-                }
-
-                if (ch == 0)
-                {
-                    // no pending bits
-                    goto ReadChar;
-                }
-
-                // read next byte. The JIT optimization seems to be getting confused when
-                // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-                int cha = *pSrc;
-                pSrc++;
-
-                // we are expecting to see trailing bytes like 10vvvvvv
-                if ((cha & unchecked((sbyte)0xC0)) != 0x80)
-                {
-                    // This can be a valid starting byte for another UTF8 byte sequence, so let's put
-                    // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
-                    pSrc--;
-                    goto InvalidByteSequence;
-                }
-
-                // fold in the new byte
-                ch = (ch << 6) | (cha & 0x3F);
-
-                if ((ch & FinalByte) == 0)
-                {
-                    // Not at last byte yet
-                    Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
-                        "[UTF8Encoding.GetChars]Invariant volation");
-
-                    if ((ch & SupplimentarySeq) != 0)
-                    {
-                        // Its a 4-byte supplimentary sequence
-                        if ((ch & (FinalByte >> 6)) != 0)
-                        {
-                            // this is 3rd byte of 4 byte sequence - nothing to do
-                            continue;
-                        }
-
-                        // 2nd byte of 4 bytes
-                        // check for non-shortest form of surrogate and the valid surrogate
-                        // range 0x000000 - 0x10FFFF at the same time
-                        if (!InRange(ch & 0x1F0, 0x10, 0x100))
-                        {
-                            goto InvalidByteSequence;
-                        }
-                    }
-                    else
-                    {
-                        // Must be 2nd byte of a 3-byte sequence
-                        // check for non-shortest form of 3 byte seq
-                        if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
-                            (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
-                        {
-                            goto InvalidByteSequence;
-                        }
-                    }
-                    continue;
-                }
-
-                // ready to punch
-
-                // surrogate in shortest form?
-                // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
-                if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
-                {
-                    // let the range check for the second char throw the exception
-                    if (pTarget < pAllocatedBufferEnd)
-                    {
-                        *pTarget = (char)(((ch >> 10) & 0x7FF) +
-                            unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
-                        pTarget++;
-
-                        ch = (ch & 0x3FF) +
-                            unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
-                    }
-                }
-
-                goto EncodeChar;
-
-            InvalidByteSequence:
-                // this code fragment should be close to the gotos referencing it
-                // Have to do fallback for invalid bytes
-                if (fallback == null)
-                {
-                    if (baseDecoder == null)
-                        fallback = this.decoderFallback.CreateFallbackBuffer();
-                    else
-                        fallback = baseDecoder.FallbackBuffer;
-                    fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
-                }
-                // That'll back us up the appropriate # of bytes if we didn't get anywhere
-                pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
-                pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
-                bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
-                pSrc = pSrcForFallback;
-                pTarget = pTargetForFallback;
-
-                if (!fallbackResult)
-                {
-                    // Ran out of buffer space
-                    // Need to throw an exception?
-                    Debug.Assert(pSrc >= bytes || pTarget == chars,
-                        "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
-                    fallback.InternalReset();
-                    ThrowCharsOverflow(baseDecoder, pTarget == chars);
-                    ch = 0;
-                    break;
-                }
-                Debug.Assert(pSrc >= bytes,
-                    "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
-                ch = 0;
-                continue;
-
-            ReadChar:
-                ch = *pSrc;
-                pSrc++;
-
-            ProcessChar:
-                if (ch > 0x7F)
-                {
-                    // If its > 0x7F, its start of a new multi-byte sequence
-
-                    // bit 6 has to be non-zero
-                    if ((ch & 0x40) == 0)
-                    {
-                        goto InvalidByteSequence;
-                    }
-
-                    // start a new long code
-                    if ((ch & 0x20) != 0)
-                    {
-                        if ((ch & 0x10) != 0)
-                        {
-                            // 4 byte encoding - supplimentary character (2 surrogates)
-
-                            ch &= 0x0F;
-
-                            // check that bit 4 is zero and the valid supplimentary character
-                            // range 0x000000 - 0x10FFFF at the same time
-                            if (ch > 0x04)
-                            {
-                                ch |= 0xf0;
-                                goto InvalidByteSequence;
-                            }
-
-                            ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
-                                (SupplimentarySeq) | (SupplimentarySeq >> 6) |
-                                (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
-                        }
-                        else
-                        {
-                            // 3 byte encoding
-                            ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
-                                (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
-                        }
-                    }
-                    else
-                    {
-                        // 2 byte encoding
-
-                        ch &= 0x1F;
-
-                        // check for non-shortest form
-                        if (ch <= 1)
-                        {
-                            ch |= 0xc0;
-                            goto InvalidByteSequence;
-                        }
-
-                        ch |= (FinalByte >> 6);
-                    }
-                    continue;
-                }
-
-            EncodeChar:
-                // write the pending character
-                if (pTarget >= pAllocatedBufferEnd)
-                {
-                    // Fix chars so we make sure to throw if we didn't output anything
-                    ch &= 0x1fffff;
-                    if (ch > 0x7f)
-                    {
-                        if (ch > 0x7ff)
-                        {
-                            if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
-                                ch <= CharUnicodeInfo.LOW_SURROGATE_END)
-                            {
-                                pSrc--;     // It was 4 bytes
-                                pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
-                            }
-                            else if (ch > 0xffff)
-                            {
-                                pSrc--;     // It was 4 bytes, nothing was stored
-                            }
-                            pSrc--;         // It was at least 3 bytes
-                        }
-                        pSrc--;             // It was at least 2 bytes
-                    }
-                    pSrc--;
-
-                    // Throw that we don't have enough room (pSrc could be < chars if we had started to process
-                    // a 4 byte sequence already)
-                    Debug.Assert(pSrc >= bytes || pTarget == chars,
-                        "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
-                    ThrowCharsOverflow(baseDecoder, pTarget == chars);
-
-                    // Don't store ch in decoder, we already backed up to its start
-                    ch = 0;
-
-                    // Didn't throw, just use this buffer size.
-                    break;
-                }
-                *pTarget = (char)ch;
-                pTarget++;
-
-#if FASTLOOP
-                int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
-                int availableBytes = PtrDiff(pEnd, pSrc);
-
-                // don't fall into the fast decoding loop if we don't have enough bytes
-                // Test for availableChars is done because pStop would be <= pTarget.
-                if (availableBytes <= 13)
-                {
-                    // we may need as many as 1 character per byte
-                    if (availableChars < availableBytes)
-                    {
-                        // not enough output room.  no pending bits at this point
-                        ch = 0;
-                        continue;
-                    }
-
-                    // try to get over the remainder of the ascii characters fast though
-                    byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
-                    while (pSrc < pLocalEnd)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-
-                        if (ch > 0x7F)
-                            goto ProcessChar;
-
-                        *pTarget = (char)ch;
-                        pTarget++;
-                    }
-                    // we are done
-                    ch = 0;
-                    break;
-                }
-
-                // we may need as many as 1 character per byte, so reduce the byte count if necessary.
-                // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
-                if (availableChars < availableBytes)
-                {
-                    availableBytes = availableChars;
-                }
-
-                // To compute the upper bound, assume that all characters are ASCII characters at this point,
-                //  the boundary will be decreased for every non-ASCII character we encounter
-                // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
-                char* pStop = pTarget + availableBytes - 7;
-
-                while (pTarget < pStop)
-                {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    if (ch > 0x7F)
-                    {
-                        goto LongCode;
-                    }
-                    *pTarget = (char)ch;
-                    pTarget++;
-
-                    // get pSrc to be 2-byte aligned
-                    if ((unchecked((int)pSrc) & 0x1) != 0)
-                    {
-                        ch = *pSrc;
-                        pSrc++;
-                        if (ch > 0x7F)
-                        {
-                            goto LongCode;
-                        }
-                        *pTarget = (char)ch;
-                        pTarget++;
-                    }
-
-                    // get pSrc to be 4-byte aligned
-                    if ((unchecked((int)pSrc) & 0x2) != 0)
-                    {
-                        ch = *(ushort*)pSrc;
-                        if ((ch & 0x8080) != 0)
-                        {
-                            goto LongCodeWithMask16;
-                        }
-
-                        // Unfortunately, this is endianess sensitive
-                        if (BitConverter.IsLittleEndian)
-                        {
-                            *pTarget = (char)(ch & 0x7F);
-                            pSrc += 2;
-                            *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
-                            pTarget += 2;
-                        }
-                        else
-                        {
-                            *pTarget = (char)((ch >> 8) & 0x7F);
-                            pSrc += 2;
-                            *(pTarget+1) = (char)(ch & 0x7F);
-                            pTarget += 2;
-                        }
-                    }
-
-                    // Run 8 characters at a time!
-                    while (pTarget < pStop)
-                    {
-                        ch = *(int*)pSrc;
-                        int chb = *(int*)(pSrc + 4);
-                        if (((ch | chb) & unchecked((int)0x80808080)) != 0)
-                        {
-                            goto LongCodeWithMask32;
-                        }
-
-                        // Unfortunately, this is endianess sensitive
-                        if (BitConverter.IsLittleEndian)
-                        {
-                            *pTarget = (char)(ch & 0x7F);
-                            *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
-                            *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
-                            *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
-                            pSrc += 8;
-                            *(pTarget + 4) = (char)(chb & 0x7F);
-                            *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
-                            *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
-                            *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
-                            pTarget += 8;
-                        }
-                        else
-                        {
-                            *pTarget = (char)((ch >> 24) & 0x7F);
-                            *(pTarget+1) = (char)((ch >> 16) & 0x7F);
-                            *(pTarget+2) = (char)((ch >> 8) & 0x7F);
-                            *(pTarget+3) = (char)(ch & 0x7F);
-                            pSrc += 8;
-                            *(pTarget+4) = (char)((chb >> 24) & 0x7F);
-                            *(pTarget+5) = (char)((chb >> 16) & 0x7F);
-                            *(pTarget+6) = (char)((chb >> 8) & 0x7F);
-                            *(pTarget+7) = (char)(chb & 0x7F);
-                            pTarget += 8;
-                        }
-                    }
-                    break;
-
-                LongCodeWithMask32:
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        ch &= 0xFF;
-                    }
-                    else
-                    {
-                        // be careful about the sign extension
-                        ch = (int)(((uint)ch) >> 16);
-                    }
-                LongCodeWithMask16:
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        ch &= 0xFF;
-                    }
-                    else
-                    {
-                        ch = (int)(((uint)ch) >> 8);
-                    }
-                    pSrc++;
-                    if (ch <= 0x7F)
-                    {
-                        *pTarget = (char)ch;
-                        pTarget++;
-                        continue;
-                    }
-
-                LongCode:
-                    int chc = *pSrc;
-                    pSrc++;
-
-                    if (
-                        // bit 6 has to be zero
-                        (ch & 0x40) == 0 ||
-                        // we are expecting to see trailing bytes like 10vvvvvv
-                        (chc & unchecked((sbyte)0xC0)) != 0x80)
-                    {
-                        goto BadLongCode;
-                    }
-
-                    chc &= 0x3F;
-
-                    // start a new long code
-                    if ((ch & 0x20) != 0)
-                    {
-                        // fold the first two bytes together
-                        chc |= (ch & 0x0F) << 6;
-
-                        if ((ch & 0x10) != 0)
-                        {
-                            // 4 byte encoding - surrogate
-                            ch = *pSrc;
-                            if (
-                                // check that bit 4 is zero, the non-shortest form of surrogate
-                                // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
-                                !InRange(chc >> 4, 0x01, 0x10) ||
-                                // we are expecting to see trailing bytes like 10vvvvvv
-                                (ch & unchecked((sbyte)0xC0)) != 0x80)
-                            {
-                                goto BadLongCode;
-                            }
-
-                            chc = (chc << 6) | (ch & 0x3F);
-
-                            ch = *(pSrc + 1);
-                            // we are expecting to see trailing bytes like 10vvvvvv
-                            if ((ch & unchecked((sbyte)0xC0)) != 0x80)
-                            {
-                                goto BadLongCode;
-                            }
-                            pSrc += 2;
-
-                            ch = (chc << 6) | (ch & 0x3F);
-
-                            *pTarget = (char)(((ch >> 10) & 0x7FF) +
-                                unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
-                            pTarget++;
-
-                            ch = (ch & 0x3FF) +
-                                unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
-
-                            // extra byte, we're already planning 2 chars for 2 of these bytes,
-                            // but the big loop is testing the target against pStop, so we need
-                            // to subtract 2 more or we risk overrunning the input.  Subtract
-                            // one here and one below.
-                            pStop--;
-                        }
-                        else
-                        {
-                            // 3 byte encoding
-                            ch = *pSrc;
-                            if (
-                                // check for non-shortest form of 3 byte seq
-                                (chc & (0x1F << 5)) == 0 ||
-                                // Can't have surrogates here.
-                                (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
-                                // we are expecting to see trailing bytes like 10vvvvvv
-                                (ch & unchecked((sbyte)0xC0)) != 0x80)
-                            {
-                                goto BadLongCode;
-                            }
-                            pSrc++;
-
-                            ch = (chc << 6) | (ch & 0x3F);
-
-                            // extra byte, we're only expecting 1 char for each of these 3 bytes,
-                            // but the loop is testing the target (not source) against pStop, so
-                            // we need to subtract 2 more or we risk overrunning the input.
-                            // Subtract 1 here and one more below
-                            pStop--;
-                        }
-                    }
-                    else
-                    {
-                        // 2 byte encoding
-
-                        ch &= 0x1F;
-
-                        // check for non-shortest form
-                        if (ch <= 1)
-                        {
-                            goto BadLongCode;
-                        }
-                        ch = (ch << 6) | chc;
-                    }
-
-                    *pTarget = (char)ch;
-                    pTarget++;
-
-                    // extra byte, we're only expecting 1 char for each of these 2 bytes,
-                    // but the loop is testing the target (not source) against pStop.
-                    // subtract an extra count from pStop so that we don't overrun the input.
-                    pStop--;
-                }
-#endif // FASTLOOP
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private unsafe int GetCharCountCommon(byte* pBytes, int byteCount)
+        {
+            // Common helper method for all non-DecoderNLS entry points to GetCharCount.
+            // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
 
-                Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
+            Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
+            Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
 
-                // no pending bits at this point
-                ch = 0;
-                continue;
+            // First call into the fast path.
+            // Don't bother providing a fallback mechanism; our fast path doesn't use it.
 
-            BadLongCode:
-                pSrc -= 2;
-                ch = 0;
-                continue;
-            }
+            int totalCharCount = GetCharCountFast(pBytes, byteCount, fallback: null, out int bytesConsumed);
 
-            if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
+            if (bytesConsumed != byteCount)
             {
-                // Have to do fallback for invalid bytes
-                if (fallback == null)
-                {
-                    if (baseDecoder == null)
-                        fallback = this.decoderFallback.CreateFallbackBuffer();
-                    else
-                        fallback = baseDecoder.FallbackBuffer;
-                    fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
-                }
-
-                // That'll back us up the appropriate # of bytes if we didn't get anywhere
-                pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
-                pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
-                bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
-                pSrc = pSrcForFallback;
-                pTarget = pTargetForFallback;
+                // If there's still data remaining in the source buffer, go down the fallback path.
+                // We need to check for integer overflow since the fallback could change the required
+                // output count in unexpected ways.
 
-                if (!fallbackResult)
+                totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed);
+                if (totalCharCount < 0)
                 {
-                    Debug.Assert(pSrc >= bytes || pTarget == chars,
-                        "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
-
-                    // Ran out of buffer space
-                    // Need to throw an exception?
-                    fallback.InternalReset();
-                    ThrowCharsOverflow(baseDecoder, pTarget == chars);
+                    ThrowConversionOverflow();
                 }
-                Debug.Assert(pSrc >= bytes,
-                    "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
-                ch = 0;
             }
 
-            if (baseDecoder != null)
-            {
-                UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
+            return totalCharCount;
+        }
 
-                // If we're storing flush data we expect all bits to be used or else
-                // we're stuck in the middle of a conversion
-                Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow,
-                    "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
+        [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
+        private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback fallback, out int bytesConsumed)
+        {
+            // The number of UTF-16 code units will never exceed the number of UTF-8 code units,
+            // so the addition at the end of this method will not overflow.
 
-                // Remember our leftover bits.
-                decoder.bits = ch;
+            byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _);
 
-                baseDecoder._bytesUsed = (int)(pSrc - bytes);
-            }
+            int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes);
+            bytesConsumed = tempBytesConsumed;
 
-            // Shouldn't have anything in fallback buffer for GetChars
-            // (don't have to check _throwOnOverflow for chars)
-            Debug.Assert(fallback == null || fallback.Remaining == 0,
-                "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
-
-            return PtrDiff(pTarget, chars);
+            return tempBytesConsumed + utf16CodeUnitCountAdjustment;
         }
 
-        // During GetChars we had an invalid byte sequence
-        // pSrc is backed up to the start of the bad sequence if we didn't have room to
-        // fall it back.  Otherwise pSrc remains where it is.
-        private unsafe bool FallbackInvalidByteSequence(
-            ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
+        public override Decoder GetDecoder()
         {
-            // Get our byte[]
-            byte* pStart = pSrc;
-            byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
-
-            // Do the actual fallback
-            if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
-            {
-                // Oops, it failed, back up to pStart
-                pSrc = pStart;
-                return false;
-            }
-
-            // It worked
-            return true;
+            return new DecoderNLS(this);
         }
 
-        // During GetCharCount we had an invalid byte sequence
-        // pSrc is used to find the index that points to the invalid bytes,
-        // however the byte[] contains the fallback bytes (in case the index is -1)
-        private unsafe int FallbackInvalidByteSequence(
-            byte* pSrc, int ch, DecoderFallbackBuffer fallback)
+
+        public override Encoder GetEncoder()
         {
-            // Calling GetBytesUnknown can adjust the pSrc pointer but we need to pass the pointer before the adjustment
-            // to fallback.InternalFallback. The input pSrc to fallback.InternalFallback will only be used to calculate the
-            // index inside bytesUnknown and if we pass the adjusted pointer we can end up with negative index values.
-            // We store the original pSrc in pOriginalSrc and then pass pOriginalSrc to fallback.InternalFallback.
-            byte* pOriginalSrc = pSrc;
-
-            // Get our byte[]
-            byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
-
-            // Do the actual fallback
-            int count = fallback.InternalFallback(bytesUnknown, pOriginalSrc);
-
-            // # of fallback chars expected.
-            // Note that we only get here for "long" sequences, and have already unreserved
-            // the count that we prereserved for the input bytes
-            return count;
+            return new EncoderNLS(this);
         }
 
-        // Note that some of these bytes may have come from a previous fallback, so we cannot
-        // just decrement the pointer and use the values we read.  In those cases we have
-        // to regenerate the original values.
-        private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
-        {
-            // Get our byte[]
-            byte[] bytesUnknown = null;
+        //
+        // Beginning of methods used by shared fallback logic.
+        //
 
-            // See if it was a plain char
-            // (have to check >= 0 because we have all sorts of wierd bit flags)
-            if (ch < 0x100 && ch >= 0)
-            {
-                pSrc--;
-                bytesUnknown = new byte[] { unchecked((byte)ch) };
-            }
-            // See if its an unfinished 2 byte sequence
-            else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
-            {
-                pSrc--;
-                bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
-            }
-            // So now we're either 2nd byte of 3 or 4 byte sequence or
-            // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
-            // 1st check if its a 4 byte sequence
-            else if ((ch & SupplimentarySeq) != 0)
-            {
-                //  3rd byte of 4 byte sequence?
-                if ((ch & (FinalByte >> 6)) != 0)
-                {
-                    // 3rd byte of 4 byte sequence
-                    pSrc -= 3;
-                    bytesUnknown = new byte[] {
-                        unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
-                        unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
-                        unchecked((byte)(((ch) & 0x3F) | 0x80)) };
-                }
-                else if ((ch & (FinalByte >> 12)) != 0)
-                {
-                    // 2nd byte of a 4 byte sequence
-                    pSrc -= 2;
-                    bytesUnknown = new byte[] {
-                        unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
-                        unchecked((byte)(((ch) & 0x3F) | 0x80)) };
-                }
-                else
-                {
-                    // 4th byte of a 4 byte sequence
-                    pSrc--;
-                    bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
-                }
-            }
-            else
-            {
-                // 2nd byte of 3 byte sequence?
-                if ((ch & (FinalByte >> 6)) != 0)
-                {
-                    // So its 2nd byte of a 3 byte sequence
-                    pSrc -= 2;
-                    bytesUnknown = new byte[] {
-                        unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
-                }
-                else
-                {
-                    // 1st byte of a 3 byte sequence
-                    pSrc--;
-                    bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
-                }
-            }
+        internal sealed override bool TryGetByteCount(Rune value, out int byteCount)
+        {
+            // All well-formed Rune instances can be converted to 1..4 UTF-8 code units.
 
-            return bytesUnknown;
+            byteCount = value.Utf8SequenceLength;
+            return true;
         }
 
-
-        public override Decoder GetDecoder()
+        internal sealed override OperationStatus EncodeRune(Rune value, Span<byte> bytes, out int bytesWritten)
         {
-            return new UTF8Decoder(this);
-        }
+            // All well-formed Rune instances can be encoded as 1..4 UTF-8 code units.
+            // If there's an error, it's because the destination was too small.
 
+            return value.TryEncodeToUtf8(bytes, out bytesWritten) ? OperationStatus.Done : OperationStatus.DestinationTooSmall;
+        }
 
-        public override Encoder GetEncoder()
+        internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan<byte> bytes, out Rune value, out int bytesConsumed)
         {
-            return new UTF8Encoder(this);
+            return Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed);
         }
 
+        //
+        // End of methods used by shared fallback logic.
+        //
 
         public override int GetMaxByteCount(int charCount)
         {
@@ -2571,62 +858,5 @@ namespace System.Text
             return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
                    UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
         }
-
-        private sealed class UTF8Encoder : EncoderNLS
-        {
-            // We must save a high surrogate value until the next call, looking
-            // for a low surrogate value.
-            internal int surrogateChar;
-
-            public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
-            {
-                // base calls reset
-            }
-
-            public override void Reset()
-
-            {
-                this.surrogateChar = 0;
-                if (_fallbackBuffer != null)
-                    _fallbackBuffer.Reset();
-            }
-
-            // Anything left in our encoder?
-            internal override bool HasState
-            {
-                get
-                {
-                    return (this.surrogateChar != 0);
-                }
-            }
-        }
-
-        private sealed class UTF8Decoder : DecoderNLS
-        {
-            // We'll need to remember the previous information. See the comments around definition
-            // of FinalByte for details.
-            internal int bits;
-
-            public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
-            {
-                // base calls reset
-            }
-
-            public override void Reset()
-            {
-                this.bits = 0;
-                if (_fallbackBuffer != null)
-                    _fallbackBuffer.Reset();
-            }
-
-            // Anything left in our decoder?
-            internal override bool HasState
-            {
-                get
-                {
-                    return (this.bits != 0);
-                }
-            }
-        }
     }
 }
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs
new file mode 100644
index 0000000000..878e593e3d
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs
@@ -0,0 +1,379 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Numerics;
+using Internal.Runtime.CompilerServices;
+
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else // BIT64
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif // BIT64
+
+namespace System.Text.Unicode
+{
+    internal static unsafe partial class Utf16Utility
+    {
+        // Returns &inputBuffer[inputLength] if the input buffer is valid.
+        /// <summary>
+        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
+        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
+        /// </summary>
+        /// <remarks>
+        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
+        /// </remarks>
+        public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            // First, we'll handle the common case of all-ASCII. If this is able to
+            // consume the entire buffer, we'll skip the remainder of this method's logic.
+
+            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
+            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
+
+            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
+            inputLength -= numAsciiCharsConsumedJustNow;
+
+            if (inputLength == 0)
+            {
+                utf8CodeUnitCountAdjustment = 0;
+                scalarCountAdjustment = 0;
+                return pInputBuffer;
+            }
+
+            // If we got here, it means we saw some non-ASCII data, so within our
+            // vectorized code paths below we'll handle all non-surrogate UTF-16
+            // code points branchlessly. We'll only branch if we see surrogates.
+            // 
+            // We still optimistically assume the data is mostly ASCII. This means that the
+            // number of UTF-8 code units and the number of scalars almost matches the number
+            // of UTF-16 code units. As we go through the input and find non-ASCII
+            // characters, we'll keep track of these "adjustment" fixups. To get the
+            // total number of UTF-8 code units required to encode the input data, add
+            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
+            // seen.  To get the total number of scalars present in the input data,
+            // add the scalar count adjustment to the number of UTF-16 code units seen.
+
+            long tempUtf8CodeUnitCountAdjustment = 0;
+            int tempScalarCountAdjustment = 0;
+
+            if (Sse2.IsSupported)
+            {
+                if (inputLength >= Vector128<ushort>.Count)
+                {
+                    Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
+                    Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
+                    Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
+                    Vector128<ushort> vectorZero = Vector128<ushort>.Zero;
+
+                    do
+                    {
+                        Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
+                        uint mask;
+
+                        Vector128<ushort> charIsNonAscii;
+                        if (Sse41.IsSupported)
+                        {
+                            // sets 0x0080 bit if corresponding char element is >= 0x0080
+                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
+                        }
+                        else
+                        {
+                            // sets 0x8000 bit if corresponding char element is >= 0x0080
+                            charIsNonAscii = Sse2.AndNot(vector0080, Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 7)));
+                        }
+
+                        // sets 0x8080 bits if corresponding char element is >= 0x0800
+                        Vector128<ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
+
+                        mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
+
+                        // Each odd bit of mask will be 1 only if the char was >= 0x0080,
+                        // and each even bit of mask will be 1 only if the char was >= 0x0800.
+                        //
+                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
+                        //
+                        //            ,-- set if char[1] is non-ASCII
+                        //            |   ,-- set if char[0] is non-ASCII
+                        //            v   v
+                        // mask = ... 1 1 1 0
+                        //              ^   ^-- set if char[0] is >= 0x0800
+                        //              `-- set if char[1] is >= 0x0800
+                        //
+                        // (If the SSE4.1 code path is taken above, the meaning of the odd and even
+                        // bits are swapped, but the logic below otherwise holds.)
+                        //
+                        // This means we can popcnt the number of set bits, and the result is the
+                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
+                        // it expands. This results in the wrong count for UTF-16 surrogate code
+                        // units (we just counted that each individual code unit expands to 3 bytes,
+                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
+                        // We'll handle this in just a moment.
+                        //
+                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
+                        // cumulative UTF-8 adjustment factor once we determine that there are no
+                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
+                        // our computed result and we'd have to throw it away.)
+
+                        uint popcnt = (uint)BitOperations.PopCount(mask);
+
+                        // Surrogates need to be special-cased for two reasons: (a) we need
+                        // to account for the fact that we over-counted in the addition above;
+                        // and (b) they require separate validation.
+
+                        utf16Data = Sse2.Add(utf16Data, vectorA800);
+                        mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
+
+                        if (mask != 0)
+                        {
+                            // There's at least one UTF-16 surrogate code unit present.
+                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
+                            // the resulting bits of 'mask' will occur in pairs:
+                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
+                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
+                            //
+                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
+                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
+                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
+                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
+                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
+                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
+                            // determine whether a given char was a high or a low surrogate.
+                            //
+                            // Therefore the resulting bits of 'mask2' will occur in pairs:
+                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
+                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
+                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
+
+                            uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
+
+                            uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00
+                            uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00
+
+                            // Now check that each high surrogate is followed by a low surrogate and that each
+                            // low surrogate follows a high surrogate. We make an exception for the case where
+                            // the final char of the vector is a high surrogate, since we can't perform validation
+                            // on it until the next iteration of the loop when we hope to consume the matching
+                            // low surrogate.
+
+                            highSurrogatesMask <<= 2;
+                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
+                            {
+                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
+                            }
+
+                            if (highSurrogatesMask > ushort.MaxValue)
+                            {
+                                // There was a standalone high surrogate at the end of the vector.
+                                // We'll adjust our counters so that we don't consider this char consumed.
+
+                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
+                                popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
+                                pInputBuffer--;
+                                inputLength++;
+                            }
+
+                            int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask);
+
+                            // 2 UTF-16 chars become 1 Unicode scalar
+
+                            tempScalarCountAdjustment -= surrogatePairsCount;
+
+                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
+                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
+                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
+                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
+                            // perform this adjustment now.
+
+                            nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size
+                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+                        }
+
+                        tempUtf8CodeUnitCountAdjustment += popcnt;
+                        pInputBuffer += Vector128<ushort>.Count;
+                        inputLength -= Vector128<ushort>.Count;
+                    } while (inputLength >= Vector128<ushort>.Count);
+                }
+            }
+            else if (Vector.IsHardwareAccelerated)
+            {
+                if (inputLength >= Vector<ushort>.Count)
+                {
+                    Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
+                    Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
+                    Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
+                    Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
+
+                    do
+                    {
+                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
+                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
+                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
+                        // vectors, each element of the sum will contain one of three values:
+                        //
+                        // 0x0000 ( 0) = original char was 0000..007F
+                        // 0xFFFF (-1) = original char was 0080..07FF
+                        // 0xFFFE (-2) = original char was 0800..FFFF
+                        //
+                        // We'll negate them to produce a value 0..2 for each element, then sum all the
+                        // elements together to produce the number of *additional* UTF-8 code units
+                        // required to represent this UTF-16 data. This is similar to the popcnt step
+                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
+                        // handle that shortly.
+
+                        Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
+                        Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
+                        Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
+                        Vector<nuint> sumVector = (Vector<nuint>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
+
+                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
+                        // which should halve the number of operations we must perform.
+
+                        nuint popcnt = 0;
+                        for (int i = 0; i < Vector<nuint>.Count; i++)
+                        {
+                            popcnt += sumVector[i];
+                        }
+
+                        uint popcnt32 = (uint)popcnt;
+                        if (IntPtr.Size == 8)
+                        {
+                            popcnt32 += (uint)(popcnt >> 32);
+                        }
+
+                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
+                        // know there aren't any unpaired surrogates in the input data.
+
+                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
+
+                        // Now check for surrogates.
+
+                        utf16Data -= vectorD800;
+                        Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
+                        if (surrogateChars != Vector<ushort>.Zero)
+                        {
+                            // There's at least one surrogate (high or low) UTF-16 code unit in
+                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
+                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
+                            // UTF-16 code unit was a high or low surrogate, respectively.
+
+                            Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
+                            Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
+
+                            // We want to make sure that each high surrogate code unit is followed by
+                            // a low surrogate code unit and each low surrogate code unit follows a
+                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
+                            // or palignr available to us, we'll do this as a loop. We won't look at
+                            // the very last high surrogate char element since we don't yet know if
+                            // the next vector read will have a low surrogate char element.
+
+                            ushort surrogatePairsCount = 0;
+                            for (int i = 0; i < Vector<ushort>.Count - 1; i++)
+                            {
+                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
+                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
+                                {
+                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
+                                }
+                            }
+
+                            if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
+                            {
+                                // There was a standalone high surrogate at the end of the vector.
+                                // We'll adjust our counters so that we don't consider this char consumed.
+
+                                pInputBuffer--;
+                                inputLength++;
+                                popcnt32 -= 2;
+                            }
+
+                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
+
+                            // 2 UTF-16 chars become 1 Unicode scalar
+
+                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
+
+                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
+                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
+                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
+                            // so we'll adjust this now.
+
+                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+                        }
+
+                        tempUtf8CodeUnitCountAdjustment += popcnt32;
+                        pInputBuffer += Vector<ushort>.Count;
+                        inputLength -= Vector<ushort>.Count;
+                    } while (inputLength >= Vector<ushort>.Count);
+                }
+            }
+
+        NonVectorizedLoop:
+
+            // Vectorization isn't supported on our current platform, or the input was too small to benefit
+            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
+            // drain remaining valid chars before we report failure.
+
+            for (; inputLength > 0; pInputBuffer++, inputLength--)
+            {
+                uint thisChar = pInputBuffer[0];
+                if (thisChar <= 0x7F)
+                {
+                    continue;
+                }
+
+                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
+                // This optimistically assumes no surrogates, which we'll handle shortly.
+
+                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
+
+                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
+                {
+                    continue;
+                }
+
+                // Found a surrogate char. Back out the adjustment we made above, then
+                // try to consume the entire surrogate pair all at once. We won't bother
+                // trying to interpret the surrogate pair as a scalar value; we'll only
+                // validate that its bit pattern matches what's expected for a surrogate pair.
+
+                tempUtf8CodeUnitCountAdjustment -= 2;
+
+                if (inputLength == 1)
+                {
+                    goto Error; // input buffer too small to read a surrogate pair
+                }
+
+                thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
+                {
+                    goto Error; // not a well-formed surrogate pair
+                }
+
+                tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
+                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
+
+                pInputBuffer++; // consumed one extra char
+                inputLength--;
+            }
+
+        Error:
+
+            // Also used for normal return.
+
+            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
+            scalarCountAdjustment = tempScalarCountAdjustment;
+            return pInputBuffer;
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs
index bed39057e4..828776b436 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs
@@ -5,7 +5,7 @@
 using System.Runtime.CompilerServices;
 using System.Diagnostics;
 
-namespace System.Text
+namespace System.Text.Unicode
 {
     internal static partial class Utf16Utility
     {
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
index 6c8197d22b..b4cae379e2 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
@@ -4,6 +4,8 @@
 
 using System.Buffers;
 using System.Diagnostics;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
 
 namespace System.Text.Unicode
 {
@@ -37,79 +39,87 @@ namespace System.Text.Unicode
         /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
         /// this method will not return <see cref="OperationStatus.InvalidData"/>.
         /// </remarks>
-        public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+        public static unsafe OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
         {
-            int originalSourceLength = source.Length;
-            int originalDestinationLength = destination.Length;
-            OperationStatus status = OperationStatus.Done;
+            // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437
 
-            // In a loop, this is going to read and transcode one scalar value at a time
-            // from the source to the destination.
+            _ = source.Length;
+            _ = destination.Length;
 
-            while (!source.IsEmpty)
+            fixed (char* pOriginalSource = &MemoryMarshal.GetReference(source))
+            fixed (byte* pOriginalDestination = &MemoryMarshal.GetReference(destination))
             {
-                status = Rune.DecodeFromUtf16(source, out Rune firstScalarValue, out int charsConsumed);
+                // We're going to bulk transcode as much as we can in a loop, iterating
+                // every time we see bad data that requires replacement.
 
-                switch (status)
+                OperationStatus operationStatus = OperationStatus.Done;
+                char* pInputBufferRemaining = pOriginalSource;
+                byte* pOutputBufferRemaining = pOriginalDestination;
+
+                while (!source.IsEmpty)
                 {
-                    case OperationStatus.NeedMoreData:
-
-                        // Input buffer ended with a high surrogate. Only treat this as an error
-                        // if the caller told us that we shouldn't expect additional data in a
-                        // future call.
-
-                        if (!isFinalBlock)
-                        {
-                            goto Finish;
-                        }
-
-                        status = OperationStatus.InvalidData;
-                        goto case OperationStatus.InvalidData;
-
-                    case OperationStatus.InvalidData:
-
-                        // Input buffer contained invalid data. If the caller told us not to
-                        // perform U+FFFD replacement, terminate the loop immediately and return
-                        // an error to the caller.
-
-                        if (!replaceInvalidSequences)
-                        {
-                            goto Finish;
-                        }
-
-                        firstScalarValue = Rune.ReplacementChar;
-                        goto default;
-
-                    default:
-
-                        // We know which scalar value we need to transcode to UTF-8.
-                        // Do so now, and only terminate the loop if we ran out of space
-                        // in the destination buffer.
-
-                        if (firstScalarValue.TryEncodeToUtf8(destination, out int bytesWritten))
-                        {
-                            source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
-                            destination = destination.Slice(bytesWritten);
-                            status = OperationStatus.Done; // forcibly set success
-                            continue;
-                        }
-                        else
-                        {
-                            status = OperationStatus.DestinationTooSmall;
-                            goto Finish;
-                        }
+                    // We've pinned the spans at the entry point to this method.
+                    // It's safe for us to use Unsafe.AsPointer on them during this loop.
+
+                    operationStatus = Utf8Utility.TranscodeToUtf8(
+                        pInputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)),
+                        inputLength: source.Length,
+                        pOutputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)),
+                        outputBytesRemaining: destination.Length,
+                        pInputBufferRemaining: out pInputBufferRemaining,
+                        pOutputBufferRemaining: out pOutputBufferRemaining);
+
+                    // If we finished the operation entirely or we ran out of space in the destination buffer,
+                    // or if we need more input data and the caller told us that there's possibly more data
+                    // coming, return immediately.
+
+                    if (operationStatus <= OperationStatus.DestinationTooSmall
+                        || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock))
+                    {
+                        break;
+                    }
+
+                    // We encountered invalid data, or we need more data but the caller told us we're
+                    // at the end of the stream. In either case treat this as truly invalid.
+                    // If the caller didn't tell us to replace invalid sequences, return immediately.
+
+                    if (!replaceInvalidSequences)
+                    {
+                        operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error
+                        break;
+                    }
+
+                    // We're going to attempt to write U+FFFD to the destination buffer.
+                    // Do we even have enough space to do so?
+
+                    destination = destination.Slice((int)(pOutputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination))));
+
+                    if (2 >= (uint)destination.Length)
+                    {
+                        operationStatus = OperationStatus.DestinationTooSmall;
+                        break;
+                    }
+
+                    destination[0] = 0xEF; // U+FFFD = [ EF BF BD ] in UTF-8
+                    destination[1] = 0xBF;
+                    destination[2] = 0xBD;
+                    destination = destination.Slice(3);
+
+                    // Invalid UTF-16 sequences are always of length 1. Just skip the next character.
+
+                    source = source.Slice((int)(pInputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))) + 1);
+
+                    operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case
+                    pInputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+                    pOutputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination));
                 }
-            }
-
-        Finish:
 
-            numCharsRead = originalSourceLength - source.Length;
-            numBytesWritten = originalDestinationLength - destination.Length;
+                // Not possible to make any further progress - report to our caller how far we got.
 
-            Debug.Assert((status == OperationStatus.Done) == (numCharsRead == originalSourceLength),
-                "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");
-
-            return status;
+                charsRead = (int)(pInputBufferRemaining - pOriginalSource);
+                bytesWritten = (int)(pOutputBufferRemaining - pOriginalDestination);
+                return operationStatus;
+            }
         }
 
         /// <summary>
@@ -120,79 +130,92 @@ namespace System.Text.Unicode
         /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
         /// this method will not return <see cref="OperationStatus.InvalidData"/>.
         /// </remarks>
-        public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+        public static unsafe OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
         {
-            int originalSourceLength = source.Length;
-            int originalDestinationLength = destination.Length;
-            OperationStatus status = OperationStatus.Done;
+            // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437
+
+            _ = source.Length;
+            _ = destination.Length;
 
-            // In a loop, this is going to read and transcode one scalar value at a time
-            // from the source to the destination.
+            // We'll be mutating these values throughout our loop.
 
-            while (!source.IsEmpty)
+            fixed (byte* pOriginalSource = &MemoryMarshal.GetReference(source))
+            fixed (char* pOriginalDestination = &MemoryMarshal.GetReference(destination))
             {
-                status = Rune.DecodeFromUtf8(source, out Rune firstScalarValue, out int bytesConsumed);
+                // We're going to bulk transcode as much as we can in a loop, iterating
+                // every time we see bad data that requires replacement.
 
-                switch (status)
+                OperationStatus operationStatus = OperationStatus.Done;
+                byte* pInputBufferRemaining = pOriginalSource;
+                char* pOutputBufferRemaining = pOriginalDestination;
+
+                while (!source.IsEmpty)
                 {
-                    case OperationStatus.NeedMoreData:
-
-                        // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error
-                        // if the caller told us that we shouldn't expect additional data in a
-                        // future call.
-
-                        if (!isFinalBlock)
-                        {
-                            goto Finish;
-                        }
-
-                        status = OperationStatus.InvalidData;
-                        goto case OperationStatus.InvalidData;
-
-                    case OperationStatus.InvalidData:
-
-                        // Input buffer contained invalid data. If the caller told us not to
-                        // perform U+FFFD replacement, terminate the loop immediately and return
-                        // an error to the caller.
-
-                        if (!replaceInvalidSequences)
-                        {
-                            goto Finish;
-                        }
-
-                        firstScalarValue = Rune.ReplacementChar;
-                        goto default;
-
-                    default:
-
-                        // We know which scalar value we need to transcode to UTF-16.
-                        // Do so now, and only terminate the loop if we ran out of space
-                        // in the destination buffer.
-
-                        if (firstScalarValue.TryEncodeToUtf16(destination, out int charsWritten))
-                        {
-                            source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution
-                            destination = destination.Slice(charsWritten);
-                            status = OperationStatus.Done; // forcibly set success
-                            continue;
-                        }
-                        else
-                        {
-                            status = OperationStatus.DestinationTooSmall;
-                            goto Finish;
-                        }
+                    // We've pinned the spans at the entry point to this method.
+                    // It's safe for us to use Unsafe.AsPointer on them during this loop.
+
+                    operationStatus = Utf8Utility.TranscodeToUtf16(
+                        pInputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)),
+                        inputLength: source.Length,
+                        pOutputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)),
+                        outputCharsRemaining: destination.Length,
+                        pInputBufferRemaining: out pInputBufferRemaining,
+                        pOutputBufferRemaining: out pOutputBufferRemaining);
+
+                    // If we finished the operation entirely or we ran out of space in the destination buffer,
+                    // or if we need more input data and the caller told us that there's possibly more data
+                    // coming, return immediately.
+
+                    if (operationStatus <= OperationStatus.DestinationTooSmall
+                        || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock))
+                    {
+                        break;
+                    }
+
+                    // We encountered invalid data, or we need more data but the caller told us we're
+                    // at the end of the stream. In either case treat this as truly invalid.
+                    // If the caller didn't tell us to replace invalid sequences, return immediately.
+
+                    if (!replaceInvalidSequences)
+                    {
+                        operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error
+                        break;
+                    }
+
+                    // We're going to attempt to write U+FFFD to the destination buffer.
+                    // Do we even have enough space to do so?
+
+                    destination = destination.Slice((int)(pOutputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination))));
+
+                    if (destination.IsEmpty)
+                    {
+                        operationStatus = OperationStatus.DestinationTooSmall;
+                        break;
+                    }
+
+                    destination[0] = (char)UnicodeUtility.ReplacementChar;
+                    destination = destination.Slice(1);
+
+                    // Now figure out how many bytes of the source we must skip over before we should retry
+                    // the operation. This might be more than 1 byte.
+
+                    source = source.Slice((int)(pInputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))));
+                    Debug.Assert(!source.IsEmpty, "Expected 'Done' if source is fully consumed.");
+
+                    Rune.DecodeFromUtf8(source, out _, out int bytesConsumedJustNow);
+                    source = source.Slice(bytesConsumedJustNow);
+
+                    operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case
+                    pInputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+                    pOutputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination));
                 }
-            }
-
-        Finish:
 
-            numBytesRead = originalSourceLength - source.Length;
-            numCharsWritten = originalDestinationLength - destination.Length;
+                // Not possible to make any further progress - report to our caller how far we got.
 
-            Debug.Assert((status == OperationStatus.Done) == (numBytesRead == originalSourceLength),
-                    "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");
-
-            return status;
+                numBytesRead = (int)(pInputBufferRemaining - pOriginalSource);
+                numCharsWritten = (int)(pOutputBufferRemaining - pOriginalDestination);
+                return operationStatus;
+            }
         }
     }
 }
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
new file mode 100644
index 0000000000..c17c2cdce7
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
@@ -0,0 +1,861 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers.Binary;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.X86;
+using Internal.Runtime.CompilerServices;
+
+namespace System.Text.Unicode
+{
+    internal static partial class Utf8Utility
+    {
+        /// <summary>
+        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
+        /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ExtractCharFromFirstThreeByteSequence(uint value)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ ######## | 10xxxxxx 10yyyyyy 1110zzzz ]
+                return ((value & 0x003F0_000u) >> 16)
+                    | ((value & 0x0000_3F00u) >> 2)
+                    | ((value & 0x0000_000Fu) << 12);
+            }
+            else
+            {
+                // value = [ 1110zzzz 10yyyyyy 10xxxxxx | ######## ]
+                return ((value & 0x0F00_0000u) >> 12)
+                    | ((value & 0x003F_0000u) >> 10)
+                    | ((value & 0x0000_3F00u) >> 8);
+            }
+        }
+
+        /// <summary>
+        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
+        /// first two bytes as a two-byte UTF-8 subsequence and returns the UTF-16 representation.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ExtractCharFromFirstTwoByteSequence(uint value)
+        {
+            Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value) && !UInt32BeginsWithOverlongUtf8TwoByteSequence(value));
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ ######## ######## | 10xxxxxx 110yyyyy ]
+                uint leadingByte = (uint)(byte)value << 6;
+                return (uint)(byte)(value >> 8) + leadingByte - (0xC0u << 6) - 0x80u; // remove header bits
+            }
+            else
+            {
+                // value = [ 110yyyyy 10xxxxxx | ######## ######## ]
+                return (char)(((value & 0x1F00_0000u) >> 18) | ((value & 0x003F_0000u) >> 16));
+            }
+        }
+
+        /// <summary>
+        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a
+        /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+        private static uint ExtractCharsFromFourByteSequence(uint value)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+                if (Bmi2.IsSupported)
+                {
+                    // need to reverse endianness for bit manipulation to work correctly
+                    value = BinaryPrimitives.ReverseEndianness(value);
+
+                    // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+                    // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ]
+                    // where wwww = uuuuu - 1
+
+                    uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u);
+                    uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u);
+
+                    uint combined = (lowSurrogateChar << 16) + highSurrogateChar;
+                    combined -= 0x40u; // wwww = uuuuu - 1
+                    combined += 0xDC00_D800u; // add surrogate markers
+                    return combined;
+                }
+                else
+                {
+                    // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+                    // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+                    // where wwww = uuuuu - 1
+                    uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
+                    retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
+                    retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
+                    retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
+                    retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
+                    retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
+                    retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
+                    retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
+                    retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+                    return retVal;
+                }
+            }
+            else
+            {
+                // input is UTF8 [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+                // want to return UTF16 scalar 000uuuuuxxxxxxxxxxxxxxxx = [ 110110wwwwxxxxxx 110111xxxxxxxxx ]
+                // where wwww = uuuuu - 1
+                uint retVal = value & 0xFF00_0000u; // retVal = [ 11110uuu 00000000 00000000 00000000 ]
+                retVal |= (value & 0x003F_0000u) << 2; // retVal = [ 11110uuu uuzzzz00 00000000 00000000 ]
+                retVal |= (value & 0x0000_3000u) << 4; // retVal = [ 11110uuu uuzzzzyy 00000000 00000000 ]
+                retVal |= (value & 0x0000_0F00u) >> 2; // retVal = [ 11110uuu uuzzzzyy 000000yy yy000000 ]
+                retVal |= (value & 0x0000_003Fu); // retVal = [ 11110uuu uuzzzzyy 000000yy yyxxxxxx ]
+                retVal -= 0x2000_0000u; // retVal = [ 11010uuu uuzzzzyy 000000yy yyxxxxxx ]
+                retVal -= 0x0040_0000u; // retVal = [ 110100ww wwzzzzyy 000000yy yyxxxxxx ]
+                retVal += 0x0000_DC00u; // retVal = [ 110100ww wwzzzzyy 110111yy yyxxxxxx ]
+                retVal += 0x0800_0000u; // retVal = [ 110110ww wwzzzzyy 110111yy yyxxxxxx ]
+                return retVal;
+            }
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents a valid packed UTF-16 surrogate pair, all in machine-endian order,
+        /// returns the packed 4-byte UTF-8 representation of this scalar value, also in machine-endian order.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
+        {
+            Debug.Assert(IsWellFormedUtf16SurrogatePair(value));
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
+                // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
+
+                if (Bmi2.IsSupported)
+                {
+                    // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
+                    // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple
+                    // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across
+                    // all four output bytes.
+
+                    uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */;
+
+                    // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning
+                    // that should normally be masked out via an and, but we'll just direct pdep to ignore it.
+
+                    uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ]
+                    return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
+                }
+                else
+                {
+                    value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
+
+                    uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
+                    tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
+
+                    uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
+                    uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
+                    tempC |= tempB;
+
+                    uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
+                    tempD |= 0x8080_80F0u;
+
+                    return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
+                }
+            }
+            else
+            {
+                // input = [ 110110wwwwzzzzyy 110111yyyyxxxxxx ], where wwww = uuuuu - 1
+                // must return [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ], where wwww = uuuuu - 1
+
+                value -= 0xD800_DC00u; // = [ 000000wwwwzzzzyy 000000yyyyxxxxxx ]
+                value += 0x0040_0000u; // = [ 00000uuuuuzzzzyy 000000yyyyxxxxxx ]
+
+                uint tempA = value & 0x0700_0000u; // = [ 00000uuu 00000000 00000000 00000000 ]
+                uint tempB = (value >> 2) & 0x003F_0000u; // = [ 00000000 00uuzzzz 00000000 00000000 ]
+                tempB |= tempA;
+
+                uint tempC = (value << 2) & 0x0000_0F00u; // = [ 00000000 00000000 0000yyyy 00000000 ]
+                uint tempD = (value >> 6) & 0x0003_0000u; // = [ 00000000 00000000 00yy0000 00000000 ]
+                tempD |= tempC;
+
+                uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ]
+                return (tempE | tempB | tempD); // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+            }
+        }
+
+        /// <summary>
+        /// Given a machine-endian DWORD which represents two adjacent UTF-8 two-byte sequences,
+        /// returns the machine-endian DWORD representation of that same data as two adjacent
+        /// UTF-16 byte sequences.
+        /// </summary>
+        /// <param name="value"></param>
+        /// <returns></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint value)
+        {
+            // We don't want to swap the position of the high and low WORDs,
+            // as the buffer was read in machine order and will be written in
+            // machine order.
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ 10xxxxxx 110yyyyy | 10xxxxxx 110yyyyy ]
+                return ((value & 0x3F003F00u) >> 8) | ((value & 0x001F001Fu) << 6);
+            }
+            else
+            {
+                // value = [ 110yyyyy 10xxxxxx | 110yyyyy 10xxxxxx ]
+                return ((value & 0x1F001F00u) >> 2) | (value & 0x003F003Fu);
+            }
+        }
+
+        /// <summary>
+        /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences,
+        /// returns the machine-endian DWORD representation of that same data as two
+        /// adjacent UTF-8 two-byte sequences.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint value)
+        {
+            // stays in machine endian
+
+            Debug.Assert(IsFirstCharTwoUtf8Bytes(value) && IsSecondCharTwoUtf8Bytes(value));
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ]
+                // want to return [ 10XXXXXX 110YYYYY 10xxxxxx 110yyyyy ]
+
+                return ((value >> 6) & 0x001F_001Fu) + ((value << 8) & 0x3F00_3F00u) + 0x80C0_80C0u;
+            }
+            else
+            {
+                // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ]
+                // want to return [ 110YYYYY 10XXXXXX 110yyyyy 10xxxxxx ]
+
+                return ((value << 2) & 0x1F00_1F00u) + (value & 0x003F_003Fu) + 0xC080_C080u;
+            }
+        }
+
+        /// <summary>
+        /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences,
+        /// returns the machine-endian DWORD representation of the first UTF-16 char
+        /// as a UTF-8 two-byte sequence packed into a WORD and zero-extended to DWORD.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value)
+        {
+            // stays in machine endian
+
+            Debug.Assert(IsFirstCharTwoUtf8Bytes(value));
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ ######## ######## 00000yyy yyxxxxxx ]
+                // want to return [ ######## ######## 10xxxxxx 110yyyyy ]
+
+                uint temp = (value << 2) & 0x1F00u; // [ 00000000 00000000 000yyyyy 00000000 ]
+                value &= 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
+                return BinaryPrimitives.ReverseEndianness((ushort)(temp + value + 0xC080u)); // [ 00000000 00000000 10xxxxxx 110yyyyy ]
+            }
+            else
+            {
+                // value = [ 00000yyy yyxxxxxx ######## ######## ]
+                // want to return [ ######## ######## 110yyyyy 10xxxxxx ]
+
+                uint temp = (value >> 16) & 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
+                value = (value >> 22) & 0x1F00u; // [ 00000000 00000000 000yyyyy 0000000 ]
+                return value + temp + 0xC080u;
+            }
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the first UTF-16 character is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsFirstCharAscii(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0000..007F ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0000..007F ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
+                || (!BitConverter.IsLittleEndian && value < 0x0080_0000u);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the first UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
+        /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
+                || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsFirstCharSurrogate(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
+                || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsFirstCharTwoUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
+
+            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
+            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
+                || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
+        /// is a UTF-8 continuation byte.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsLowByteUtf8ContinuationByte(uint value)
+        {
+            // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
+            // so the best we can do for now is the lea / cmp pair.
+            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+            return (byte)(value - 0x80u) <= 0x3Fu;
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharAscii(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
+            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
+                || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
+        /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
+            // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
+                || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharSurrogate(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
+            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
+                || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharTwoUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
+            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
+
+            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
+            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
+                || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
+        /// i.e., has binary representation 10xxxxxx, where x is any bit.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsUtf8ContinuationByte(in byte value)
+        {
+            // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
+            // directly rather than bounce a temporary through a register. That is, we want the JIT to be
+            // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
+            // to see if it's a continuation byte. Data that's already enregistered will go through the
+            // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
+            //
+            // The below check takes advantage of the two's complement representation of negative numbers.
+            // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+
+            return ((sbyte)value < -64);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsWellFormedUtf16SurrogatePair(uint value)
+        {
+            // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
+            // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
+            //
+            // We're essentially performing a range check on each component of the input in parallel. The allowed range
+            // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
+            // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
+            // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
+            // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
+                || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
+        }
+
+        /// <summary>
+        /// Converts a DWORD from machine-endian to little-endian.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ToLittleEndian(uint value)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+                return value;
+            }
+            else
+            {
+                return BinaryPrimitives.ReverseEndianness(value);
+            }
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer are
+        /// an overlong representation of a sequence that should be represented as one byte.
+        /// This method *does not* validate that the sequence matches the appropriate
+        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
+        {
+            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+            Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
+
+            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+            // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
+            // to check that it's < C2.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
+                || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first four bytes of the buffer match
+        /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
+        /// method *does not* validate that the sequence is well-formed; the caller must
+        /// still perform overlong form or out-of-range checking.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
+        {
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0xC0C0C0F8U;
+            //     const uint comparand = 0x808080F0U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0xF8C0C0C0U;
+            //     const uint comparand = 0xF0808000U;
+            //     return ((value & mask) == comparand);
+            // }
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first three bytes of the buffer match
+        /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
+        /// validate that the sequence is well-formed; the caller must still perform
+        /// overlong form or surrogate checking.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
+        {
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0x00C0C0F0U;
+            //     const uint comparand = 0x008080E0U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0xF0C0C000U;
+            //     const uint comparand = 0xE0808000U;
+            //     return ((value & mask) == comparand);
+            // }
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer match
+        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+        /// validate that the sequence is well-formed; the caller must still perform
+        /// overlong form checking.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
+        {
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0x0000C0E0U;
+            //     const uint comparand = 0x000080C0U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0xE0C00000U;
+            //     const uint comparand = 0xC0800000U;
+            //     return ((value & mask) == comparand);
+            // }
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer are
+        /// an overlong representation of a sequence that should be represented as one byte.
+        /// This method *does not* validate that the sequence matches the appropriate
+        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
+        {
+            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+            Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
+
+            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+            // We already validated that it's 80 .. DF (per mask check earlier).
+            // C2 = 1100 0010
+            // DF = 1101 1111
+            // This means that we can AND the leading byte with the mask 0001 1110 (1E),
+            // and if the result is zero the sequence is overlong.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the last two bytes of the buffer match
+        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+        /// validate that the sequence is well-formed; the caller must still perform
+        /// overlong form checking.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
+        {
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0xC0E00000U;
+            //     const uint comparand = 0x80C00000U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0x0000E0C0U;
+            //     const uint comparand = 0x0000C080U;
+            //     return ((value & mask) == comparand);
+            // }
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
+        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
+        {
+            // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
+            // In little-endian, that would be represented as:
+            // [ ######## ######## 10xxxxxx 110yyyyy ].
+            // Due to the little-endian representation we can perform a trick by ANDing the low
+            // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
+            // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
+            // 2-byte-sequence bitmask check and overlong form validation with one comparison.
+
+            Debug.Assert(BitConverter.IsLittleEndian);
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
+                || (!BitConverter.IsLittleEndian && false);
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+        /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
+        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
+        {
+            // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
+
+            Debug.Assert(BitConverter.IsLittleEndian);
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
+                || (!BitConverter.IsLittleEndian && false);
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32FirstByteIsAscii(uint value)
+        {
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
+                || (!BitConverter.IsLittleEndian && ((int)value >= 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32FourthByteIsAscii(uint value)
+        {
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((int)value >= 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32SecondByteIsAscii(uint value)
+        {
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32ThirdByteIsAscii(uint value)
+        {
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
+        }
+
+        /// <summary>
+        /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
+        /// and writes the resulting QWORD into the destination with machine endianness.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+        private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
+        {
+            if (Bmi2.X64.IsSupported)
+            {
+                // BMI2 will work regardless of the processor's endianness.
+                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
+            }
+            else
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    outputBuffer = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
+                }
+                else
+                {
+                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+                    value >>= 8;
+                    outputBuffer = (char)value;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
+        /// converts those scalar values to their 3-byte UTF-8 representation and writes the
+        /// resulting 6 bytes to the destination buffer.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outputBuffer, uint value)
+        {
+            Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
+            Debug.Assert(IsSecondCharAtLeastThreeUtf8Bytes(value) && !IsSecondCharSurrogate(value), "Second half of value should've been 0800..D7FF or E000..FFFF");
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ ZZZZYYYY YYXXXXXX zzzzyyyy yyxxxxxx ]
+                // want to write [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ] [ 10XXXXXX 10YYYYYY ]
+
+                uint tempA = ((value << 2) & 0x3F00u) | ((value & 0x3Fu) << 16); // = [ 00000000 00xxxxxx 00yyyyyy 00000000 ]
+                uint tempB = ((value >> 4) & 0x0F00_0000u) | ((value >> 12) & 0x0Fu); // = [ 0000ZZZZ 00000000 00000000 0000zzzz ]
+                Unsafe.WriteUnaligned<uint>(ref outputBuffer, tempA + tempB + 0xE080_80E0u); // = [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ]
+                Unsafe.WriteUnaligned<ushort>(ref Unsafe.Add(ref outputBuffer, 4), (ushort)(((value >> 22) & 0x3Fu) + ((value >> 8) & 0x3F00u) + 0x8080u)); // = [ 10XXXXXX 10YYYYYY ]
+            }
+            else
+            {
+                // value = [ zzzzyyyy yyxxxxxx ZZZZYYYY YYXXXXXX ]
+                // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ] [ 1110ZZZZ ] [ 10YYYYYY ] [ 10XXXXXX ]
+
+                Unsafe.Add(ref outputBuffer, 5) = (byte)((value & 0x3Fu) | 0x80u);
+                Unsafe.Add(ref outputBuffer, 4) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
+                Unsafe.Add(ref outputBuffer, 3) = (byte)(((value >>= 6) & 0x0Fu) | 0xE0u);
+                Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 4) & 0x3Fu) | 0x80u);
+                Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
+                outputBuffer = (byte)((value >>= 6) | 0xE0u);
+            }
+        }
+
+
+        /// <summary>
+        /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
+        /// converts the first UTF-16 value to its 3-byte UTF-8 representation and writes the
+        /// resulting 3 bytes to the destination buffer.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref byte outputBuffer, uint value)
+        {
+            Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // value = [ ######## ######## zzzzyyyy yyxxxxxx ]
+                // want to write [ 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
+
+                uint tempA = (value << 2) & 0x3F00u; // [ 00yyyyyy 00000000 ]
+                uint tempB = ((uint)(ushort)value >> 12); // [ 00000000 0000zzzz ]
+                Unsafe.WriteUnaligned<ushort>(ref outputBuffer, (ushort)(tempA + tempB + 0x80E0u)); // [ 10yyyyyy 1110zzzz ]
+                Unsafe.Add(ref outputBuffer, 2) = (byte)((value & 0x3Fu) | ~0x7Fu); // [ 10xxxxxx ]
+            }
+            else
+            {
+                // value = [ zzzzyyyy yyxxxxxx ######## ######## ]
+                // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ]
+
+                Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 16) & 0x3Fu) | 0x80u);
+                Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
+                outputBuffer = (byte)((value >>= 6) | 0xE0u);
+            }
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
new file mode 100644
index 0000000000..2baee48024
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
@@ -0,0 +1,1452 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Buffers.Binary;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.X86;
+using Internal.Runtime.CompilerServices;
+
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else // BIT64
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif // BIT64
+
+namespace System.Text.Unicode
+{
+    internal static unsafe partial class Utf8Utility
+    {
+        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
+        // the next byte would have been consumed from / the next char would have been written to.
+        // inputLength in bytes, outputCharsRemaining in chars.
+        [MethodImpl(MethodImplOptions.AggressiveOptimization)]
+        public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining)
+        {
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative.");
+            Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
+
+            // First, try vectorized conversion.
+
+            {
+                nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
+
+                pInputBuffer += numElementsConverted;
+                pOutputBuffer += numElementsConverted;
+
+                // Quick check - did we just end up consuming the entire input buffer?
+                // If so, short-circuit the remainder of the method.
+
+                if ((int)numElementsConverted == inputLength)
+                {
+                    pInputBufferRemaining = pInputBuffer;
+                    pOutputBufferRemaining = pOutputBuffer;
+                    return OperationStatus.Done;
+                }
+
+                inputLength -= (int)numElementsConverted;
+                outputCharsRemaining -= (int)numElementsConverted;
+            }
+
+            if (inputLength < sizeof(uint))
+            {
+                goto ProcessInputOfLessThanDWordSize;
+            }
+
+            byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - 4;
+
+            // Begin the main loop.
+
+#if DEBUG
+            byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            {
+                // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
+
+                uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+            AfterReadDWord:
+
+#if DEBUG
+                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+                pLastBufferPosProcessed = pInputBuffer;
+#endif
+                // First, check for the common case of all-ASCII bytes.
+
+                if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                {
+                    // We read an all-ASCII sequence.
+
+                    if (outputCharsRemaining < sizeof(uint))
+                    {
+                        goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
+                    }
+
+                    Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                    pInputBuffer += 4;
+                    pOutputBuffer += 4;
+                    outputCharsRemaining -= 4;
+
+                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+                    // Below is basically unrolled loops with poor man's vectorization.
+
+                    uint remainingInputBytes = (uint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+                    uint maxIters = Math.Min(remainingInputBytes, (uint)outputCharsRemaining) / (2 * sizeof(uint));
+                    uint secondDWord;
+                    int i;
+                    for (i = 0; (uint)i < maxIters; i++)
+                    {
+                        // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD.
+
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                        secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + sizeof(uint));
+
+                        if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord))
+                        {
+                            goto LoopTerminatedEarlyDueToNonAsciiData;
+                        }
+
+                        pInputBuffer += 8;
+
+                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
+                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
+
+                        pOutputBuffer += 8;
+                    }
+
+                    outputCharsRemaining -= 8 * i;
+
+                    continue; // need to perform a bounds check because we might be running out of data
+
+                LoopTerminatedEarlyDueToNonAsciiData:
+
+                    if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                    {
+                        // The first DWORD contained all-ASCII bytes, so expand it.
+
+                        Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+
+                        // continue the outer loop from the second DWORD
+
+                        Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord));
+                        thisDWord = secondDWord;
+
+                        pInputBuffer += 4;
+                        pOutputBuffer += 4;
+                        outputCharsRemaining -= 4;
+                    }
+
+                    outputCharsRemaining -= 8 * i;
+
+                    // We know that there's *at least* one DWORD of data remaining in the buffer.
+                    // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop.
+
+                    goto AfterReadDWordSkipAllBytesAsciiCheck;
+                }
+
+            AfterReadDWordSkipAllBytesAsciiCheck:
+
+                Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+                // Next, try stripping off ASCII bytes one at a time.
+                // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
+
+                if (UInt32FirstByteIsAscii(thisDWord))
+                {
+                    if (outputCharsRemaining >= 3)
+                    {
+                        // Fast-track: we don't need to check the destination length for subsequent
+                        // ASCII bytes since we know we can write them all now.
+
+                        uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
+
+                        nuint adjustment = 1;
+                        pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian;
+
+                        if (UInt32SecondByteIsAscii(thisDWord))
+                        {
+                            adjustment++;
+                            thisDWordLittleEndian >>= 8;
+                            pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian;
+
+                            if (UInt32ThirdByteIsAscii(thisDWord))
+                            {
+                                adjustment++;
+                                thisDWordLittleEndian >>= 8;
+                                pOutputBuffer[2] = (char)(byte)thisDWordLittleEndian;
+                            }
+                        }
+
+                        pInputBuffer += adjustment;
+                        pOutputBuffer += adjustment;
+                        outputCharsRemaining -= (int)adjustment;
+                    }
+                    else
+                    {
+                        // Slow-track: we need to make sure each individual write has enough
+                        // of a buffer so that we don't overrun the destination.
+
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall;
+                        }
+
+                        uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
+
+                        pInputBuffer++;
+                        *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
+                        outputCharsRemaining--;
+
+                        if (UInt32SecondByteIsAscii(thisDWord))
+                        {
+                            if (outputCharsRemaining == 0)
+                            {
+                                goto OutputBufferTooSmall;
+                            }
+
+                            pInputBuffer++;
+                            thisDWordLittleEndian >>= 8;
+                            *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
+
+                            // We can perform a small optimization here. We know at this point that
+                            // the output buffer is fully consumed (we read two ASCII bytes and wrote
+                            // two ASCII chars, and we checked earlier that the destination buffer
+                            // can't store a third byte). If the next byte is ASCII, we can jump straight
+                            // to the return statement since the end-of-method logic only relies on the
+                            // destination buffer pointer -- NOT the output chars remaining count -- being
+                            // correct. If the next byte is not ASCII, we'll need to continue with the
+                            // rest of the main loop, but we can set the buffer length directly to zero
+                            // rather than decrementing it from 1 to 0.
+
+                            Debug.Assert(outputCharsRemaining == 1);
+
+                            if (UInt32ThirdByteIsAscii(thisDWord))
+                            {
+                                goto OutputBufferTooSmall;
+                            }
+                            else
+                            {
+                                outputCharsRemaining = 0;
+                            }
+                        }
+                    }
+
+                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        goto ProcessRemainingBytesSlow; // input buffer doesn't contain enough data to read a DWORD
+                    }
+                    else
+                    {
+                        // The input buffer at the current offset contains a non-ASCII byte.
+                        // Read an entire DWORD and fall through to multi-byte consumption logic.
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                    }
+                }
+
+            BeforeProcessTwoByteSequence:
+
+                // At this point, we know we're working with a multi-byte code unit,
+                // but we haven't yet validated it.
+
+                // The masks and comparands are derived from the Unicode Standard, Table 3-6.
+                // Additionally, we need to check for valid byte sequences per Table 3-7.
+
+                // Check the 2-byte case.
+
+                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                {
+                    // Per Table 3-7, valid sequences are:
+                    // [ C2..DF ] [ 80..BF ]
+
+                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                    {
+                        goto Error;
+                    }
+
+                ProcessTwoByteSequenceSkipOverlongFormCheck:
+
+                    // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
+                    // there's a good chance that if we see one two-byte run then there's another two-byte
+                    // run immediately after. Let's check that now.
+
+                    // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
+                    // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
+                    // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
+
+                    if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                        || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+                    {
+                        // We have two runs of two bytes each.
+
+                        if (outputCharsRemaining < 2)
+                        {
+                            goto ProcessRemainingBytesSlow; // running out of output buffer
+                        }
+
+                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
+
+                        pInputBuffer += 4;
+                        pOutputBuffer += 2;
+                        outputCharsRemaining -= 2;
+
+                        if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+                            // also two bytes. Check for that first before going back to the beginning of the loop.
+
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                                {
+                                    // The next sequence is a valid two-byte sequence.
+                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+                                }
+                            }
+                            else
+                            {
+                                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                                {
+                                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                                    {
+                                        goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
+                                    }
+
+                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+                                }
+                            }
+
+                            // If we reached this point, the next sequence is something other than a valid
+                            // two-byte sequence, so go back to the beginning of the loop.
+                            goto AfterReadDWord;
+                        }
+                        else
+                        {
+                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                        }
+                    }
+
+                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+                    // bytes are ASCII?
+
+                    uint charToWrite = ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
+
+                    if (UInt32ThirdByteIsAscii(thisDWord))
+                    {
+                        if (UInt32FourthByteIsAscii(thisDWord))
+                        {
+                            if (outputCharsRemaining < 3)
+                            {
+                                goto ProcessRemainingBytesSlow; // running out of output buffer
+                            }
+
+                            pOutputBuffer[0] = (char)charToWrite;
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                thisDWord >>= 16;
+                                pOutputBuffer[1] = (char)(byte)thisDWord;
+                                thisDWord >>= 8;
+                                pOutputBuffer[2] = (char)thisDWord;
+                            }
+                            else
+                            {
+                                pOutputBuffer[2] = (char)(byte)thisDWord;
+                                pOutputBuffer[1] = (char)(byte)(thisDWord >> 8);
+                            }
+                            pInputBuffer += 4;
+                            pOutputBuffer += 3;
+                            outputCharsRemaining -= 3;
+
+                            continue; // go back to original bounds check and check for ASCII
+                        }
+                        else
+                        {
+                            if (outputCharsRemaining < 2)
+                            {
+                                goto ProcessRemainingBytesSlow; // running out of output buffer
+                            }
+
+                            pOutputBuffer[0] = (char)charToWrite;
+                            pOutputBuffer[1] = (char)(byte)(thisDWord >> (BitConverter.IsLittleEndian ? 16 : 8));
+                            pInputBuffer += 3;
+                            pOutputBuffer += 2;
+                            outputCharsRemaining -= 2;
+
+                            // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
+                            // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
+
+                            if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+                            {
+                                goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                            }
+                            else
+                            {
+                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                                goto BeforeProcessTwoByteSequence;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto ProcessRemainingBytesSlow; // running out of output buffer
+                        }
+
+                        pOutputBuffer[0] = (char)charToWrite;
+                        pInputBuffer += 2;
+                        pOutputBuffer += 1;
+                        outputCharsRemaining--;
+
+                        if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+                        {
+                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                            goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
+                        }
+                    }
+                }
+
+            // Check the 3-byte case.
+
+            BeforeProcessThreeByteSequence:
+
+                if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                {
+                ProcessThreeByteSequenceWithCheck:
+
+                    // We need to check for overlong or surrogate three-byte sequences.
+                    //
+                    // Per Table 3-7, valid sequences are:
+                    // [   E0   ] [ A0..BF ] [ 80..BF ]
+                    // [ E1..EC ] [ 80..BF ] [ 80..BF ]
+                    // [   ED   ] [ 80..9F ] [ 80..BF ]
+                    // [ EE..EF ] [ 80..BF ] [ 80..BF ]
+                    //
+                    // Big-endian examples of using the above validation table:
+                    // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
+                    // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
+                    // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
+                    // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
+                    // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        // The "overlong or surrogate" check can be implemented using a single jump, but there's
+                        // some overhead to moving the bits into the correct locations in order to perform the
+                        // correct comparison, and in practice the processor's branch prediction capability is
+                        // good enough that we shouldn't bother. So we'll use two jumps instead.
+
+                        // Can't extract this check into its own helper method because JITter produces suboptimal
+                        // assembly, even with aggressive inlining.
+
+                        // Code below becomes 5 instructions: test, jz, lea, test, jz
+
+                        if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0))
+                        {
+                            goto Error; // overlong or surrogate
+                        }
+                    }
+                    else
+                    {
+                        if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0))
+                        {
+                            goto Error; // overlong or surrogate
+                        }
+                    }
+
+                    // At this point, we know the incoming scalar is well-formed.
+
+                    if (outputCharsRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // not enough space in the destination buffer to write
+                    }
+
+                    // As an optimization, on compatible platforms check if a second three-byte sequence immediately
+                    // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
+
+                    if (BitConverter.IsLittleEndian && Bmi2.X64.IsSupported)
+                    {
+                        if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0)
+                        {
+                            if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 7)
+                            {
+                                // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
+
+                                uint nextDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
+                                if (((nextDWord & 0x0000_200Fu) != 0) && (((nextDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
+                                {
+                                    // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
+                                    ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(nextDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
+                                    thisDWord = nextDWord; // store this value in the correct local for the ASCII drain logic
+
+                                    // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
+                                    ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
+
+                                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
+                                    pInputBuffer += 6;
+                                    pOutputBuffer += 2;
+                                    outputCharsRemaining -= 2;
+
+                                    // Drain any ASCII data following the second three-byte sequence.
+
+                                    goto CheckForAsciiByteAfterThreeByteSequence;
+                                }
+                            }
+                        }
+                    }
+
+                    // Couldn't extract 2x three-byte sequences together, just do this one by itself.
+
+                    *pOutputBuffer = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+                    pInputBuffer += 3;
+                    pOutputBuffer += 1;
+                    outputCharsRemaining -= 1;
+
+                CheckForAsciiByteAfterThreeByteSequence:
+
+                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+                    // in to the text. If this happens strip it off now before seeing if the next character
+                    // consists of three code units.
+
+                    if (UInt32FourthByteIsAscii(thisDWord))
+                    {
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall;
+                        }
+
+                        if (BitConverter.IsLittleEndian)
+                        {
+                            *pOutputBuffer = (char)(thisDWord >> 24);
+                        }
+                        else
+                        {
+                            *pOutputBuffer = (char)(byte)thisDWord;
+                        }
+
+                        pInputBuffer += 1;
+                        pOutputBuffer += 1;
+                        outputCharsRemaining -= 1;
+                    }
+
+                    if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                        // Optimization: A three-byte character could indicate CJK text, which makes it likely
+                        // that the character following this one is also CJK. We'll check for a three-byte sequence
+                        // marker now and jump directly to three-byte sequence processing if we see one, skipping
+                        // all of the logic at the beginning of the loop.
+
+                        if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                        {
+                            goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume
+                        }
+                        else
+                        {
+                            goto AfterReadDWord; // probably ASCII punctuation or whitespace
+                        }
+                    }
+                    else
+                    {
+                        goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                    }
+                }
+
+                // Assume the 4-byte case, but we need to validate.
+
+                {
+                    // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences.
+                    //
+                    // Per Table 3-7, valid sequences are:
+                    // [   F0   ] [ 90..BF ] [ 80..BF ] [ 80..BF ]
+                    // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
+                    // [   F4   ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
+
+                    if (!UInt32BeginsWithUtf8FourByteMask(thisDWord))
+                    {
+                        goto Error;
+                    }
+
+                    // Now check for overlong / out-of-range sequences.
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ].
+                        // We want to get the 'w' byte in front of the 'z' byte so that we can perform
+                        // a single range comparison. We'll take advantage of the fact that the JITter
+                        // can detect a ROR / ROL operation, then we'll just zero out the bytes that
+                        // aren't involved in the range check.
+
+                        uint toCheck = thisDWord & 0x0000_FFFFu;
+
+                        // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ].
+
+                        toCheck = BitOperations.RotateRight(toCheck, 8);
+
+                        // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ].
+
+                        if (!UnicodeUtility.IsInRangeInclusive(toCheck, 0xF000_0090u, 0xF400_008Fu))
+                        {
+                            goto Error;
+                        }
+                    }
+                    else
+                    {
+                        if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0xF090_0000u, 0xF48F_FFFFu))
+                        {
+                            goto Error;
+                        }
+                    }
+
+                    // Validation complete.
+
+                    if (outputCharsRemaining < 2)
+                    {
+                        // There's no point to falling back to the "drain the input buffer" logic, since we know
+                        // we can't write anything to the destination. So we'll just exit immediately.
+                        goto OutputBufferTooSmall;
+                    }
+
+                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractCharsFromFourByteSequence(thisDWord));
+
+                    pInputBuffer += 4;
+                    pOutputBuffer += 2;
+                    outputCharsRemaining -= 2;
+
+                    continue; // go back to beginning of loop for processing
+                }
+            }
+
+        ProcessRemainingBytesSlow:
+            inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+
+        ProcessInputOfLessThanDWordSize:
+            while (inputLength > 0)
+            {
+                uint firstByte = pInputBuffer[0];
+                if (firstByte <= 0x7Fu)
+                {
+                    if (outputCharsRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 1-byte (ASCII) case
+                    *pOutputBuffer = (char)firstByte;
+
+                    pInputBuffer += 1;
+                    pOutputBuffer += 1;
+                    inputLength -= 1;
+                    outputCharsRemaining -= 1;
+                    continue;
+                }
+
+                // Potentially the start of a multi-byte sequence?
+
+                firstByte -= 0xC2u;
+                if ((byte)firstByte <= (0xDFu - 0xC2u))
+                {
+                    // Potentially a 2-byte sequence?
+                    if (inputLength < 2)
+                    {
+                        goto InputBufferTooSmall; // out of data
+                    }
+
+                    uint secondByte = pInputBuffer[1];
+                    if (!IsLowByteUtf8ContinuationByte(secondByte))
+                    {
+                        goto Error; // 2-byte marker not followed by continuation byte
+                    }
+
+                    if (outputCharsRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    uint asChar = (firstByte << 6) + secondByte + ((0xC2u - 0xC0u) << 6) - 0x80u; // remove UTF-8 markers from scalar
+                    *pOutputBuffer = (char)asChar;
+
+                    pInputBuffer += 2;
+                    pOutputBuffer += 1;
+                    inputLength -= 2;
+                    outputCharsRemaining -= 1;
+                    continue;
+                }
+                else if ((byte)firstByte <= (0xEFu - 0xC2u))
+                {
+                    // Potentially a 3-byte sequence?
+                    if (inputLength >= 3)
+                    {
+                        uint secondByte = pInputBuffer[1];
+                        uint thirdByte = pInputBuffer[2];
+                        if (!IsLowByteUtf8ContinuationByte(secondByte) || !IsLowByteUtf8ContinuationByte(thirdByte))
+                        {
+                            goto Error; // 3-byte marker not followed by 2 continuation bytes
+                        }
+
+                        // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet.
+                        // We account for this in the comparisons below.
+
+                        uint partialChar = (firstByte << 12) + (secondByte << 6);
+                        if (partialChar < ((0xE0u - 0xC2u) << 12) + (0xA0u << 6))
+                        {
+                            goto Error; // this is an overlong encoding; fail
+                        }
+
+                        partialChar -= ((0xEDu - 0xC2u) << 12) + (0xA0u << 6); //if partialChar = 0, we're at beginning of UTF-16 surrogate code point range
+                        if (partialChar < (0x0800u /* number of code points in UTF-16 surrogate code point range */))
+                        {
+                            goto Error; // attempted to encode a UTF-16 surrogate code point; fail
+                        }
+
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                        }
+
+                        // Now restore the full scalar value.
+
+                        partialChar += thirdByte;
+                        partialChar += 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds
+                        partialChar -= 0x80u; // remove third byte continuation marker
+
+                        *pOutputBuffer = (char)partialChar;
+
+                        pInputBuffer += 3;
+                        pOutputBuffer += 1;
+                        inputLength -= 3;
+                        outputCharsRemaining -= 1;
+                        continue;
+                    }
+                    else if (inputLength >= 2)
+                    {
+                        uint secondByte = pInputBuffer[1];
+                        if (!IsLowByteUtf8ContinuationByte(secondByte))
+                        {
+                            goto Error; // 3-byte marker not followed by continuation byte
+                        }
+
+                        // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations
+                        // from just the first two bytes.
+
+                        uint partialChar = (firstByte << 6) + secondByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
+                        if (partialChar < ((0xE0u - 0xC2u) << 6) + 0xA0u)
+                        {
+                            goto Error; // failed overlong check
+                        }
+                        if (UnicodeUtility.IsInRangeInclusive(partialChar, ((0xEDu - 0xC2u) << 6) + 0xA0u, ((0xEEu - 0xC2u) << 6) + 0x7Fu))
+                        {
+                            goto Error; // failed surrogate check
+                        }
+                    }
+
+                    goto InputBufferTooSmall; // out of data
+                }
+                else if ((byte)firstByte <= (0xF4u - 0xC2u))
+                {
+                    // Potentially a 4-byte sequence?
+
+                    if (inputLength < 2)
+                    {
+                        goto InputBufferTooSmall; // ran out of data
+                    }
+
+                    uint nextByte = pInputBuffer[1];
+                    if (!IsLowByteUtf8ContinuationByte(nextByte))
+                    {
+                        goto Error; // 4-byte marker not followed by a continuation byte
+                    }
+
+                    uint asPartialChar = (firstByte << 6) + nextByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
+                    if (!UnicodeUtility.IsInRangeInclusive(asPartialChar, ((0xF0u - 0xC2u) << 6) + 0x90u, ((0xF4u - 0xC2u) << 6) + 0x8Fu))
+                    {
+                        goto Error; // failed overlong / out-of-range check
+                    }
+
+                    if (inputLength < 3)
+                    {
+                        goto InputBufferTooSmall; // ran out of data
+                    }
+
+                    if (!IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
+                    {
+                        goto Error; // third byte in 4-byte sequence not a continuation byte
+                    }
+
+                    if (inputLength < 4)
+                    {
+                        goto InputBufferTooSmall; // ran out of data
+                    }
+
+                    if (!IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
+                    {
+                        goto Error; // fourth byte in 4-byte sequence not a continuation byte
+                    }
+
+                    // If we read a valid astral scalar value, the only way we could've fallen down this code path
+                    // is that we didn't have enough output buffer to write the result.
+
+                    goto OutputBufferTooSmall;
+                }
+                else
+                {
+                    goto Error; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte
+                }
+            }
+
+            OperationStatus retVal = OperationStatus.Done;
+            goto ReturnCommon;
+
+        InputBufferTooSmall:
+            retVal = OperationStatus.NeedMoreData;
+            goto ReturnCommon;
+
+        OutputBufferTooSmall:
+            retVal = OperationStatus.DestinationTooSmall;
+            goto ReturnCommon;
+
+        Error:
+            retVal = OperationStatus.InvalidData;
+            goto ReturnCommon;
+
+        ReturnCommon:
+            pInputBufferRemaining = pInputBuffer;
+            pOutputBufferRemaining = pOutputBuffer;
+            return retVal;
+        }
+
+        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
+        // the next char would have been consumed from / the next byte would have been written to.
+        // inputLength in chars, outputBytesRemaining in bytes.
+        public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining)
+        {
+            const int CharsPerDWord = sizeof(uint) / sizeof(char);
+
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative.");
+            Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
+
+            // First, try vectorized conversion.
+
+            {
+                nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
+
+                pInputBuffer += numElementsConverted;
+                pOutputBuffer += numElementsConverted;
+
+                // Quick check - did we just end up consuming the entire input buffer?
+                // If so, short-circuit the remainder of the method.
+
+                if ((int)numElementsConverted == inputLength)
+                {
+                    pInputBufferRemaining = pInputBuffer;
+                    pOutputBufferRemaining = pOutputBuffer;
+                    return OperationStatus.Done;
+                }
+
+                inputLength -= (int)numElementsConverted;
+                outputBytesRemaining -= (int)numElementsConverted;
+            }
+
+            if (inputLength < CharsPerDWord)
+            {
+                goto ProcessInputOfLessThanDWordSize;
+            }
+
+            char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
+
+            // Begin the main loop.
+
+#if DEBUG
+            char* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+            uint thisDWord;
+
+            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            {
+                // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
+
+                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+            AfterReadDWord:
+
+#if DEBUG
+                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+                pLastBufferPosProcessed = pInputBuffer;
+#endif
+
+                // First, check for the common case of all-ASCII chars.
+
+                if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+                {
+                    // We read an all-ASCII sequence (2 chars).
+
+                    if (outputBytesRemaining < 2)
+                    {
+                        goto ProcessOneCharFromCurrentDWordAndFinish; // running out of space, but may be able to write some data
+                    }
+
+                    // The high WORD of the local declared below might be populated with garbage
+                    // as a result of our shifts below, but that's ok since we're only going to
+                    // write the low WORD.
+                    //
+                    // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                    // (Same logic works regardless of endianness.)
+                    uint valueToWrite = thisDWord | (thisDWord >> 8);
+
+                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)valueToWrite);
+
+                    pInputBuffer += 2;
+                    pOutputBuffer += 2;
+                    outputBytesRemaining -= 2;
+
+                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+                    // Below is basically unrolled loops with poor man's vectorization.
+
+                    uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
+                    uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
+
+                    if (BitConverter.IsLittleEndian && Bmi2.X64.IsSupported)
+                    {
+                        const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
+
+                        // Try reading and writing 8 elements per iteration.
+                        uint maxIters = minElementsRemaining / 8;
+                        ulong firstQWord, secondQWord;
+                        int i;
+                        for (i = 0; (uint)i < maxIters; i++)
+                        {
+                            firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
+
+                            if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
+                            {
+                                goto LoopTerminatedDueToNonAsciiData;
+                            }
+
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+
+                            pInputBuffer += 8;
+                            pOutputBuffer += 8;
+                        }
+
+                        outputBytesRemaining -= 8 * i;
+
+                        // Can we perform one more iteration, but reading & writing 4 elements instead of 8?
+
+                        if ((minElementsRemaining & 4) != 0)
+                        {
+                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+
+                            if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
+                            {
+                                goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
+                            }
+
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+
+                            pInputBuffer += 4;
+                            pOutputBuffer += 4;
+                            outputBytesRemaining -= 4;
+                        }
+
+                        continue; // Go back to beginning of main loop, read data, check for ASCII
+
+                    LoopTerminatedDueToNonAsciiData:
+
+                        outputBytesRemaining -= 8 * i;
+
+                        // First, see if we can drain any ASCII data from the first QWORD.
+
+                        if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
+                        {
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+                            pInputBuffer += 4;
+                            pOutputBuffer += 4;
+                            outputBytesRemaining -= 4;
+                        }
+                        else
+                        {
+                            secondQWord = firstQWord;
+                        }
+
+                    LoopTerminatedDueToNonAsciiDataInSecondQWord:
+
+                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
+
+                        thisDWord = (uint)secondQWord;
+                        if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+                        {
+                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+                            pInputBuffer += 2;
+                            pOutputBuffer += 2;
+                            outputBytesRemaining -= 2;
+                            thisDWord = (uint)(secondQWord >> 32);
+                        }
+
+                        goto AfterReadDWordSkipAllCharsAsciiCheck;
+                    }
+                    else
+                    {
+                        // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
+                        uint maxIters = minElementsRemaining / 4;
+                        uint secondDWord;
+                        int i;
+                        for (i = 0; (uint)i < maxIters; i++)
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                            secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 2);
+
+                            if (!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord | secondDWord))
+                            {
+                                goto LoopTerminatedDueToNonAsciiData;
+                            }
+
+                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                            // (Same logic works regardless of endianness.)
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer + 2, (ushort)(secondDWord | (secondDWord >> 8)));
+
+                            pInputBuffer += 4;
+                            pOutputBuffer += 4;
+                        }
+
+                        outputBytesRemaining -= 4 * i;
+
+                        continue; // Go back to beginning of main loop, read data, check for ASCII
+
+                    LoopTerminatedDueToNonAsciiData:
+
+                        outputBytesRemaining -= 4 * i;
+
+                        // First, see if we can drain any ASCII data from the first DWORD.
+
+                        if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+                        {
+                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                            // (Same logic works regardless of endianness.)
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+                            pInputBuffer += 2;
+                            pOutputBuffer += 2;
+                            outputBytesRemaining -= 2;
+                            thisDWord = secondDWord;
+                        }
+
+                        goto AfterReadDWordSkipAllCharsAsciiCheck;
+                    }
+                }
+
+            AfterReadDWordSkipAllCharsAsciiCheck:
+
+                Debug.Assert(!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+                // Next, try stripping off the first ASCII char if it exists.
+                // We don't check for a second ASCII char since that should have been handled above.
+
+                if (IsFirstCharAscii(thisDWord))
+                {
+                    if (outputBytesRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        pOutputBuffer[0] = (byte)thisDWord; // extract [ ## ## 00 AA ]
+                    }
+                    else
+                    {
+                        pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ]
+                    }
+
+                    pInputBuffer += 1;
+                    pOutputBuffer += 1;
+                    outputBytesRemaining -= 1;
+
+                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        goto ProcessNextCharAndFinish; // input buffer doesn't contain enough data to read a DWORD
+                    }
+                    else
+                    {
+                        // The input buffer at the current offset contains a non-ASCII char.
+                        // Read an entire DWORD and fall through to non-ASCII consumption logic.
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                    }
+                }
+
+                // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
+
+                if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                {
+                TryConsumeMultipleTwoByteSequences:
+
+                    // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
+                    // a tight loop without falling back to the main loop.
+
+                    if (IsSecondCharTwoUtf8Bytes(thisDWord))
+                    {
+                        // We have two runs of two bytes each.
+
+                        if (outputBytesRemaining < 4)
+                        {
+                            goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer
+                        }
+
+                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
+
+                        pInputBuffer += 2;
+                        pOutputBuffer += 4;
+                        outputBytesRemaining -= 4;
+
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+                            // also two bytes. Check for that first before going back to the beginning of the loop.
+
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (IsFirstCharTwoUtf8Bytes(thisDWord))
+                            {
+                                // Validated we have a two-byte sequence coming up
+                                goto TryConsumeMultipleTwoByteSequences;
+                            }
+
+                            // If we reached this point, the next sequence is something other than a valid
+                            // two-byte sequence, so go back to the beginning of the loop.
+                            goto AfterReadDWord;
+                        }
+                    }
+
+                    if (outputBytesRemaining < 2)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
+
+                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+                    // char is ASCII?
+
+                    if (IsSecondCharAscii(thisDWord))
+                    {
+                        if (outputBytesRemaining >= 3)
+                        {
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                thisDWord >>= 16;
+                            }
+                            pOutputBuffer[2] = (byte)thisDWord;
+
+                            pInputBuffer += 2;
+                            pOutputBuffer += 3;
+                            outputBytesRemaining -= 3;
+
+                            continue; // go back to original bounds check and check for ASCII
+                        }
+                        else
+                        {
+                            pInputBuffer += 1;
+                            pOutputBuffer += 2;
+                            goto OutputBufferTooSmall;
+                        }
+                    }
+                    else
+                    {
+                        pInputBuffer += 1;
+                        pOutputBuffer += 2;
+                        outputBytesRemaining -= 2;
+
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                            goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
+                        }
+                    }
+                }
+
+            // Check the 3-byte case.
+
+            BeforeProcessThreeByteSequence:
+
+                if (!IsFirstCharSurrogate(thisDWord))
+                {
+                    // Optimization: A three-byte character could indicate CJK text, which makes it likely
+                    // that the character following this one is also CJK. We'll perform the check now
+                    // rather than jumping to the beginning of the main loop.
+
+                    if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
+                    {
+                        if (!IsSecondCharSurrogate(thisDWord))
+                        {
+                            if (outputBytesRemaining < 6)
+                            {
+                                goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can
+                            }
+
+                            WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
+
+                            pInputBuffer += 2;
+                            pOutputBuffer += 6;
+                            outputBytesRemaining -= 6;
+
+                            // Try to remain in the 3-byte processing loop if at all possible.
+
+                            if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                            {
+                                goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                            }
+                            else
+                            {
+                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                                if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                                {
+                                    goto BeforeProcessThreeByteSequence;
+                                }
+                                else
+                                {
+                                    // Fall back to standard processing loop since we don't know how to optimize this.
+                                    goto AfterReadDWord;
+                                }
+                            }
+                        }
+                    }
+
+                ConsumeSingleThreeByteRun:
+
+                    if (outputBytesRemaining < 3)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
+
+                    pInputBuffer += 1;
+                    pOutputBuffer += 3;
+                    outputBytesRemaining -= 3;
+
+                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+                    // in to the text. If this happens strip it off now before seeing if the next character
+                    // consists of three code units.
+
+                    if (IsSecondCharAscii(thisDWord))
+                    {
+                        if (outputBytesRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall;
+                        }
+
+                        if (BitConverter.IsLittleEndian)
+                        {
+                            *pOutputBuffer = (byte)(thisDWord >> 16);
+                        }
+                        else
+                        {
+                            *pOutputBuffer = (byte)(thisDWord);
+                        }
+
+                        pInputBuffer += 1;
+                        pOutputBuffer += 1;
+                        outputBytesRemaining -= 1;
+
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                            {
+                                goto BeforeProcessThreeByteSequence;
+                            }
+                            else
+                            {
+                                // Fall back to standard processing loop since we don't know how to optimize this.
+                                goto AfterReadDWord;
+                            }
+                        }
+                    }
+
+                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                    }
+                    else
+                    {
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                        goto AfterReadDWordSkipAllCharsAsciiCheck; // we just checked above that this value isn't ASCII
+                    }
+                }
+
+                // Four byte sequence processing
+
+                if (IsWellFormedUtf16SurrogatePair(thisDWord))
+                {
+                    if (outputBytesRemaining < 4)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
+
+                    pInputBuffer += 2;
+                    pOutputBuffer += 4;
+                    outputBytesRemaining -= 4;
+
+                    continue; // go back to beginning of loop for processing
+                }
+
+                goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
+            }
+
+        ProcessNextCharAndFinish:
+            inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;
+
+        ProcessInputOfLessThanDWordSize:
+            Debug.Assert(inputLength < CharsPerDWord);
+
+            if (inputLength == 0)
+            {
+                goto InputBufferFullyConsumed;
+            }
+
+            uint thisChar = *pInputBuffer;
+            goto ProcessFinalChar;
+
+        ProcessOneCharFromCurrentDWordAndFinish:
+            if (BitConverter.IsLittleEndian)
+            {
+                thisChar = thisDWord & 0xFFFFu; // preserve only the first char
+            }
+            else
+            {
+                thisChar = thisDWord >> 16; // preserve only the first char
+            }
+
+        ProcessFinalChar:
+            {
+                if (thisChar <= 0x7Fu)
+                {
+                    if (outputBytesRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 1-byte (ASCII) case
+                    *pOutputBuffer = (byte)thisChar;
+
+                    pInputBuffer += 1;
+                    pOutputBuffer += 1;
+                }
+                else if (thisChar < 0x0800u)
+                {
+                    if (outputBytesRemaining < 2)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 2-byte case
+                    pOutputBuffer[1] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
+                    pOutputBuffer[0] = (byte)((thisChar >> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ]
+
+                    pInputBuffer += 1;
+                    pOutputBuffer += 2;
+                }
+                else if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
+                {
+                    if (outputBytesRemaining < 3)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 3-byte case
+                    pOutputBuffer[2] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
+                    pOutputBuffer[1] = (byte)(((thisChar >> 6) & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ]
+                    pOutputBuffer[0] = (byte)((thisChar >> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ]
+
+                    pInputBuffer += 1;
+                    pOutputBuffer += 3;
+                }
+                else if (thisChar <= 0xDBFFu)
+                {
+                    // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer
+                    goto InputBufferTooSmall;
+                }
+                else
+                {
+                    // UTF-16 low surrogate code point with no leading data, report error
+                    goto Error;
+                }
+            }
+
+            // There are two ways we can end up here. Either we were running low on input data,
+            // or we were running low on space in the destination buffer. If we're running low on
+            // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish),
+            // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done.
+            // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish),
+            // then we didn't modify inputLength since entering the main loop, which means it should
+            // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine
+            // which of the two scenarios we're in.
+
+            if (inputLength > 1)
+            {
+                goto OutputBufferTooSmall;
+            }
+
+        InputBufferFullyConsumed:
+            OperationStatus retVal = OperationStatus.Done;
+            goto ReturnCommon;
+
+        InputBufferTooSmall:
+            retVal = OperationStatus.NeedMoreData;
+            goto ReturnCommon;
+
+        OutputBufferTooSmall:
+            retVal = OperationStatus.DestinationTooSmall;
+            goto ReturnCommon;
+
+        Error:
+            retVal = OperationStatus.InvalidData;
+            goto ReturnCommon;
+
+        ReturnCommon:
+            pInputBufferRemaining = pInputBuffer;
+            pOutputBufferRemaining = pOutputBuffer;
+            return retVal;
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs
new file mode 100644
index 0000000000..671bf1fc60
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -0,0 +1,729 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics.X86;
+using Internal.Runtime.CompilerServices;
+
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else // BIT64
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif // BIT64
+
+namespace System.Text.Unicode
+{
+    internal static unsafe partial class Utf8Utility
+    {
+        // Returns &inputBuffer[inputLength] if the input buffer is valid.
+        /// <summary>
+        /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>,
+        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
+        /// </summary>
+        /// <remarks>
+        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
+        /// </remarks>
+        public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            // First, try to drain off as many ASCII bytes as we can from the beginning.
+
+            {
+                nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength);
+                pInputBuffer += numAsciiBytesCounted;
+
+                // Quick check - did we just end up consuming the entire input buffer?
+                // If so, short-circuit the remainder of the method.
+
+                inputLength -= (int)numAsciiBytesCounted;
+                if (inputLength == 0)
+                {
+                    utf16CodeUnitCountAdjustment = 0;
+                    scalarCountAdjustment = 0;
+                    return pInputBuffer;
+                }
+            }
+
+#if DEBUG
+            // Keep these around for final validation at the end of the method.
+            byte* pOriginalInputBuffer = pInputBuffer;
+            int originalInputLength = inputLength;
+#endif
+
+            // Enregistered locals that we'll eventually out to our caller.
+
+            int tempUtf16CodeUnitCountAdjustment = 0;
+            int tempScalarCountAdjustment = 0;
+
+            if (inputLength < sizeof(uint))
+            {
+                goto ProcessInputOfLessThanDWordSize;
+            }
+
+            byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint);
+
+            // Begin the main loop.
+
+#if DEBUG
+            byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            {
+                // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
+
+                uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+            AfterReadDWord:
+
+#if DEBUG
+                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+                pLastBufferPosProcessed = pInputBuffer;
+#endif
+
+                // First, check for the common case of all-ASCII bytes.
+
+                if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                {
+                    // We read an all-ASCII sequence.
+
+                    pInputBuffer += sizeof(uint);
+
+                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+                    // Below is basically unrolled loops with poor man's vectorization.
+
+                    // Below check is "can I read at least five DWORDs from the input stream?"
+                    // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value,
+                    // hence using nint instead of nuint.
+
+                    if ((nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint))
+                    {
+                        // We want reads in the inner loop to be aligned. So let's perform a quick
+                        // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump
+                        // the read pointer up to the next aligned address.
+
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                        if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                        {
+                            goto AfterReadDWordSkipAllBytesAsciiCheck;
+                        }
+
+                        pInputBuffer = (byte*)((nuint)(pInputBuffer + 4) & ~(nuint)3);
+
+                        // At this point, the input buffer offset points to an aligned DWORD. We also know that there's
+                        // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above:
+                        // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and
+                        // the alignment check consumes at most a single DWORD.)
+
+                        byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
+                        uint mask;
+
+                        do
+                        {
+                            if (Sse2.IsSupported && Bmi1.IsSupported)
+                            {
+                                // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
+                                // going to perform an unaligned load. We don't necessarily care about aligning
+                                // this because we pessimistically assume we'll encounter non-ASCII data at some
+                                // point in the not-too-distant future (otherwise we would've stayed entirely
+                                // within the all-ASCII vectorized code at the entry to this method).
+
+                                mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer));
+                                if (mask != 0)
+                                {
+                                    goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
+                                }
+                            }
+                            else
+                            {
+                                if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1]))
+                                {
+                                    goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair;
+                                }
+
+                                if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[2] | ((uint*)pInputBuffer)[3]))
+                                {
+                                    goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair;
+                                }
+                            }
+
+                            pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs
+                        } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop);
+
+                        continue; // need to perform a bounds check because we might be running out of data
+
+                    Sse2LoopTerminatedEarlyDueToNonAsciiData:
+
+                        Debug.Assert(BitConverter.IsLittleEndian);
+                        Debug.Assert(Sse2.IsSupported);
+                        Debug.Assert(Bmi1.IsSupported);
+
+                        // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
+                        // for each non-ASCII byte we saw. We can count the number of ASCII bytes,
+                        // bump our input counter by that amount, and resume processing from the
+                        // "the first byte is no longer ASCII" portion of the main loop.
+
+                        Debug.Assert(mask != 0);
+
+                        pInputBuffer += Bmi1.TrailingZeroCount(mask);
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessRemainingBytesSlow;
+                        }
+
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); // no longer guaranteed to be aligned
+                        goto BeforeProcessTwoByteSequence;
+
+                    LoopTerminatedEarlyDueToNonAsciiDataInSecondPair:
+
+                        pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs
+
+                    LoopTerminatedEarlyDueToNonAsciiDataInFirstPair:
+
+                        // We know that there's *at least* two DWORDs of data remaining in the buffer.
+                        // We also know that one of them (or both of them) contains non-ASCII data somewhere.
+                        // Let's perform a quick check here to bypass the logic at the beginning of the main loop.
+
+                        thisDWord = *(uint*)pInputBuffer; // still aligned here
+                        if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                        {
+                            pInputBuffer += sizeof(uint); // consumed 1 more DWORD
+                            thisDWord = *(uint*)pInputBuffer; // still aligned here
+                        }
+
+                        goto AfterReadDWordSkipAllBytesAsciiCheck;
+                    }
+
+                    continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks
+                }
+
+            AfterReadDWordSkipAllBytesAsciiCheck:
+
+                Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+                // Next, try stripping off ASCII bytes one at a time.
+                // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
+
+                {
+                    uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFrom24BitInteger(thisDWord);
+                    pInputBuffer += numLeadingAsciiBytes;
+
+                    if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+                    {
+                        goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD
+                    }
+                    else
+                    {
+                        // The input buffer at the current offset contains a non-ASCII byte.
+                        // Read an entire DWORD and fall through to multi-byte consumption logic.
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                    }
+                }
+
+            BeforeProcessTwoByteSequence:
+
+                // At this point, we suspect we're working with a multi-byte code unit sequence,
+                // but we haven't yet validated it for well-formedness.
+
+                // The masks and comparands are derived from the Unicode Standard, Table 3-6.
+                // Additionally, we need to check for valid byte sequences per Table 3-7.
+
+                // Check the 2-byte case.
+
+                thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u;
+                if ((thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u)) == 0)
+                {
+                    // Per Table 3-7, valid sequences are:
+                    // [ C2..DF ] [ 80..BF ]
+                    //
+                    // Due to our modification of 'thisDWord' above, this becomes:
+                    // [ 02..1F ] [ 00..3F ]
+                    //
+                    // We've already checked that the leading byte was originally in the range [ C0..DF ]
+                    // and that the trailing byte was originally in the range [ 80..BF ], so now we only need
+                    // to check that the modified leading byte is >= [ 02 ].
+
+                    if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u)
+                        || (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u))
+                    {
+                        goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ]
+                    }
+
+                ProcessTwoByteSequenceSkipOverlongFormCheck:
+
+                    // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
+                    // there's a good chance that if we see one two-byte run then there's another two-byte
+                    // run immediately after. Let's check that now.
+
+                    // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
+                    // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
+                    // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
+
+                    if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                        || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+                    {
+                        // We have two runs of two bytes each.
+                        pInputBuffer += 4;
+                        tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars)
+
+                        if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+                            // also two bytes. Check for that first before going back to the beginning of the loop.
+
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                                {
+                                    // The next sequence is a valid two-byte sequence.
+                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+                                }
+                            }
+                            else
+                            {
+                                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                                {
+                                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                                    {
+                                        goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
+                                    }
+
+                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+                                }
+                            }
+
+                            // If we reached this point, the next sequence is something other than a valid
+                            // two-byte sequence, so go back to the beginning of the loop.
+                            goto AfterReadDWord;
+                        }
+                        else
+                        {
+                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                        }
+                    }
+
+                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+                    // bytes are ASCII?
+
+                    tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing]
+
+                    if (UInt32ThirdByteIsAscii(thisDWord))
+                    {
+                        if (UInt32FourthByteIsAscii(thisDWord))
+                        {
+                            pInputBuffer += 4;
+                        }
+                        else
+                        {
+                            pInputBuffer += 3;
+
+                            // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
+                            // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
+
+                            if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                            {
+                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                                goto BeforeProcessTwoByteSequence;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        pInputBuffer += 2;
+                    }
+
+                    continue;
+                }
+
+                // Check the 3-byte case.
+                // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte.
+
+                thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u);
+                if ((thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u)) == 0)
+                {
+                ProcessThreeByteSequenceWithCheck:
+
+                    // We assume the caller has confirmed that the bit pattern is representative of a three-byte
+                    // sequence, but it may still be overlong or surrogate. We need to check for these possibilities.
+                    //
+                    // Per Table 3-7, valid sequences are:
+                    // [   E0   ] [ A0..BF ] [ 80..BF ]
+                    // [ E1..EC ] [ 80..BF ] [ 80..BF ]
+                    // [   ED   ] [ 80..9F ] [ 80..BF ]
+                    // [ EE..EF ] [ 80..BF ] [ 80..BF ]
+                    //
+                    // Big-endian examples of using the above validation table:
+                    // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
+                    // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
+                    // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
+                    // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
+                    // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
+                    //
+                    // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80)
+                    // as long as they haven't touched the bits we're about to use in our mask checking below.
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        // The "overlong or surrogate" check can be implemented using a single jump, but there's
+                        // some overhead to moving the bits into the correct locations in order to perform the
+                        // correct comparison, and in practice the processor's branch prediction capability is
+                        // good enough that we shouldn't bother. So we'll use two jumps instead.
+
+                        // Can't extract this check into its own helper method because JITter produces suboptimal
+                        // assembly, even with aggressive inlining.
+
+                        // Code below becomes 5 instructions: test, jz, add, test, jz
+
+                        if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord -= 0x0000_200Du) & 0x0000_200Fu) == 0))
+                        {
+                            goto Error; // overlong or surrogate
+                        }
+                    }
+                    else
+                    {
+                        if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord -= 0x0D20_0000u) & 0x0F20_0000u) == 0))
+                        {
+                            goto Error; // overlong or surrogate
+                        }
+                    }
+
+                ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks:
+
+                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+                    // in to the text. If this happens strip it off now before seeing if the next character
+                    // consists of three code units.
+
+                    // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end
+
+                    nint asciiAdjustment;
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value
+                    }
+                    else
+                    {
+                        asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value
+                    }
+
+                    // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise
+
+                    // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method
+                    // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid
+                    // to add 4 before backing up since we already checked previously that the input buffer contains at
+                    // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can
+                    // no longer track the reference. However, we can't back up before adding 4, since we might back up to
+                    // before the start of the buffer, and the GC isn't guaranteed to be able to track this.
+
+                    pInputBuffer += 4; // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte
+                    pInputBuffer += asciiAdjustment; // back up if we didn't actually consume an ASCII byte
+
+                    tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar)
+
+                SuccessfullyProcessedThreeByteSequence:
+
+                    if (IntPtr.Size >= 8 && BitConverter.IsLittleEndian)
+                    {
+                        // x64 little-endian optimization: A three-byte character could indicate CJK text,
+                        // which makes it likely that the character following this one is also CJK.
+                        // We'll try to process several three-byte sequences at a time.
+
+                        // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset
+                        // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so
+                        // use nint instead of nuint.
+
+                        if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5)
+                        {
+                            ulong thisQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+
+                            // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward
+                            // to a previous location in the loop. This offers defense against reading main memory again (which may
+                            // have been modified and could lead to a race condition).
+
+                            thisDWord = (uint)thisQWord;
+
+                            // Is this three 3-byte sequences in a row?
+                            // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
+                            //               ---- CHAR 3  ----   --------- CHAR 2 ---------   --------- CHAR 1 ---------     -CHAR 3-
+                            if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && IsUtf8ContinuationByte(in pInputBuffer[8]))
+                            {
+                                // Saw a proper bitmask for three incoming 3-byte sequences, perform the
+                                // overlong and surrogate sequence checking now.
+
+                                // Check the first character.
+                                // If the first character is overlong or a surrogate, fail immediately.
+
+                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+                                {
+                                    goto Error;
+                                }
+
+                                // Check the second character.
+                                // At this point, we now know the first three bytes represent a well-formed sequence.
+                                // If there's an error beyond here, we'll jump back to the "process three known good bytes"
+                                // logic.
+
+                                thisQWord >>= 24;
+                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+                                {
+                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
+                                }
+
+                                // Check the third character (we already checked that it's followed by a continuation byte).
+
+                                thisQWord >>= 24;
+                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+                                {
+                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
+                                }
+
+                                pInputBuffer += 9;
+                                tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars)
+
+                                goto SuccessfullyProcessedThreeByteSequence;
+                            }
+
+                            // Is this two 3-byte sequences in a row?
+                            // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ]
+                            //                                   --------- CHAR 2 ---------   --------- CHAR 1 ---------
+                            if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul)
+                            {
+                                // Saw a proper bitmask for two incoming 3-byte sequences, perform the
+                                // overlong and surrogate sequence checking now.
+
+                                // Check the first character.
+                                // If the first character is overlong or a surrogate, fail immediately.
+
+                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+                                {
+                                    goto Error;
+                                }
+
+                                // Check the second character.
+                                // At this point, we now know the first three bytes represent a well-formed sequence.
+                                // If there's an error beyond here, we'll jump back to the "process three known good bytes"
+                                // logic.
+
+                                thisQWord >>= 24;
+                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+                                {
+                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
+                                }
+
+                                pInputBuffer += 6;
+                                tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
+
+                                // The next byte in the sequence didn't have a 3-byte marker, so it's probably
+                                // an ASCII character. Jump back to the beginning of loop processing.
+
+                                continue;
+                            }
+
+                            if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                            {
+                                // A single three-byte sequence.
+                                goto ProcessThreeByteSequenceWithCheck;
+                            }
+                            else
+                            {
+                                // Not a three-byte sequence; perhaps ASCII?
+                                goto AfterReadDWord;
+                            }
+                        }
+                    }
+
+                    if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                        // Optimization: A three-byte character could indicate CJK text, which makes it likely
+                        // that the character following this one is also CJK. We'll check for a three-byte sequence
+                        // marker now and jump directly to three-byte sequence processing if we see one, skipping
+                        // all of the logic at the beginning of the loop.
+
+                        if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                        {
+                            goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process
+                        }
+                        else
+                        {
+                            goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop
+                        }
+                    }
+                    else
+                    {
+                        goto ProcessRemainingBytesSlow; // Running out of data
+                    }
+                }
+
+                // Assume the 4-byte case, but we need to validate.
+
+                if (BitConverter.IsLittleEndian)
+                {
+                    thisDWord &= 0xC0C0_FFFFu;
+
+                    // After the above modifications earlier in this method, we expect 'thisDWord'
+                    // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now
+                    // perform two checks to confirm this. The first will verify the
+                    // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's
+                    // complement representation to perform a single *signed* integer check.
+
+                    if ((int)thisDWord > unchecked((int)0x8000_3FFF))
+                    {
+                        goto Error; // didn't have three trailing bytes
+                    }
+
+                    // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
+                    // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
+
+                    thisDWord = BitOperations.RotateRight(thisDWord, 8);
+
+                    // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ].
+                    // The check is now a simple add / cmp / jcc combo.
+
+                    if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu))
+                    {
+                        goto Error; // overlong or out-of-range
+                    }
+                }
+                else
+                {
+                    thisDWord -= 0x80u;
+
+                    // After the above modifications earlier in this method, we expect 'thisDWord'
+                    // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now
+                    // perform two checks to confirm this. The first will verify the
+                    // [ ######## 00###### 00###### 00###### ] structure.
+
+                    if ((thisDWord & 0x00C0_C0C0u) != 0)
+                    {
+                        goto Error; // didn't have three trailing bytes
+                    }
+
+                    // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
+                    // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
+                    // This is a simple range check. (We don't care about the low two bytes.)
+
+                    if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu))
+                    {
+                        goto Error; // overlong or out-of-range
+                    }
+                }
+
+                // Validation of 4-byte case complete.
+
+                pInputBuffer += 4;
+                tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units
+                tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
+
+                continue; // go back to beginning of loop for processing
+            }
+
+            goto ProcessRemainingBytesSlow;
+
+        ProcessInputOfLessThanDWordSize:
+
+            Debug.Assert(inputLength < 4);
+            nuint inputBufferRemainingBytes = (uint)inputLength;
+            goto ProcessSmallBufferCommon;
+
+        ProcessRemainingBytesSlow:
+
+            inputBufferRemainingBytes = (nuint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+
+        ProcessSmallBufferCommon:
+
+            Debug.Assert(inputBufferRemainingBytes < 4);
+            while (inputBufferRemainingBytes > 0)
+            {
+                uint firstByte = pInputBuffer[0];
+
+                if ((byte)firstByte < 0x80u)
+                {
+                    // 1-byte (ASCII) case
+                    pInputBuffer++;
+                    inputBufferRemainingBytes--;
+                    continue;
+                }
+                else if (inputBufferRemainingBytes >= 2)
+                {
+                    uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value
+                    if ((byte)firstByte < 0xE0u)
+                    {
+                        // 2-byte case
+                        if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte))
+                        {
+                            pInputBuffer += 2;
+                            tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar)
+                            inputBufferRemainingBytes -= 2;
+                            continue;
+                        }
+                    }
+                    else if (inputBufferRemainingBytes >= 3)
+                    {
+                        if ((byte)firstByte < 0xF0u)
+                        {
+                            if ((byte)firstByte == 0xE0u)
+                            {
+                                if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu))
+                                {
+                                    goto Error; // overlong encoding
+                                }
+                            }
+                            else if ((byte)firstByte == 0xEDu)
+                            {
+                                if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu))
+                                {
+                                    goto Error; // would be a UTF-16 surrogate code point
+                                }
+                            }
+                            else
+                            {
+                                if (!IsLowByteUtf8ContinuationByte(secondByte))
+                                {
+                                    goto Error; // first trailing byte doesn't have proper continuation marker
+                                }
+                            }
+
+                            if (IsUtf8ContinuationByte(in pInputBuffer[2]))
+                            {
+                                pInputBuffer += 3;
+                                tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
+                                inputBufferRemainingBytes -= 3;
+                                continue;
+                            }
+                        }
+                    }
+                }
+
+                // Error - no match.
+
+                goto Error;
+            }
+
+            // If we reached this point, we're out of data, and we saw no bad UTF8 sequence.
+
+#if DEBUG
+            // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength].
+            Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value.");
+#endif
+
+        Error:
+
+            // Report back to our caller how far we got before seeing invalid data.
+            // (Also used for normal termination when falling out of the loop above.)
+
+            utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment;
+            scalarCountAdjustment = tempScalarCountAdjustment;
+            return pInputBuffer;
+        }
+    }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
index 6ee9ca05a6..d24f766474 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
@@ -6,10 +6,12 @@ using System.Buffers;
 using System.Diagnostics;
 using System.IO;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
 
 namespace System.Text.Unicode
 {
-    internal static class Utf8Utility
+    internal static partial class Utf8Utility
     {
         /// <summary>
         /// The maximum number of bytes that can result from UTF-8 transcoding
@@ -29,26 +31,16 @@ namespace System.Text.Unicode
         /// comes first) is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
+        public unsafe static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
         {
-            // TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948).
-
-            bool tempIsAscii = true;
-            int originalDataLength = utf8Data.Length;
-
-            while (!utf8Data.IsEmpty)
+            fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
             {
-                if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done)
-                {
-                    break;
-                }
+                byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _);
+                int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte);
 
-                tempIsAscii &= result.IsAscii;
-                utf8Data = utf8Data.Slice(bytesConsumed);
+                isAscii = (utf16CodeUnitCountAdjustment == 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII.
+                return (index < utf8Data.Length) ? index : -1;
             }
-
-            isAscii = tempIsAscii;
-            return (utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length);
         }
 
 #if FEATURE_UTF8STRING
diff --git a/tests/CoreFX/CoreFX.issues.json b/tests/CoreFX/CoreFX.issues.json
index cfd18acfd0..6d613d96b9 100644
--- a/tests/CoreFX/CoreFX.issues.json
+++ b/tests/CoreFX/CoreFX.issues.json
@@ -896,15 +896,23 @@
             "methods": [
                 {
                     "name": "System.Text.Tests.EncoderConvert2.EncoderASCIIConvertMixedASCIIUnicodeCharArrayPartial",
-                    "reason": "https://github.com/dotnet/coreclr/issues/23020"
+                    "reason": "https://github.com/dotnet/coreclr/issues/23864"
                 },
                 {
                     "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertMixedASCIIUnicodeCharArrayPartial",
-                    "reason": "https://github.com/dotnet/coreclr/issues/23020"
+                    "reason": "https://github.com/dotnet/coreclr/issues/23864"
                 },
                 {
                     "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertUnicodeCharArrayPartial",
-                    "reason": "https://github.com/dotnet/coreclr/issues/23020"
+                    "reason": "https://github.com/dotnet/coreclr/issues/23864"
+                },
+                {
+                    "name": "System.Text.Tests.NegativeEncodingTests.GetByteCount_Invalid",
+                    "reason": "https://github.com/dotnet/coreclr/issues/23864"
+                },
+                {
+                    "name": "System.Text.Tests.UTF8EncodingDecode.Decode_InvalidBytes",
+                    "reason": "https://github.com/dotnet/coreclr/issues/23864"
                 }
             ]
         }