summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLevi Broderick <GrabYourPitchforks@users.noreply.github.com>2019-04-11 20:50:16 -0700
committerGitHub <noreply@github.com>2019-04-11 20:50:16 -0700
commit77a09eb013b7e2d66b84fc0a0008f31caee58c76 (patch)
tree6143d22041543ca14ba4db80f4be152fe74ef36d /src
parentdaa688d788fa470133ef21b0b51f18e9d91d1228 (diff)
parente307d1462e5a279f90d43fdf78be25f7f6092f1c (diff)
downloadcoreclr-77a09eb013b7e2d66b84fc0a0008f31caee58c76.tar.gz
coreclr-77a09eb013b7e2d66b84fc0a0008f31caee58c76.tar.bz2
coreclr-77a09eb013b7e2d66b84fc0a0008f31caee58c76.zip
Add optimized UTF-8 validation and transcoding apis, hook them up to UTF8Encoding (#21948)
* Add optimized UTF-8 validation and transcoding logic - Hook it up through the existing Utf8 public static APIs - Move some shared methods out of ASCIIUtility - Hook it up through the Utf8String ctor * Hook up new UTF-8 logic through UTF8Encoding - Add vectorized UTF-16 validation and transcoded byte counts - Move Utf16Utility into Unicode namespace alongside Utf8Utility - Fix some bugs in DecoderNLS's draining logic * Improve perf of "is ASCII?" inner loop in UTF-8 validation. * Remove SSE41.X64 optimization from AsciiUtility RyuJIT now handles this optimally * Clarify that vector read is unaligned * Simplify vectorized logic; remove unnecessary adjustment * PR feedback: GetElement(0) -> Sse2.StoreLow * PR feedback - Simplify CountNumberOfLeadingAsciiBytesFrom24BitInteger - Extract some consts out to top of file w/ comments * PR feedback: Enable SSE2 in Utf16Utility code * Expand masks in Utf8Utility, fix const in fallback path * Temporarily disable failing CoreFX tests * Fix incorrect Debug.Assert statements * Add comments tracking JIT workarounds. * Rename DWORD -> UInt32 throughout API surface * Re-flow Utf8Utility.Helpers * PR feedback: Fix typos * PR feedback: CountNumberOfLeadingAsciiBytesFrom24BitInteger * PR feedback: Remove redundant endianess checks * PR feedback: Validate nint definitions * PR feedback: Clarify charIsNonAscii vector usage * PR feedback: document tempUtf8CodeUnitCountAdjustment usage * Fix compilation failure in Utf16Utility * PR feedback: Clarify 3-byte sequence processing * Add missing check to 3-byte processing logic * Clarify comment in 3-byte processing
Diffstat (limited to 'src')
-rw-r--r--src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems7
-rw-r--r--src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs2
-rw-r--r--src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs1
-rw-r--r--src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs2
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs12
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs108
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs110
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs11
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs21
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Rune.cs1
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs2570
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs411
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs (renamed from src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs)2
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs279
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs863
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs1480
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs737
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs26
18 files changed, 4216 insertions, 2427 deletions
diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
index 19d8105baf..02656f57ad 100644
--- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
+++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
@@ -768,6 +768,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.Helpers.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
@@ -799,13 +800,17 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />
- <Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf16Utility.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.Validation.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Helpers.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Transcoding.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Validation.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />
diff --git a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs
index f5bba908b5..ef2eb4945a 100644
--- a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs
+++ b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs
@@ -8,7 +8,7 @@ using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Serialization;
-using System.Text;
+using System.Text.Unicode;
using Internal.Runtime.CompilerServices;
namespace System.Globalization
diff --git a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs
index cf89dff6a2..4391dec044 100644
--- a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs
+++ b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs
@@ -8,6 +8,7 @@ using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Serialization;
using System.Text;
+using System.Text.Unicode;
using Internal.Runtime.CompilerServices;
#if BIT64
diff --git a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs
index beab0cfe02..9e9bb31623 100644
--- a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs
+++ b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs
@@ -5,7 +5,7 @@
using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
-using System.Text;
+using System.Text.Unicode;
using Internal.Runtime.CompilerServices;
#if BIT64
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs
index 8cf1f57ccb..f0fcc3b3a6 100644
--- a/src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs
@@ -159,7 +159,7 @@ namespace System.Text
// Common helper method for all non-EncoderNLS entry points to GetByteCount.
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
- Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
+ Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
// First call into the fast path.
@@ -340,9 +340,9 @@ namespace System.Text
// Common helper method for all non-EncoderNLS entry points to GetBytes.
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
- Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
+ Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
- Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
+ Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
// First call into the fast path.
@@ -496,7 +496,7 @@ namespace System.Text
// Common helper method for all non-DecoderNLS entry points to GetCharCount.
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
- Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
+ Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
// First call into the fast path.
@@ -624,9 +624,9 @@ namespace System.Text
// Common helper method for all non-DecoderNLS entry points to GetChars.
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
- Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
+ Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
- Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
+ Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
// First call into the fast path.
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs
new file mode 100644
index 0000000000..731d52ab82
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs
@@ -0,0 +1,108 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.X86;
+
+namespace System.Text
+{
+ internal static partial class ASCIIUtility
+ {
+ /// <summary>
+ /// A mask which selects only the high bit of each byte of the given <see cref="uint"/>.
+ /// </summary>
+ private const uint UInt32HighBitsOnlyMask = 0x80808080u;
+
+ /// <summary>
+ /// A mask which selects only the high bit of each byte of the given <see cref="ulong"/>.
+ /// </summary>
+ private const ulong UInt64HighBitsOnlyMask = 0x80808080_80808080ul;
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static bool AllBytesInUInt32AreAscii(uint value)
+ {
+ // If the high bit of any byte is set, that byte is non-ASCII.
+
+ return (value & UInt32HighBitsOnlyMask) == 0;
+ }
+
+ /// <summary>
+ /// Given a DWORD which represents a four-byte buffer read in machine endianness, and which
+ /// the caller has asserted contains a non-ASCII byte *somewhere* in the data, counts the
+ /// number of consecutive ASCII bytes starting from the beginning of the buffer. Returns
+ /// a value 0 - 3, inclusive. (The caller is responsible for ensuring that the buffer doesn't
+ /// contain all-ASCII data.)
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(uint value)
+ {
+ Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");
+
+ // Use BMI1 directly rather than going through BitOperations. We only see a perf gain here
+ // if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations
+ // is too slow for our purposes since we can provide our own faster, specialized software fallback.
+
+ if (Bmi1.IsSupported)
+ {
+ Debug.Assert(BitConverter.IsLittleEndian);
+ return Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
+ }
+
+ // Couldn't emit tzcnt, use specialized software fallback.
+ // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
+ // on whether all processed bytes were ASCII. Then we accumulate all of the
+ // results to calculate how many consecutive ASCII bytes are present.
+
+ value = ~value;
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // Read first byte
+ value >>= 7;
+ uint allBytesUpToNowAreAscii = value & 1;
+ uint numAsciiBytes = allBytesUpToNowAreAscii;
+
+ // Read second byte
+ value >>= 8;
+ allBytesUpToNowAreAscii &= value;
+ numAsciiBytes += allBytesUpToNowAreAscii;
+
+ // Read third byte
+ value >>= 8;
+ allBytesUpToNowAreAscii &= value;
+ numAsciiBytes += allBytesUpToNowAreAscii;
+
+ return numAsciiBytes;
+ }
+ else
+ {
+ // BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on
+ // little-endian platforms, so using it in this big-endian path would be too
+ // expensive. Instead we'll just change how we perform the shifts.
+
+ // Read first byte
+ value = BitOperations.RotateLeft(value, 1);
+ uint allBytesUpToNowAreAscii = value & 1;
+ uint numAsciiBytes = allBytesUpToNowAreAscii;
+
+ // Read second byte
+ value = BitOperations.RotateLeft(value, 8);
+ allBytesUpToNowAreAscii &= value;
+ numAsciiBytes += allBytesUpToNowAreAscii;
+
+ // Read third byte
+ value = BitOperations.RotateLeft(value, 8);
+ allBytesUpToNowAreAscii &= value;
+ numAsciiBytes += allBytesUpToNowAreAscii;
+
+ return numAsciiBytes;
+ }
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
index 755f925610..8ff5b05429 100644
--- a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
@@ -21,19 +21,20 @@ namespace System.Text
{
internal static partial class ASCIIUtility
{
- /// <summary>
- /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool AllBytesInUInt32AreAscii(uint value)
+#if DEBUG
+ static ASCIIUtility()
{
- return ((value & 0x80808080u) == 0);
+ Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
+ Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
}
+#endif // DEBUG
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool AllBytesInUInt64AreAscii(ulong value)
{
- return ((value & 0x80808080_80808080ul) == 0);
+ // If the high bit of any byte is set, that byte is non-ASCII.
+
+ return ((value & UInt64HighBitsOnlyMask) == 0);
}
/// <summary>
@@ -55,56 +56,6 @@ namespace System.Text
}
/// <summary>
- /// Given a 24-bit integer which represents a three-byte buffer read in machine endianness,
- /// counts the number of consecutive ASCII bytes starting from the beginning of the buffer.
- /// Returns a value 0 - 3, inclusive.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value)
- {
- // This implementation seems to have better performance than tzcnt.
-
- // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
- // on whether all processed bytes were ASCII. Then we accumulate all of the
- // results to calculate how many consecutive ASCII bytes are present.
-
- value = ~value;
-
- if (BitConverter.IsLittleEndian)
- {
- // Read first byte
- uint allBytesUpToNowAreAscii = (value >>= 7) & 1;
- uint numAsciiBytes = allBytesUpToNowAreAscii;
-
- // Read second byte
- allBytesUpToNowAreAscii &= (value >>= 8);
- numAsciiBytes += allBytesUpToNowAreAscii;
-
- // Read third byte
- allBytesUpToNowAreAscii &= (value >>= 8);
- numAsciiBytes += allBytesUpToNowAreAscii;
-
- return numAsciiBytes;
- }
- else
- {
- // Read first byte
- uint allBytesUpToNowAreAscii = (value = ROL32(value, 1)) & 1;
- uint numAsciiBytes = allBytesUpToNowAreAscii;
-
- // Read second byte
- allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
- numAsciiBytes += allBytesUpToNowAreAscii;
-
- // Read third byte
- allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
- numAsciiBytes += allBytesUpToNowAreAscii;
-
- return numAsciiBytes;
- }
- }
-
- /// <summary>
/// Given a DWORD which represents two packed chars in machine-endian order,
/// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
/// </summary>
@@ -273,7 +224,7 @@ namespace System.Text
// we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
// non-ASCII. In both cases we only care about the low 24 bits.
- pBuffer += CountNumberOfLeadingAsciiBytesFrom24BitInteger(currentUInt32);
+ pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
goto Finish;
}
@@ -435,7 +386,7 @@ namespace System.Text
uint currentDWord;
Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
- pBuffer += CountNumberOfLeadingAsciiBytesFrom24BitInteger(currentDWord);
+ pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
goto Finish;
@@ -461,7 +412,7 @@ namespace System.Text
// Clear everything but the high bit of each byte, then tzcnt.
// Remember the / 8 at the end to convert bit count to byte count.
- candidateUInt64 &= 0x80808080_80808080ul;
+ candidateUInt64 &= UInt64HighBitsOnlyMask;
pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
goto Finish;
}
@@ -1395,17 +1346,7 @@ namespace System.Text
// Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-
- if (Sse41.X64.IsSupported)
- {
- // Use PEXTRQ instruction if available, since it can extract from the vector directly to the destination address.
- Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, Sse41.X64.Extract(asciiVector.AsUInt64(), 0));
- }
- else
- {
- // Bounce this through a temporary register (with potential stack spillage) before writing to memory.
- Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, asciiVector.AsUInt64().GetElement(0));
- }
+ Sse2.StoreLow((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
@@ -1444,16 +1385,7 @@ namespace System.Text
// Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-
- // See comments earlier in this method for information about how this works.
- if (Sse41.X64.IsSupported)
- {
- Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, Sse41.X64.Extract(asciiVector.AsUInt64(), 0));
- }
- else
- {
- Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, asciiVector.AsUInt64().GetElement(0));
- }
+ Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
}
// Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
@@ -1529,27 +1461,13 @@ namespace System.Text
Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
- // See comments earlier in this method for information about how this works.
- if (Sse41.X64.IsSupported)
- {
- *(ulong*)(pAsciiBuffer + currentOffsetInElements) = Sse41.X64.Extract(asciiVector.AsUInt64(), 0);
- }
- else
- {
- *(ulong*)(pAsciiBuffer + currentOffsetInElements) = asciiVector.AsUInt64().GetElement(0);
- }
+ Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
currentOffsetInElements += SizeOfVector128 / 2;
goto Finish;
}
/// <summary>
- /// Rotates a <see cref="uint"/> left. The JIT is smart enough to turn this into a ROL / ROR instruction.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static uint ROL32(uint value, int shift) => (value << shift) | (value >> (32 - shift));
-
- /// <summary>
/// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
/// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
/// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
diff --git a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs
index 9040a94f0f..bb5aa5f0ac 100644
--- a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs
@@ -266,6 +266,7 @@ namespace System.Text
// to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
+ Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
// Copy the existing leftover data plus as many bytes as possible of the new incoming data
// into a temporary concated buffer, then get its char count by decoding it.
@@ -319,6 +320,7 @@ namespace System.Text
// to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
+ Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
// Copy the existing leftover data plus as many bytes as possible of the new incoming data
// into a temporary concated buffer, then transcode it from bytes to chars.
@@ -370,6 +372,14 @@ namespace System.Text
Finish:
+ // Report back the number of bytes (from the new incoming span) we consumed just now.
+ // This calculation is simple: it's the difference between the original leftover byte
+ // count and the number of bytes from the combined buffer we needed to decode the first
+ // scalar value. We need to report this before the call to SetLeftoverData /
+ // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field.
+
+ bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount;
+
if (persistNewCombinedBuffer)
{
Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer.");
@@ -380,7 +390,6 @@ namespace System.Text
ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths
}
- bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now
return charsWritten;
DestinationTooSmall:
diff --git a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs
index 0e32167957..ca740a1adc 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs
@@ -850,8 +850,14 @@ namespace System.Text
ReadOnlySpan<byte> bytes = new ReadOnlySpan<byte>(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar);
- int totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out int bytesConsumedJustNow);
- bytes = bytes.Slice(bytesConsumedJustNow);
+ int bytesConsumedJustNow = 0;
+ int totalCharCount = 0;
+
+ if (decoder.HasLeftoverData)
+ {
+ totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out bytesConsumedJustNow);
+ bytes = bytes.Slice(bytesConsumedJustNow);
+ }
// Now try invoking the "fast path" (no fallback) implementation.
// We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers).
@@ -1120,10 +1126,15 @@ namespace System.Text
ReadOnlySpan<byte> bytes = new ReadOnlySpan<byte>(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar);
Span<char> chars = new Span<char>(pOriginalChars, originalCharCount).Slice(charsWrittenSoFar);
- int charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out int bytesConsumedJustNow);
+ int bytesConsumedJustNow = 0;
+ int charsWrittenJustNow = 0;
- bytes = bytes.Slice(bytesConsumedJustNow);
- chars = chars.Slice(charsWrittenJustNow);
+ if (decoder.HasLeftoverData)
+ {
+ charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out bytesConsumedJustNow);
+ bytes = bytes.Slice(bytesConsumedJustNow);
+ chars = chars.Slice(charsWrittenJustNow);
+ }
Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, "Should be no remaining fallback data at this point.");
diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
index a91c0fcb99..a71750eaa5 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
@@ -6,6 +6,7 @@ using System.Buffers;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
+using System.Text.Unicode;
namespace System.Text
{
diff --git a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs
index aaac975ec8..72d0a58f11 100644
--- a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs
@@ -15,9 +15,11 @@
#define FASTLOOP
using System;
+using System.Buffers;
using System.Diagnostics;
-using System.Globalization;
+using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+using System.Text.Unicode;
namespace System.Text
{
@@ -129,22 +131,26 @@ namespace System.Text
public override unsafe int GetByteCount(char[] chars, int index, int count)
{
// Validate input parameters
- if (chars == null)
- throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
- if (index < 0 || count < 0)
- throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
+ if (chars is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array);
+ }
- if (chars.Length - index < count)
- throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
+ if ((index | count) < 0)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- // If no input, return 0, avoid fixed empty array problem
- if (count == 0)
- return 0;
+ if (chars.Length - index < count)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
+ }
- // Just call the pointer version
fixed (char* pChars = chars)
- return GetByteCount(pChars + index, count, null);
+ {
+ return GetByteCountCommon(pChars + index, count);
+ }
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -154,12 +160,17 @@ namespace System.Text
public override unsafe int GetByteCount(string chars)
{
- // Validate input
- if (chars==null)
- throw new ArgumentNullException("s");
+ // Validate input parameters
+
+ if (chars is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
+ }
fixed (char* pChars = chars)
- return GetByteCount(pChars, chars.Length, null);
+ {
+ return GetByteCountCommon(pChars, chars.Length);
+ }
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -170,22 +181,78 @@ namespace System.Text
public override unsafe int GetByteCount(char* chars, int count)
{
// Validate Parameters
+
if (chars == null)
- throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
+ }
if (count < 0)
- throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- // Call it with empty encoder
- return GetByteCount(chars, count, null);
+ return GetByteCountCommon(chars, count);
}
public override unsafe int GetByteCount(ReadOnlySpan<char> chars)
{
- fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
+ // It's ok for us to pass null pointers down to the workhorse below.
+
+ fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
+ {
+ return GetByteCountCommon(charsPtr, chars.Length);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe int GetByteCountCommon(char* pChars, int charCount)
+ {
+ // Common helper method for all non-EncoderNLS entry points to GetByteCount.
+ // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
+
+ Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
+ Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+
+ // First call into the fast path.
+ // Don't bother providing a fallback mechanism; our fast path doesn't use it.
+
+ int totalByteCount = GetByteCountFast(pChars, charCount, fallback: null, out int charsConsumed);
+
+ if (charsConsumed != charCount)
+ {
+ // If there's still data remaining in the source buffer, go down the fallback path.
+ // We need to check for integer overflow since the fallback could change the required
+ // output count in unexpected ways.
+
+ totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed);
+ if (totalByteCount < 0)
+ {
+ ThrowConversionOverflow();
+ }
+ }
+
+ return totalByteCount;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
+ private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback fallback, out int charsConsumed)
+ {
+ // The number of UTF-8 code units may exceed the number of UTF-16 code units,
+ // so we'll need to check for overflow before casting to Int32.
+
+ char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _);
+
+ int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars);
+ charsConsumed = tempCharsConsumed;
+
+ long totalUtf8Bytes = tempCharsConsumed + utf8CodeUnitCountAdjustment;
+ if ((ulong)totalUtf8Bytes > int.MaxValue)
{
- return GetByteCount(charsPtr, chars.Length, baseEncoder: null);
+ ThrowConversionOverflow();
}
+
+ return (int)totalUtf8Bytes;
}
// Parent method is safe.
@@ -196,22 +263,37 @@ namespace System.Text
public override unsafe int GetBytes(string s, int charIndex, int charCount,
byte[] bytes, int byteIndex)
{
- if (s == null || bytes == null)
- throw new ArgumentNullException((s == null ? nameof(s) : nameof(bytes)), SR.ArgumentNull_Array);
+ // Validate Parameters
+
+ if (s is null || bytes is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(
+ argument: (s is null) ? ExceptionArgument.s : ExceptionArgument.bytes,
+ resource: ExceptionResource.ArgumentNull_Array);
+ }
- if (charIndex < 0 || charCount < 0)
- throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
+ if ((charIndex | charCount) < 0)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(
+ argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
+ resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
if (s.Length - charIndex < charCount)
- throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
-
- if (byteIndex < 0 || byteIndex > bytes.Length)
- throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.s, ExceptionResource.ArgumentOutOfRange_IndexCount);
+ }
- int byteCount = bytes.Length - byteIndex;
+ if ((uint)byteIndex > bytes.Length)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
+ }
- fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
- return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
+ fixed (char* pChars = s)
+ fixed (byte* pBytes = bytes)
+ {
+ return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
+ }
}
// Encodes a range of characters in a character array into a range of bytes
@@ -232,28 +314,36 @@ namespace System.Text
byte[] bytes, int byteIndex)
{
// Validate parameters
- if (chars == null || bytes == null)
- throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array);
-
- if (charIndex < 0 || charCount < 0)
- throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
- if (chars.Length - charIndex < charCount)
- throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
+ if (chars is null || bytes is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(
+ argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
+ resource: ExceptionResource.ArgumentNull_Array);
+ }
- if (byteIndex < 0 || byteIndex > bytes.Length)
- throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
+ if ((charIndex | charCount) < 0)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(
+ argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
+ resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- // If nothing to encode return 0, avoid fixed problem
- if (charCount == 0)
- return 0;
+ if (chars.Length - charIndex < charCount)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount);
+ }
- // Just call pointer version
- int byteCount = bytes.Length - byteIndex;
+ if ((uint)byteIndex > bytes.Length)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
+ }
- fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
- // Remember that byteCount is # to decode, not size of array.
- return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
+ fixed (char* pChars = chars)
+ fixed (byte* pBytes = bytes)
+ {
+ return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
+ }
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -264,24 +354,77 @@ namespace System.Text
public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
{
// Validate Parameters
- if (bytes == null || chars == null)
- throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
- if (charCount < 0 || byteCount < 0)
- throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
+ if (chars == null || bytes == null)
+ {
+ ThrowHelper.ThrowArgumentNullException(
+ argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
+ resource: ExceptionResource.ArgumentNull_Array);
+ }
+
+ if ((charCount | byteCount) < 0)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(
+ argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount,
+ resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- return GetBytes(chars, charCount, bytes, byteCount, null);
+ return GetBytesCommon(chars, charCount, bytes, byteCount);
}
public override unsafe int GetBytes(ReadOnlySpan<char> chars, Span<byte> bytes)
{
- fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
- fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
+ // It's ok for us to operate on null / empty spans.
+
+ fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
+ fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
{
- return GetBytes(charsPtr, chars.Length, bytesPtr, bytes.Length, baseEncoder: null);
+ return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length);
}
}
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount)
+ {
+ // Common helper method for all non-EncoderNLS entry points to GetBytes.
+ // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
+
+ Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
+ Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+ Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
+ Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+
+ // First call into the fast path.
+
+ int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed);
+
+ if (charsConsumed == charCount)
+ {
+ // All elements converted - return immediately.
+
+ return bytesWritten;
+ }
+ else
+ {
+ // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
+
+ return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon
+ private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed)
+ {
+ // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
+ // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
+ // we'll handle the remainder in the fallback routine.
+
+ Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining);
+
+ charsConsumed = (int)(pInputBufferRemaining - pChars);
+ return (int)(pOutputBufferRemaining - pBytes);
+ }
+
// Returns the number of characters produced by decoding a range of bytes
// in a byte array.
//
@@ -293,22 +436,26 @@ namespace System.Text
public override unsafe int GetCharCount(byte[] bytes, int index, int count)
{
// Validate Parameters
- if (bytes == null)
- throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
- if (index < 0 || count < 0)
- throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
+ if (bytes is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
+ }
- if (bytes.Length - index < count)
- throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
+ if ((index | count) < 0)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- // If no input just return 0, fixed doesn't like 0 length arrays.
- if (count == 0)
- return 0;
+ if (bytes.Length - index < count)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
+ }
- // Just call pointer version
fixed (byte* pBytes = bytes)
- return GetCharCount(pBytes + index, count, null);
+ {
+ return GetCharCountCommon(pBytes + index, count);
+ }
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -319,20 +466,27 @@ namespace System.Text
public override unsafe int GetCharCount(byte* bytes, int count)
{
// Validate Parameters
+
if (bytes == null)
- throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
+ }
if (count < 0)
- throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- return GetCharCount(bytes, count, null);
+ return GetCharCountCommon(bytes, count);
}
public override unsafe int GetCharCount(ReadOnlySpan<byte> bytes)
{
- fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
+ // It's ok for us to pass null pointers down to the workhorse routine.
+
+ fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
{
- return GetCharCount(bytesPtr, bytes.Length, baseDecoder: null);
+ return GetCharCountCommon(bytesPtr, bytes.Length);
}
}
@@ -345,28 +499,36 @@ namespace System.Text
char[] chars, int charIndex)
{
// Validate Parameters
- if (bytes == null || chars == null)
- throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
-
- if (byteIndex < 0 || byteCount < 0)
- throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
- if ( bytes.Length - byteIndex < byteCount)
- throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
+ if (bytes is null || chars is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(
+ argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
+ resource: ExceptionResource.ArgumentNull_Array);
+ }
- if (charIndex < 0 || charIndex > chars.Length)
- throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
+ if ((byteIndex | byteCount) < 0)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(
+ argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount,
+ resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
+ }
- // If no input, return 0 & avoid fixed problem
- if (byteCount == 0)
- return 0;
+ if (bytes.Length - byteIndex < byteCount)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
+ }
- // Just call pointer version
- int charCount = chars.Length - charIndex;
+ if ((uint)charIndex > (uint)chars.Length)
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index);
+ }
- fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
- // Remember that charCount is # to decode, not size of array
- return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
+ fixed (byte* pBytes = bytes)
+ fixed (char* pChars = chars)
+ {
+ return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex);
+ }
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
@@ -377,2120 +539,245 @@ namespace System.Text
public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
{
// Validate Parameters
- if (bytes == null || chars == null)
- throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
-
- if (charCount < 0 || byteCount < 0)
- throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
- return GetChars(bytes, byteCount, chars, charCount, null);
- }
+ if (bytes is null || chars is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(
+ argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
+ resource: ExceptionResource.ArgumentNull_Array);
+ }
- public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
- {
- fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
- fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
+ if ((byteCount | charCount) < 0)
{
- return GetChars(bytesPtr, bytes.Length, charsPtr, chars.Length, baseDecoder: null);
+ ThrowHelper.ThrowArgumentOutOfRangeException(
+ argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount,
+ resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
}
- }
- // Returns a string containing the decoded representation of a range of
- // bytes in a byte array.
- //
- // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
- // So if you fix this, fix the others. Currently those include:
- // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
- // parent method is safe
+ return GetCharsCommon(bytes, byteCount, chars, charCount);
+ }
- public override unsafe string GetString(byte[] bytes, int index, int count)
+ public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
{
- // Validate Parameters
- if (bytes == null)
- throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
-
- if (index < 0 || count < 0)
- throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
-
- if (bytes.Length - index < count)
- throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
-
- // Avoid problems with empty input buffer
- if (count == 0) return string.Empty;
+ // It's ok for us to pass null pointers down to the workhorse below.
- fixed (byte* pBytes = bytes)
- return string.CreateStringFromEncoding(
- pBytes + index, count, this);
+ fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
+ fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
+ {
+ return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length);
+ }
}
+ // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
+ // So if we're really broken, then that could also throw an error... recursively.
+ // So try to make sure GetChars can at least process all uses by
+ // System.Resources.ResourceReader!
//
- // End of standard methods copied from EncodingNLS.cs
- //
-
- // To simplify maintenance, the structure of GetByteCount and GetBytes should be
- // kept the same as much as possible
- internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
+ // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
+ // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount)
{
- // For fallback we may need a fallback buffer.
- // We wait to initialize it though in case we don't have any broken input unicode
- EncoderFallbackBuffer fallbackBuffer = null;
- char* pSrcForFallback;
+ // Common helper method for all non-DecoderNLS entry points to GetChars.
+ // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
- char* pSrc = chars;
- char* pEnd = pSrc + count;
+ Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
+ Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
+ Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
+ Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
- // Start by assuming we have as many as count
- int byteCount = count;
+ // First call into the fast path.
- int ch = 0;
+ int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed);
- if (baseEncoder != null)
+ if (bytesConsumed == byteCount)
{
- UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
- ch = encoder.surrogateChar;
-
- // We mustn't have left over fallback data when counting
- if (encoder.InternalHasFallbackBuffer)
- {
- fallbackBuffer = encoder.FallbackBuffer;
- if (fallbackBuffer.Remaining > 0)
- throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
+ // All elements converted - return immediately.
- // Set our internal fallback interesting things.
- fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
- }
+ return charsWritten;
}
-
- for (;;)
+ else
{
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
- if (pSrc >= pEnd)
- {
- if (ch == 0)
- {
- // Unroll any fallback that happens at the end
- ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
- if (ch > 0)
- {
- byteCount++;
- goto ProcessChar;
- }
- }
- else
- {
- // Case of surrogates in the fallback.
- if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
- {
- Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- ch = fallbackBuffer.InternalGetNextChar();
- byteCount++;
-
- if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- ch = 0xfffd;
- byteCount++;
- goto EncodeChar;
- }
- else if (ch > 0)
- {
- goto ProcessChar;
- }
- else
- {
- byteCount--; // ignore last one.
- break;
- }
- }
- }
-
- if (ch <= 0)
- {
- break;
- }
- if (baseEncoder != null && !baseEncoder.MustFlush)
- {
- break;
- }
-
- // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
- byteCount++;
- goto EncodeChar;
- }
-
- if (ch > 0)
- {
- Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- // use separate helper variables for local contexts so that the jit optimizations
- // won't get confused about the variable lifetimes
- int cha = *pSrc;
-
- // count the pending surrogate
- byteCount++;
-
- // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
- // if (IsLowSurrogate(cha)) {
- if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
- ch = 0xfffd;
- // ch = cha + (ch << 10) +
- // (0x10000
- // - CharUnicodeInfo.LOW_SURROGATE_START
- // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
-
- // Use this next char
- pSrc++;
- }
- // else ch is still high surrogate and encoding will fail (so don't add count)
-
- // attempt to encode the surrogate or partial surrogate
- goto EncodeChar;
- }
-
- // If we've used a fallback, then we have to check for it
- if (fallbackBuffer != null)
- {
- ch = fallbackBuffer.InternalGetNextChar();
- if (ch > 0)
- {
- // We have an extra byte we weren't expecting.
- byteCount++;
- goto ProcessChar;
- }
- }
-
- // read next char. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- // if (IsHighSurrogate(ch)) {
- if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
- {
- // we will count this surrogate next time around
- byteCount--;
- continue;
- }
- // either good char or partial surrogate
-
- EncodeChar:
- // throw exception on partial surrogate if necessary
- // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
- if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // Lone surrogates aren't allowed
- // Have to make a fallback buffer if we don't have one
- if (fallbackBuffer == null)
- {
- // wait on fallbacks if we can
- // For fallback we may need a fallback buffer
- if (baseEncoder == null)
- fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
- else
- fallbackBuffer = baseEncoder.FallbackBuffer;
-
- // Set our internal fallback interesting things.
- fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
- }
-
- // Do our fallback. Actually we already know its a mixed up surrogate,
- // so the ref pSrc isn't gonna do anything.
- pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
- fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
- pSrc = pSrcForFallback;
-
- // Ignore it if we don't throw (we had preallocated this ch)
- byteCount--;
- ch = 0;
- continue;
- }
-
- // Count them
- if (ch > 0x7F)
- {
- if (ch > 0x7FF)
- {
- // the extra surrogate byte was compensated by the second surrogate character
- // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
- byteCount++;
- }
- byteCount++;
- }
-
-#if BIT64
- // check for overflow
- if (byteCount < 0)
- {
- break;
- }
-#endif
-
-#if FASTLOOP
- // If still have fallback don't do fast loop
- if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
- {
- // We're reserving 1 byte for each char by default
- byteCount++;
- goto ProcessChar;
- }
-
- int availableChars = PtrDiff(pEnd, pSrc);
+ // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
- // don't fall into the fast decoding loop if we don't have enough characters
- if (availableChars <= 13)
- {
- // try to get over the remainder of the ascii characters fast though
- char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
- while (pSrc < pLocalEnd)
- {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F)
- goto ProcessChar;
- }
-
- // we are done
- break;
- }
-
-#if BIT64
- // make sure that we won't get a silent overflow inside the fast loop
- // (Fall out to slow loop if we have this many characters)
- availableChars &= 0x0FFFFFFF;
-#endif
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
- char* pStop = pSrc + availableChars - (3 + 4);
-
- while (pSrc < pStop)
- {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F) // Not ASCII
- {
- if (ch > 0x7FF) // Not 2 Byte
- {
- if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
- goto LongCode;
- byteCount++;
- }
- byteCount++;
- }
-
- // get pSrc aligned
- if ((unchecked((int)pSrc) & 0x2) != 0)
- {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F) // Not ASCII
- {
- if (ch > 0x7FF) // Not 2 Byte
- {
- if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
- goto LongCode;
- byteCount++;
- }
- byteCount++;
- }
- }
-
- // Run 2 * 4 characters at a time!
- while (pSrc < pStop)
- {
- ch = *(int*)pSrc;
- int chc = *(int*)(pSrc + 2);
- if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
- {
- if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
- {
- goto LongCodeWithMask;
- }
-
-
- if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits)
- byteCount++;
- if ((ch & unchecked((int)0xFF80)) != 0)
- byteCount++;
- if ((chc & unchecked((int)0xFF800000)) != 0)
- byteCount++;
- if ((chc & unchecked((int)0xFF80)) != 0)
- byteCount++;
- }
- pSrc += 4;
-
- ch = *(int*)pSrc;
- chc = *(int*)(pSrc + 2);
- if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
- {
- if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
- {
- goto LongCodeWithMask;
- }
-
- if ((ch & unchecked((int)0xFF800000)) != 0)
- byteCount++;
- if ((ch & unchecked((int)0xFF80)) != 0)
- byteCount++;
- if ((chc & unchecked((int)0xFF800000)) != 0)
- byteCount++;
- if ((chc & unchecked((int)0xFF80)) != 0)
- byteCount++;
- }
- pSrc += 4;
- }
- break;
-
- LongCodeWithMask:
- if (BitConverter.IsLittleEndian)
- {
- ch = (char)ch;
- }
- else
- {
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- }
- pSrc++;
-
- if (ch <= 0x7F)
- {
- continue;
- }
-
- LongCode:
- // use separate helper variables for slow and fast loop so that the jit optimizations
- // won't get confused about the variable lifetimes
- if (ch > 0x7FF)
- {
- // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
- if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // 4 byte encoding - high surrogate + low surrogate
-
- int chd = *pSrc;
- if (
- // !IsHighSurrogate(ch) // low without high -> bad
- ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
- // !IsLowSurrogate(chd) // high not followed by low -> bad
- !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // Back up and drop out to slow loop to figure out error
- pSrc--;
- break;
- }
- pSrc++;
-
- // byteCount - this byte is compensated by the second surrogate character
- }
- byteCount++;
- }
- byteCount++;
-
- // byteCount - the last byte is already included
- }
-#endif // FASTLOOP
-
- // no pending char at this point
- ch = 0;
+ return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten);
}
-
-#if BIT64
- // check for overflow
- if (byteCount < 0)
- {
- throw new ArgumentException(
- SR.Argument_ConversionOverflow);
- }
-#endif
-
- Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
- "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
-
- return byteCount;
}
- // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
- // is good enough for us, and it tends to generate better code than the signed
- // arithmetic generated by default
- private static unsafe int PtrDiff(char* a, char* b)
+ [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon
+ private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed)
{
- return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
- }
+ // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
+ // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
+ // we'll handle the remainder in the fallback routine.
- // byte* flavor just for parity
- private static unsafe int PtrDiff(byte* a, byte* b)
- {
- return (int)(a - b);
- }
+ Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining);
- private static bool InRange(int ch, int start, int end)
- {
- return (uint)(ch - start) <= (uint)(end - start);
+ bytesConsumed = (int)(pInputBufferRemaining - pBytes);
+ return (int)(pOutputBufferRemaining - pChars);
}
- // Our workhorse
- // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw
- internal sealed override unsafe int GetBytes(
- char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS baseEncoder)
+ private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan<byte> bytes, int originalBytesLength, Span<char> chars, int originalCharsLength, DecoderNLS decoder)
{
- Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
- Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
- Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
- Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
-
- UTF8Encoder encoder = null;
-
- // For fallback we may need a fallback buffer.
- // We wait to initialize it though in case we don't have any broken input unicode
- EncoderFallbackBuffer fallbackBuffer = null;
- char* pSrcForFallback;
-
- char* pSrc = chars;
- byte* pTarget = bytes;
-
- char* pEnd = pSrc + charCount;
- byte* pAllocatedBufferEnd = pTarget + byteCount;
-
- int ch = 0;
-
- // assume that JIT will en-register pSrc, pTarget and ch
+ // We special-case DecoderReplacementFallback if it's telling us to write a single U+FFFD char,
+ // since we believe this to be relatively common and we can handle it more efficiently than
+ // the base implementation.
- if (baseEncoder != null)
+ if (((decoder is null) ? this.DecoderFallback : decoder.Fallback) is DecoderReplacementFallback replacementFallback
+ && replacementFallback.MaxCharCount == 1
+ && replacementFallback.DefaultString[0] == UnicodeUtility.ReplacementChar)
{
- encoder = (UTF8Encoder)baseEncoder;
- ch = encoder.surrogateChar;
-
- // We mustn't have left over fallback data when counting
- if (encoder.InternalHasFallbackBuffer)
- {
- // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
- fallbackBuffer = encoder.FallbackBuffer;
- if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
- throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
-
- // Set our internal fallback interesting things.
- fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
- }
- }
-
- for (;;)
- {
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
- if (pSrc >= pEnd)
- {
- if (ch == 0)
- {
- // Check if there's anything left to get out of the fallback buffer
- ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
- if (ch > 0)
- {
- goto ProcessChar;
- }
- }
- else
- {
- // Case of leftover surrogates in the fallback buffer
- if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
- {
- Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- int cha = ch;
-
- ch = fallbackBuffer.InternalGetNextChar();
-
- if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
- goto EncodeChar;
- }
- else if (ch > 0)
- {
- goto ProcessChar;
- }
- else
- {
- break;
- }
- }
- }
-
- // attempt to encode the partial surrogate (will fail or ignore)
- if (ch > 0 && (encoder == null || encoder.MustFlush))
- goto EncodeChar;
-
- // We're done
- break;
- }
-
- if (ch > 0)
- {
- // We have a high surrogate left over from a previous loop.
- Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- // use separate helper variables for local contexts so that the jit optimizations
- // won't get confused about the variable lifetimes
- int cha = *pSrc;
-
- // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
- // if (IsLowSurrogate(cha)) {
- if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- ch = cha + (ch << 10) +
- (0x10000
- - CharUnicodeInfo.LOW_SURROGATE_START
- - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
-
- pSrc++;
- }
- // else ch is still high surrogate and encoding will fail
-
- // attempt to encode the surrogate or partial surrogate
- goto EncodeChar;
- }
-
- // If we've used a fallback, then we have to check for it
- if (fallbackBuffer != null)
- {
- ch = fallbackBuffer.InternalGetNextChar();
- if (ch > 0) goto ProcessChar;
- }
-
- // read next char. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- // if (IsHighSurrogate(ch)) {
- if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
- {
- continue;
- }
- // either good char or partial surrogate
-
- EncodeChar:
- // throw exception on partial surrogate if necessary
- // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
- if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // Lone surrogates aren't allowed, we have to do fallback for them
- // Have to make a fallback buffer if we don't have one
- if (fallbackBuffer == null)
- {
- // wait on fallbacks if we can
- // For fallback we may need a fallback buffer
- if (baseEncoder == null)
- fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
- else
- fallbackBuffer = baseEncoder.FallbackBuffer;
-
- // Set our internal fallback interesting things.
- fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
- }
-
- // Do our fallback. Actually we already know its a mixed up surrogate,
- // so the ref pSrc isn't gonna do anything.
- pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
- fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
- pSrc = pSrcForFallback;
-
- // Ignore it if we don't throw
- ch = 0;
- continue;
- }
-
- // Count bytes needed
- int bytesNeeded = 1;
- if (ch > 0x7F)
- {
- if (ch > 0x7FF)
- {
- if (ch > 0xFFFF)
- {
- bytesNeeded++; // 4 bytes (surrogate pair)
- }
- bytesNeeded++; // 3 bytes (800-FFFF)
- }
- bytesNeeded++; // 2 bytes (80-7FF)
- }
-
- if (pTarget > pAllocatedBufferEnd - bytesNeeded)
- {
- // Left over surrogate from last time will cause pSrc == chars, so we'll throw
- if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
- {
- fallbackBuffer.MovePrevious(); // Didn't use this fallback char
- if (ch > 0xFFFF)
- fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either
- }
- else
- {
- pSrc--; // Didn't use this char
- if (ch > 0xFFFF)
- pSrc--; // Was surrogate, didn't use 2nd part either
- }
- Debug.Assert(pSrc >= chars || pTarget == bytes,
- "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
- ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must
- ch = 0; // Nothing left over (we backed up to start of pair if supplementary)
- break;
- }
-
- if (ch <= 0x7F)
- {
- *pTarget = (byte)ch;
- }
- else
- {
- // use separate helper variables for local contexts so that the jit optimizations
- // won't get confused about the variable lifetimes
- int chb;
- if (ch <= 0x7FF)
- {
- // 2 byte encoding
- chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
- }
- else
- {
- if (ch <= 0xFFFF)
- {
- chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
- }
- else
- {
- *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
- pTarget++;
-
- chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
- }
- *pTarget = (byte)chb;
- pTarget++;
-
- chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
- }
- *pTarget = (byte)chb;
- pTarget++;
-
- *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
- }
- pTarget++;
-
-
-#if FASTLOOP
- // If still have fallback don't do fast loop
- if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
- goto ProcessChar;
-
- int availableChars = PtrDiff(pEnd, pSrc);
- int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
-
- // don't fall into the fast decoding loop if we don't have enough characters
- // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
- if (availableChars <= 13)
- {
- // we are hoping for 1 byte per char
- if (availableBytes < availableChars)
- {
- // not enough output room. no pending bits at this point
- ch = 0;
- continue;
- }
-
- // try to get over the remainder of the ascii characters fast though
- char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
- while (pSrc < pLocalEnd)
- {
- ch = *pSrc;
- pSrc++;
-
- // Not ASCII, need more than 1 byte per char
- if (ch > 0x7F)
- goto ProcessChar;
-
- *pTarget = (byte)ch;
- pTarget++;
- }
- // we are done, let ch be 0 to clear encoder
- ch = 0;
- break;
- }
-
- // we need at least 1 byte per character, but Convert might allow us to convert
- // only part of the input, so try as much as we can. Reduce charCount if necessary
- if (availableBytes < availableChars)
- {
- availableChars = availableBytes;
- }
-
- // FASTLOOP:
- // - optimistic range checks
- // - fallbacks to the slow loop for all special cases, exception throwing, etc.
+ // Don't care about the exact OperationStatus, just how much of the payload we were able
+ // to process.
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
- // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
- char* pStop = pSrc + availableChars - 5;
+ Utf8.ToUtf16(bytes, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: true, isFinalBlock: decoder is null || decoder.MustFlush);
- while (pSrc < pStop)
- {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- {
- goto LongCode;
- }
- *pTarget = (byte)ch;
- pTarget++;
-
- // get pSrc aligned
- if ((unchecked((int)pSrc) & 0x2) != 0)
- {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F)
- {
- goto LongCode;
- }
- *pTarget = (byte)ch;
- pTarget++;
- }
-
- // Run 4 characters at a time!
- while (pSrc < pStop)
- {
- ch = *(int*)pSrc;
- int chc = *(int*)(pSrc + 2);
- if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
- {
- goto LongCodeWithMask;
- }
-
- // Unfortunately, this is endianess sensitive
- if (BitConverter.IsLittleEndian)
- {
- *pTarget = (byte)ch;
- *(pTarget + 1) = (byte)(ch >> 16);
- pSrc += 4;
- *(pTarget + 2) = (byte)chc;
- *(pTarget + 3) = (byte)(chc >> 16);
- pTarget += 4;
- }
- else
- {
- *pTarget = (byte)(ch>>16);
- *(pTarget+1) = (byte)ch;
- pSrc += 4;
- *(pTarget+2) = (byte)(chc>>16);
- *(pTarget+3) = (byte)chc;
- pTarget += 4;
- }
- }
- continue;
-
- LongCodeWithMask:
- if (BitConverter.IsLittleEndian)
- {
- ch = (char)ch;
- }
- else
- {
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- }
- pSrc++;
-
- if (ch > 0x7F)
- {
- goto LongCode;
- }
- *pTarget = (byte)ch;
- pTarget++;
- continue;
-
- LongCode:
- // use separate helper variables for slow and fast loop so that the jit optimizations
- // won't get confused about the variable lifetimes
- int chd;
- if (ch <= 0x7FF)
- {
- // 2 byte encoding
- chd = unchecked((sbyte)0xC0) | (ch >> 6);
- }
- else
- {
- // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
- if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // 3 byte encoding
- chd = unchecked((sbyte)0xE0) | (ch >> 12);
- }
- else
- {
- // 4 byte encoding - high surrogate + low surrogate
- // if (!IsHighSurrogate(ch))
- if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
- {
- // low without high -> bad, try again in slow loop
- pSrc -= 1;
- break;
- }
-
- chd = *pSrc;
- pSrc++;
-
- // if (!IsLowSurrogate(chd)) {
- if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
- {
- // high not followed by low -> bad, try again in slow loop
- pSrc -= 2;
- break;
- }
-
- ch = chd + (ch << 10) +
- (0x10000
- - CharUnicodeInfo.LOW_SURROGATE_START
- - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
-
- *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
- // pStop - this byte is compensated by the second surrogate character
- // 2 input chars require 4 output bytes. 2 have been anticipated already
- // and 2 more will be accounted for by the 2 pStop-- calls below.
- pTarget++;
-
- chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
- }
- *pTarget = (byte)chd;
- pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too.
- pTarget++;
-
- chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
- }
- *pTarget = (byte)chd;
- pStop--; // 2 byte sequence for 1 char so need pStop--.
- pTarget++;
-
- *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
- // pStop - this byte is already included
- pTarget++;
- }
+ // Slice off how much we consumed / wrote.
- Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
+ bytes = bytes.Slice(bytesRead);
+ chars = chars.Slice(charsWritten);
+ }
-#endif // FASTLOOP
+ // If we couldn't go through our fast fallback mechanism, or if we still have leftover
+ // data because we couldn't consume everything in the loop above, we need to go down the
+ // slow fallback path.
- // no pending char at this point
- ch = 0;
+ if (bytes.IsEmpty)
+ {
+ return originalCharsLength - chars.Length; // total number of chars written
}
-
- // Do we have to set the encoder bytes?
- if (encoder != null)
+ else
{
- Debug.Assert(!encoder.MustFlush || ch == 0,
- "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
-
- encoder.surrogateChar = ch;
- encoder._charsUsed = (int)(pSrc - chars);
+ return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder);
}
-
- Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
- baseEncoder == null || !baseEncoder._throwOnOverflow,
- "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
-
- return (int)(pTarget - bytes);
}
-
- // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
- // while the actual character is being built in the lower bits. They are shifted together
- // with the actual bits of the character.
-
- // bits 30 & 31 are used for pending bits fixup
- private const int FinalByte = 1 << 29;
- private const int SupplimentarySeq = 1 << 28;
- private const int ThreeByteSeq = 1 << 27;
-
- // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
- // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+ // Returns a string containing the decoded representation of a range of
+ // bytes in a byte array.
//
- // To simplify maintenance, the structure of GetCharCount and GetChars should be
- // kept the same as much as possible
- internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
- {
- Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
- Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
-
- // Initialize stuff
- byte* pSrc = bytes;
- byte* pEnd = pSrc + count;
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
- // Start by assuming we have as many as count, charCount always includes the adjustment
- // for the character being decoded
- int charCount = count;
- int ch = 0;
- DecoderFallbackBuffer fallback = null;
+ public override unsafe string GetString(byte[] bytes, int index, int count)
+ {
+ // Validate Parameters
- if (baseDecoder != null)
+ if (bytes is null)
{
- UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
- ch = decoder.bits;
- charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars.
-
- // Shouldn't have anything in fallback buffer for GetCharCount
- // (don't have to check _throwOnOverflow for count)
- Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
- "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
}
- for (;;)
+ if ((index | count) < 0)
{
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
- if (pSrc >= pEnd)
- {
- break;
- }
-
- if (ch == 0)
- {
- // no pending bits
- goto ReadChar;
- }
-
- // read next byte. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- int cha = *pSrc;
- pSrc++;
-
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((cha & unchecked((sbyte)0xC0)) != 0x80)
- {
- // This can be a valid starting byte for another UTF8 byte sequence, so let's put
- // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
- pSrc--;
- charCount += (ch >> 30);
- goto InvalidByteSequence;
- }
-
- // fold in the new byte
- ch = (ch << 6) | (cha & 0x3F);
-
- if ((ch & FinalByte) == 0)
- {
- Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
- "[UTF8Encoding.GetChars]Invariant volation");
-
- if ((ch & SupplimentarySeq) != 0)
- {
- if ((ch & (FinalByte >> 6)) != 0)
- {
- // this is 3rd byte (of 4 byte supplementary) - nothing to do
- continue;
- }
-
- // 2nd byte, check for non-shortest form of supplementary char and the valid
- // supplementary characters in range 0x010000 - 0x10FFFF at the same time
- if (!InRange(ch & 0x1F0, 0x10, 0x100))
- {
- goto InvalidByteSequence;
- }
- }
- else
- {
- // Must be 2nd byte of a 3-byte sequence
- // check for non-shortest form of 3 byte seq
- if ((ch & (0x1F << 5)) == 0 || // non-shortest form
- (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
- {
- goto InvalidByteSequence;
- }
- }
- continue;
- }
-
- // ready to punch
-
- // adjust for surrogates in non-shortest form
- if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
- {
- charCount--;
- }
- goto EncodeChar;
-
- InvalidByteSequence:
- // this code fragment should be close to the goto referencing it
- // Have to do fallback for invalid bytes
- if (fallback == null)
- {
- if (baseDecoder == null)
- fallback = this.decoderFallback.CreateFallbackBuffer();
- else
- fallback = baseDecoder.FallbackBuffer;
- fallback.InternalInitialize(bytes, null);
- }
- charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
-
- ch = 0;
- continue;
-
- ReadChar:
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- if (ch > 0x7F)
- {
- // If its > 0x7F, its start of a new multi-byte sequence
-
- // Long sequence, so unreserve our char.
- charCount--;
-
- // bit 6 has to be non-zero for start of multibyte chars.
- if ((ch & 0x40) == 0)
- {
- // Unexpected trail byte
- goto InvalidByteSequence;
- }
-
- // start a new long code
- if ((ch & 0x20) != 0)
- {
- if ((ch & 0x10) != 0)
- {
- // 4 byte encoding - supplimentary character (2 surrogates)
-
- ch &= 0x0F;
-
- // check that bit 4 is zero and the valid supplimentary character
- // range 0x000000 - 0x10FFFF at the same time
- if (ch > 0x04)
- {
- ch |= 0xf0;
- goto InvalidByteSequence;
- }
-
- // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
- // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
- ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
- (1 << 30) | // If it dies on next byte we'll need an extra char
- (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
- (SupplimentarySeq) | (SupplimentarySeq >> 6) |
- (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
-
- // Our character count will be 2 characters for these 4 bytes, so subtract another char
- charCount--;
- }
- else
- {
- // 3 byte encoding
- // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
- ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
- (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
-
- // We'll expect 1 character for these 3 bytes, so subtract another char.
- charCount--;
- }
- }
- else
- {
- // 2 byte encoding
-
- ch &= 0x1F;
-
- // check for non-shortest form
- if (ch <= 1)
- {
- ch |= 0xc0;
- goto InvalidByteSequence;
- }
-
- // Add bit flags so we'll be flagged correctly
- ch |= (FinalByte >> 6);
- }
- continue;
- }
-
- EncodeChar:
-
-#if FASTLOOP
- int availableBytes = PtrDiff(pEnd, pSrc);
-
- // don't fall into the fast decoding loop if we don't have enough bytes
- if (availableBytes <= 13)
- {
- // try to get over the remainder of the ascii characters fast though
- byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
- while (pSrc < pLocalEnd)
- {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- goto ProcessChar;
- }
- // we are done
- ch = 0;
- break;
- }
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
- byte* pStop = pSrc + availableBytes - 7;
-
- while (pSrc < pStop)
- {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- {
- goto LongCode;
- }
-
- // get pSrc 2-byte aligned
- if ((unchecked((int)pSrc) & 0x1) != 0)
- {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F)
- {
- goto LongCode;
- }
- }
-
- // get pSrc 4-byte aligned
- if ((unchecked((int)pSrc) & 0x2) != 0)
- {
- ch = *(ushort*)pSrc;
- if ((ch & 0x8080) != 0)
- {
- goto LongCodeWithMask16;
- }
- pSrc += 2;
- }
-
- // Run 8 + 8 characters at a time!
- while (pSrc < pStop)
- {
- ch = *(int*)pSrc;
- int chb = *(int*)(pSrc + 4);
- if (((ch | chb) & unchecked((int)0x80808080)) != 0)
- {
- goto LongCodeWithMask32;
- }
- pSrc += 8;
-
- // This is a really small loop - unroll it
- if (pSrc >= pStop)
- break;
-
- ch = *(int*)pSrc;
- chb = *(int*)(pSrc + 4);
- if (((ch | chb) & unchecked((int)0x80808080)) != 0)
- {
- goto LongCodeWithMask32;
- }
- pSrc += 8;
- }
- break;
-
- LongCodeWithMask32:
- if (BitConverter.IsLittleEndian)
- {
- ch &= 0xFF;
- }
- else
- {
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- }
- LongCodeWithMask16:
- if (BitConverter.IsLittleEndian)
- {
- ch &= 0xFF;
- }
- else
- {
- ch = (int)(((uint)ch) >> 8);
- }
-
- pSrc++;
- if (ch <= 0x7F)
- {
- continue;
- }
-
- LongCode:
- int chc = *pSrc;
- pSrc++;
-
- if (
- // bit 6 has to be zero
- (ch & 0x40) == 0 ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (chc & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc &= 0x3F;
-
- // start a new long code
- if ((ch & 0x20) != 0)
- {
- // fold the first two bytes together
- chc |= (ch & 0x0F) << 6;
-
- if ((ch & 0x10) != 0)
- {
- // 4 byte encoding - surrogate
- ch = *pSrc;
- if (
- // check that bit 4 is zero, the non-shortest form of surrogate
- // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
- !InRange(chc >> 4, 0x01, 0x10) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc = (chc << 6) | (ch & 0x3F);
-
- ch = *(pSrc + 1);
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((ch & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
- pSrc += 2;
-
- // extra byte
- charCount--;
- }
- else
- {
- // 3 byte encoding
- ch = *pSrc;
- if (
- // check for non-shortest form of 3 byte seq
- (chc & (0x1F << 5)) == 0 ||
- // Can't have surrogates here.
- (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
- pSrc++;
-
- // extra byte
- charCount--;
- }
- }
- else
- {
- // 2 byte encoding
-
- // check for non-shortest form
- if ((ch & 0x1E) == 0)
- {
- goto BadLongCode;
- }
- }
-
- // extra byte
- charCount--;
- }
-#endif // FASTLOOP
-
- // no pending bits at this point
- ch = 0;
- continue;
-
- BadLongCode:
- pSrc -= 2;
- ch = 0;
- continue;
+ ThrowHelper.ThrowArgumentOutOfRangeException(
+ argument: (index < 0) ? ExceptionArgument.index : ExceptionArgument.count,
+ resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
}
- // May have a problem if we have to flush
- if (ch != 0)
+ if (bytes.Length - index < count)
{
- // We were already adjusting for these, so need to un-adjust
- charCount += (ch >> 30);
- if (baseDecoder == null || baseDecoder.MustFlush)
- {
- // Have to do fallback for invalid bytes
- if (fallback == null)
- {
- if (baseDecoder == null)
- fallback = this.decoderFallback.CreateFallbackBuffer();
- else
- fallback = baseDecoder.FallbackBuffer;
- fallback.InternalInitialize(bytes, null);
- }
- charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
- }
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
}
- // Shouldn't have anything in fallback buffer for GetCharCount
- // (don't have to check _throwOnOverflow for count)
- Debug.Assert(fallback == null || fallback.Remaining == 0,
- "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
+ // Avoid problems with empty input buffer
+ if (count == 0)
+ return string.Empty;
- return charCount;
+ fixed (byte* pBytes = bytes)
+ {
+ return string.CreateStringFromEncoding(pBytes + index, count, this);
+ }
}
- // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
- // So if we're really broken, then that could also throw an error... recursively.
- // So try to make sure GetChars can at least process all uses by
- // System.Resources.ResourceReader!
//
- // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
- // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+ // End of standard methods copied from EncodingNLS.cs
//
- // To simplify maintenance, the structure of GetCharCount and GetChars should be
- // kept the same as much as possible
- internal sealed override unsafe int GetChars(
- byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder)
- {
- Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
- Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
- Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
- Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
-
- byte* pSrc = bytes;
- char* pTarget = chars;
-
- byte* pEnd = pSrc + byteCount;
- char* pAllocatedBufferEnd = pTarget + charCount;
-
- int ch = 0;
-
- DecoderFallbackBuffer fallback = null;
- byte* pSrcForFallback;
- char* pTargetForFallback;
- if (baseDecoder != null)
- {
- UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
- ch = decoder.bits;
-
- // Shouldn't have anything in fallback buffer for GetChars
- // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty)
- Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
- "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
- }
- for (;;)
- {
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
- if (pSrc >= pEnd)
- {
- break;
- }
-
- if (ch == 0)
- {
- // no pending bits
- goto ReadChar;
- }
-
- // read next byte. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- int cha = *pSrc;
- pSrc++;
-
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((cha & unchecked((sbyte)0xC0)) != 0x80)
- {
- // This can be a valid starting byte for another UTF8 byte sequence, so let's put
- // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
- pSrc--;
- goto InvalidByteSequence;
- }
-
- // fold in the new byte
- ch = (ch << 6) | (cha & 0x3F);
-
- if ((ch & FinalByte) == 0)
- {
- // Not at last byte yet
- Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
- "[UTF8Encoding.GetChars]Invariant volation");
-
- if ((ch & SupplimentarySeq) != 0)
- {
- // Its a 4-byte supplimentary sequence
- if ((ch & (FinalByte >> 6)) != 0)
- {
- // this is 3rd byte of 4 byte sequence - nothing to do
- continue;
- }
-
- // 2nd byte of 4 bytes
- // check for non-shortest form of surrogate and the valid surrogate
- // range 0x000000 - 0x10FFFF at the same time
- if (!InRange(ch & 0x1F0, 0x10, 0x100))
- {
- goto InvalidByteSequence;
- }
- }
- else
- {
- // Must be 2nd byte of a 3-byte sequence
- // check for non-shortest form of 3 byte seq
- if ((ch & (0x1F << 5)) == 0 || // non-shortest form
- (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
- {
- goto InvalidByteSequence;
- }
- }
- continue;
- }
-
- // ready to punch
-
- // surrogate in shortest form?
- // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
- if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
- {
- // let the range check for the second char throw the exception
- if (pTarget < pAllocatedBufferEnd)
- {
- *pTarget = (char)(((ch >> 10) & 0x7FF) +
- unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
- pTarget++;
-
- ch = (ch & 0x3FF) +
- unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
- }
- }
-
- goto EncodeChar;
-
- InvalidByteSequence:
- // this code fragment should be close to the gotos referencing it
- // Have to do fallback for invalid bytes
- if (fallback == null)
- {
- if (baseDecoder == null)
- fallback = this.decoderFallback.CreateFallbackBuffer();
- else
- fallback = baseDecoder.FallbackBuffer;
- fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
- }
- // That'll back us up the appropriate # of bytes if we didn't get anywhere
- pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
- pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
- bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
- pSrc = pSrcForFallback;
- pTarget = pTargetForFallback;
-
- if (!fallbackResult)
- {
- // Ran out of buffer space
- // Need to throw an exception?
- Debug.Assert(pSrc >= bytes || pTarget == chars,
- "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
- fallback.InternalReset();
- ThrowCharsOverflow(baseDecoder, pTarget == chars);
- ch = 0;
- break;
- }
- Debug.Assert(pSrc >= bytes,
- "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
- ch = 0;
- continue;
-
- ReadChar:
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- if (ch > 0x7F)
- {
- // If its > 0x7F, its start of a new multi-byte sequence
-
- // bit 6 has to be non-zero
- if ((ch & 0x40) == 0)
- {
- goto InvalidByteSequence;
- }
-
- // start a new long code
- if ((ch & 0x20) != 0)
- {
- if ((ch & 0x10) != 0)
- {
- // 4 byte encoding - supplimentary character (2 surrogates)
-
- ch &= 0x0F;
-
- // check that bit 4 is zero and the valid supplimentary character
- // range 0x000000 - 0x10FFFF at the same time
- if (ch > 0x04)
- {
- ch |= 0xf0;
- goto InvalidByteSequence;
- }
-
- ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
- (SupplimentarySeq) | (SupplimentarySeq >> 6) |
- (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
- }
- else
- {
- // 3 byte encoding
- ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
- (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
- }
- }
- else
- {
- // 2 byte encoding
-
- ch &= 0x1F;
-
- // check for non-shortest form
- if (ch <= 1)
- {
- ch |= 0xc0;
- goto InvalidByteSequence;
- }
-
- ch |= (FinalByte >> 6);
- }
- continue;
- }
-
- EncodeChar:
- // write the pending character
- if (pTarget >= pAllocatedBufferEnd)
- {
- // Fix chars so we make sure to throw if we didn't output anything
- ch &= 0x1fffff;
- if (ch > 0x7f)
- {
- if (ch > 0x7ff)
- {
- if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
- ch <= CharUnicodeInfo.LOW_SURROGATE_END)
- {
- pSrc--; // It was 4 bytes
- pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
- }
- else if (ch > 0xffff)
- {
- pSrc--; // It was 4 bytes, nothing was stored
- }
- pSrc--; // It was at least 3 bytes
- }
- pSrc--; // It was at least 2 bytes
- }
- pSrc--;
-
- // Throw that we don't have enough room (pSrc could be < chars if we had started to process
- // a 4 byte sequence already)
- Debug.Assert(pSrc >= bytes || pTarget == chars,
- "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
- ThrowCharsOverflow(baseDecoder, pTarget == chars);
-
- // Don't store ch in decoder, we already backed up to its start
- ch = 0;
-
- // Didn't throw, just use this buffer size.
- break;
- }
- *pTarget = (char)ch;
- pTarget++;
-
-#if FASTLOOP
- int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
- int availableBytes = PtrDiff(pEnd, pSrc);
-
- // don't fall into the fast decoding loop if we don't have enough bytes
- // Test for availableChars is done because pStop would be <= pTarget.
- if (availableBytes <= 13)
- {
- // we may need as many as 1 character per byte
- if (availableChars < availableBytes)
- {
- // not enough output room. no pending bits at this point
- ch = 0;
- continue;
- }
-
- // try to get over the remainder of the ascii characters fast though
- byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
- while (pSrc < pLocalEnd)
- {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- goto ProcessChar;
-
- *pTarget = (char)ch;
- pTarget++;
- }
- // we are done
- ch = 0;
- break;
- }
-
- // we may need as many as 1 character per byte, so reduce the byte count if necessary.
- // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
- if (availableChars < availableBytes)
- {
- availableBytes = availableChars;
- }
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
- char* pStop = pTarget + availableBytes - 7;
-
- while (pTarget < pStop)
- {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- {
- goto LongCode;
- }
- *pTarget = (char)ch;
- pTarget++;
-
- // get pSrc to be 2-byte aligned
- if ((unchecked((int)pSrc) & 0x1) != 0)
- {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F)
- {
- goto LongCode;
- }
- *pTarget = (char)ch;
- pTarget++;
- }
-
- // get pSrc to be 4-byte aligned
- if ((unchecked((int)pSrc) & 0x2) != 0)
- {
- ch = *(ushort*)pSrc;
- if ((ch & 0x8080) != 0)
- {
- goto LongCodeWithMask16;
- }
-
- // Unfortunately, this is endianess sensitive
- if (BitConverter.IsLittleEndian)
- {
- *pTarget = (char)(ch & 0x7F);
- pSrc += 2;
- *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
- pTarget += 2;
- }
- else
- {
- *pTarget = (char)((ch >> 8) & 0x7F);
- pSrc += 2;
- *(pTarget+1) = (char)(ch & 0x7F);
- pTarget += 2;
- }
- }
-
- // Run 8 characters at a time!
- while (pTarget < pStop)
- {
- ch = *(int*)pSrc;
- int chb = *(int*)(pSrc + 4);
- if (((ch | chb) & unchecked((int)0x80808080)) != 0)
- {
- goto LongCodeWithMask32;
- }
-
- // Unfortunately, this is endianess sensitive
- if (BitConverter.IsLittleEndian)
- {
- *pTarget = (char)(ch & 0x7F);
- *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
- *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
- *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
- pSrc += 8;
- *(pTarget + 4) = (char)(chb & 0x7F);
- *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
- *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
- *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
- pTarget += 8;
- }
- else
- {
- *pTarget = (char)((ch >> 24) & 0x7F);
- *(pTarget+1) = (char)((ch >> 16) & 0x7F);
- *(pTarget+2) = (char)((ch >> 8) & 0x7F);
- *(pTarget+3) = (char)(ch & 0x7F);
- pSrc += 8;
- *(pTarget+4) = (char)((chb >> 24) & 0x7F);
- *(pTarget+5) = (char)((chb >> 16) & 0x7F);
- *(pTarget+6) = (char)((chb >> 8) & 0x7F);
- *(pTarget+7) = (char)(chb & 0x7F);
- pTarget += 8;
- }
- }
- break;
-
- LongCodeWithMask32:
- if (BitConverter.IsLittleEndian)
- {
- ch &= 0xFF;
- }
- else
- {
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- }
- LongCodeWithMask16:
- if (BitConverter.IsLittleEndian)
- {
- ch &= 0xFF;
- }
- else
- {
- ch = (int)(((uint)ch) >> 8);
- }
- pSrc++;
- if (ch <= 0x7F)
- {
- *pTarget = (char)ch;
- pTarget++;
- continue;
- }
-
- LongCode:
- int chc = *pSrc;
- pSrc++;
-
- if (
- // bit 6 has to be zero
- (ch & 0x40) == 0 ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (chc & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc &= 0x3F;
-
- // start a new long code
- if ((ch & 0x20) != 0)
- {
- // fold the first two bytes together
- chc |= (ch & 0x0F) << 6;
-
- if ((ch & 0x10) != 0)
- {
- // 4 byte encoding - surrogate
- ch = *pSrc;
- if (
- // check that bit 4 is zero, the non-shortest form of surrogate
- // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
- !InRange(chc >> 4, 0x01, 0x10) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc = (chc << 6) | (ch & 0x3F);
-
- ch = *(pSrc + 1);
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((ch & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
- pSrc += 2;
-
- ch = (chc << 6) | (ch & 0x3F);
-
- *pTarget = (char)(((ch >> 10) & 0x7FF) +
- unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
- pTarget++;
-
- ch = (ch & 0x3FF) +
- unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
-
- // extra byte, we're already planning 2 chars for 2 of these bytes,
- // but the big loop is testing the target against pStop, so we need
- // to subtract 2 more or we risk overrunning the input. Subtract
- // one here and one below.
- pStop--;
- }
- else
- {
- // 3 byte encoding
- ch = *pSrc;
- if (
- // check for non-shortest form of 3 byte seq
- (chc & (0x1F << 5)) == 0 ||
- // Can't have surrogates here.
- (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & unchecked((sbyte)0xC0)) != 0x80)
- {
- goto BadLongCode;
- }
- pSrc++;
-
- ch = (chc << 6) | (ch & 0x3F);
-
- // extra byte, we're only expecting 1 char for each of these 3 bytes,
- // but the loop is testing the target (not source) against pStop, so
- // we need to subtract 2 more or we risk overrunning the input.
- // Subtract 1 here and one more below
- pStop--;
- }
- }
- else
- {
- // 2 byte encoding
-
- ch &= 0x1F;
-
- // check for non-shortest form
- if (ch <= 1)
- {
- goto BadLongCode;
- }
- ch = (ch << 6) | chc;
- }
-
- *pTarget = (char)ch;
- pTarget++;
-
- // extra byte, we're only expecting 1 char for each of these 2 bytes,
- // but the loop is testing the target (not source) against pStop.
- // subtract an extra count from pStop so that we don't overrun the input.
- pStop--;
- }
-#endif // FASTLOOP
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private unsafe int GetCharCountCommon(byte* pBytes, int byteCount)
+ {
+ // Common helper method for all non-DecoderNLS entry points to GetCharCount.
+ // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
- Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
+ Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
+ Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
- // no pending bits at this point
- ch = 0;
- continue;
+ // First call into the fast path.
+ // Don't bother providing a fallback mechanism; our fast path doesn't use it.
- BadLongCode:
- pSrc -= 2;
- ch = 0;
- continue;
- }
+ int totalCharCount = GetCharCountFast(pBytes, byteCount, fallback: null, out int bytesConsumed);
- if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
+ if (bytesConsumed != byteCount)
{
- // Have to do fallback for invalid bytes
- if (fallback == null)
- {
- if (baseDecoder == null)
- fallback = this.decoderFallback.CreateFallbackBuffer();
- else
- fallback = baseDecoder.FallbackBuffer;
- fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
- }
-
- // That'll back us up the appropriate # of bytes if we didn't get anywhere
- pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
- pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
- bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
- pSrc = pSrcForFallback;
- pTarget = pTargetForFallback;
+ // If there's still data remaining in the source buffer, go down the fallback path.
+ // We need to check for integer overflow since the fallback could change the required
+ // output count in unexpected ways.
- if (!fallbackResult)
+ totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed);
+ if (totalCharCount < 0)
{
- Debug.Assert(pSrc >= bytes || pTarget == chars,
- "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
-
- // Ran out of buffer space
- // Need to throw an exception?
- fallback.InternalReset();
- ThrowCharsOverflow(baseDecoder, pTarget == chars);
+ ThrowConversionOverflow();
}
- Debug.Assert(pSrc >= bytes,
- "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
- ch = 0;
}
- if (baseDecoder != null)
- {
- UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
+ return totalCharCount;
+ }
- // If we're storing flush data we expect all bits to be used or else
- // we're stuck in the middle of a conversion
- Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow,
- "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
+ [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
+ private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback fallback, out int bytesConsumed)
+ {
+ // The number of UTF-16 code units will never exceed the number of UTF-8 code units,
+ // so the addition at the end of this method will not overflow.
- // Remember our leftover bits.
- decoder.bits = ch;
+ byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _);
- baseDecoder._bytesUsed = (int)(pSrc - bytes);
- }
+ int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes);
+ bytesConsumed = tempBytesConsumed;
- // Shouldn't have anything in fallback buffer for GetChars
- // (don't have to check _throwOnOverflow for chars)
- Debug.Assert(fallback == null || fallback.Remaining == 0,
- "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
-
- return PtrDiff(pTarget, chars);
+ return tempBytesConsumed + utf16CodeUnitCountAdjustment;
}
- // During GetChars we had an invalid byte sequence
- // pSrc is backed up to the start of the bad sequence if we didn't have room to
- // fall it back. Otherwise pSrc remains where it is.
- private unsafe bool FallbackInvalidByteSequence(
- ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
+ public override Decoder GetDecoder()
{
- // Get our byte[]
- byte* pStart = pSrc;
- byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
-
- // Do the actual fallback
- if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
- {
- // Oops, it failed, back up to pStart
- pSrc = pStart;
- return false;
- }
-
- // It worked
- return true;
+ return new DecoderNLS(this);
}
- // During GetCharCount we had an invalid byte sequence
- // pSrc is used to find the index that points to the invalid bytes,
- // however the byte[] contains the fallback bytes (in case the index is -1)
- private unsafe int FallbackInvalidByteSequence(
- byte* pSrc, int ch, DecoderFallbackBuffer fallback)
+
+ public override Encoder GetEncoder()
{
- // Calling GetBytesUnknown can adjust the pSrc pointer but we need to pass the pointer before the adjustment
- // to fallback.InternalFallback. The input pSrc to fallback.InternalFallback will only be used to calculate the
- // index inside bytesUnknown and if we pass the adjusted pointer we can end up with negative index values.
- // We store the original pSrc in pOriginalSrc and then pass pOriginalSrc to fallback.InternalFallback.
- byte* pOriginalSrc = pSrc;
-
- // Get our byte[]
- byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
-
- // Do the actual fallback
- int count = fallback.InternalFallback(bytesUnknown, pOriginalSrc);
-
- // # of fallback chars expected.
- // Note that we only get here for "long" sequences, and have already unreserved
- // the count that we prereserved for the input bytes
- return count;
+ return new EncoderNLS(this);
}
- // Note that some of these bytes may have come from a previous fallback, so we cannot
- // just decrement the pointer and use the values we read. In those cases we have
- // to regenerate the original values.
- private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
- {
- // Get our byte[]
- byte[] bytesUnknown = null;
+ //
+ // Beginning of methods used by shared fallback logic.
+ //
- // See if it was a plain char
- // (have to check >= 0 because we have all sorts of wierd bit flags)
- if (ch < 0x100 && ch >= 0)
- {
- pSrc--;
- bytesUnknown = new byte[] { unchecked((byte)ch) };
- }
- // See if its an unfinished 2 byte sequence
- else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
- {
- pSrc--;
- bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
- }
- // So now we're either 2nd byte of 3 or 4 byte sequence or
- // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
- // 1st check if its a 4 byte sequence
- else if ((ch & SupplimentarySeq) != 0)
- {
- // 3rd byte of 4 byte sequence?
- if ((ch & (FinalByte >> 6)) != 0)
- {
- // 3rd byte of 4 byte sequence
- pSrc -= 3;
- bytesUnknown = new byte[] {
- unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
- unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
- unchecked((byte)(((ch) & 0x3F) | 0x80)) };
- }
- else if ((ch & (FinalByte >> 12)) != 0)
- {
- // 2nd byte of a 4 byte sequence
- pSrc -= 2;
- bytesUnknown = new byte[] {
- unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
- unchecked((byte)(((ch) & 0x3F) | 0x80)) };
- }
- else
- {
- // 4th byte of a 4 byte sequence
- pSrc--;
- bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
- }
- }
- else
- {
- // 2nd byte of 3 byte sequence?
- if ((ch & (FinalByte >> 6)) != 0)
- {
- // So its 2nd byte of a 3 byte sequence
- pSrc -= 2;
- bytesUnknown = new byte[] {
- unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
- }
- else
- {
- // 1st byte of a 3 byte sequence
- pSrc--;
- bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
- }
- }
+ internal sealed override bool TryGetByteCount(Rune value, out int byteCount)
+ {
+ // All well-formed Rune instances can be converted to 1..4 UTF-8 code units.
- return bytesUnknown;
+ byteCount = value.Utf8SequenceLength;
+ return true;
}
-
- public override Decoder GetDecoder()
+ internal sealed override OperationStatus EncodeRune(Rune value, Span<byte> bytes, out int bytesWritten)
{
- return new UTF8Decoder(this);
- }
+ // All well-formed Rune instances can be encoded as 1..4 UTF-8 code units.
+ // If there's an error, it's because the destination was too small.
+ return value.TryEncodeToUtf8(bytes, out bytesWritten) ? OperationStatus.Done : OperationStatus.DestinationTooSmall;
+ }
- public override Encoder GetEncoder()
+ internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan<byte> bytes, out Rune value, out int bytesConsumed)
{
- return new UTF8Encoder(this);
+ return Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed);
}
+ //
+ // End of methods used by shared fallback logic.
+ //
public override int GetMaxByteCount(int charCount)
{
@@ -2571,62 +858,5 @@ namespace System.Text
return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
}
-
- private sealed class UTF8Encoder : EncoderNLS
- {
- // We must save a high surrogate value until the next call, looking
- // for a low surrogate value.
- internal int surrogateChar;
-
- public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
- {
- // base calls reset
- }
-
- public override void Reset()
-
- {
- this.surrogateChar = 0;
- if (_fallbackBuffer != null)
- _fallbackBuffer.Reset();
- }
-
- // Anything left in our encoder?
- internal override bool HasState
- {
- get
- {
- return (this.surrogateChar != 0);
- }
- }
- }
-
- private sealed class UTF8Decoder : DecoderNLS
- {
- // We'll need to remember the previous information. See the comments around definition
- // of FinalByte for details.
- internal int bits;
-
- public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
- {
- // base calls reset
- }
-
- public override void Reset()
- {
- this.bits = 0;
- if (_fallbackBuffer != null)
- _fallbackBuffer.Reset();
- }
-
- // Anything left in our decoder?
- internal override bool HasState
- {
- get
- {
- return (this.bits != 0);
- }
- }
- }
}
}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs
new file mode 100644
index 0000000000..40e818e2b6
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs
@@ -0,0 +1,411 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Numerics;
+using Internal.Runtime.CompilerServices;
+
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else // BIT64
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif // BIT64
+
+namespace System.Text.Unicode
+{
+ internal static unsafe partial class Utf16Utility
+ {
+#if DEBUG
+ static Utf16Utility()
+ {
+ Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
+ Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
+ }
+#endif // DEBUG
+
+ // Returns &inputBuffer[inputLength] if the input buffer is valid.
+ /// <summary>
+ /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
+ /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
+ /// </summary>
+ /// <remarks>
+ /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
+ /// </remarks>
+ public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
+ {
+ Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+ Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+ // First, we'll handle the common case of all-ASCII. If this is able to
+ // consume the entire buffer, we'll skip the remainder of this method's logic.
+
+ int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
+ Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
+
+ pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
+ inputLength -= numAsciiCharsConsumedJustNow;
+
+ if (inputLength == 0)
+ {
+ utf8CodeUnitCountAdjustment = 0;
+ scalarCountAdjustment = 0;
+ return pInputBuffer;
+ }
+
+ // If we got here, it means we saw some non-ASCII data, so within our
+ // vectorized code paths below we'll handle all non-surrogate UTF-16
+ // code points branchlessly. We'll only branch if we see surrogates.
+ //
+ // We still optimistically assume the data is mostly ASCII. This means that the
+ // number of UTF-8 code units and the number of scalars almost matches the number
+ // of UTF-16 code units. As we go through the input and find non-ASCII
+ // characters, we'll keep track of these "adjustment" fixups. To get the
+ // total number of UTF-8 code units required to encode the input data, add
+ // the UTF-8 code unit count adjustment to the number of UTF-16 code units
+ // seen. To get the total number of scalars present in the input data,
+ // add the scalar count adjustment to the number of UTF-16 code units seen.
+
+ long tempUtf8CodeUnitCountAdjustment = 0;
+ int tempScalarCountAdjustment = 0;
+
+ if (Sse2.IsSupported)
+ {
+ if (inputLength >= Vector128<ushort>.Count)
+ {
+ Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
+ Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
+ Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
+ Vector128<ushort> vectorZero = Vector128<ushort>.Zero;
+
+ do
+ {
+ Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
+ uint mask;
+
+ // The 'charIsNonAscii' vector we're about to build will have the 0x8000 or the 0x0080
+ // bit set (but not both!) only if the corresponding input char is non-ASCII. Which of
+ // the two bits is set doesn't matter, as will be explained in the diagram a few lines
+ // below.
+
+ Vector128<ushort> charIsNonAscii;
+ if (Sse41.IsSupported)
+ {
+ // sets 0x0080 bit if corresponding char element is >= 0x0080
+ charIsNonAscii = Sse41.Min(utf16Data, vector0080);
+ }
+ else
+ {
+ // sets 0x8000 bit if corresponding char element is >= 0x0080
+ charIsNonAscii = Sse2.AndNot(vector0080, Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 7)));
+ }
+
+#if DEBUG
+ // Quick check to ensure we didn't accidentally set both 0x8080 bits in any element.
+ uint debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
+ Debug.Assert((debugMask & (debugMask << 1)) == 0, "Two set bits shouldn't occur adjacent to each other in this mask.");
+#endif // DEBUG
+
+ // sets 0x8080 bits if corresponding char element is >= 0x0800
+ Vector128<ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
+
+ mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
+
+ // Each odd bit of mask will be 1 only if the char was >= 0x0080,
+ // and each even bit of mask will be 1 only if the char was >= 0x0800.
+ //
+ // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
+ //
+ // ,-- set if char[1] is non-ASCII
+ // | ,-- set if char[0] is non-ASCII
+ // v v
+ // mask = ... 1 1 1 0
+ // ^ ^-- set if char[0] is >= 0x0800
+ // `-- set if char[1] is >= 0x0800
+ //
+ // (If the SSE4.1 code path is taken above, the meaning of the odd and even
+ // bits are swapped, but the logic below otherwise holds.)
+ //
+ // This means we can popcnt the number of set bits, and the result is the
+ // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
+ // it expands. This results in the wrong count for UTF-16 surrogate code
+ // units (we just counted that each individual code unit expands to 3 bytes,
+ // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
+ // We'll handle this in just a moment.
+ //
+ // For now, compute the popcnt but squirrel it away. We'll fold it in to the
+ // cumulative UTF-8 adjustment factor once we determine that there are no
+ // unpaired surrogates in our data. (Unpaired surrogates would invalidate
+ // our computed result and we'd have to throw it away.)
+
+ uint popcnt = (uint)BitOperations.PopCount(mask);
+
+ // Surrogates need to be special-cased for two reasons: (a) we need
+ // to account for the fact that we over-counted in the addition above;
+ // and (b) they require separate validation.
+
+ utf16Data = Sse2.Add(utf16Data, vectorA800);
+ mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
+
+ if (mask != 0)
+ {
+ // There's at least one UTF-16 surrogate code unit present.
+ // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
+ // the resulting bits of 'mask' will occur in pairs:
+ // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
+ // - 11 if the corresponding UTF-16 char was a surrogate code unit.
+ //
+ // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
+ // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
+ // a low surrogate. Since we added 0xA800 in the vectorized operation above,
+ // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
+ // If we logical right-shift each word by 3, we'll end up with the bit pattern
+ // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
+ // determine whether a given char was a high or a low surrogate.
+ //
+ // Therefore the resulting bits of 'mask2' will occur in pairs:
+ // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
+ // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
+ // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
+
+ uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
+
+ uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00
+ uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00
+
+ // Now check that each high surrogate is followed by a low surrogate and that each
+ // low surrogate follows a high surrogate. We make an exception for the case where
+ // the final char of the vector is a high surrogate, since we can't perform validation
+ // on it until the next iteration of the loop when we hope to consume the matching
+ // low surrogate.
+
+ highSurrogatesMask <<= 2;
+ if ((ushort)highSurrogatesMask != lowSurrogatesMask)
+ {
+ goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
+ }
+
+ if (highSurrogatesMask > ushort.MaxValue)
+ {
+ // There was a standalone high surrogate at the end of the vector.
+ // We'll adjust our counters so that we don't consider this char consumed.
+
+ highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
+ popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
+ pInputBuffer--;
+ inputLength++;
+ }
+
+ // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
+ // free right now, saving the extension step a few lines below. If we're 32-bit, the
+ // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
+ // 64 -bit extension a few lines below.
+ nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);
+
+ // 2 UTF-16 chars become 1 Unicode scalar
+
+ tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;
+
+ // Since each surrogate code unit was >= 0x0800, we eagerly assumed
+ // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
+ // assumes that the pair is encoded as 6 UTF-8 code units. Since each
+ // pair is in reality only encoded as 4 UTF-8 code units, we need to
+ // perform this adjustment now.
+
+ if (IntPtr.Size == 8)
+ {
+ // Since we've already zero-extended surrogatePairsCountNuint, we can directly
+ // sub + sub. It's more efficient than shl + sub.
+ tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
+ tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
+ }
+ else
+ {
+ // Take the hit of the 64-bit extension now.
+ tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
+ }
+ }
+
+ tempUtf8CodeUnitCountAdjustment += popcnt;
+ pInputBuffer += Vector128<ushort>.Count;
+ inputLength -= Vector128<ushort>.Count;
+ } while (inputLength >= Vector128<ushort>.Count);
+ }
+ }
+ else if (Vector.IsHardwareAccelerated)
+ {
+ if (inputLength >= Vector<ushort>.Count)
+ {
+ Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
+ Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
+ Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
+ Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
+
+ do
+ {
+ // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
+ // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
+ // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
+ // vectors, each element of the sum will contain one of three values:
+ //
+ // 0x0000 ( 0) = original char was 0000..007F
+ // 0xFFFF (-1) = original char was 0080..07FF
+ // 0xFFFE (-2) = original char was 0800..FFFF
+ //
+ // We'll negate them to produce a value 0..2 for each element, then sum all the
+ // elements together to produce the number of *additional* UTF-8 code units
+ // required to represent this UTF-16 data. This is similar to the popcnt step
+ // performed by the SSE2 code path. This will overcount surrogates, but we'll
+ // handle that shortly.
+
+ Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
+ Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
+ Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
+ Vector<nuint> sumVector = (Vector<nuint>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
+
+ // We'll try summing by a natural word (rather than a 16-bit word) at a time,
+ // which should halve the number of operations we must perform.
+
+ nuint popcnt = 0;
+ for (int i = 0; i < Vector<nuint>.Count; i++)
+ {
+ popcnt += sumVector[i];
+ }
+
+ uint popcnt32 = (uint)popcnt;
+ if (IntPtr.Size == 8)
+ {
+ popcnt32 += (uint)(popcnt >> 32);
+ }
+
+ // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
+ // know there aren't any unpaired surrogates in the input data.
+
+ popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
+
+ // Now check for surrogates.
+
+ utf16Data -= vectorD800;
+ Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
+ if (surrogateChars != Vector<ushort>.Zero)
+ {
+ // There's at least one surrogate (high or low) UTF-16 code unit in
+ // the vector. We'll build up additional vectors: 'highSurrogateChars'
+ // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
+ // UTF-16 code unit was a high or low surrogate, respectively.
+
+ Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
+ Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
+
+ // We want to make sure that each high surrogate code unit is followed by
+ // a low surrogate code unit and each low surrogate code unit follows a
+ // high surrogate code unit. Since we don't have an equivalent of pmovmskb
+ // or palignr available to us, we'll do this as a loop. We won't look at
+ // the very last high surrogate char element since we don't yet know if
+ // the next vector read will have a low surrogate char element.
+
+ ushort surrogatePairsCount = 0;
+ for (int i = 0; i < Vector<ushort>.Count - 1; i++)
+ {
+ surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
+ if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
+ {
+ goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
+ }
+ }
+
+ if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
+ {
+ // There was a standalone high surrogate at the end of the vector.
+ // We'll adjust our counters so that we don't consider this char consumed.
+
+ pInputBuffer--;
+ inputLength++;
+ popcnt32 -= 2;
+ }
+
+ nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
+
+ // 2 UTF-16 chars become 1 Unicode scalar
+
+ tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
+
+ // Since each surrogate code unit was >= 0x0800, we eagerly assumed
+ // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
+ // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
+ // so we'll adjust this now.
+
+ tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+ tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+ }
+
+ tempUtf8CodeUnitCountAdjustment += popcnt32;
+ pInputBuffer += Vector<ushort>.Count;
+ inputLength -= Vector<ushort>.Count;
+ } while (inputLength >= Vector<ushort>.Count);
+ }
+ }
+
+ NonVectorizedLoop:
+
+ // Vectorization isn't supported on our current platform, or the input was too small to benefit
+ // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
+ // drain remaining valid chars before we report failure.
+
+ for (; inputLength > 0; pInputBuffer++, inputLength--)
+ {
+ uint thisChar = pInputBuffer[0];
+ if (thisChar <= 0x7F)
+ {
+ continue;
+ }
+
+ // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
+ // This optimistically assumes no surrogates, which we'll handle shortly.
+
+ tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
+
+ if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
+ {
+ continue;
+ }
+
+ // Found a surrogate char. Back out the adjustment we made above, then
+ // try to consume the entire surrogate pair all at once. We won't bother
+ // trying to interpret the surrogate pair as a scalar value; we'll only
+ // validate that its bit pattern matches what's expected for a surrogate pair.
+
+ tempUtf8CodeUnitCountAdjustment -= 2;
+
+ if (inputLength == 1)
+ {
+ goto Error; // input buffer too small to read a surrogate pair
+ }
+
+ thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
+ {
+ goto Error; // not a well-formed surrogate pair
+ }
+
+ tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
+ tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
+
+ pInputBuffer++; // consumed one extra char
+ inputLength--;
+ }
+
+ Error:
+
+ // Also used for normal return.
+
+ utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
+ scalarCountAdjustment = tempScalarCountAdjustment;
+ return pInputBuffer;
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs
index bed39057e4..828776b436 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs
@@ -5,7 +5,7 @@
using System.Runtime.CompilerServices;
using System.Diagnostics;
-namespace System.Text
+namespace System.Text.Unicode
{
internal static partial class Utf16Utility
{
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
index 6c8197d22b..b4cae379e2 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs
@@ -4,6 +4,8 @@
using System.Buffers;
using System.Diagnostics;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
namespace System.Text.Unicode
{
@@ -37,79 +39,87 @@ namespace System.Text.Unicode
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
/// this method will not return <see cref="OperationStatus.InvalidData"/>.
/// </remarks>
- public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+ public static unsafe OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
{
- int originalSourceLength = source.Length;
- int originalDestinationLength = destination.Length;
- OperationStatus status = OperationStatus.Done;
+ // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437
- // In a loop, this is going to read and transcode one scalar value at a time
- // from the source to the destination.
+ _ = source.Length;
+ _ = destination.Length;
- while (!source.IsEmpty)
+ fixed (char* pOriginalSource = &MemoryMarshal.GetReference(source))
+ fixed (byte* pOriginalDestination = &MemoryMarshal.GetReference(destination))
{
- status = Rune.DecodeFromUtf16(source, out Rune firstScalarValue, out int charsConsumed);
+ // We're going to bulk transcode as much as we can in a loop, iterating
+ // every time we see bad data that requires replacement.
- switch (status)
+ OperationStatus operationStatus = OperationStatus.Done;
+ char* pInputBufferRemaining = pOriginalSource;
+ byte* pOutputBufferRemaining = pOriginalDestination;
+
+ while (!source.IsEmpty)
{
- case OperationStatus.NeedMoreData:
-
- // Input buffer ended with a high surrogate. Only treat this as an error
- // if the caller told us that we shouldn't expect additional data in a
- // future call.
-
- if (!isFinalBlock)
- {
- goto Finish;
- }
-
- status = OperationStatus.InvalidData;
- goto case OperationStatus.InvalidData;
-
- case OperationStatus.InvalidData:
-
- // Input buffer contained invalid data. If the caller told us not to
- // perform U+FFFD replacement, terminate the loop immediately and return
- // an error to the caller.
-
- if (!replaceInvalidSequences)
- {
- goto Finish;
- }
-
- firstScalarValue = Rune.ReplacementChar;
- goto default;
-
- default:
-
- // We know which scalar value we need to transcode to UTF-8.
- // Do so now, and only terminate the loop if we ran out of space
- // in the destination buffer.
-
- if (firstScalarValue.TryEncodeToUtf8(destination, out int bytesWritten))
- {
- source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
- destination = destination.Slice(bytesWritten);
- status = OperationStatus.Done; // forcibly set success
- continue;
- }
- else
- {
- status = OperationStatus.DestinationTooSmall;
- goto Finish;
- }
+ // We've pinned the spans at the entry point to this method.
+ // It's safe for us to use Unsafe.AsPointer on them during this loop.
+
+ operationStatus = Utf8Utility.TranscodeToUtf8(
+ pInputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)),
+ inputLength: source.Length,
+ pOutputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)),
+ outputBytesRemaining: destination.Length,
+ pInputBufferRemaining: out pInputBufferRemaining,
+ pOutputBufferRemaining: out pOutputBufferRemaining);
+
+ // If we finished the operation entirely or we ran out of space in the destination buffer,
+ // or if we need more input data and the caller told us that there's possibly more data
+ // coming, return immediately.
+
+ if (operationStatus <= OperationStatus.DestinationTooSmall
+ || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock))
+ {
+ break;
+ }
+
+ // We encountered invalid data, or we need more data but the caller told us we're
+ // at the end of the stream. In either case treat this as truly invalid.
+ // If the caller didn't tell us to replace invalid sequences, return immediately.
+
+ if (!replaceInvalidSequences)
+ {
+ operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error
+ break;
+ }
+
+ // We're going to attempt to write U+FFFD to the destination buffer.
+ // Do we even have enough space to do so?
+
+ destination = destination.Slice((int)(pOutputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination))));
+
+ if (2 >= (uint)destination.Length)
+ {
+ operationStatus = OperationStatus.DestinationTooSmall;
+ break;
+ }
+
+ destination[0] = 0xEF; // U+FFFD = [ EF BF BD ] in UTF-8
+ destination[1] = 0xBF;
+ destination[2] = 0xBD;
+ destination = destination.Slice(3);
+
+ // Invalid UTF-16 sequences are always of length 1. Just skip the next character.
+
+ source = source.Slice((int)(pInputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))) + 1);
+
+ operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case
+ pInputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+ pOutputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination));
}
- }
-
- Finish:
- numCharsRead = originalSourceLength - source.Length;
- numBytesWritten = originalDestinationLength - destination.Length;
+ // Not possible to make any further progress - report to our caller how far we got.
- Debug.Assert((status == OperationStatus.Done) == (numCharsRead == originalSourceLength),
- "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");
-
- return status;
+ charsRead = (int)(pInputBufferRemaining - pOriginalSource);
+ bytesWritten = (int)(pOutputBufferRemaining - pOriginalDestination);
+ return operationStatus;
+ }
}
/// <summary>
@@ -120,79 +130,92 @@ namespace System.Text.Unicode
/// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
/// this method will not return <see cref="OperationStatus.InvalidData"/>.
/// </remarks>
- public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+ public static unsafe OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
{
- int originalSourceLength = source.Length;
- int originalDestinationLength = destination.Length;
- OperationStatus status = OperationStatus.Done;
+ // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437
+
+ _ = source.Length;
+ _ = destination.Length;
- // In a loop, this is going to read and transcode one scalar value at a time
- // from the source to the destination.
+ // We'll be mutating these values throughout our loop.
- while (!source.IsEmpty)
+ fixed (byte* pOriginalSource = &MemoryMarshal.GetReference(source))
+ fixed (char* pOriginalDestination = &MemoryMarshal.GetReference(destination))
{
- status = Rune.DecodeFromUtf8(source, out Rune firstScalarValue, out int bytesConsumed);
+ // We're going to bulk transcode as much as we can in a loop, iterating
+ // every time we see bad data that requires replacement.
- switch (status)
+ OperationStatus operationStatus = OperationStatus.Done;
+ byte* pInputBufferRemaining = pOriginalSource;
+ char* pOutputBufferRemaining = pOriginalDestination;
+
+ while (!source.IsEmpty)
{
- case OperationStatus.NeedMoreData:
-
- // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error
- // if the caller told us that we shouldn't expect additional data in a
- // future call.
-
- if (!isFinalBlock)
- {
- goto Finish;
- }
-
- status = OperationStatus.InvalidData;
- goto case OperationStatus.InvalidData;
-
- case OperationStatus.InvalidData:
-
- // Input buffer contained invalid data. If the caller told us not to
- // perform U+FFFD replacement, terminate the loop immediately and return
- // an error to the caller.
-
- if (!replaceInvalidSequences)
- {
- goto Finish;
- }
-
- firstScalarValue = Rune.ReplacementChar;
- goto default;
-
- default:
-
- // We know which scalar value we need to transcode to UTF-16.
- // Do so now, and only terminate the loop if we ran out of space
- // in the destination buffer.
-
- if (firstScalarValue.TryEncodeToUtf16(destination, out int charsWritten))
- {
- source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution
- destination = destination.Slice(charsWritten);
- status = OperationStatus.Done; // forcibly set success
- continue;
- }
- else
- {
- status = OperationStatus.DestinationTooSmall;
- goto Finish;
- }
+ // We've pinned the spans at the entry point to this method.
+ // It's safe for us to use Unsafe.AsPointer on them during this loop.
+
+ operationStatus = Utf8Utility.TranscodeToUtf16(
+ pInputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)),
+ inputLength: source.Length,
+ pOutputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)),
+ outputCharsRemaining: destination.Length,
+ pInputBufferRemaining: out pInputBufferRemaining,
+ pOutputBufferRemaining: out pOutputBufferRemaining);
+
+ // If we finished the operation entirely or we ran out of space in the destination buffer,
+ // or if we need more input data and the caller told us that there's possibly more data
+ // coming, return immediately.
+
+ if (operationStatus <= OperationStatus.DestinationTooSmall
+ || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock))
+ {
+ break;
+ }
+
+ // We encountered invalid data, or we need more data but the caller told us we're
+ // at the end of the stream. In either case treat this as truly invalid.
+ // If the caller didn't tell us to replace invalid sequences, return immediately.
+
+ if (!replaceInvalidSequences)
+ {
+ operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error
+ break;
+ }
+
+ // We're going to attempt to write U+FFFD to the destination buffer.
+ // Do we even have enough space to do so?
+
+ destination = destination.Slice((int)(pOutputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination))));
+
+ if (destination.IsEmpty)
+ {
+ operationStatus = OperationStatus.DestinationTooSmall;
+ break;
+ }
+
+ destination[0] = (char)UnicodeUtility.ReplacementChar;
+ destination = destination.Slice(1);
+
+ // Now figure out how many bytes of the source we must skip over before we should retry
+ // the operation. This might be more than 1 byte.
+
+ source = source.Slice((int)(pInputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))));
+ Debug.Assert(!source.IsEmpty, "Expected 'Done' if source is fully consumed.");
+
+ Rune.DecodeFromUtf8(source, out _, out int bytesConsumedJustNow);
+ source = source.Slice(bytesConsumedJustNow);
+
+ operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case
+ pInputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+ pOutputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination));
}
- }
-
- Finish:
- numBytesRead = originalSourceLength - source.Length;
- numCharsWritten = originalDestinationLength - destination.Length;
+ // Not possible to make any further progress - report to our caller how far we got.
- Debug.Assert((status == OperationStatus.Done) == (numBytesRead == originalSourceLength),
- "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");
-
- return status;
+ numBytesRead = (int)(pInputBufferRemaining - pOriginalSource);
+ numCharsWritten = (int)(pOutputBufferRemaining - pOriginalDestination);
+ return operationStatus;
+ }
}
}
}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
new file mode 100644
index 0000000000..ab29fbe7a6
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
@@ -0,0 +1,863 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers.Binary;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.X86;
+using Internal.Runtime.CompilerServices;
+
+namespace System.Text.Unicode
+{
+ internal static partial class Utf8Utility
+ {
+ /// <summary>
+ /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
+ /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ExtractCharFromFirstThreeByteSequence(uint value)
+ {
+ Debug.Assert(UInt32BeginsWithUtf8ThreeByteMask(value));
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ ######## | 10xxxxxx 10yyyyyy 1110zzzz ]
+ return ((value & 0x003F0_000u) >> 16)
+ | ((value & 0x0000_3F00u) >> 2)
+ | ((value & 0x0000_000Fu) << 12);
+ }
+ else
+ {
+ // value = [ 1110zzzz 10yyyyyy 10xxxxxx | ######## ]
+ return ((value & 0x0F00_0000u) >> 12)
+ | ((value & 0x003F_0000u) >> 10)
+ | ((value & 0x0000_3F00u) >> 8);
+ }
+ }
+
+ /// <summary>
+ /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
+ /// first two bytes as a two-byte UTF-8 subsequence and returns the UTF-16 representation.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ExtractCharFromFirstTwoByteSequence(uint value)
+ {
+ Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value) && !UInt32BeginsWithOverlongUtf8TwoByteSequence(value));
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ ######## ######## | 10xxxxxx 110yyyyy ]
+ uint leadingByte = (uint)(byte)value << 6;
+ return (uint)(byte)(value >> 8) + leadingByte - (0xC0u << 6) - 0x80u; // remove header bits
+ }
+ else
+ {
+ // value = [ 110yyyyy 10xxxxxx | ######## ######## ]
+ return (char)(((value & 0x1F00_0000u) >> 18) | ((value & 0x003F_0000u) >> 16));
+ }
+ }
+
+ /// <summary>
+ /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a
+ /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+ private static uint ExtractCharsFromFourByteSequence(uint value)
+ {
+ if (BitConverter.IsLittleEndian)
+ {
+ if (Bmi2.IsSupported)
+ {
+ // need to reverse endianness for bit manipulation to work correctly
+ value = BinaryPrimitives.ReverseEndianness(value);
+
+ // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+ // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ]
+ // where wwww = uuuuu - 1
+
+ uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u);
+ uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u);
+
+ uint combined = (lowSurrogateChar << 16) + highSurrogateChar;
+ combined -= 0x40u; // wwww = uuuuu - 1
+ combined += 0xDC00_D800u; // add surrogate markers
+ return combined;
+ }
+ else
+ {
+ // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+ // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+ // where wwww = uuuuu - 1
+ uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
+ retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
+ retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
+ retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
+ retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
+ retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
+ retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
+ retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
+ retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+ return retVal;
+ }
+ }
+ else
+ {
+ // input is UTF8 [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+ // want to return UTF16 scalar 000uuuuuxxxxxxxxxxxxxxxx = [ 110110wwwwxxxxxx 110111xxxxxxxxx ]
+ // where wwww = uuuuu - 1
+ uint retVal = value & 0xFF00_0000u; // retVal = [ 11110uuu 00000000 00000000 00000000 ]
+ retVal |= (value & 0x003F_0000u) << 2; // retVal = [ 11110uuu uuzzzz00 00000000 00000000 ]
+ retVal |= (value & 0x0000_3000u) << 4; // retVal = [ 11110uuu uuzzzzyy 00000000 00000000 ]
+ retVal |= (value & 0x0000_0F00u) >> 2; // retVal = [ 11110uuu uuzzzzyy 000000yy yy000000 ]
+ retVal |= (value & 0x0000_003Fu); // retVal = [ 11110uuu uuzzzzyy 000000yy yyxxxxxx ]
+ retVal -= 0x2000_0000u; // retVal = [ 11010uuu uuzzzzyy 000000yy yyxxxxxx ]
+ retVal -= 0x0040_0000u; // retVal = [ 110100ww wwzzzzyy 000000yy yyxxxxxx ]
+ retVal += 0x0000_DC00u; // retVal = [ 110100ww wwzzzzyy 110111yy yyxxxxxx ]
+ retVal += 0x0800_0000u; // retVal = [ 110110ww wwzzzzyy 110111yy yyxxxxxx ]
+ return retVal;
+ }
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents a valid packed UTF-16 surrogate pair, all in machine-endian order,
+ /// returns the packed 4-byte UTF-8 representation of this scalar value, also in machine-endian order.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
+ {
+ Debug.Assert(IsWellFormedUtf16SurrogatePair(value));
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
+ // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
+
+ if (Bmi2.IsSupported)
+ {
+ // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
+ // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple
+ // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across
+ // all four output bytes.
+
+ uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */;
+
+ // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning
+ // that should normally be masked out via an and, but we'll just direct pdep to ignore it.
+
+ uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ]
+ return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
+ }
+ else
+ {
+ value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
+
+ uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
+ tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
+
+ uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
+ uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
+ tempC |= tempB;
+
+ uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
+ tempD |= 0x8080_80F0u;
+
+ return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
+ }
+ }
+ else
+ {
+ // input = [ 110110wwwwzzzzyy 110111yyyyxxxxxx ], where wwww = uuuuu - 1
+ // must return [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ], where wwww = uuuuu - 1
+
+ value -= 0xD800_DC00u; // = [ 000000wwwwzzzzyy 000000yyyyxxxxxx ]
+ value += 0x0040_0000u; // = [ 00000uuuuuzzzzyy 000000yyyyxxxxxx ]
+
+ uint tempA = value & 0x0700_0000u; // = [ 00000uuu 00000000 00000000 00000000 ]
+ uint tempB = (value >> 2) & 0x003F_0000u; // = [ 00000000 00uuzzzz 00000000 00000000 ]
+ tempB |= tempA;
+
+ uint tempC = (value << 2) & 0x0000_0F00u; // = [ 00000000 00000000 0000yyyy 00000000 ]
+ uint tempD = (value >> 6) & 0x0003_0000u; // = [ 00000000 00000000 00yy0000 00000000 ]
+ tempD |= tempC;
+
+ uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ]
+ return (tempE | tempB | tempD); // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+ }
+ }
+
+ /// <summary>
+ /// Given a machine-endian DWORD which represents two adjacent UTF-8 two-byte sequences,
+ /// returns the machine-endian DWORD representation of that same data as two adjacent
+ /// UTF-16 byte sequences.
+ /// </summary>
+ /// <param name="value"></param>
+ /// <returns></returns>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint value)
+ {
+ // We don't want to swap the position of the high and low WORDs,
+ // as the buffer was read in machine order and will be written in
+ // machine order.
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ 10xxxxxx 110yyyyy | 10xxxxxx 110yyyyy ]
+ return ((value & 0x3F003F00u) >> 8) | ((value & 0x001F001Fu) << 6);
+ }
+ else
+ {
+ // value = [ 110yyyyy 10xxxxxx | 110yyyyy 10xxxxxx ]
+ return ((value & 0x1F001F00u) >> 2) | (value & 0x003F003Fu);
+ }
+ }
+
+ /// <summary>
+ /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences,
+ /// returns the machine-endian DWORD representation of that same data as two
+ /// adjacent UTF-8 two-byte sequences.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint value)
+ {
+ // stays in machine endian
+
+ Debug.Assert(IsFirstCharTwoUtf8Bytes(value) && IsSecondCharTwoUtf8Bytes(value));
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ]
+ // want to return [ 10XXXXXX 110YYYYY 10xxxxxx 110yyyyy ]
+
+ return ((value >> 6) & 0x001F_001Fu) + ((value << 8) & 0x3F00_3F00u) + 0x80C0_80C0u;
+ }
+ else
+ {
+ // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ]
+ // want to return [ 110YYYYY 10XXXXXX 110yyyyy 10xxxxxx ]
+
+ return ((value << 2) & 0x1F00_1F00u) + (value & 0x003F_003Fu) + 0xC080_C080u;
+ }
+ }
+
+ /// <summary>
+ /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences,
+ /// returns the machine-endian DWORD representation of the first UTF-16 char
+ /// as a UTF-8 two-byte sequence packed into a WORD and zero-extended to DWORD.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value)
+ {
+ // stays in machine endian
+
+ Debug.Assert(IsFirstCharTwoUtf8Bytes(value));
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ ######## ######## 00000yyy yyxxxxxx ]
+ // want to return [ ######## ######## 10xxxxxx 110yyyyy ]
+
+ uint temp = (value << 2) & 0x1F00u; // [ 00000000 00000000 000yyyyy 00000000 ]
+ value &= 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
+ return BinaryPrimitives.ReverseEndianness((ushort)(temp + value + 0xC080u)); // [ 00000000 00000000 10xxxxxx 110yyyyy ]
+ }
+ else
+ {
+ // value = [ 00000yyy yyxxxxxx ######## ######## ]
+ // want to return [ ######## ######## 110yyyyy 10xxxxxx ]
+
+ uint temp = (value >> 16) & 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ]
+ value = (value >> 22) & 0x1F00u; // [ 00000000 00000000 000yyyyy 0000000 ]
+ return value + temp + 0xC080u;
+ }
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the first UTF-16 character is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsFirstCharAscii(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0000..007F ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0000..007F ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
+ || (!BitConverter.IsLittleEndian && value < 0x0080_0000u);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the first UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
+ /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
+ || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsFirstCharSurrogate(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
+ || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsFirstCharTwoUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
+
+ // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
+ // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+ // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
+ || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
+ }
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
+ /// is a UTF-8 continuation byte.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsLowByteUtf8ContinuationByte(uint value)
+ {
+ // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
+ // so the best we can do for now is the lea / cmp pair.
+ // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+ return (byte)(value - 0x80u) <= 0x3Fu;
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharAscii(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
+ // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
+ || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
+ /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
+ // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
+ || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharSurrogate(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
+ // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
+ || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharTwoUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
+ // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
+
+ // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
+ // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+ // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
+ || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
+ }
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
+ /// i.e., has binary representation 10xxxxxx, where x is any bit.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsUtf8ContinuationByte(in byte value)
+ {
+ // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
+ // directly rather than bounce a temporary through a register. That is, we want the JIT to be
+ // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
+ // to see if it's a continuation byte. Data that's already enregistered will go through the
+ // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
+ //
+ // The below check takes advantage of the two's complement representation of negative numbers.
+ // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+
+ return ((sbyte)value < -64);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsWellFormedUtf16SurrogatePair(uint value)
+ {
+ // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
+ // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
+ //
+ // We're essentially performing a range check on each component of the input in parallel. The allowed range
+ // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
+ // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
+ // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
+ // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
+ || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
+ }
+
+ /// <summary>
+ /// Converts a DWORD from machine-endian to little-endian.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ToLittleEndian(uint value)
+ {
+ if (BitConverter.IsLittleEndian)
+ {
+ return value;
+ }
+ else
+ {
+ return BinaryPrimitives.ReverseEndianness(value);
+ }
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer are
+ /// an overlong representation of a sequence that should be represented as one byte.
+ /// This method *does not* validate that the sequence matches the appropriate
+ /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
+ {
+ // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+ Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
+
+ // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+ // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
+ // to check that it's < C2.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
+ || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first four bytes of the buffer match
+ /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
+ /// method *does not* validate that the sequence is well-formed; the caller must
+ /// still perform overlong form or out-of-range checking.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
+ {
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0xC0C0C0F8U;
+ // const uint comparand = 0x808080F0U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0xF8C0C0C0U;
+ // const uint comparand = 0xF0808000U;
+ // return ((value & mask) == comparand);
+ // }
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first three bytes of the buffer match
+ /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
+ /// validate that the sequence is well-formed; the caller must still perform
+ /// overlong form or surrogate checking.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
+ {
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0x00C0C0F0U;
+ // const uint comparand = 0x008080E0U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0xF0C0C000U;
+ // const uint comparand = 0xE0808000U;
+ // return ((value & mask) == comparand);
+ // }
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer match
+ /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+ /// validate that the sequence is well-formed; the caller must still perform
+ /// overlong form checking.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
+ {
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0x0000C0E0U;
+ // const uint comparand = 0x000080C0U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0xE0C00000U;
+ // const uint comparand = 0xC0800000U;
+ // return ((value & mask) == comparand);
+ // }
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer are
+ /// an overlong representation of a sequence that should be represented as one byte.
+ /// This method *does not* validate that the sequence matches the appropriate
+ /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
+ {
+ // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+ Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
+
+ // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+ // We already validated that it's 80 .. DF (per mask check earlier).
+ // C2 = 1100 0010
+ // DF = 1101 1111
+ // This means that we can AND the leading byte with the mask 0001 1110 (1E),
+ // and if the result is zero the sequence is overlong.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the last two bytes of the buffer match
+ /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+ /// validate that the sequence is well-formed; the caller must still perform
+ /// overlong form checking.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
+ {
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0xC0E00000U;
+ // const uint comparand = 0x80C00000U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0x0000E0C0U;
+ // const uint comparand = 0x0000C080U;
+ // return ((value & mask) == comparand);
+ // }
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
+ /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+ /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
+ {
+ // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
+ // In little-endian, that would be represented as:
+ // [ ######## ######## 10xxxxxx 110yyyyy ].
+ // Due to the little-endian representation we can perform a trick by ANDing the low
+ // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
+ // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
+ // 2-byte-sequence bitmask check and overlong form validation with one comparison.
+
+ Debug.Assert(BitConverter.IsLittleEndian);
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
+ || (!BitConverter.IsLittleEndian && false);
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+ /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
+ /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+ /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
+ {
+ // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
+
+ Debug.Assert(BitConverter.IsLittleEndian);
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
+ || (!BitConverter.IsLittleEndian && false);
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32FirstByteIsAscii(uint value)
+ {
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
+ || (!BitConverter.IsLittleEndian && ((int)value >= 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32FourthByteIsAscii(uint value)
+ {
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((int)value >= 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32SecondByteIsAscii(uint value)
+ {
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32ThirdByteIsAscii(uint value)
+ {
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
+ }
+
+ /// <summary>
+ /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
+ /// and writes the resulting QWORD into the destination with machine endianness.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
+ private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
+ {
+ if (Bmi2.X64.IsSupported)
+ {
+ // BMI2 will work regardless of the processor's endianness.
+ Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
+ }
+ else
+ {
+ if (BitConverter.IsLittleEndian)
+ {
+ outputBuffer = (char)(byte)value;
+ value >>= 8;
+ Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+ value >>= 8;
+ Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+ value >>= 8;
+ Unsafe.Add(ref outputBuffer, 3) = (char)value;
+ }
+ else
+ {
+ Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
+ value >>= 8;
+ Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+ value >>= 8;
+ Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+ value >>= 8;
+ outputBuffer = (char)value;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
+ /// converts those scalar values to their 3-byte UTF-8 representation and writes the
+ /// resulting 6 bytes to the destination buffer.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outputBuffer, uint value)
+ {
+ Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
+ Debug.Assert(IsSecondCharAtLeastThreeUtf8Bytes(value) && !IsSecondCharSurrogate(value), "Second half of value should've been 0800..D7FF or E000..FFFF");
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ ZZZZYYYY YYXXXXXX zzzzyyyy yyxxxxxx ]
+ // want to write [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ] [ 10XXXXXX 10YYYYYY ]
+
+ uint tempA = ((value << 2) & 0x3F00u) | ((value & 0x3Fu) << 16); // = [ 00000000 00xxxxxx 00yyyyyy 00000000 ]
+ uint tempB = ((value >> 4) & 0x0F00_0000u) | ((value >> 12) & 0x0Fu); // = [ 0000ZZZZ 00000000 00000000 0000zzzz ]
+ Unsafe.WriteUnaligned<uint>(ref outputBuffer, tempA + tempB + 0xE080_80E0u); // = [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ]
+ Unsafe.WriteUnaligned<ushort>(ref Unsafe.Add(ref outputBuffer, 4), (ushort)(((value >> 22) & 0x3Fu) + ((value >> 8) & 0x3F00u) + 0x8080u)); // = [ 10XXXXXX 10YYYYYY ]
+ }
+ else
+ {
+ // value = [ zzzzyyyy yyxxxxxx ZZZZYYYY YYXXXXXX ]
+ // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ] [ 1110ZZZZ ] [ 10YYYYYY ] [ 10XXXXXX ]
+
+ Unsafe.Add(ref outputBuffer, 5) = (byte)((value & 0x3Fu) | 0x80u);
+ Unsafe.Add(ref outputBuffer, 4) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
+ Unsafe.Add(ref outputBuffer, 3) = (byte)(((value >>= 6) & 0x0Fu) | 0xE0u);
+ Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 4) & 0x3Fu) | 0x80u);
+ Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
+ outputBuffer = (byte)((value >>= 6) | 0xE0u);
+ }
+ }
+
+
+ /// <summary>
+ /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
+ /// converts the first UTF-16 value to its 3-byte UTF-8 representation and writes the
+ /// resulting 3 bytes to the destination buffer.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref byte outputBuffer, uint value)
+ {
+ Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // value = [ ######## ######## zzzzyyyy yyxxxxxx ]
+ // want to write [ 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
+
+ uint tempA = (value << 2) & 0x3F00u; // [ 00yyyyyy 00000000 ]
+ uint tempB = ((uint)(ushort)value >> 12); // [ 00000000 0000zzzz ]
+ Unsafe.WriteUnaligned<ushort>(ref outputBuffer, (ushort)(tempA + tempB + 0x80E0u)); // [ 10yyyyyy 1110zzzz ]
+ Unsafe.Add(ref outputBuffer, 2) = (byte)((value & 0x3Fu) | ~0x7Fu); // [ 10xxxxxx ]
+ }
+ else
+ {
+ // value = [ zzzzyyyy yyxxxxxx ######## ######## ]
+ // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ]
+
+ Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 16) & 0x3Fu) | 0x80u);
+ Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u);
+ outputBuffer = (byte)((value >>= 6) | 0xE0u);
+ }
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
new file mode 100644
index 0000000000..3b83a24559
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
@@ -0,0 +1,1480 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Buffers.Binary;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.X86;
+using Internal.Runtime.CompilerServices;
+
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else // BIT64
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif // BIT64
+
+namespace System.Text.Unicode
+{
+ internal static unsafe partial class Utf8Utility
+ {
+#if DEBUG
+ static Utf8Utility()
+ {
+ Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
+ Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
+
+ _ValidateAdditionalNIntDefinitions();
+ }
+#endif // DEBUG
+
+ // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
+ // the next byte would have been consumed from / the next char would have been written to.
+ // inputLength in bytes, outputCharsRemaining in chars.
+ [MethodImpl(MethodImplOptions.AggressiveOptimization)]
+ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining)
+ {
+ Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+ Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+ Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative.");
+ Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
+
+ // First, try vectorized conversion.
+
+ {
+ nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
+
+ pInputBuffer += numElementsConverted;
+ pOutputBuffer += numElementsConverted;
+
+ // Quick check - did we just end up consuming the entire input buffer?
+ // If so, short-circuit the remainder of the method.
+
+ if ((int)numElementsConverted == inputLength)
+ {
+ pInputBufferRemaining = pInputBuffer;
+ pOutputBufferRemaining = pOutputBuffer;
+ return OperationStatus.Done;
+ }
+
+ inputLength -= (int)numElementsConverted;
+ outputCharsRemaining -= (int)numElementsConverted;
+ }
+
+ if (inputLength < sizeof(uint))
+ {
+ goto ProcessInputOfLessThanDWordSize;
+ }
+
+ byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - 4;
+
+ // Begin the main loop.
+
+#if DEBUG
+ byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+ while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
+
+ uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ AfterReadDWord:
+
+#if DEBUG
+ Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+ pLastBufferPosProcessed = pInputBuffer;
+#endif
+ // First, check for the common case of all-ASCII bytes.
+
+ if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+ {
+ // We read an all-ASCII sequence.
+
+ if (outputCharsRemaining < sizeof(uint))
+ {
+ goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
+ }
+
+ Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+ pInputBuffer += 4;
+ pOutputBuffer += 4;
+ outputCharsRemaining -= 4;
+
+ // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+ // Below is basically unrolled loops with poor man's vectorization.
+
+ uint remainingInputBytes = (uint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+ uint maxIters = Math.Min(remainingInputBytes, (uint)outputCharsRemaining) / (2 * sizeof(uint));
+ uint secondDWord;
+ int i;
+ for (i = 0; (uint)i < maxIters; i++)
+ {
+ // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD.
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + sizeof(uint));
+
+ if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord))
+ {
+ goto LoopTerminatedEarlyDueToNonAsciiData;
+ }
+
+ pInputBuffer += 8;
+
+ Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
+ Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
+
+ pOutputBuffer += 8;
+ }
+
+ outputCharsRemaining -= 8 * i;
+
+ continue; // need to perform a bounds check because we might be running out of data
+
+ LoopTerminatedEarlyDueToNonAsciiData:
+
+ if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+ {
+ // The first DWORD contained all-ASCII bytes, so expand it.
+
+ Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+
+ // continue the outer loop from the second DWORD
+
+ Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord));
+ thisDWord = secondDWord;
+
+ pInputBuffer += 4;
+ pOutputBuffer += 4;
+ outputCharsRemaining -= 4;
+ }
+
+ outputCharsRemaining -= 8 * i;
+
+ // We know that there's *at least* one DWORD of data remaining in the buffer.
+ // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop.
+
+ goto AfterReadDWordSkipAllBytesAsciiCheck;
+ }
+
+ AfterReadDWordSkipAllBytesAsciiCheck:
+
+ Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+ // Next, try stripping off ASCII bytes one at a time.
+ // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
+
+ if (UInt32FirstByteIsAscii(thisDWord))
+ {
+ if (outputCharsRemaining >= 3)
+ {
+ // Fast-track: we don't need to check the destination length for subsequent
+ // ASCII bytes since we know we can write them all now.
+
+ uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
+
+ nuint adjustment = 1;
+ pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian;
+
+ if (UInt32SecondByteIsAscii(thisDWord))
+ {
+ adjustment++;
+ thisDWordLittleEndian >>= 8;
+ pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian;
+
+ if (UInt32ThirdByteIsAscii(thisDWord))
+ {
+ adjustment++;
+ thisDWordLittleEndian >>= 8;
+ pOutputBuffer[2] = (char)(byte)thisDWordLittleEndian;
+ }
+ }
+
+ pInputBuffer += adjustment;
+ pOutputBuffer += adjustment;
+ outputCharsRemaining -= (int)adjustment;
+ }
+ else
+ {
+ // Slow-track: we need to make sure each individual write has enough
+ // of a buffer so that we don't overrun the destination.
+
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
+
+ pInputBuffer++;
+ *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
+ outputCharsRemaining--;
+
+ if (UInt32SecondByteIsAscii(thisDWord))
+ {
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ pInputBuffer++;
+ thisDWordLittleEndian >>= 8;
+ *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
+
+ // We can perform a small optimization here. We know at this point that
+ // the output buffer is fully consumed (we read two ASCII bytes and wrote
+ // two ASCII chars, and we checked earlier that the destination buffer
+ // can't store a third byte). If the next byte is ASCII, we can jump straight
+ // to the return statement since the end-of-method logic only relies on the
+ // destination buffer pointer -- NOT the output chars remaining count -- being
+ // correct. If the next byte is not ASCII, we'll need to continue with the
+ // rest of the main loop, but we can set the buffer length directly to zero
+ // rather than decrementing it from 1 to 0.
+
+ Debug.Assert(outputCharsRemaining == 1);
+
+ if (UInt32ThirdByteIsAscii(thisDWord))
+ {
+ goto OutputBufferTooSmall;
+ }
+ else
+ {
+ outputCharsRemaining = 0;
+ }
+ }
+ }
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessRemainingBytesSlow; // input buffer doesn't contain enough data to read a DWORD
+ }
+ else
+ {
+ // The input buffer at the current offset contains a non-ASCII byte.
+ // Read an entire DWORD and fall through to multi-byte consumption logic.
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ }
+ }
+
+ BeforeProcessTwoByteSequence:
+
+ // At this point, we know we're working with a multi-byte code unit,
+ // but we haven't yet validated it.
+
+ // The masks and comparands are derived from the Unicode Standard, Table 3-6.
+ // Additionally, we need to check for valid byte sequences per Table 3-7.
+
+ // Check the 2-byte case.
+
+ if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+ {
+ // Per Table 3-7, valid sequences are:
+ // [ C2..DF ] [ 80..BF ]
+
+ if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+ {
+ goto Error;
+ }
+
+ ProcessTwoByteSequenceSkipOverlongFormCheck:
+
+ // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
+ // there's a good chance that if we see one two-byte run then there's another two-byte
+ // run immediately after. Let's check that now.
+
+ // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
+ // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
+ // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
+
+ if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+ || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+ {
+ // We have two runs of two bytes each.
+
+ if (outputCharsRemaining < 2)
+ {
+ goto ProcessRemainingBytesSlow; // running out of output buffer
+ }
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
+
+ pInputBuffer += 4;
+ pOutputBuffer += 2;
+ outputCharsRemaining -= 2;
+
+ if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+ // also two bytes. Check for that first before going back to the beginning of the loop.
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ if (BitConverter.IsLittleEndian)
+ {
+ if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+ {
+ // The next sequence is a valid two-byte sequence.
+ goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+ }
+ }
+ else
+ {
+ if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+ {
+ if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+ {
+ goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
+ }
+
+ goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+ }
+ }
+
+ // If we reached this point, the next sequence is something other than a valid
+ // two-byte sequence, so go back to the beginning of the loop.
+ goto AfterReadDWord;
+ }
+ else
+ {
+ goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+ }
+ }
+
+ // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+ // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+ // bytes are ASCII?
+
+ uint charToWrite = ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
+
+ if (UInt32ThirdByteIsAscii(thisDWord))
+ {
+ if (UInt32FourthByteIsAscii(thisDWord))
+ {
+ if (outputCharsRemaining < 3)
+ {
+ goto ProcessRemainingBytesSlow; // running out of output buffer
+ }
+
+ pOutputBuffer[0] = (char)charToWrite;
+ if (BitConverter.IsLittleEndian)
+ {
+ thisDWord >>= 16;
+ pOutputBuffer[1] = (char)(byte)thisDWord;
+ thisDWord >>= 8;
+ pOutputBuffer[2] = (char)thisDWord;
+ }
+ else
+ {
+ pOutputBuffer[2] = (char)(byte)thisDWord;
+ pOutputBuffer[1] = (char)(byte)(thisDWord >> 8);
+ }
+ pInputBuffer += 4;
+ pOutputBuffer += 3;
+ outputCharsRemaining -= 3;
+
+ continue; // go back to original bounds check and check for ASCII
+ }
+ else
+ {
+ if (outputCharsRemaining < 2)
+ {
+ goto ProcessRemainingBytesSlow; // running out of output buffer
+ }
+
+ pOutputBuffer[0] = (char)charToWrite;
+ pOutputBuffer[1] = (char)(byte)(thisDWord >> (BitConverter.IsLittleEndian ? 16 : 8));
+ pInputBuffer += 3;
+ pOutputBuffer += 2;
+ outputCharsRemaining -= 2;
+
+ // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
+ // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
+
+ if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+ {
+ goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+ }
+ else
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ goto BeforeProcessTwoByteSequence;
+ }
+ }
+ }
+ else
+ {
+ if (outputCharsRemaining == 0)
+ {
+ goto ProcessRemainingBytesSlow; // running out of output buffer
+ }
+
+ pOutputBuffer[0] = (char)charToWrite;
+ pInputBuffer += 2;
+ pOutputBuffer += 1;
+ outputCharsRemaining--;
+
+ if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+ {
+ goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+ }
+ else
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
+ }
+ }
+ }
+
+ // Check the 3-byte case.
+
+ BeforeProcessThreeByteSequence:
+
+ if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+ {
+ ProcessThreeByteSequenceWithCheck:
+
+ // We need to check for overlong or surrogate three-byte sequences.
+ //
+ // Per Table 3-7, valid sequences are:
+ // [ E0 ] [ A0..BF ] [ 80..BF ]
+ // [ E1..EC ] [ 80..BF ] [ 80..BF ]
+ // [ ED ] [ 80..9F ] [ 80..BF ]
+ // [ EE..EF ] [ 80..BF ] [ 80..BF ]
+ //
+ // Big-endian examples of using the above validation table:
+ // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
+ // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
+ // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
+ // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
+ // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // The "overlong or surrogate" check can be implemented using a single jump, but there's
+ // some overhead to moving the bits into the correct locations in order to perform the
+ // correct comparison, and in practice the processor's branch prediction capability is
+ // good enough that we shouldn't bother. So we'll use two jumps instead.
+
+ // Can't extract this check into its own helper method because JITter produces suboptimal
+ // assembly, even with aggressive inlining.
+
+ // Code below becomes 5 instructions: test, jz, lea, test, jz
+
+ if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0))
+ {
+ goto Error; // overlong or surrogate
+ }
+ }
+ else
+ {
+ if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0))
+ {
+ goto Error; // overlong or surrogate
+ }
+ }
+
+ // At this point, we know the incoming scalar is well-formed.
+
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall; // not enough space in the destination buffer to write
+ }
+
+ // As an optimization, on compatible platforms check if a second three-byte sequence immediately
+ // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
+
+ if (Bmi2.X64.IsSupported)
+ {
+ Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
+
+ // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
+ // would indicate the potential start of a second three-byte sequence.
+
+ if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0)
+ {
+ // The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents
+ // the final place where we can safely perform a DWORD read, and we want to probe whether it's
+ // safe to read a DWORD beginning at address &pInputBuffer[3].
+
+ if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
+ {
+ // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
+ // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
+ // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
+ // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
+ // 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again,
+ // fail, and redirect control flow to the error handling logic at the very end of this method.
+
+ uint secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
+
+ if (UInt32BeginsWithUtf8ThreeByteMask(secondDWord)
+ && ((secondDWord & 0x0000_200Fu) != 0)
+ && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
+ {
+ // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
+ ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
+ thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
+
+ // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
+ ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
+ pInputBuffer += 6;
+ pOutputBuffer += 2;
+ outputCharsRemaining -= 2;
+
+ // Drain any ASCII data following the second three-byte sequence.
+
+ goto CheckForAsciiByteAfterThreeByteSequence;
+ }
+ }
+ }
+ }
+
+ // Couldn't extract 2x three-byte sequences together, just do this one by itself.
+
+ *pOutputBuffer = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+ pInputBuffer += 3;
+ pOutputBuffer += 1;
+ outputCharsRemaining -= 1;
+
+ CheckForAsciiByteAfterThreeByteSequence:
+
+ // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+ // in to the text. If this happens strip it off now before seeing if the next character
+ // consists of three code units.
+
+ if (UInt32FourthByteIsAscii(thisDWord))
+ {
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ if (BitConverter.IsLittleEndian)
+ {
+ *pOutputBuffer = (char)(thisDWord >> 24);
+ }
+ else
+ {
+ *pOutputBuffer = (char)(byte)thisDWord;
+ }
+
+ pInputBuffer += 1;
+ pOutputBuffer += 1;
+ outputCharsRemaining -= 1;
+ }
+
+ if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ // Optimization: A three-byte character could indicate CJK text, which makes it likely
+ // that the character following this one is also CJK. We'll check for a three-byte sequence
+ // marker now and jump directly to three-byte sequence processing if we see one, skipping
+ // all of the logic at the beginning of the loop.
+
+ if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+ {
+ goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume
+ }
+ else
+ {
+ goto AfterReadDWord; // probably ASCII punctuation or whitespace
+ }
+ }
+ else
+ {
+ goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+ }
+ }
+
+ // Assume the 4-byte case, but we need to validate.
+
+ {
+ // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences.
+ //
+ // Per Table 3-7, valid sequences are:
+ // [ F0 ] [ 90..BF ] [ 80..BF ] [ 80..BF ]
+ // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
+ // [ F4 ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
+
+ if (!UInt32BeginsWithUtf8FourByteMask(thisDWord))
+ {
+ goto Error;
+ }
+
+ // Now check for overlong / out-of-range sequences.
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ].
+ // We want to get the 'w' byte in front of the 'z' byte so that we can perform
+ // a single range comparison. We'll take advantage of the fact that the JITter
+ // can detect a ROR / ROL operation, then we'll just zero out the bytes that
+ // aren't involved in the range check.
+
+ uint toCheck = thisDWord & 0x0000_FFFFu;
+
+ // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ].
+
+ toCheck = BitOperations.RotateRight(toCheck, 8);
+
+ // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ].
+
+ if (!UnicodeUtility.IsInRangeInclusive(toCheck, 0xF000_0090u, 0xF400_008Fu))
+ {
+ goto Error;
+ }
+ }
+ else
+ {
+ if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0xF090_0000u, 0xF48F_FFFFu))
+ {
+ goto Error;
+ }
+ }
+
+ // Validation complete.
+
+ if (outputCharsRemaining < 2)
+ {
+ // There's no point to falling back to the "drain the input buffer" logic, since we know
+ // we can't write anything to the destination. So we'll just exit immediately.
+ goto OutputBufferTooSmall;
+ }
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractCharsFromFourByteSequence(thisDWord));
+
+ pInputBuffer += 4;
+ pOutputBuffer += 2;
+ outputCharsRemaining -= 2;
+
+ continue; // go back to beginning of loop for processing
+ }
+ }
+
+ ProcessRemainingBytesSlow:
+ inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+
+ ProcessInputOfLessThanDWordSize:
+ while (inputLength > 0)
+ {
+ uint firstByte = pInputBuffer[0];
+ if (firstByte <= 0x7Fu)
+ {
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+ }
+
+ // 1-byte (ASCII) case
+ *pOutputBuffer = (char)firstByte;
+
+ pInputBuffer += 1;
+ pOutputBuffer += 1;
+ inputLength -= 1;
+ outputCharsRemaining -= 1;
+ continue;
+ }
+
+ // Potentially the start of a multi-byte sequence?
+
+ firstByte -= 0xC2u;
+ if ((byte)firstByte <= (0xDFu - 0xC2u))
+ {
+ // Potentially a 2-byte sequence?
+ if (inputLength < 2)
+ {
+ goto InputBufferTooSmall; // out of data
+ }
+
+ uint secondByte = pInputBuffer[1];
+ if (!IsLowByteUtf8ContinuationByte(secondByte))
+ {
+ goto Error; // 2-byte marker not followed by continuation byte
+ }
+
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+ }
+
+ uint asChar = (firstByte << 6) + secondByte + ((0xC2u - 0xC0u) << 6) - 0x80u; // remove UTF-8 markers from scalar
+ *pOutputBuffer = (char)asChar;
+
+ pInputBuffer += 2;
+ pOutputBuffer += 1;
+ inputLength -= 2;
+ outputCharsRemaining -= 1;
+ continue;
+ }
+ else if ((byte)firstByte <= (0xEFu - 0xC2u))
+ {
+ // Potentially a 3-byte sequence?
+ if (inputLength >= 3)
+ {
+ uint secondByte = pInputBuffer[1];
+ uint thirdByte = pInputBuffer[2];
+ if (!IsLowByteUtf8ContinuationByte(secondByte) || !IsLowByteUtf8ContinuationByte(thirdByte))
+ {
+ goto Error; // 3-byte marker not followed by 2 continuation bytes
+ }
+
+ // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet.
+ // We account for this in the comparisons below.
+
+ uint partialChar = (firstByte << 12) + (secondByte << 6);
+ if (partialChar < ((0xE0u - 0xC2u) << 12) + (0xA0u << 6))
+ {
+ goto Error; // this is an overlong encoding; fail
+ }
+
+ partialChar -= ((0xEDu - 0xC2u) << 12) + (0xA0u << 6); //if partialChar = 0, we're at beginning of UTF-16 surrogate code point range
+ if (partialChar < (0x0800u /* number of code points in UTF-16 surrogate code point range */))
+ {
+ goto Error; // attempted to encode a UTF-16 surrogate code point; fail
+ }
+
+ if (outputCharsRemaining == 0)
+ {
+ goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+ }
+
+ // Now restore the full scalar value.
+
+ partialChar += thirdByte;
+ partialChar += 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds
+ partialChar -= 0x80u; // remove third byte continuation marker
+
+ *pOutputBuffer = (char)partialChar;
+
+ pInputBuffer += 3;
+ pOutputBuffer += 1;
+ inputLength -= 3;
+ outputCharsRemaining -= 1;
+ continue;
+ }
+ else if (inputLength >= 2)
+ {
+ uint secondByte = pInputBuffer[1];
+ if (!IsLowByteUtf8ContinuationByte(secondByte))
+ {
+ goto Error; // 3-byte marker not followed by continuation byte
+ }
+
+ // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations
+ // from just the first two bytes.
+
+ uint partialChar = (firstByte << 6) + secondByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
+ if (partialChar < ((0xE0u - 0xC2u) << 6) + 0xA0u)
+ {
+ goto Error; // failed overlong check
+ }
+ if (UnicodeUtility.IsInRangeInclusive(partialChar, ((0xEDu - 0xC2u) << 6) + 0xA0u, ((0xEEu - 0xC2u) << 6) + 0x7Fu))
+ {
+ goto Error; // failed surrogate check
+ }
+ }
+
+ goto InputBufferTooSmall; // out of data
+ }
+ else if ((byte)firstByte <= (0xF4u - 0xC2u))
+ {
+ // Potentially a 4-byte sequence?
+
+ if (inputLength < 2)
+ {
+ goto InputBufferTooSmall; // ran out of data
+ }
+
+ uint nextByte = pInputBuffer[1];
+ if (!IsLowByteUtf8ContinuationByte(nextByte))
+ {
+ goto Error; // 4-byte marker not followed by a continuation byte
+ }
+
+ uint asPartialChar = (firstByte << 6) + nextByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
+ if (!UnicodeUtility.IsInRangeInclusive(asPartialChar, ((0xF0u - 0xC2u) << 6) + 0x90u, ((0xF4u - 0xC2u) << 6) + 0x8Fu))
+ {
+ goto Error; // failed overlong / out-of-range check
+ }
+
+ if (inputLength < 3)
+ {
+ goto InputBufferTooSmall; // ran out of data
+ }
+
+ if (!IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
+ {
+ goto Error; // third byte in 4-byte sequence not a continuation byte
+ }
+
+ if (inputLength < 4)
+ {
+ goto InputBufferTooSmall; // ran out of data
+ }
+
+ if (!IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
+ {
+ goto Error; // fourth byte in 4-byte sequence not a continuation byte
+ }
+
+ // If we read a valid astral scalar value, the only way we could've fallen down this code path
+ // is that we didn't have enough output buffer to write the result.
+
+ goto OutputBufferTooSmall;
+ }
+ else
+ {
+ goto Error; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte
+ }
+ }
+
+ OperationStatus retVal = OperationStatus.Done;
+ goto ReturnCommon;
+
+ InputBufferTooSmall:
+ retVal = OperationStatus.NeedMoreData;
+ goto ReturnCommon;
+
+ OutputBufferTooSmall:
+ retVal = OperationStatus.DestinationTooSmall;
+ goto ReturnCommon;
+
+ Error:
+ retVal = OperationStatus.InvalidData;
+ goto ReturnCommon;
+
+ ReturnCommon:
+ pInputBufferRemaining = pInputBuffer;
+ pOutputBufferRemaining = pOutputBuffer;
+ return retVal;
+ }
+
+ // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
+ // the next char would have been consumed from / the next byte would have been written to.
+ // inputLength in chars, outputBytesRemaining in bytes.
+ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining)
+ {
+ const int CharsPerDWord = sizeof(uint) / sizeof(char);
+
+ Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+ Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+ Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative.");
+ Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
+
+ // First, try vectorized conversion.
+
+ {
+ nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
+
+ pInputBuffer += numElementsConverted;
+ pOutputBuffer += numElementsConverted;
+
+ // Quick check - did we just end up consuming the entire input buffer?
+ // If so, short-circuit the remainder of the method.
+
+ if ((int)numElementsConverted == inputLength)
+ {
+ pInputBufferRemaining = pInputBuffer;
+ pOutputBufferRemaining = pOutputBuffer;
+ return OperationStatus.Done;
+ }
+
+ inputLength -= (int)numElementsConverted;
+ outputBytesRemaining -= (int)numElementsConverted;
+ }
+
+ if (inputLength < CharsPerDWord)
+ {
+ goto ProcessInputOfLessThanDWordSize;
+ }
+
+ char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
+
+ // Begin the main loop.
+
+#if DEBUG
+ char* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+ uint thisDWord;
+
+ while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ AfterReadDWord:
+
+#if DEBUG
+ Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+ pLastBufferPosProcessed = pInputBuffer;
+#endif
+
+ // First, check for the common case of all-ASCII chars.
+
+ if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+ {
+ // We read an all-ASCII sequence (2 chars).
+
+ if (outputBytesRemaining < 2)
+ {
+ goto ProcessOneCharFromCurrentDWordAndFinish; // running out of space, but may be able to write some data
+ }
+
+ // The high WORD of the local declared below might be populated with garbage
+ // as a result of our shifts below, but that's ok since we're only going to
+ // write the low WORD.
+ //
+ // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+ // (Same logic works regardless of endianness.)
+ uint valueToWrite = thisDWord | (thisDWord >> 8);
+
+ Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)valueToWrite);
+
+ pInputBuffer += 2;
+ pOutputBuffer += 2;
+ outputBytesRemaining -= 2;
+
+ // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+ // Below is basically unrolled loops with poor man's vectorization.
+
+ uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
+ uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
+
+ if (Bmi2.X64.IsSupported)
+ {
+ Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
+ const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
+
+ // Try reading and writing 8 elements per iteration.
+ uint maxIters = minElementsRemaining / 8;
+ ulong firstQWord, secondQWord;
+ int i;
+ for (i = 0; (uint)i < maxIters; i++)
+ {
+ firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+ secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
+
+ if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
+ {
+ goto LoopTerminatedDueToNonAsciiData;
+ }
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+
+ pInputBuffer += 8;
+ pOutputBuffer += 8;
+ }
+
+ outputBytesRemaining -= 8 * i;
+
+ // Can we perform one more iteration, but reading & writing 4 elements instead of 8?
+
+ if ((minElementsRemaining & 4) != 0)
+ {
+ secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+
+ if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
+ {
+ goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
+ }
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+
+ pInputBuffer += 4;
+ pOutputBuffer += 4;
+ outputBytesRemaining -= 4;
+ }
+
+ continue; // Go back to beginning of main loop, read data, check for ASCII
+
+ LoopTerminatedDueToNonAsciiData:
+
+ outputBytesRemaining -= 8 * i;
+
+ // First, see if we can drain any ASCII data from the first QWORD.
+
+ if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
+ {
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+ pInputBuffer += 4;
+ pOutputBuffer += 4;
+ outputBytesRemaining -= 4;
+ }
+ else
+ {
+ secondQWord = firstQWord;
+ }
+
+ LoopTerminatedDueToNonAsciiDataInSecondQWord:
+
+ Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
+
+ thisDWord = (uint)secondQWord;
+ if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+ {
+ // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+ Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+ pInputBuffer += 2;
+ pOutputBuffer += 2;
+ outputBytesRemaining -= 2;
+ thisDWord = (uint)(secondQWord >> 32);
+ }
+
+ goto AfterReadDWordSkipAllCharsAsciiCheck;
+ }
+ else
+ {
+ // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
+ uint maxIters = minElementsRemaining / 4;
+ uint secondDWord;
+ int i;
+ for (i = 0; (uint)i < maxIters; i++)
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 2);
+
+ if (!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord | secondDWord))
+ {
+ goto LoopTerminatedDueToNonAsciiData;
+ }
+
+ // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+ // (Same logic works regardless of endianness.)
+ Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+ Unsafe.WriteUnaligned<ushort>(pOutputBuffer + 2, (ushort)(secondDWord | (secondDWord >> 8)));
+
+ pInputBuffer += 4;
+ pOutputBuffer += 4;
+ }
+
+ outputBytesRemaining -= 4 * i;
+
+ continue; // Go back to beginning of main loop, read data, check for ASCII
+
+ LoopTerminatedDueToNonAsciiData:
+
+ outputBytesRemaining -= 4 * i;
+
+ // First, see if we can drain any ASCII data from the first DWORD.
+
+ if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+ {
+ // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+ // (Same logic works regardless of endianness.)
+ Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+ pInputBuffer += 2;
+ pOutputBuffer += 2;
+ outputBytesRemaining -= 2;
+ thisDWord = secondDWord;
+ }
+
+ goto AfterReadDWordSkipAllCharsAsciiCheck;
+ }
+ }
+
+ AfterReadDWordSkipAllCharsAsciiCheck:
+
+ Debug.Assert(!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+ // Next, try stripping off the first ASCII char if it exists.
+ // We don't check for a second ASCII char since that should have been handled above.
+
+ if (IsFirstCharAscii(thisDWord))
+ {
+ if (outputBytesRemaining == 0)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ if (BitConverter.IsLittleEndian)
+ {
+ pOutputBuffer[0] = (byte)thisDWord; // extract [ ## ## 00 AA ]
+ }
+ else
+ {
+ pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ]
+ }
+
+ pInputBuffer += 1;
+ pOutputBuffer += 1;
+ outputBytesRemaining -= 1;
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessNextCharAndFinish; // input buffer doesn't contain enough data to read a DWORD
+ }
+ else
+ {
+ // The input buffer at the current offset contains a non-ASCII char.
+ // Read an entire DWORD and fall through to non-ASCII consumption logic.
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ }
+ }
+
+ // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
+
+ if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+ {
+ TryConsumeMultipleTwoByteSequences:
+
+ // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
+ // a tight loop without falling back to the main loop.
+
+ if (IsSecondCharTwoUtf8Bytes(thisDWord))
+ {
+ // We have two runs of two bytes each.
+
+ if (outputBytesRemaining < 4)
+ {
+ goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer
+ }
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
+
+ pInputBuffer += 2;
+ pOutputBuffer += 4;
+ outputBytesRemaining -= 4;
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+ }
+ else
+ {
+ // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+ // also two bytes. Check for that first before going back to the beginning of the loop.
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ if (IsFirstCharTwoUtf8Bytes(thisDWord))
+ {
+ // Validated we have a two-byte sequence coming up
+ goto TryConsumeMultipleTwoByteSequences;
+ }
+
+ // If we reached this point, the next sequence is something other than a valid
+ // two-byte sequence, so go back to the beginning of the loop.
+ goto AfterReadDWord;
+ }
+ }
+
+ if (outputBytesRemaining < 2)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
+
+ // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+ // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+ // char is ASCII?
+
+ if (IsSecondCharAscii(thisDWord))
+ {
+ if (outputBytesRemaining >= 3)
+ {
+ if (BitConverter.IsLittleEndian)
+ {
+ thisDWord >>= 16;
+ }
+ pOutputBuffer[2] = (byte)thisDWord;
+
+ pInputBuffer += 2;
+ pOutputBuffer += 3;
+ outputBytesRemaining -= 3;
+
+ continue; // go back to original bounds check and check for ASCII
+ }
+ else
+ {
+ pInputBuffer += 1;
+ pOutputBuffer += 2;
+ goto OutputBufferTooSmall;
+ }
+ }
+ else
+ {
+ pInputBuffer += 1;
+ pOutputBuffer += 2;
+ outputBytesRemaining -= 2;
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+ }
+ else
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
+ }
+ }
+ }
+
+ // Check the 3-byte case.
+
+ BeforeProcessThreeByteSequence:
+
+ if (!IsFirstCharSurrogate(thisDWord))
+ {
+ // Optimization: A three-byte character could indicate CJK text, which makes it likely
+ // that the character following this one is also CJK. We'll perform the check now
+ // rather than jumping to the beginning of the main loop.
+
+ if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
+ {
+ if (!IsSecondCharSurrogate(thisDWord))
+ {
+ if (outputBytesRemaining < 6)
+ {
+ goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can
+ }
+
+ WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
+
+ pInputBuffer += 2;
+ pOutputBuffer += 6;
+ outputBytesRemaining -= 6;
+
+ // Try to remain in the 3-byte processing loop if at all possible.
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+ }
+ else
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+ {
+ goto BeforeProcessThreeByteSequence;
+ }
+ else
+ {
+ // Fall back to standard processing loop since we don't know how to optimize this.
+ goto AfterReadDWord;
+ }
+ }
+ }
+ }
+
+ ConsumeSingleThreeByteRun:
+
+ if (outputBytesRemaining < 3)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
+
+ pInputBuffer += 1;
+ pOutputBuffer += 3;
+ outputBytesRemaining -= 3;
+
+ // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+ // in to the text. If this happens strip it off now before seeing if the next character
+ // consists of three code units.
+
+ if (IsSecondCharAscii(thisDWord))
+ {
+ if (outputBytesRemaining == 0)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ if (BitConverter.IsLittleEndian)
+ {
+ *pOutputBuffer = (byte)(thisDWord >> 16);
+ }
+ else
+ {
+ *pOutputBuffer = (byte)(thisDWord);
+ }
+
+ pInputBuffer += 1;
+ pOutputBuffer += 1;
+ outputBytesRemaining -= 1;
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+ }
+ else
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+ {
+ goto BeforeProcessThreeByteSequence;
+ }
+ else
+ {
+ // Fall back to standard processing loop since we don't know how to optimize this.
+ goto AfterReadDWord;
+ }
+ }
+ }
+
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+ }
+ else
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ goto AfterReadDWordSkipAllCharsAsciiCheck; // we just checked above that this value isn't ASCII
+ }
+ }
+
+ // Four byte sequence processing
+
+ if (IsWellFormedUtf16SurrogatePair(thisDWord))
+ {
+ if (outputBytesRemaining < 4)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
+
+ pInputBuffer += 2;
+ pOutputBuffer += 4;
+ outputBytesRemaining -= 4;
+
+ continue; // go back to beginning of loop for processing
+ }
+
+ goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
+ }
+
+ ProcessNextCharAndFinish:
+ inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;
+
+ ProcessInputOfLessThanDWordSize:
+ Debug.Assert(inputLength < CharsPerDWord);
+
+ if (inputLength == 0)
+ {
+ goto InputBufferFullyConsumed;
+ }
+
+ uint thisChar = *pInputBuffer;
+ goto ProcessFinalChar;
+
+ ProcessOneCharFromCurrentDWordAndFinish:
+ if (BitConverter.IsLittleEndian)
+ {
+ thisChar = thisDWord & 0xFFFFu; // preserve only the first char
+ }
+ else
+ {
+ thisChar = thisDWord >> 16; // preserve only the first char
+ }
+
+ ProcessFinalChar:
+ {
+ if (thisChar <= 0x7Fu)
+ {
+ if (outputBytesRemaining == 0)
+ {
+ goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+ }
+
+ // 1-byte (ASCII) case
+ *pOutputBuffer = (byte)thisChar;
+
+ pInputBuffer += 1;
+ pOutputBuffer += 1;
+ }
+ else if (thisChar < 0x0800u)
+ {
+ if (outputBytesRemaining < 2)
+ {
+ goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+ }
+
+ // 2-byte case
+ pOutputBuffer[1] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
+ pOutputBuffer[0] = (byte)((thisChar >> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ]
+
+ pInputBuffer += 1;
+ pOutputBuffer += 2;
+ }
+ else if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
+ {
+ if (outputBytesRemaining < 3)
+ {
+ goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+ }
+
+ // 3-byte case
+ pOutputBuffer[2] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
+ pOutputBuffer[1] = (byte)(((thisChar >> 6) & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ]
+ pOutputBuffer[0] = (byte)((thisChar >> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ]
+
+ pInputBuffer += 1;
+ pOutputBuffer += 3;
+ }
+ else if (thisChar <= 0xDBFFu)
+ {
+ // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer
+ goto InputBufferTooSmall;
+ }
+ else
+ {
+ // UTF-16 low surrogate code point with no leading data, report error
+ goto Error;
+ }
+ }
+
+ // There are two ways we can end up here. Either we were running low on input data,
+ // or we were running low on space in the destination buffer. If we're running low on
+ // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish),
+ // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done.
+ // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish),
+ // then we didn't modify inputLength since entering the main loop, which means it should
+ // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine
+ // which of the two scenarios we're in.
+
+ if (inputLength > 1)
+ {
+ goto OutputBufferTooSmall;
+ }
+
+ InputBufferFullyConsumed:
+ OperationStatus retVal = OperationStatus.Done;
+ goto ReturnCommon;
+
+ InputBufferTooSmall:
+ retVal = OperationStatus.NeedMoreData;
+ goto ReturnCommon;
+
+ OutputBufferTooSmall:
+ retVal = OperationStatus.DestinationTooSmall;
+ goto ReturnCommon;
+
+ Error:
+ retVal = OperationStatus.InvalidData;
+ goto ReturnCommon;
+
+ ReturnCommon:
+ pInputBufferRemaining = pInputBuffer;
+ pOutputBufferRemaining = pOutputBuffer;
+ return retVal;
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs
new file mode 100644
index 0000000000..6425ae1da3
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -0,0 +1,737 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics.X86;
+using Internal.Runtime.CompilerServices;
+
+#if BIT64
+using nint = System.Int64;
+using nuint = System.UInt64;
+#else // BIT64
+using nint = System.Int32;
+using nuint = System.UInt32;
+#endif // BIT64
+
+namespace System.Text.Unicode
+{
+ internal static unsafe partial class Utf8Utility
+ {
+#if DEBUG
+ private static void _ValidateAdditionalNIntDefinitions()
+ {
+ Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
+ Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
+ }
+#endif // DEBUG
+
+ // Returns &inputBuffer[inputLength] if the input buffer is valid.
+ /// <summary>
+ /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>,
+ /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
+ /// </summary>
+ /// <remarks>
+ /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
+ /// </remarks>
+ public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+ {
+ Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+ Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+ // First, try to drain off as many ASCII bytes as we can from the beginning.
+
+ {
+ nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength);
+ pInputBuffer += numAsciiBytesCounted;
+
+ // Quick check - did we just end up consuming the entire input buffer?
+ // If so, short-circuit the remainder of the method.
+
+ inputLength -= (int)numAsciiBytesCounted;
+ if (inputLength == 0)
+ {
+ utf16CodeUnitCountAdjustment = 0;
+ scalarCountAdjustment = 0;
+ return pInputBuffer;
+ }
+ }
+
+#if DEBUG
+ // Keep these around for final validation at the end of the method.
+ byte* pOriginalInputBuffer = pInputBuffer;
+ int originalInputLength = inputLength;
+#endif
+
+ // Enregistered locals that we'll eventually out to our caller.
+
+ int tempUtf16CodeUnitCountAdjustment = 0;
+ int tempScalarCountAdjustment = 0;
+
+ if (inputLength < sizeof(uint))
+ {
+ goto ProcessInputOfLessThanDWordSize;
+ }
+
+ byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint);
+
+ // Begin the main loop.
+
+#if DEBUG
+ byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+ while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
+
+ uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ AfterReadDWord:
+
+#if DEBUG
+ Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+ pLastBufferPosProcessed = pInputBuffer;
+#endif
+
+ // First, check for the common case of all-ASCII bytes.
+
+ if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+ {
+ // We read an all-ASCII sequence.
+
+ pInputBuffer += sizeof(uint);
+
+ // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+ // Below is basically unrolled loops with poor man's vectorization.
+
+ // Below check is "can I read at least five DWORDs from the input stream?"
+ // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value,
+ // hence using nint instead of nuint.
+
+ if ((nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint))
+ {
+ // We want reads in the inner loop to be aligned. So let's perform a quick
+ // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump
+ // the read pointer up to the next aligned address.
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+ {
+ goto AfterReadDWordSkipAllBytesAsciiCheck;
+ }
+
+ pInputBuffer = (byte*)((nuint)(pInputBuffer + 4) & ~(nuint)3);
+
+ // At this point, the input buffer offset points to an aligned DWORD. We also know that there's
+ // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above:
+ // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and
+ // the alignment check consumes at most a single DWORD.)
+
+ byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
+ uint mask;
+
+ do
+ {
+ if (Sse2.IsSupported && Bmi1.IsSupported)
+ {
+ // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
+ // going to perform an unaligned load. We don't necessarily care about aligning
+ // this because we pessimistically assume we'll encounter non-ASCII data at some
+ // point in the not-too-distant future (otherwise we would've stayed entirely
+ // within the all-ASCII vectorized code at the entry to this method).
+
+ mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer));
+ if (mask != 0)
+ {
+ goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
+ }
+ }
+ else
+ {
+ if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1]))
+ {
+ goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair;
+ }
+
+ if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[2] | ((uint*)pInputBuffer)[3]))
+ {
+ goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair;
+ }
+ }
+
+ pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs
+ } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop);
+
+ continue; // need to perform a bounds check because we might be running out of data
+
+ Sse2LoopTerminatedEarlyDueToNonAsciiData:
+
+ Debug.Assert(BitConverter.IsLittleEndian);
+ Debug.Assert(Sse2.IsSupported);
+ Debug.Assert(Bmi1.IsSupported);
+
+ // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
+ // for each non-ASCII byte we saw. We can count the number of ASCII bytes,
+ // bump our input counter by that amount, and resume processing from the
+ // "the first byte is no longer ASCII" portion of the main loop.
+
+ Debug.Assert(mask != 0);
+
+ pInputBuffer += Bmi1.TrailingZeroCount(mask);
+ if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ goto ProcessRemainingBytesSlow;
+ }
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); // no longer guaranteed to be aligned
+ goto BeforeProcessTwoByteSequence;
+
+ LoopTerminatedEarlyDueToNonAsciiDataInSecondPair:
+
+ pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs
+
+ LoopTerminatedEarlyDueToNonAsciiDataInFirstPair:
+
+ // We know that there's *at least* two DWORDs of data remaining in the buffer.
+ // We also know that one of them (or both of them) contains non-ASCII data somewhere.
+ // Let's perform a quick check here to bypass the logic at the beginning of the main loop.
+
+ thisDWord = *(uint*)pInputBuffer; // still aligned here
+ if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+ {
+ pInputBuffer += sizeof(uint); // consumed 1 more DWORD
+ thisDWord = *(uint*)pInputBuffer; // still aligned here
+ }
+
+ goto AfterReadDWordSkipAllBytesAsciiCheck;
+ }
+
+ continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks
+ }
+
+ AfterReadDWordSkipAllBytesAsciiCheck:
+
+ Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+ // Next, try stripping off ASCII bytes one at a time.
+ // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
+
+ {
+ uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord);
+ pInputBuffer += numLeadingAsciiBytes;
+
+ if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+ {
+ goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD
+ }
+ else
+ {
+ // The input buffer at the current offset contains a non-ASCII byte.
+ // Read an entire DWORD and fall through to multi-byte consumption logic.
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ }
+ }
+
+ BeforeProcessTwoByteSequence:
+
+ // At this point, we suspect we're working with a multi-byte code unit sequence,
+ // but we haven't yet validated it for well-formedness.
+
+ // The masks and comparands are derived from the Unicode Standard, Table 3-6.
+ // Additionally, we need to check for valid byte sequences per Table 3-7.
+
+ // Check the 2-byte case.
+
+ thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u;
+ if ((thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u)) == 0)
+ {
+ // Per Table 3-7, valid sequences are:
+ // [ C2..DF ] [ 80..BF ]
+ //
+ // Due to our modification of 'thisDWord' above, this becomes:
+ // [ 02..1F ] [ 00..3F ]
+ //
+ // We've already checked that the leading byte was originally in the range [ C0..DF ]
+ // and that the trailing byte was originally in the range [ 80..BF ], so now we only need
+ // to check that the modified leading byte is >= [ 02 ].
+
+ if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u)
+ || (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u))
+ {
+ goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ]
+ }
+
+ ProcessTwoByteSequenceSkipOverlongFormCheck:
+
+ // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
+ // there's a good chance that if we see one two-byte run then there's another two-byte
+ // run immediately after. Let's check that now.
+
+ // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
+ // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
+ // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
+
+ if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+ || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+ {
+ // We have two runs of two bytes each.
+ pInputBuffer += 4;
+ tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars)
+
+ if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+ // also two bytes. Check for that first before going back to the beginning of the loop.
+
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ if (BitConverter.IsLittleEndian)
+ {
+ if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+ {
+ // The next sequence is a valid two-byte sequence.
+ goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+ }
+ }
+ else
+ {
+ if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+ {
+ if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+ {
+ goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
+ }
+
+ goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+ }
+ }
+
+ // If we reached this point, the next sequence is something other than a valid
+ // two-byte sequence, so go back to the beginning of the loop.
+ goto AfterReadDWord;
+ }
+ else
+ {
+ goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+ }
+ }
+
+ // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+ // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+ // bytes are ASCII?
+
+ tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing]
+
+ if (UInt32ThirdByteIsAscii(thisDWord))
+ {
+ if (UInt32FourthByteIsAscii(thisDWord))
+ {
+ pInputBuffer += 4;
+ }
+ else
+ {
+ pInputBuffer += 3;
+
+ // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
+ // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
+
+ if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+ goto BeforeProcessTwoByteSequence;
+ }
+ }
+ }
+ else
+ {
+ pInputBuffer += 2;
+ }
+
+ continue;
+ }
+
+ // Check the 3-byte case.
+ // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte.
+
+ thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u);
+ if ((thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u)) == 0)
+ {
+ ProcessThreeByteSequenceWithCheck:
+
+ // We assume the caller has confirmed that the bit pattern is representative of a three-byte
+ // sequence, but it may still be overlong or surrogate. We need to check for these possibilities.
+ //
+ // Per Table 3-7, valid sequences are:
+ // [ E0 ] [ A0..BF ] [ 80..BF ]
+ // [ E1..EC ] [ 80..BF ] [ 80..BF ]
+ // [ ED ] [ 80..9F ] [ 80..BF ]
+ // [ EE..EF ] [ 80..BF ] [ 80..BF ]
+ //
+ // Big-endian examples of using the above validation table:
+ // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
+ // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
+ // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
+ // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
+ // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
+ //
+ // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80)
+ // as long as they haven't touched the bits we're about to use in our mask checking below.
+
+ if (BitConverter.IsLittleEndian)
+ {
+ // The "overlong or surrogate" check can be implemented using a single jump, but there's
+ // some overhead to moving the bits into the correct locations in order to perform the
+ // correct comparison, and in practice the processor's branch prediction capability is
+ // good enough that we shouldn't bother. So we'll use two jumps instead.
+
+ // Can't extract this check into its own helper method because JITter produces suboptimal
+ // assembly, even with aggressive inlining.
+
+ // Code below becomes 5 instructions: test, jz, add, test, jz
+
+ if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord -= 0x0000_200Du) & 0x0000_200Fu) == 0))
+ {
+ goto Error; // overlong or surrogate
+ }
+ }
+ else
+ {
+ if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord -= 0x0D20_0000u) & 0x0F20_0000u) == 0))
+ {
+ goto Error; // overlong or surrogate
+ }
+ }
+
+ ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks:
+
+ // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+ // in to the text. If this happens strip it off now before seeing if the next character
+ // consists of three code units.
+
+ // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end
+
+ nint asciiAdjustment;
+ if (BitConverter.IsLittleEndian)
+ {
+ asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value
+ }
+ else
+ {
+ asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value
+ }
+
+ // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise
+
+ // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method
+ // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid
+ // to add 4 before backing up since we already checked previously that the input buffer contains at
+ // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can
+ // no longer track the reference. However, we can't back up before adding 4, since we might back up to
+ // before the start of the buffer, and the GC isn't guaranteed to be able to track this.
+
+ pInputBuffer += 4; // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte
+ pInputBuffer += asciiAdjustment; // back up if we didn't actually consume an ASCII byte
+
+ tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar)
+
+ SuccessfullyProcessedThreeByteSequence:
+
+ if (IntPtr.Size >= 8 && BitConverter.IsLittleEndian)
+ {
+ // x64 little-endian optimization: A three-byte character could indicate CJK text,
+ // which makes it likely that the character following this one is also CJK.
+ // We'll try to process several three-byte sequences at a time.
+
+ // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset
+ // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so
+ // use nint instead of nuint.
+
+ if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5)
+ {
+ ulong thisQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+
+ // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward
+ // to a previous location in the loop. This offers defense against reading main memory again (which may
+ // have been modified and could lead to a race condition).
+
+ thisDWord = (uint)thisQWord;
+
+ // Is this three 3-byte sequences in a row?
+ // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
+ // ---- CHAR 3 ---- --------- CHAR 2 --------- --------- CHAR 1 --------- -CHAR 3-
+ if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && IsUtf8ContinuationByte(in pInputBuffer[8]))
+ {
+ // Saw a proper bitmask for three incoming 3-byte sequences, perform the
+ // overlong and surrogate sequence checking now.
+
+ // Check the first character.
+ // If the first character is overlong or a surrogate, fail immediately.
+
+ if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+ {
+ goto Error;
+ }
+
+ // Check the second character.
+ // At this point, we now know the first three bytes represent a well-formed sequence.
+ // If there's an error beyond here, we'll jump back to the "process three known good bytes"
+ // logic.
+
+ thisQWord >>= 24;
+ if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+ {
+ goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
+ }
+
+ // Check the third character (we already checked that it's followed by a continuation byte).
+
+ thisQWord >>= 24;
+ if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+ {
+ goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
+ }
+
+ pInputBuffer += 9;
+ tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars)
+
+ goto SuccessfullyProcessedThreeByteSequence;
+ }
+
+ // Is this two 3-byte sequences in a row?
+ // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ]
+ // --------- CHAR 2 --------- --------- CHAR 1 ---------
+ if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul)
+ {
+ // Saw a proper bitmask for two incoming 3-byte sequences, perform the
+ // overlong and surrogate sequence checking now.
+
+ // Check the first character.
+ // If the first character is overlong or a surrogate, fail immediately.
+
+ if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+ {
+ goto Error;
+ }
+
+ // Check the second character.
+ // At this point, we now know the first three bytes represent a well-formed sequence.
+ // If there's an error beyond here, we'll jump back to the "process three known good bytes"
+ // logic.
+
+ thisQWord >>= 24;
+ if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
+ {
+ goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
+ }
+
+ pInputBuffer += 6;
+ tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
+
+ // The next byte in the sequence didn't have a 3-byte marker, so it's probably
+ // an ASCII character. Jump back to the beginning of loop processing.
+
+ continue;
+ }
+
+ if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+ {
+ // A single three-byte sequence.
+ goto ProcessThreeByteSequenceWithCheck;
+ }
+ else
+ {
+ // Not a three-byte sequence; perhaps ASCII?
+ goto AfterReadDWord;
+ }
+ }
+ }
+
+ if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ {
+ thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+ // Optimization: A three-byte character could indicate CJK text, which makes it likely
+ // that the character following this one is also CJK. We'll check for a three-byte sequence
+ // marker now and jump directly to three-byte sequence processing if we see one, skipping
+ // all of the logic at the beginning of the loop.
+
+ if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+ {
+ goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process
+ }
+ else
+ {
+ goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop
+ }
+ }
+ else
+ {
+ goto ProcessRemainingBytesSlow; // Running out of data
+ }
+ }
+
+ // Assume the 4-byte case, but we need to validate.
+
+ if (BitConverter.IsLittleEndian)
+ {
+ thisDWord &= 0xC0C0_FFFFu;
+
+ // After the above modifications earlier in this method, we expect 'thisDWord'
+ // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now
+ // perform two checks to confirm this. The first will verify the
+ // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's
+ // complement representation to perform a single *signed* integer check.
+
+ if ((int)thisDWord > unchecked((int)0x8000_3FFF))
+ {
+ goto Error; // didn't have three trailing bytes
+ }
+
+ // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
+ // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
+
+ thisDWord = BitOperations.RotateRight(thisDWord, 8);
+
+ // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ].
+ // The check is now a simple add / cmp / jcc combo.
+
+ if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu))
+ {
+ goto Error; // overlong or out-of-range
+ }
+ }
+ else
+ {
+ thisDWord -= 0x80u;
+
+ // After the above modifications earlier in this method, we expect 'thisDWord'
+ // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now
+ // perform two checks to confirm this. The first will verify the
+ // [ ######## 00###### 00###### 00###### ] structure.
+
+ if ((thisDWord & 0x00C0_C0C0u) != 0)
+ {
+ goto Error; // didn't have three trailing bytes
+ }
+
+ // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
+ // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
+ // This is a simple range check. (We don't care about the low two bytes.)
+
+ if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu))
+ {
+ goto Error; // overlong or out-of-range
+ }
+ }
+
+ // Validation of 4-byte case complete.
+
+ pInputBuffer += 4;
+ tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units
+ tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
+
+ continue; // go back to beginning of loop for processing
+ }
+
+ goto ProcessRemainingBytesSlow;
+
+ ProcessInputOfLessThanDWordSize:
+
+ Debug.Assert(inputLength < 4);
+ nuint inputBufferRemainingBytes = (uint)inputLength;
+ goto ProcessSmallBufferCommon;
+
+ ProcessRemainingBytesSlow:
+
+ inputBufferRemainingBytes = (nuint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+
+ ProcessSmallBufferCommon:
+
+ Debug.Assert(inputBufferRemainingBytes < 4);
+ while (inputBufferRemainingBytes > 0)
+ {
+ uint firstByte = pInputBuffer[0];
+
+ if ((byte)firstByte < 0x80u)
+ {
+ // 1-byte (ASCII) case
+ pInputBuffer++;
+ inputBufferRemainingBytes--;
+ continue;
+ }
+ else if (inputBufferRemainingBytes >= 2)
+ {
+ uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value
+ if ((byte)firstByte < 0xE0u)
+ {
+ // 2-byte case
+ if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte))
+ {
+ pInputBuffer += 2;
+ tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar)
+ inputBufferRemainingBytes -= 2;
+ continue;
+ }
+ }
+ else if (inputBufferRemainingBytes >= 3)
+ {
+ if ((byte)firstByte < 0xF0u)
+ {
+ if ((byte)firstByte == 0xE0u)
+ {
+ if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu))
+ {
+ goto Error; // overlong encoding
+ }
+ }
+ else if ((byte)firstByte == 0xEDu)
+ {
+ if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu))
+ {
+ goto Error; // would be a UTF-16 surrogate code point
+ }
+ }
+ else
+ {
+ if (!IsLowByteUtf8ContinuationByte(secondByte))
+ {
+ goto Error; // first trailing byte doesn't have proper continuation marker
+ }
+ }
+
+ if (IsUtf8ContinuationByte(in pInputBuffer[2]))
+ {
+ pInputBuffer += 3;
+ tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
+ inputBufferRemainingBytes -= 3;
+ continue;
+ }
+ }
+ }
+ }
+
+ // Error - no match.
+
+ goto Error;
+ }
+
+ // If we reached this point, we're out of data, and we saw no bad UTF8 sequence.
+
+#if DEBUG
+ // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength].
+ Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value.");
+#endif
+
+ Error:
+
+ // Report back to our caller how far we got before seeing invalid data.
+ // (Also used for normal termination when falling out of the loop above.)
+
+ utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment;
+ scalarCountAdjustment = tempScalarCountAdjustment;
+ return pInputBuffer;
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
index 6ee9ca05a6..d24f766474 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs
@@ -6,10 +6,12 @@ using System.Buffers;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Internal.Runtime.CompilerServices;
namespace System.Text.Unicode
{
- internal static class Utf8Utility
+ internal static partial class Utf8Utility
{
/// <summary>
/// The maximum number of bytes that can result from UTF-8 transcoding
@@ -29,26 +31,16 @@ namespace System.Text.Unicode
/// comes first) is ASCII.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
+ public unsafe static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
{
- // TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948).
-
- bool tempIsAscii = true;
- int originalDataLength = utf8Data.Length;
-
- while (!utf8Data.IsEmpty)
+ fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
{
- if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done)
- {
- break;
- }
+ byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _);
+ int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte);
- tempIsAscii &= result.IsAscii;
- utf8Data = utf8Data.Slice(bytesConsumed);
+ isAscii = (utf16CodeUnitCountAdjustment == 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII.
+ return (index < utf8Data.Length) ? index : -1;
}
-
- isAscii = tempIsAscii;
- return (utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length);
}
#if FEATURE_UTF8STRING