diff options
18 files changed, 4113 insertions, 2425 deletions
diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index 19d8105baf..02656f57ad 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -768,6 +768,7 @@ <Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.Helpers.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" /> @@ -799,13 +800,17 @@ <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" /> - <Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf16Utility.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.Validation.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Helpers.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Transcoding.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Validation.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" /> diff --git a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs index f5bba908b5..ef2eb4945a 100644 --- a/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs +++ b/src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs @@ -8,7 +8,7 @@ using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Serialization; -using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; namespace System.Globalization diff --git a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs index cf89dff6a2..4391dec044 100644 --- a/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs +++ b/src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs @@ -8,6 +8,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Serialization; using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; #if BIT64 diff --git a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs index beab0cfe02..9e9bb31623 100644 --- a/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs +++ b/src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs @@ -5,7 +5,7 @@ using System.Buffers; using System.Diagnostics; using System.Runtime.InteropServices; -using System.Text; +using System.Text.Unicode; using Internal.Runtime.CompilerServices; #if BIT64 diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs new file mode 100644 index 0000000000..b48a001d48 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.Helpers.cs @@ -0,0 +1,77 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; + +namespace System.Text +{ + internal static partial class ASCIIUtility + { + /// <summary> + /// A mask which selects only the high bit of each byte of the given <see cref="uint"/>. + /// </summary> + private const uint UInt32HighBitsOnlyMask = 0x80808080u; + + /// <summary> + /// A mask which selects only the high bit of each byte of the given <see cref="ulong"/>. + /// </summary> + private const ulong UInt64HighBitsOnlyMask = 0x80808080_80808080ul; + + /// <summary> + /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool AllBytesInUInt32AreAscii(uint value) + { + // If the high bit of any byte is set, that byte is non-ASCII. + + return (value & UInt32HighBitsOnlyMask) == 0; + } + + + /// <summary> + /// Given a 24-bit integer which represents a three-byte buffer read in machine endianness, + /// counts the number of consecutive ASCII bytes starting from the beginning of the buffer. + /// Returns a value 0 - 3, inclusive. (The caller is responsible for ensuring that an all- + /// ASCII value does not make its way to this method.) + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value) + { + Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value."); + + if (BitConverter.IsLittleEndian) + { + return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; + } + else + { + // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending + // on whether all processed bytes were ASCII. Then we accumulate all of the + // results to calculate how many consecutive ASCII bytes are present. + + value = ~value; + + // Read first byte + value = BitOperations.RotateLeft(value, 1); + uint allBytesUpToNowAreAscii = value & 1; + uint numAsciiBytes = allBytesUpToNowAreAscii; + + // Read second byte + value = BitOperations.RotateLeft(value, 8); + allBytesUpToNowAreAscii &= value; + numAsciiBytes += allBytesUpToNowAreAscii; + + // Read third byte + value = BitOperations.RotateLeft(value, 8); + allBytesUpToNowAreAscii &= value; + numAsciiBytes += allBytesUpToNowAreAscii; + + return numAsciiBytes; + } + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs index 755f925610..6193a0a5ee 100644 --- a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs @@ -21,19 +21,12 @@ namespace System.Text { internal static partial class ASCIIUtility { - /// <summary> - /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII. - /// </summary> - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool AllBytesInUInt32AreAscii(uint value) - { - return ((value & 0x80808080u) == 0); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool AllBytesInUInt64AreAscii(ulong value) { - return ((value & 0x80808080_80808080ul) == 0); + // If the high bit of any byte is set, that byte is non-ASCII. + + return ((value & UInt64HighBitsOnlyMask) == 0); } /// <summary> @@ -55,56 +48,6 @@ namespace System.Text } /// <summary> - /// Given a 24-bit integer which represents a three-byte buffer read in machine endianness, - /// counts the number of consecutive ASCII bytes starting from the beginning of the buffer. - /// Returns a value 0 - 3, inclusive. - /// </summary> - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value) - { - // This implementation seems to have better performance than tzcnt. - - // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending - // on whether all processed bytes were ASCII. Then we accumulate all of the - // results to calculate how many consecutive ASCII bytes are present. - - value = ~value; - - if (BitConverter.IsLittleEndian) - { - // Read first byte - uint allBytesUpToNowAreAscii = (value >>= 7) & 1; - uint numAsciiBytes = allBytesUpToNowAreAscii; - - // Read second byte - allBytesUpToNowAreAscii &= (value >>= 8); - numAsciiBytes += allBytesUpToNowAreAscii; - - // Read third byte - allBytesUpToNowAreAscii &= (value >>= 8); - numAsciiBytes += allBytesUpToNowAreAscii; - - return numAsciiBytes; - } - else - { - // Read first byte - uint allBytesUpToNowAreAscii = (value = ROL32(value, 1)) & 1; - uint numAsciiBytes = allBytesUpToNowAreAscii; - - // Read second byte - allBytesUpToNowAreAscii &= (value = ROL32(value, 8)); - numAsciiBytes += allBytesUpToNowAreAscii; - - // Read third byte - allBytesUpToNowAreAscii &= (value = ROL32(value, 8)); - numAsciiBytes += allBytesUpToNowAreAscii; - - return numAsciiBytes; - } - } - - /// <summary> /// Given a DWORD which represents two packed chars in machine-endian order, /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII. /// </summary> @@ -461,7 +404,7 @@ namespace System.Text // Clear everything but the high bit of each byte, then tzcnt. // Remember the / 8 at the end to convert bit count to byte count. - candidateUInt64 &= 0x80808080_80808080ul; + candidateUInt64 &= UInt64HighBitsOnlyMask; pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8); goto Finish; } @@ -1395,17 +1338,7 @@ namespace System.Text // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); - - if (Sse41.X64.IsSupported) - { - // Use PEXTRQ instruction if available, since it can extract from the vector directly to the destination address. - Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, Sse41.X64.Extract(asciiVector.AsUInt64(), 0)); - } - else - { - // Bounce this through a temporary register (with potential stack spillage) before writing to memory. - Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, asciiVector.AsUInt64().GetElement(0)); - } + Sse2.StoreLow((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far @@ -1444,16 +1377,7 @@ namespace System.Text // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); - - // See comments earlier in this method for information about how this works. - if (Sse41.X64.IsSupported) - { - Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, Sse41.X64.Extract(asciiVector.AsUInt64(), 0)); - } - else - { - Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, asciiVector.AsUInt64().GetElement(0)); - } + Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment @@ -1529,27 +1453,13 @@ namespace System.Text Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); - // See comments earlier in this method for information about how this works. - if (Sse41.X64.IsSupported) - { - *(ulong*)(pAsciiBuffer + currentOffsetInElements) = Sse41.X64.Extract(asciiVector.AsUInt64(), 0); - } - else - { - *(ulong*)(pAsciiBuffer + currentOffsetInElements) = asciiVector.AsUInt64().GetElement(0); - } + Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned currentOffsetInElements += SizeOfVector128 / 2; goto Finish; } /// <summary> - /// Rotates a <see cref="uint"/> left. The JIT is smart enough to turn this into a ROL / ROR instruction. - /// </summary> - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint ROL32(uint value, int shift) => (value << shift) | (value >> (32 - shift)); - - /// <summary> /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/> /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number diff --git a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs index 9040a94f0f..bb5aa5f0ac 100644 --- a/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs +++ b/src/System.Private.CoreLib/shared/System/Text/DecoderNLS.cs @@ -266,6 +266,7 @@ namespace System.Text // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer."); + Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder."); // Copy the existing leftover data plus as many bytes as possible of the new incoming data // into a temporary concated buffer, then get its char count by decoding it. @@ -319,6 +320,7 @@ namespace System.Text // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer."); + Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder."); // Copy the existing leftover data plus as many bytes as possible of the new incoming data // into a temporary concated buffer, then transcode it from bytes to chars. @@ -370,6 +372,14 @@ namespace System.Text Finish: + // Report back the number of bytes (from the new incoming span) we consumed just now. + // This calculation is simple: it's the difference between the original leftover byte + // count and the number of bytes from the combined buffer we needed to decode the first + // scalar value. We need to report this before the call to SetLeftoverData / + // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field. + + bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; + if (persistNewCombinedBuffer) { Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer."); @@ -380,7 +390,6 @@ namespace System.Text ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths } - bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now return charsWritten; DestinationTooSmall: diff --git a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs index 0e32167957..ca740a1adc 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Encoding.Internal.cs @@ -850,8 +850,14 @@ namespace System.Text ReadOnlySpan<byte> bytes = new ReadOnlySpan<byte>(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar); - int totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out int bytesConsumedJustNow); - bytes = bytes.Slice(bytesConsumedJustNow); + int bytesConsumedJustNow = 0; + int totalCharCount = 0; + + if (decoder.HasLeftoverData) + { + totalCharCount = decoder.DrainLeftoverDataForGetCharCount(bytes, out bytesConsumedJustNow); + bytes = bytes.Slice(bytesConsumedJustNow); + } // Now try invoking the "fast path" (no fallback) implementation. // We can use Unsafe.AsPointer here since these spans are created from pinned data (raw pointers). @@ -1120,10 +1126,15 @@ namespace System.Text ReadOnlySpan<byte> bytes = new ReadOnlySpan<byte>(pOriginalBytes, originalByteCount).Slice(bytesConsumedSoFar); Span<char> chars = new Span<char>(pOriginalChars, originalCharCount).Slice(charsWrittenSoFar); - int charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out int bytesConsumedJustNow); + int bytesConsumedJustNow = 0; + int charsWrittenJustNow = 0; - bytes = bytes.Slice(bytesConsumedJustNow); - chars = chars.Slice(charsWrittenJustNow); + if (decoder.HasLeftoverData) + { + charsWrittenJustNow = decoder.DrainLeftoverDataForGetChars(bytes, chars, out bytesConsumedJustNow); + bytes = bytes.Slice(bytesConsumedJustNow); + chars = chars.Slice(charsWrittenJustNow); + } Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, "Should be no remaining fallback data at this point."); diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index a91c0fcb99..a71750eaa5 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -6,6 +6,7 @@ using System.Buffers; using System.Diagnostics; using System.Globalization; using System.Runtime.CompilerServices; +using System.Text.Unicode; namespace System.Text { diff --git a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs index aaac975ec8..7a3a1f7de5 100644 --- a/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs +++ b/src/System.Private.CoreLib/shared/System/Text/UTF8Encoding.cs @@ -15,9 +15,11 @@ #define FASTLOOP using System; +using System.Buffers; using System.Diagnostics; -using System.Globalization; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Text.Unicode; namespace System.Text { @@ -129,22 +131,26 @@ namespace System.Text public override unsafe int GetByteCount(char[] chars, int index, int count) { // Validate input parameters - if (chars == null) - throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array); - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array); + } - if (chars.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer); + if ((index | count) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input, return 0, avoid fixed empty array problem - if (count == 0) - return 0; + if (chars.Length - index < count) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call the pointer version fixed (char* pChars = chars) - return GetByteCount(pChars + index, count, null); + { + return GetByteCountCommon(pChars + index, count); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -154,12 +160,17 @@ namespace System.Text public override unsafe int GetByteCount(string chars) { - // Validate input - if (chars==null) - throw new ArgumentNullException("s"); + // Validate input parameters + + if (chars is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars); + } fixed (char* pChars = chars) - return GetByteCount(pChars, chars.Length, null); + { + return GetByteCountCommon(pChars, chars.Length); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -170,22 +181,78 @@ namespace System.Text public override unsafe int GetByteCount(char* chars, int count) { // Validate Parameters + if (chars == null) - throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array); + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars); + } if (count < 0) - throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // Call it with empty encoder - return GetByteCount(chars, count, null); + return GetByteCountCommon(chars, count); } public override unsafe int GetByteCount(ReadOnlySpan<char> chars) { - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) + // It's ok for us to pass null pointers down to the workhorse below. + + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + { + return GetByteCountCommon(charsPtr, chars.Length); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetByteCountCommon(char* pChars, int charCount) + { + // Common helper method for all non-EncoderNLS entry points to GetByteCount. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. + + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + + // First call into the fast path. + // Don't bother providing a fallback mechanism; our fast path doesn't use it. + + int totalByteCount = GetByteCountFast(pChars, charCount, fallback: null, out int charsConsumed); + + if (charsConsumed != charCount) + { + // If there's still data remaining in the source buffer, go down the fallback path. + // We need to check for integer overflow since the fallback could change the required + // output count in unexpected ways. + + totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed); + if (totalByteCount < 0) + { + ThrowConversionOverflow(); + } + } + + return totalByteCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon + private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback fallback, out int charsConsumed) + { + // The number of UTF-8 code units may exceed the number of UTF-16 code units, + // so we'll need to check for overflow before casting to Int32. + + char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _); + + int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars); + charsConsumed = tempCharsConsumed; + + long totalUtf8Bytes = tempCharsConsumed + utf8CodeUnitCountAdjustment; + if ((ulong)totalUtf8Bytes > int.MaxValue) { - return GetByteCount(charsPtr, chars.Length, baseEncoder: null); + ThrowConversionOverflow(); } + + return (int)totalUtf8Bytes; } // Parent method is safe. @@ -196,22 +263,37 @@ namespace System.Text public override unsafe int GetBytes(string s, int charIndex, int charCount, byte[] bytes, int byteIndex) { - if (s == null || bytes == null) - throw new ArgumentNullException((s == null ? nameof(s) : nameof(bytes)), SR.ArgumentNull_Array); + // Validate Parameters + + if (s is null || bytes is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (s is null) ? ExceptionArgument.s : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - if (charIndex < 0 || charCount < 0) - throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if ((charIndex | charCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } if (s.Length - charIndex < charCount) - throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount); - - if (byteIndex < 0 || byteIndex > bytes.Length) - throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.s, ExceptionResource.ArgumentOutOfRange_IndexCount); + } - int byteCount = bytes.Length - byteIndex; + if ((uint)byteIndex > bytes.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes)) - return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + fixed (char* pChars = s) + fixed (byte* pBytes = bytes) + { + return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex); + } } // Encodes a range of characters in a character array into a range of bytes @@ -232,28 +314,36 @@ namespace System.Text byte[] bytes, int byteIndex) { // Validate parameters - if (chars == null || bytes == null) - throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array); - - if (charIndex < 0 || charCount < 0) - throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - if (chars.Length - charIndex < charCount) - throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer); + if (chars is null || bytes is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } - if (byteIndex < 0 || byteIndex > bytes.Length) - throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index); + if ((charIndex | charCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If nothing to encode return 0, avoid fixed problem - if (charCount == 0) - return 0; + if (chars.Length - charIndex < charCount) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount); + } - // Just call pointer version - int byteCount = bytes.Length - byteIndex; + if ((uint)byteIndex > bytes.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes)) - // Remember that byteCount is # to decode, not size of array. - return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + fixed (char* pChars = chars) + fixed (byte* pBytes = bytes) + { + return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -264,24 +354,77 @@ namespace System.Text public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - if (charCount < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (chars == null || bytes == null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes, + resource: ExceptionResource.ArgumentNull_Array); + } + + if ((charCount | byteCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - return GetBytes(chars, charCount, bytes, byteCount, null); + return GetBytesCommon(chars, charCount, bytes, byteCount); } public override unsafe int GetBytes(ReadOnlySpan<char> chars, Span<byte> bytes) { - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) + // It's ok for us to operate on null / empty spans. + + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) { - return GetBytes(charsPtr, chars.Length, bytesPtr, bytes.Length, baseEncoder: null); + return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length); } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount) + { + // Common helper method for all non-EncoderNLS entry points to GetBytes. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. + + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + + // First call into the fast path. + + int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed); + + if (charsConsumed == charCount) + { + // All elements converted - return immediately. + + return bytesWritten; + } + else + { + // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback. + + return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon + private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed) + { + // We don't care about the exact OperationStatus value returned by the workhorse routine; we only + // care if the workhorse was able to consume the entire input payload. If we're unable to do so, + // we'll handle the remainder in the fallback routine. + + Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining); + + charsConsumed = (int)(pInputBufferRemaining - pChars); + return (int)(pOutputBufferRemaining - pBytes); + } + // Returns the number of characters produced by decoding a range of bytes // in a byte array. // @@ -293,22 +436,26 @@ namespace System.Text public override unsafe int GetCharCount(byte[] bytes, int index, int count) { // Validate Parameters - if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); + if (bytes is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); + } - if (bytes.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + if ((index | count) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input just return 0, fixed doesn't like 0 length arrays. - if (count == 0) - return 0; + if (bytes.Length - index < count) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call pointer version fixed (byte* pBytes = bytes) - return GetCharCount(pBytes + index, count, null); + { + return GetCharCountCommon(pBytes + index, count); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -319,20 +466,27 @@ namespace System.Text public override unsafe int GetCharCount(byte* bytes, int count) { // Validate Parameters + if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); + } if (count < 0) - throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - return GetCharCount(bytes, count, null); + return GetCharCountCommon(bytes, count); } public override unsafe int GetCharCount(ReadOnlySpan<byte> bytes) { - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) + // It's ok for us to pass null pointers down to the workhorse routine. + + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) { - return GetCharCount(bytesPtr, bytes.Length, baseDecoder: null); + return GetCharCountCommon(bytesPtr, bytes.Length); } } @@ -345,28 +499,36 @@ namespace System.Text char[] chars, int charIndex) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - - if (byteIndex < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - if ( bytes.Length - byteIndex < byteCount) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); + if (bytes is null || chars is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars, + resource: ExceptionResource.ArgumentNull_Array); + } - if (charIndex < 0 || charIndex > chars.Length) - throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index); + if ((byteIndex | byteCount) < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); + } - // If no input, return 0 & avoid fixed problem - if (byteCount == 0) - return 0; + if (bytes.Length - byteIndex < byteCount) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); + } - // Just call pointer version - int charCount = chars.Length - charIndex; + if ((uint)charIndex > (uint)chars.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index); + } - fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars)) - // Remember that charCount is # to decode, not size of array - return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + fixed (byte* pBytes = bytes) + fixed (char* pChars = chars) + { + return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex); + } } // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) @@ -377,2120 +539,245 @@ namespace System.Text public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { // Validate Parameters - if (bytes == null || chars == null) - throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array); - - if (charCount < 0 || byteCount < 0) - throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum); - return GetChars(bytes, byteCount, chars, charCount, null); - } + if (bytes is null || chars is null) + { + ThrowHelper.ThrowArgumentNullException( + argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars, + resource: ExceptionResource.ArgumentNull_Array); + } - public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars) - { - fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes)) - fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars)) + if ((byteCount | charCount) < 0) { - return GetChars(bytesPtr, bytes.Length, charsPtr, chars.Length, baseDecoder: null); + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); } - } - // Returns a string containing the decoded representation of a range of - // bytes in a byte array. - // - // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) - // So if you fix this, fix the others. Currently those include: - // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding - // parent method is safe + return GetCharsCommon(bytes, byteCount, chars, charCount); + } - public override unsafe string GetString(byte[] bytes, int index, int count) + public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars) { - // Validate Parameters - if (bytes == null) - throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array); - - if (index < 0 || count < 0) - throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum); - - if (bytes.Length - index < count) - throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer); - - // Avoid problems with empty input buffer - if (count == 0) return string.Empty; + // It's ok for us to pass null pointers down to the workhorse below. - fixed (byte* pBytes = bytes) - return string.CreateStringFromEncoding( - pBytes + index, count, this); + fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) + fixed (char* charsPtr = &MemoryMarshal.GetReference(chars)) + { + return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length); + } } + // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method. + // So if we're really broken, then that could also throw an error... recursively. + // So try to make sure GetChars can at least process all uses by + // System.Resources.ResourceReader! // - // End of standard methods copied from EncodingNLS.cs - // - - // To simplify maintenance, the structure of GetByteCount and GetBytes should be - // kept the same as much as possible - internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) + // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. + // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount) { - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer fallbackBuffer = null; - char* pSrcForFallback; + // Common helper method for all non-DecoderNLS entry points to GetChars. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. - char* pSrc = chars; - char* pEnd = pSrc + count; + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); + Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified."); - // Start by assuming we have as many as count - int byteCount = count; + // First call into the fast path. - int ch = 0; + int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed); - if (baseEncoder != null) + if (bytesConsumed == byteCount) { - UTF8Encoder encoder = (UTF8Encoder)baseEncoder; - ch = encoder.surrogateChar; - - // We mustn't have left over fallback data when counting - if (encoder.InternalHasFallbackBuffer) - { - fallbackBuffer = encoder.FallbackBuffer; - if (fallbackBuffer.Remaining > 0) - throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + // All elements converted - return immediately. - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false); - } + return charsWritten; } - - for (;;) + else { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) - { - if (ch == 0) - { - // Unroll any fallback that happens at the end - ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0; - if (ch > 0) - { - byteCount++; - goto ProcessChar; - } - } - else - { - // Case of surrogates in the fallback. - if (fallbackBuffer != null && fallbackBuffer.bFallingBack) - { - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - ch = fallbackBuffer.InternalGetNextChar(); - byteCount++; - - if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - ch = 0xfffd; - byteCount++; - goto EncodeChar; - } - else if (ch > 0) - { - goto ProcessChar; - } - else - { - byteCount--; // ignore last one. - break; - } - } - } - - if (ch <= 0) - { - break; - } - if (baseEncoder != null && !baseEncoder.MustFlush) - { - break; - } - - // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. - byteCount++; - goto EncodeChar; - } - - if (ch > 0) - { - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // count the pending surrogate - byteCount++; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. - ch = 0xfffd; - // ch = cha + (ch << 10) + - // (0x10000 - // - CharUnicodeInfo.LOW_SURROGATE_START - // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) ); - - // Use this next char - pSrc++; - } - // else ch is still high surrogate and encoding will fail (so don't add count) - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != null) - { - ch = fallbackBuffer.InternalGetNextChar(); - if (ch > 0) - { - // We have an extra byte we weren't expecting. - byteCount++; - goto ProcessChar; - } - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - // if (IsHighSurrogate(ch)) { - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) - { - // we will count this surrogate next time around - byteCount--; - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == null) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - if (baseEncoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = baseEncoder.FallbackBuffer; - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback); - pSrc = pSrcForFallback; - - // Ignore it if we don't throw (we had preallocated this ch) - byteCount--; - ch = 0; - continue; - } - - // Count them - if (ch > 0x7F) - { - if (ch > 0x7FF) - { - // the extra surrogate byte was compensated by the second surrogate character - // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) - byteCount++; - } - byteCount++; - } - -#if BIT64 - // check for overflow - if (byteCount < 0) - { - break; - } -#endif - -#if FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0) - { - // We're reserving 1 byte for each char by default - byteCount++; - goto ProcessChar; - } - - int availableChars = PtrDiff(pEnd, pSrc); + // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback. - // don't fall into the fast decoding loop if we don't have enough characters - if (availableChars <= 13) - { - // try to get over the remainder of the ascii characters fast though - char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - goto ProcessChar; - } - - // we are done - break; - } - -#if BIT64 - // make sure that we won't get a silent overflow inside the fast loop - // (Fall out to slow loop if we have this many characters) - availableChars &= 0x0FFFFFFF; -#endif - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - char* pStop = pSrc + availableChars - (3 + 4); - - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - - // get pSrc aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - } - - // Run 2 * 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII - { - if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - - if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits) - byteCount++; - if ((ch & unchecked((int)0xFF80)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF800000)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF80)) != 0) - byteCount++; - } - pSrc += 4; - - ch = *(int*)pSrc; - chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII - { - if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - if ((ch & unchecked((int)0xFF800000)) != 0) - byteCount++; - if ((ch & unchecked((int)0xFF80)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF800000)) != 0) - byteCount++; - if ((chc & unchecked((int)0xFF80)) != 0) - byteCount++; - } - pSrc += 4; - } - break; - - LongCodeWithMask: - if (BitConverter.IsLittleEndian) - { - ch = (char)ch; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - pSrc++; - - if (ch <= 0x7F) - { - continue; - } - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch > 0x7FF) - { - // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; - if ( - // !IsHighSurrogate(ch) // low without high -> bad - ch > CharUnicodeInfo.HIGH_SURROGATE_END || - // !IsLowSurrogate(chd) // high not followed by low -> bad - !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Back up and drop out to slow loop to figure out error - pSrc--; - break; - } - pSrc++; - - // byteCount - this byte is compensated by the second surrogate character - } - byteCount++; - } - byteCount++; - - // byteCount - the last byte is already included - } -#endif // FASTLOOP - - // no pending char at this point - ch = 0; + return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten); } - -#if BIT64 - // check for overflow - if (byteCount < 0) - { - throw new ArgumentException( - SR.Argument_ConversionOverflow); - } -#endif - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); - - return byteCount; } - // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic - // is good enough for us, and it tends to generate better code than the signed - // arithmetic generated by default - private static unsafe int PtrDiff(char* a, char* b) + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon + private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed) { - return (int)(((uint)((byte*)a - (byte*)b)) >> 1); - } + // We don't care about the exact OperationStatus value returned by the workhorse routine; we only + // care if the workhorse was able to consume the entire input payload. If we're unable to do so, + // we'll handle the remainder in the fallback routine. - // byte* flavor just for parity - private static unsafe int PtrDiff(byte* a, byte* b) - { - return (int)(a - b); - } + Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining); - private static bool InRange(int ch, int start, int end) - { - return (uint)(ch - start) <= (uint)(end - start); + bytesConsumed = (int)(pInputBufferRemaining - pBytes); + return (int)(pOutputBufferRemaining - pChars); } - // Our workhorse - // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw - internal sealed override unsafe int GetBytes( - char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS baseEncoder) + private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan<byte> bytes, int originalBytesLength, Span<char> chars, int originalCharsLength, DecoderNLS decoder) { - Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null"); - Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); - Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); - Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null"); - - UTF8Encoder encoder = null; - - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer fallbackBuffer = null; - char* pSrcForFallback; - - char* pSrc = chars; - byte* pTarget = bytes; - - char* pEnd = pSrc + charCount; - byte* pAllocatedBufferEnd = pTarget + byteCount; - - int ch = 0; - - // assume that JIT will en-register pSrc, pTarget and ch + // We special-case DecoderReplacementFallback if it's telling us to write a single U+FFFD char, + // since we believe this to be relatively common and we can handle it more efficiently than + // the base implementation. - if (baseEncoder != null) + if (((decoder is null) ? this.DecoderFallback : decoder.Fallback) is DecoderReplacementFallback replacementFallback + && replacementFallback.MaxCharCount == 1 + && replacementFallback.DefaultString[0] == UnicodeUtility.ReplacementChar) { - encoder = (UTF8Encoder)baseEncoder; - ch = encoder.surrogateChar; - - // We mustn't have left over fallback data when counting - if (encoder.InternalHasFallbackBuffer) - { - // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary - fallbackBuffer = encoder.FallbackBuffer; - if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow) - throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true); - } - } - - for (;;) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - if (ch == 0) - { - // Check if there's anything left to get out of the fallback buffer - ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0; - if (ch > 0) - { - goto ProcessChar; - } - } - else - { - // Case of leftover surrogates in the fallback buffer - if (fallbackBuffer != null && fallbackBuffer.bFallingBack) - { - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - int cha = ch; - - ch = fallbackBuffer.InternalGetNextChar(); - - if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); - goto EncodeChar; - } - else if (ch > 0) - { - goto ProcessChar; - } - else - { - break; - } - } - } - - // attempt to encode the partial surrogate (will fail or ignore) - if (ch > 0 && (encoder == null || encoder.MustFlush)) - goto EncodeChar; - - // We're done - break; - } - - if (ch > 0) - { - // We have a high surrogate left over from a previous loop. - Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - ch = cha + (ch << 10) + - (0x10000 - - CharUnicodeInfo.LOW_SURROGATE_START - - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); - - pSrc++; - } - // else ch is still high surrogate and encoding will fail - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != null) - { - ch = fallbackBuffer.InternalGetNextChar(); - if (ch > 0) goto ProcessChar; - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - // if (IsHighSurrogate(ch)) { - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) - { - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) - if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed, we have to do fallback for them - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == null) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - if (baseEncoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = baseEncoder.FallbackBuffer; - - // Set our internal fallback interesting things. - fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback); - pSrc = pSrcForFallback; - - // Ignore it if we don't throw - ch = 0; - continue; - } - - // Count bytes needed - int bytesNeeded = 1; - if (ch > 0x7F) - { - if (ch > 0x7FF) - { - if (ch > 0xFFFF) - { - bytesNeeded++; // 4 bytes (surrogate pair) - } - bytesNeeded++; // 3 bytes (800-FFFF) - } - bytesNeeded++; // 2 bytes (80-7FF) - } - - if (pTarget > pAllocatedBufferEnd - bytesNeeded) - { - // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallbackBuffer != null && fallbackBuffer.bFallingBack) - { - fallbackBuffer.MovePrevious(); // Didn't use this fallback char - if (ch > 0xFFFF) - fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either - } - else - { - pSrc--; // Didn't use this char - if (ch > 0xFFFF) - pSrc--; // Was surrogate, didn't use 2nd part either - } - Debug.Assert(pSrc >= chars || pTarget == bytes, - "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); - ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must - ch = 0; // Nothing left over (we backed up to start of pair if supplementary) - break; - } - - if (ch <= 0x7F) - { - *pTarget = (byte)ch; - } - else - { - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int chb; - if (ch <= 0x7FF) - { - // 2 byte encoding - chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6)); - } - else - { - if (ch <= 0xFFFF) - { - chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12)); - } - else - { - *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); - pTarget++; - - chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; - } - *pTarget = (byte)chb; - pTarget++; - - chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; - } - *pTarget = (byte)chb; - pTarget++; - - *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); - } - pTarget++; - - -#if FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0) - goto ProcessChar; - - int availableChars = PtrDiff(pEnd, pSrc); - int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); - - // don't fall into the fast decoding loop if we don't have enough characters - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - if (availableChars <= 13) - { - // we are hoping for 1 byte per char - if (availableBytes < availableChars) - { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - // Not ASCII, need more than 1 byte per char - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (byte)ch; - pTarget++; - } - // we are done, let ch be 0 to clear encoder - ch = 0; - break; - } - - // we need at least 1 byte per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - if (availableBytes < availableChars) - { - availableChars = availableBytes; - } - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. + // Don't care about the exact OperationStatus, just how much of the payload we were able + // to process. - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - char* pStop = pSrc + availableChars - 5; + Utf8.ToUtf16(bytes, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: true, isFinalBlock: decoder is null || decoder.MustFlush); - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - - // get pSrc aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - } - - // Run 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) - { - goto LongCodeWithMask; - } - - // Unfortunately, this is endianess sensitive - if (BitConverter.IsLittleEndian) - { - *pTarget = (byte)ch; - *(pTarget + 1) = (byte)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (byte)chc; - *(pTarget + 3) = (byte)(chc >> 16); - pTarget += 4; - } - else - { - *pTarget = (byte)(ch>>16); - *(pTarget+1) = (byte)ch; - pSrc += 4; - *(pTarget+2) = (byte)(chc>>16); - *(pTarget+3) = (byte)chc; - pTarget += 4; - } - } - continue; - - LongCodeWithMask: - if (BitConverter.IsLittleEndian) - { - ch = (char)ch; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - int chd; - if (ch <= 0x7FF) - { - // 2 byte encoding - chd = unchecked((sbyte)0xC0) | (ch >> 6); - } - else - { - // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch)) - if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // 3 byte encoding - chd = unchecked((sbyte)0xE0) | (ch >> 12); - } - else - { - // 4 byte encoding - high surrogate + low surrogate - // if (!IsHighSurrogate(ch)) - if (ch > CharUnicodeInfo.HIGH_SURROGATE_END) - { - // low without high -> bad, try again in slow loop - pSrc -= 1; - break; - } - - chd = *pSrc; - pSrc++; - - // if (!IsLowSurrogate(chd)) { - if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) - { - // high not followed by low -> bad, try again in slow loop - pSrc -= 2; - break; - } - - ch = chd + (ch << 10) + - (0x10000 - - CharUnicodeInfo.LOW_SURROGATE_START - - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); - - *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); - // pStop - this byte is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; - } - *pTarget = (byte)chd; - pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too. - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; - } - *pTarget = (byte)chd; - pStop--; // 2 byte sequence for 1 char so need pStop--. - pTarget++; - - *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); - // pStop - this byte is already included - pTarget++; - } + // Slice off how much we consumed / wrote. - Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); + bytes = bytes.Slice(bytesRead); + chars = chars.Slice(charsWritten); + } -#endif // FASTLOOP + // If we couldn't go through our fast fallback mechanism, or if we still have leftover + // data because we couldn't consume everything in the loop above, we need to go down the + // slow fallback path. - // no pending char at this point - ch = 0; + if (bytes.IsEmpty) + { + return originalCharsLength - chars.Length; // total number of chars written } - - // Do we have to set the encoder bytes? - if (encoder != null) + else { - Debug.Assert(!encoder.MustFlush || ch == 0, - "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture)); - - encoder.surrogateChar = ch; - encoder._charsUsed = (int)(pSrc - chars); + return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder); } - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || - baseEncoder == null || !baseEncoder._throwOnOverflow, - "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting"); - - return (int)(pTarget - bytes); } - - // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits - // while the actual character is being built in the lower bits. They are shifted together - // with the actual bits of the character. - - // bits 30 & 31 are used for pending bits fixup - private const int FinalByte = 1 << 29; - private const int SupplimentarySeq = 1 << 28; - private const int ThreeByteSeq = 1 << 27; - - // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. - // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. // - // To simplify maintenance, the structure of GetCharCount and GetChars should be - // kept the same as much as possible - internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); - Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null"); - - // Initialize stuff - byte* pSrc = bytes; - byte* pEnd = pSrc + count; + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe - // Start by assuming we have as many as count, charCount always includes the adjustment - // for the character being decoded - int charCount = count; - int ch = 0; - DecoderFallbackBuffer fallback = null; + public override unsafe string GetString(byte[] bytes, int index, int count) + { + // Validate Parameters - if (baseDecoder != null) + if (bytes is null) { - UTF8Decoder decoder = (UTF8Decoder)baseDecoder; - ch = decoder.bits; - charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars. - - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check _throwOnOverflow for count) - Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start"); + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array); } - for (;;) + if ((index | count) < 0) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - break; - } - - if (ch == 0) - { - // no pending bits - goto ReadChar; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & unchecked((sbyte)0xC0)) != 0x80) - { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - charCount += (ch >> 30); - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) - { - Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) - { - if ((ch & (FinalByte >> 6)) != 0) - { - // this is 3rd byte (of 4 byte supplementary) - nothing to do - continue; - } - - // 2nd byte, check for non-shortest form of supplementary char and the valid - // supplementary characters in range 0x010000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) - { - goto InvalidByteSequence; - } - } - else - { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // adjust for surrogates in non-shortest form - if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) - { - charCount--; - } - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the goto referencing it - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, null); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) - { - // If its > 0x7F, its start of a new multi-byte sequence - - // Long sequence, so unreserve our char. - charCount--; - - // bit 6 has to be non-zero for start of multibyte chars. - if ((ch & 0x40) == 0) - { - // Unexpected trail byte - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) - { - if ((ch & 0x10) != 0) - { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) - { - ch |= 0xf0; - goto InvalidByteSequence; - } - - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. - ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now - (1 << 30) | // If it dies on next byte we'll need an extra char - (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - - // Our character count will be 2 characters for these 4 bytes, so subtract another char - charCount--; - } - else - { - // 3 byte encoding - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - - // We'll expect 1 character for these 3 bytes, so subtract another char. - charCount--; - } - } - else - { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - ch |= 0xc0; - goto InvalidByteSequence; - } - - // Add bit flags so we'll be flagged correctly - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - -#if FASTLOOP - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - if (availableBytes <= 13) - { - // try to get over the remainder of the ascii characters fast though - byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - } - // we are done - ch = 0; - break; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - byte* pStop = pSrc + availableBytes - 7; - - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - - // get pSrc 2-byte aligned - if ((unchecked((int)pSrc) & 0x1) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - } - - // get pSrc 4-byte aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *(ushort*)pSrc; - if ((ch & 0x8080) != 0) - { - goto LongCodeWithMask16; - } - pSrc += 2; - } - - // Run 8 + 8 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & unchecked((int)0x80808080)) != 0) - { - goto LongCodeWithMask32; - } - pSrc += 8; - - // This is a really small loop - unroll it - if (pSrc >= pStop) - break; - - ch = *(int*)pSrc; - chb = *(int*)(pSrc + 4); - if (((ch | chb) & unchecked((int)0x80808080)) != 0) - { - goto LongCodeWithMask32; - } - pSrc += 8; - } - break; - - LongCodeWithMask32: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - LongCodeWithMask16: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - ch = (int)(((uint)ch) >> 8); - } - - pSrc++; - if (ch <= 0x7F) - { - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) - { - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) - { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc += 2; - - // extra byte - charCount--; - } - else - { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - // extra byte - charCount--; - } - } - else - { - // 2 byte encoding - - // check for non-shortest form - if ((ch & 0x1E) == 0) - { - goto BadLongCode; - } - } - - // extra byte - charCount--; - } -#endif // FASTLOOP - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; + ThrowHelper.ThrowArgumentOutOfRangeException( + argument: (index < 0) ? ExceptionArgument.index : ExceptionArgument.count, + resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum); } - // May have a problem if we have to flush - if (ch != 0) + if (bytes.Length - index < count) { - // We were already adjusting for these, so need to un-adjust - charCount += (ch >> 30); - if (baseDecoder == null || baseDecoder.MustFlush) - { - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, null); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - } + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer); } - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check _throwOnOverflow for count) - Debug.Assert(fallback == null || fallback.Remaining == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); + // Avoid problems with empty input buffer + if (count == 0) + return string.Empty; - return charCount; + fixed (byte* pBytes = bytes) + { + return string.CreateStringFromEncoding(pBytes + index, count, this); + } } - // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method. - // So if we're really broken, then that could also throw an error... recursively. - // So try to make sure GetChars can at least process all uses by - // System.Resources.ResourceReader! // - // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. - // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + // End of standard methods copied from EncodingNLS.cs // - // To simplify maintenance, the structure of GetCharCount and GetChars should be - // kept the same as much as possible - internal sealed override unsafe int GetChars( - byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder) - { - Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null"); - Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0"); - Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); - Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null"); - - byte* pSrc = bytes; - char* pTarget = chars; - - byte* pEnd = pSrc + byteCount; - char* pAllocatedBufferEnd = pTarget + charCount; - - int ch = 0; - - DecoderFallbackBuffer fallback = null; - byte* pSrcForFallback; - char* pTargetForFallback; - if (baseDecoder != null) - { - UTF8Decoder decoder = (UTF8Decoder)baseDecoder; - ch = decoder.bits; - - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty) - Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at start"); - } - for (;;) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - break; - } - - if (ch == 0) - { - // no pending bits - goto ReadChar; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & unchecked((sbyte)0xC0)) != 0x80) - { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) - { - // Not at last byte yet - Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) - { - // Its a 4-byte supplimentary sequence - if ((ch & (FinalByte >> 6)) != 0) - { - // this is 3rd byte of 4 byte sequence - nothing to do - continue; - } - - // 2nd byte of 4 bytes - // check for non-shortest form of surrogate and the valid surrogate - // range 0x000000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) - { - goto InvalidByteSequence; - } - } - else - { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // surrogate in shortest form? - // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? - if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) - { - // let the range check for the second char throw the exception - if (pTarget < pAllocatedBufferEnd) - { - *pTarget = (char)(((ch >> 10) & 0x7FF) + - unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))))); - pTarget++; - - ch = (ch & 0x3FF) + - unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START)); - } - } - - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, pAllocatedBufferEnd); - } - // That'll back us up the appropriate # of bytes if we didn't get anywhere - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered - bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback); - pSrc = pSrcForFallback; - pTarget = pTargetForFallback; - - if (!fallbackResult) - { - // Ran out of buffer space - // Need to throw an exception? - Debug.Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); - fallback.InternalReset(); - ThrowCharsOverflow(baseDecoder, pTarget == chars); - ch = 0; - break; - } - Debug.Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) - { - // If its > 0x7F, its start of a new multi-byte sequence - - // bit 6 has to be non-zero - if ((ch & 0x40) == 0) - { - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) - { - if ((ch & 0x10) != 0) - { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) - { - ch |= 0xf0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - } - else - { - // 3 byte encoding - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - } - } - else - { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - ch |= 0xc0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - // write the pending character - if (pTarget >= pAllocatedBufferEnd) - { - // Fix chars so we make sure to throw if we didn't output anything - ch &= 0x1fffff; - if (ch > 0x7f) - { - if (ch > 0x7ff) - { - if (ch >= CharUnicodeInfo.LOW_SURROGATE_START && - ch <= CharUnicodeInfo.LOW_SURROGATE_END) - { - pSrc--; // It was 4 bytes - pTarget--; // 1 was stored already, but we can't remember 1/2, so back up - } - else if (ch > 0xffff) - { - pSrc--; // It was 4 bytes, nothing was stored - } - pSrc--; // It was at least 3 bytes - } - pSrc--; // It was at least 2 bytes - } - pSrc--; - - // Throw that we don't have enough room (pSrc could be < chars if we had started to process - // a 4 byte sequence already) - Debug.Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); - ThrowCharsOverflow(baseDecoder, pTarget == chars); - - // Don't store ch in decoder, we already backed up to its start - ch = 0; - - // Didn't throw, just use this buffer size. - break; - } - *pTarget = (char)ch; - pTarget++; - -#if FASTLOOP - int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - // Test for availableChars is done because pStop would be <= pTarget. - if (availableBytes <= 13) - { - // we may need as many as 1 character per byte - if (availableChars < availableBytes) - { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (char)ch; - pTarget++; - } - // we are done - ch = 0; - break; - } - - // we may need as many as 1 character per byte, so reduce the byte count if necessary. - // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. - if (availableChars < availableBytes) - { - availableBytes = availableChars; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - char* pStop = pTarget + availableBytes - 7; - - while (pTarget < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (char)ch; - pTarget++; - - // get pSrc to be 2-byte aligned - if ((unchecked((int)pSrc) & 0x1) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (char)ch; - pTarget++; - } - - // get pSrc to be 4-byte aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *(ushort*)pSrc; - if ((ch & 0x8080) != 0) - { - goto LongCodeWithMask16; - } - - // Unfortunately, this is endianess sensitive - if (BitConverter.IsLittleEndian) - { - *pTarget = (char)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (char)((ch >> 8) & 0x7F); - pTarget += 2; - } - else - { - *pTarget = (char)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget+1) = (char)(ch & 0x7F); - pTarget += 2; - } - } - - // Run 8 characters at a time! - while (pTarget < pStop) - { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & unchecked((int)0x80808080)) != 0) - { - goto LongCodeWithMask32; - } - - // Unfortunately, this is endianess sensitive - if (BitConverter.IsLittleEndian) - { - *pTarget = (char)(ch & 0x7F); - *(pTarget + 1) = (char)((ch >> 8) & 0x7F); - *(pTarget + 2) = (char)((ch >> 16) & 0x7F); - *(pTarget + 3) = (char)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (char)(chb & 0x7F); - *(pTarget + 5) = (char)((chb >> 8) & 0x7F); - *(pTarget + 6) = (char)((chb >> 16) & 0x7F); - *(pTarget + 7) = (char)((chb >> 24) & 0x7F); - pTarget += 8; - } - else - { - *pTarget = (char)((ch >> 24) & 0x7F); - *(pTarget+1) = (char)((ch >> 16) & 0x7F); - *(pTarget+2) = (char)((ch >> 8) & 0x7F); - *(pTarget+3) = (char)(ch & 0x7F); - pSrc += 8; - *(pTarget+4) = (char)((chb >> 24) & 0x7F); - *(pTarget+5) = (char)((chb >> 16) & 0x7F); - *(pTarget+6) = (char)((chb >> 8) & 0x7F); - *(pTarget+7) = (char)(chb & 0x7F); - pTarget += 8; - } - } - break; - - LongCodeWithMask32: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - LongCodeWithMask16: - if (BitConverter.IsLittleEndian) - { - ch &= 0xFF; - } - else - { - ch = (int)(((uint)ch) >> 8); - } - pSrc++; - if (ch <= 0x7F) - { - *pTarget = (char)ch; - pTarget++; - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) - { - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) - { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc += 2; - - ch = (chc << 6) | (ch & 0x3F); - - *pTarget = (char)(((ch >> 10) & 0x7FF) + - unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))); - pTarget++; - - ch = (ch & 0x3FF) + - unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START)); - - // extra byte, we're already planning 2 chars for 2 of these bytes, - // but the big loop is testing the target against pStop, so we need - // to subtract 2 more or we risk overrunning the input. Subtract - // one here and one below. - pStop--; - } - else - { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & unchecked((sbyte)0xC0)) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - ch = (chc << 6) | (ch & 0x3F); - - // extra byte, we're only expecting 1 char for each of these 3 bytes, - // but the loop is testing the target (not source) against pStop, so - // we need to subtract 2 more or we risk overrunning the input. - // Subtract 1 here and one more below - pStop--; - } - } - else - { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - goto BadLongCode; - } - ch = (ch << 6) | chc; - } - - *pTarget = (char)ch; - pTarget++; - - // extra byte, we're only expecting 1 char for each of these 2 bytes, - // but the loop is testing the target (not source) against pStop. - // subtract an extra count from pStop so that we don't overrun the input. - pStop--; - } -#endif // FASTLOOP + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe int GetCharCountCommon(byte* pBytes, int byteCount) + { + // Common helper method for all non-DecoderNLS entry points to GetCharCount. + // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32. - Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); + Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer."); + Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified."); - // no pending bits at this point - ch = 0; - continue; + // First call into the fast path. + // Don't bother providing a fallback mechanism; our fast path doesn't use it. - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } + int totalCharCount = GetCharCountFast(pBytes, byteCount, fallback: null, out int bytesConsumed); - if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush)) + if (bytesConsumed != byteCount) { - // Have to do fallback for invalid bytes - if (fallback == null) - { - if (baseDecoder == null) - fallback = this.decoderFallback.CreateFallbackBuffer(); - else - fallback = baseDecoder.FallbackBuffer; - fallback.InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // That'll back us up the appropriate # of bytes if we didn't get anywhere - pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered - pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered - bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback); - pSrc = pSrcForFallback; - pTarget = pTargetForFallback; + // If there's still data remaining in the source buffer, go down the fallback path. + // We need to check for integer overflow since the fallback could change the required + // output count in unexpected ways. - if (!fallbackResult) + totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed); + if (totalCharCount < 0) { - Debug.Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); - - // Ran out of buffer space - // Need to throw an exception? - fallback.InternalReset(); - ThrowCharsOverflow(baseDecoder, pTarget == chars); + ThrowConversionOverflow(); } - Debug.Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); - ch = 0; } - if (baseDecoder != null) - { - UTF8Decoder decoder = (UTF8Decoder)baseDecoder; + return totalCharCount; + } - // If we're storing flush data we expect all bits to be used or else - // we're stuck in the middle of a conversion - Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow, - "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow."); + [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon + private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback fallback, out int bytesConsumed) + { + // The number of UTF-16 code units will never exceed the number of UTF-8 code units, + // so the addition at the end of this method will not overflow. - // Remember our leftover bits. - decoder.bits = ch; + byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _); - baseDecoder._bytesUsed = (int)(pSrc - bytes); - } + int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes); + bytesConsumed = tempBytesConsumed; - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check _throwOnOverflow for chars) - Debug.Assert(fallback == null || fallback.Remaining == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); - - return PtrDiff(pTarget, chars); + return tempBytesConsumed + utf16CodeUnitCountAdjustment; } - // During GetChars we had an invalid byte sequence - // pSrc is backed up to the start of the bad sequence if we didn't have room to - // fall it back. Otherwise pSrc remains where it is. - private unsafe bool FallbackInvalidByteSequence( - ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget) + public override Decoder GetDecoder() { - // Get our byte[] - byte* pStart = pSrc; - byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch); - - // Do the actual fallback - if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget)) - { - // Oops, it failed, back up to pStart - pSrc = pStart; - return false; - } - - // It worked - return true; + return new DecoderNLS(this); } - // During GetCharCount we had an invalid byte sequence - // pSrc is used to find the index that points to the invalid bytes, - // however the byte[] contains the fallback bytes (in case the index is -1) - private unsafe int FallbackInvalidByteSequence( - byte* pSrc, int ch, DecoderFallbackBuffer fallback) + + public override Encoder GetEncoder() { - // Calling GetBytesUnknown can adjust the pSrc pointer but we need to pass the pointer before the adjustment - // to fallback.InternalFallback. The input pSrc to fallback.InternalFallback will only be used to calculate the - // index inside bytesUnknown and if we pass the adjusted pointer we can end up with negative index values. - // We store the original pSrc in pOriginalSrc and then pass pOriginalSrc to fallback.InternalFallback. - byte* pOriginalSrc = pSrc; - - // Get our byte[] - byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch); - - // Do the actual fallback - int count = fallback.InternalFallback(bytesUnknown, pOriginalSrc); - - // # of fallback chars expected. - // Note that we only get here for "long" sequences, and have already unreserved - // the count that we prereserved for the input bytes - return count; + return new EncoderNLS(this); } - // Note that some of these bytes may have come from a previous fallback, so we cannot - // just decrement the pointer and use the values we read. In those cases we have - // to regenerate the original values. - private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch) - { - // Get our byte[] - byte[] bytesUnknown = null; + // + // Beginning of methods used by shared fallback logic. + // - // See if it was a plain char - // (have to check >= 0 because we have all sorts of wierd bit flags) - if (ch < 0x100 && ch >= 0) - { - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)ch) }; - } - // See if its an unfinished 2 byte sequence - else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) - { - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) }; - } - // So now we're either 2nd byte of 3 or 4 byte sequence or - // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence - // 1st check if its a 4 byte sequence - else if ((ch & SupplimentarySeq) != 0) - { - // 3rd byte of 4 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // 3rd byte of 4 byte sequence - pSrc -= 3; - bytesUnknown = new byte[] { - unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)), - unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)), - unchecked((byte)(((ch) & 0x3F) | 0x80)) }; - } - else if ((ch & (FinalByte >> 12)) != 0) - { - // 2nd byte of a 4 byte sequence - pSrc -= 2; - bytesUnknown = new byte[] { - unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)), - unchecked((byte)(((ch) & 0x3F) | 0x80)) }; - } - else - { - // 4th byte of a 4 byte sequence - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) }; - } - } - else - { - // 2nd byte of 3 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // So its 2nd byte of a 3 byte sequence - pSrc -= 2; - bytesUnknown = new byte[] { - unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) }; - } - else - { - // 1st byte of a 3 byte sequence - pSrc--; - bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) }; - } - } + internal sealed override bool TryGetByteCount(Rune value, out int byteCount) + { + // All well-formed Rune instances can be converted to 1..4 UTF-8 code units. - return bytesUnknown; + byteCount = value.Utf8SequenceLength; + return true; } - - public override Decoder GetDecoder() + internal sealed override OperationStatus EncodeRune(Rune value, Span<byte> bytes, out int bytesWritten) { - return new UTF8Decoder(this); - } + // All well-formed Rune instances can be encoded as 1..4 UTF-8 code units. + // If there's an error, it's because the destination was too small. + return value.TryEncodeToUtf8(bytes, out bytesWritten) ? OperationStatus.Done : OperationStatus.DestinationTooSmall; + } - public override Encoder GetEncoder() + internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan<byte> bytes, out Rune value, out int bytesConsumed) { - return new UTF8Encoder(this); + return Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed); } + // + // End of methods used by shared fallback logic. + // public override int GetMaxByteCount(int charCount) { @@ -2571,62 +858,5 @@ namespace System.Text return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0); } - - private sealed class UTF8Encoder : EncoderNLS - { - // We must save a high surrogate value until the next call, looking - // for a low surrogate value. - internal int surrogateChar; - - public UTF8Encoder(UTF8Encoding encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - - { - this.surrogateChar = 0; - if (_fallbackBuffer != null) - _fallbackBuffer.Reset(); - } - - // Anything left in our encoder? - internal override bool HasState - { - get - { - return (this.surrogateChar != 0); - } - } - } - - private sealed class UTF8Decoder : DecoderNLS - { - // We'll need to remember the previous information. See the comments around definition - // of FinalByte for details. - internal int bits; - - public UTF8Decoder(UTF8Encoding encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - { - this.bits = 0; - if (_fallbackBuffer != null) - _fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - return (this.bits != 0); - } - } - } } } diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs new file mode 100644 index 0000000000..878e593e3d --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs @@ -0,0 +1,379 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Numerics; +using Internal.Runtime.CompilerServices; + +#if BIT64 +using nint = System.Int64; +using nuint = System.UInt64; +#else // BIT64 +using nint = System.Int32; +using nuint = System.UInt32; +#endif // BIT64 + +namespace System.Text.Unicode +{ + internal static unsafe partial class Utf16Utility + { + // Returns &inputBuffer[inputLength] if the input buffer is valid. + /// <summary> + /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, + /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. + /// </summary> + /// <remarks> + /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. + /// </remarks> + public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + // First, we'll handle the common case of all-ASCII. If this is able to + // consume the entire buffer, we'll skip the remainder of this method's logic. + + int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); + Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); + + pInputBuffer += (uint)numAsciiCharsConsumedJustNow; + inputLength -= numAsciiCharsConsumedJustNow; + + if (inputLength == 0) + { + utf8CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; + return pInputBuffer; + } + + // If we got here, it means we saw some non-ASCII data, so within our + // vectorized code paths below we'll handle all non-surrogate UTF-16 + // code points branchlessly. We'll only branch if we see surrogates. + // + // We still optimistically assume the data is mostly ASCII. This means that the + // number of UTF-8 code units and the number of scalars almost matches the number + // of UTF-16 code units. As we go through the input and find non-ASCII + // characters, we'll keep track of these "adjustment" fixups. To get the + // total number of UTF-8 code units required to encode the input data, add + // the UTF-8 code unit count adjustment to the number of UTF-16 code units + // seen. To get the total number of scalars present in the input data, + // add the scalar count adjustment to the number of UTF-16 code units seen. + + long tempUtf8CodeUnitCountAdjustment = 0; + int tempScalarCountAdjustment = 0; + + if (Sse2.IsSupported) + { + if (inputLength >= Vector128<ushort>.Count) + { + Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80); + Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800); + Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800)); + Vector128<ushort> vectorZero = Vector128<ushort>.Zero; + + do + { + Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned + uint mask; + + Vector128<ushort> charIsNonAscii; + if (Sse41.IsSupported) + { + // sets 0x0080 bit if corresponding char element is >= 0x0080 + charIsNonAscii = Sse41.Min(utf16Data, vector0080); + } + else + { + // sets 0x8000 bit if corresponding char element is >= 0x0080 + charIsNonAscii = Sse2.AndNot(vector0080, Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 7))); + } + + // sets 0x8080 bits if corresponding char element is >= 0x0800 + Vector128<ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); + + mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); + + // Each odd bit of mask will be 1 only if the char was >= 0x0080, + // and each even bit of mask will be 1 only if the char was >= 0x0800. + // + // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": + // + // ,-- set if char[1] is non-ASCII + // | ,-- set if char[0] is non-ASCII + // v v + // mask = ... 1 1 1 0 + // ^ ^-- set if char[0] is >= 0x0800 + // `-- set if char[1] is >= 0x0800 + // + // (If the SSE4.1 code path is taken above, the meaning of the odd and even + // bits are swapped, but the logic below otherwise holds.) + // + // This means we can popcnt the number of set bits, and the result is the + // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as + // it expands. This results in the wrong count for UTF-16 surrogate code + // units (we just counted that each individual code unit expands to 3 bytes, + // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). + // We'll handle this in just a moment. + // + // For now, compute the popcnt but squirrel it away. We'll fold it in to the + // cumulative UTF-8 adjustment factor once we determine that there are no + // unpaired surrogates in our data. (Unpaired surrogates would invalidate + // our computed result and we'd have to throw it away.) + + uint popcnt = (uint)BitOperations.PopCount(mask); + + // Surrogates need to be special-cased for two reasons: (a) we need + // to account for the fact that we over-counted in the addition above; + // and (b) they require separate validation. + + utf16Data = Sse2.Add(utf16Data, vectorA800); + mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); + + if (mask != 0) + { + // There's at least one UTF-16 surrogate code unit present. + // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, + // the resulting bits of 'mask' will occur in pairs: + // - 00 if the corresponding UTF-16 char was not a surrogate code unit; + // - 11 if the corresponding UTF-16 char was a surrogate code unit. + // + // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], + // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents + // a low surrogate. Since we added 0xA800 in the vectorized operation above, + // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. + // If we logical right-shift each word by 3, we'll end up with the bit pattern + // [ 00010000 q####### ], which means that we can immediately use pmovmskb to + // determine whether a given char was a high or a low surrogate. + // + // Therefore the resulting bits of 'mask2' will occur in pairs: + // - 00 if the corresponding UTF-16 char was a high surrogate code unit; + // - 01 if the corresponding UTF-16 char was a low surrogate code unit; + // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. + + uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); + + uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00 + uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00 + + // Now check that each high surrogate is followed by a low surrogate and that each + // low surrogate follows a high surrogate. We make an exception for the case where + // the final char of the vector is a high surrogate, since we can't perform validation + // on it until the next iteration of the loop when we hope to consume the matching + // low surrogate. + + highSurrogatesMask <<= 2; + if ((ushort)highSurrogatesMask != lowSurrogatesMask) + { + goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic + } + + if (highSurrogatesMask > ushort.MaxValue) + { + // There was a standalone high surrogate at the end of the vector. + // We'll adjust our counters so that we don't consider this char consumed. + + highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt + popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here + pInputBuffer--; + inputLength++; + } + + int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask); + + // 2 UTF-16 chars become 1 Unicode scalar + + tempScalarCountAdjustment -= surrogatePairsCount; + + // Since each surrogate code unit was >= 0x0800, we eagerly assumed + // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation + // assumes that the pair is encoded as 6 UTF-8 code units. Since each + // pair is in reality only encoded as 4 UTF-8 code units, we need to + // perform this adjustment now. + + nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + } + + tempUtf8CodeUnitCountAdjustment += popcnt; + pInputBuffer += Vector128<ushort>.Count; + inputLength -= Vector128<ushort>.Count; + } while (inputLength >= Vector128<ushort>.Count); + } + } + else if (Vector.IsHardwareAccelerated) + { + if (inputLength >= Vector<ushort>.Count) + { + Vector<ushort> vector0080 = new Vector<ushort>(0x0080); + Vector<ushort> vector0400 = new Vector<ushort>(0x0400); + Vector<ushort> vector0800 = new Vector<ushort>(0x0800); + Vector<ushort> vectorD800 = new Vector<ushort>(0xD800); + + do + { + // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain + // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding + // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these + // vectors, each element of the sum will contain one of three values: + // + // 0x0000 ( 0) = original char was 0000..007F + // 0xFFFF (-1) = original char was 0080..07FF + // 0xFFFE (-2) = original char was 0800..FFFF + // + // We'll negate them to produce a value 0..2 for each element, then sum all the + // elements together to produce the number of *additional* UTF-8 code units + // required to represent this UTF-16 data. This is similar to the popcnt step + // performed by the SSE2 code path. This will overcount surrogates, but we'll + // handle that shortly. + + Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer); + Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); + Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); + Vector<nuint> sumVector = (Vector<nuint>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); + + // We'll try summing by a natural word (rather than a 16-bit word) at a time, + // which should halve the number of operations we must perform. + + nuint popcnt = 0; + for (int i = 0; i < Vector<nuint>.Count; i++) + { + popcnt += sumVector[i]; + } + + uint popcnt32 = (uint)popcnt; + if (IntPtr.Size == 8) + { + popcnt32 += (uint)(popcnt >> 32); + } + + // As in the SSE4.1 paths, compute popcnt but don't fold it in until we + // know there aren't any unpaired surrogates in the input data. + + popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); + + // Now check for surrogates. + + utf16Data -= vectorD800; + Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); + if (surrogateChars != Vector<ushort>.Zero) + { + // There's at least one surrogate (high or low) UTF-16 code unit in + // the vector. We'll build up additional vectors: 'highSurrogateChars' + // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original + // UTF-16 code unit was a high or low surrogate, respectively. + + Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); + Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); + + // We want to make sure that each high surrogate code unit is followed by + // a low surrogate code unit and each low surrogate code unit follows a + // high surrogate code unit. Since we don't have an equivalent of pmovmskb + // or palignr available to us, we'll do this as a loop. We won't look at + // the very last high surrogate char element since we don't yet know if + // the next vector read will have a low surrogate char element. + + ushort surrogatePairsCount = 0; + for (int i = 0; i < Vector<ushort>.Count - 1; i++) + { + surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 + if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) + { + goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic + } + } + + if (highSurrogateChars[Vector<ushort>.Count - 1] != 0) + { + // There was a standalone high surrogate at the end of the vector. + // We'll adjust our counters so that we don't consider this char consumed. + + pInputBuffer--; + inputLength++; + popcnt32 -= 2; + } + + nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size + + // 2 UTF-16 chars become 1 Unicode scalar + + tempScalarCountAdjustment -= (int)surrogatePairsCountNint; + + // Since each surrogate code unit was >= 0x0800, we eagerly assumed + // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only + // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), + // so we'll adjust this now. + + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; + } + + tempUtf8CodeUnitCountAdjustment += popcnt32; + pInputBuffer += Vector<ushort>.Count; + inputLength -= Vector<ushort>.Count; + } while (inputLength >= Vector<ushort>.Count); + } + } + + NonVectorizedLoop: + + // Vectorization isn't supported on our current platform, or the input was too small to benefit + // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to + // drain remaining valid chars before we report failure. + + for (; inputLength > 0; pInputBuffer++, inputLength--) + { + uint thisChar = pInputBuffer[0]; + if (thisChar <= 0x7F) + { + continue; + } + + // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. + // This optimistically assumes no surrogates, which we'll handle shortly. + + tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; + + if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) + { + continue; + } + + // Found a surrogate char. Back out the adjustment we made above, then + // try to consume the entire surrogate pair all at once. We won't bother + // trying to interpret the surrogate pair as a scalar value; we'll only + // validate that its bit pattern matches what's expected for a surrogate pair. + + tempUtf8CodeUnitCountAdjustment -= 2; + + if (inputLength == 1) + { + goto Error; // input buffer too small to read a surrogate pair + } + + thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer); + if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) + { + goto Error; // not a well-formed surrogate pair + } + + tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar + tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units + + pInputBuffer++; // consumed one extra char + inputLength--; + } + + Error: + + // Also used for normal return. + + utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; + scalarCountAdjustment = tempScalarCountAdjustment; + return pInputBuffer; + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs index bed39057e4..828776b436 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Utf16Utility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.cs @@ -5,7 +5,7 @@ using System.Runtime.CompilerServices; using System.Diagnostics; -namespace System.Text +namespace System.Text.Unicode { internal static partial class Utf16Utility { diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs index 6c8197d22b..b4cae379e2 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs @@ -4,6 +4,8 @@ using System.Buffers; using System.Diagnostics; +using System.Runtime.InteropServices; +using Internal.Runtime.CompilerServices; namespace System.Text.Unicode { @@ -37,79 +39,87 @@ namespace System.Text.Unicode /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and /// this method will not return <see cref="OperationStatus.InvalidData"/>. /// </remarks> - public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) + public static unsafe OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { - int originalSourceLength = source.Length; - int originalDestinationLength = destination.Length; - OperationStatus status = OperationStatus.Done; + // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437 - // In a loop, this is going to read and transcode one scalar value at a time - // from the source to the destination. + _ = source.Length; + _ = destination.Length; - while (!source.IsEmpty) + fixed (char* pOriginalSource = &MemoryMarshal.GetReference(source)) + fixed (byte* pOriginalDestination = &MemoryMarshal.GetReference(destination)) { - status = Rune.DecodeFromUtf16(source, out Rune firstScalarValue, out int charsConsumed); + // We're going to bulk transcode as much as we can in a loop, iterating + // every time we see bad data that requires replacement. - switch (status) + OperationStatus operationStatus = OperationStatus.Done; + char* pInputBufferRemaining = pOriginalSource; + byte* pOutputBufferRemaining = pOriginalDestination; + + while (!source.IsEmpty) { - case OperationStatus.NeedMoreData: - - // Input buffer ended with a high surrogate. Only treat this as an error - // if the caller told us that we shouldn't expect additional data in a - // future call. - - if (!isFinalBlock) - { - goto Finish; - } - - status = OperationStatus.InvalidData; - goto case OperationStatus.InvalidData; - - case OperationStatus.InvalidData: - - // Input buffer contained invalid data. If the caller told us not to - // perform U+FFFD replacement, terminate the loop immediately and return - // an error to the caller. - - if (!replaceInvalidSequences) - { - goto Finish; - } - - firstScalarValue = Rune.ReplacementChar; - goto default; - - default: - - // We know which scalar value we need to transcode to UTF-8. - // Do so now, and only terminate the loop if we ran out of space - // in the destination buffer. - - if (firstScalarValue.TryEncodeToUtf8(destination, out int bytesWritten)) - { - source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution - destination = destination.Slice(bytesWritten); - status = OperationStatus.Done; // forcibly set success - continue; - } - else - { - status = OperationStatus.DestinationTooSmall; - goto Finish; - } + // We've pinned the spans at the entry point to this method. + // It's safe for us to use Unsafe.AsPointer on them during this loop. + + operationStatus = Utf8Utility.TranscodeToUtf8( + pInputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)), + inputLength: source.Length, + pOutputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)), + outputBytesRemaining: destination.Length, + pInputBufferRemaining: out pInputBufferRemaining, + pOutputBufferRemaining: out pOutputBufferRemaining); + + // If we finished the operation entirely or we ran out of space in the destination buffer, + // or if we need more input data and the caller told us that there's possibly more data + // coming, return immediately. + + if (operationStatus <= OperationStatus.DestinationTooSmall + || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock)) + { + break; + } + + // We encountered invalid data, or we need more data but the caller told us we're + // at the end of the stream. In either case treat this as truly invalid. + // If the caller didn't tell us to replace invalid sequences, return immediately. + + if (!replaceInvalidSequences) + { + operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error + break; + } + + // We're going to attempt to write U+FFFD to the destination buffer. + // Do we even have enough space to do so? + + destination = destination.Slice((int)(pOutputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)))); + + if (2 >= (uint)destination.Length) + { + operationStatus = OperationStatus.DestinationTooSmall; + break; + } + + destination[0] = 0xEF; // U+FFFD = [ EF BF BD ] in UTF-8 + destination[1] = 0xBF; + destination[2] = 0xBD; + destination = destination.Slice(3); + + // Invalid UTF-16 sequences are always of length 1. Just skip the next character. + + source = source.Slice((int)(pInputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))) + 1); + + operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case + pInputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); + pOutputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)); } - } - - Finish: - numCharsRead = originalSourceLength - source.Length; - numBytesWritten = originalDestinationLength - destination.Length; + // Not possible to make any further progress - report to our caller how far we got. - Debug.Assert((status == OperationStatus.Done) == (numCharsRead == originalSourceLength), - "Should report OperationStatus.Done if and only if we've consumed the entire input buffer."); - - return status; + charsRead = (int)(pInputBufferRemaining - pOriginalSource); + bytesWritten = (int)(pOutputBufferRemaining - pOriginalDestination); + return operationStatus; + } } /// <summary> @@ -120,79 +130,92 @@ namespace System.Text.Unicode /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and /// this method will not return <see cref="OperationStatus.InvalidData"/>. /// </remarks> - public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) + public static unsafe OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { - int originalSourceLength = source.Length; - int originalDestinationLength = destination.Length; - OperationStatus status = OperationStatus.Done; + // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/23437 + + _ = source.Length; + _ = destination.Length; - // In a loop, this is going to read and transcode one scalar value at a time - // from the source to the destination. + // We'll be mutating these values throughout our loop. - while (!source.IsEmpty) + fixed (byte* pOriginalSource = &MemoryMarshal.GetReference(source)) + fixed (char* pOriginalDestination = &MemoryMarshal.GetReference(destination)) { - status = Rune.DecodeFromUtf8(source, out Rune firstScalarValue, out int bytesConsumed); + // We're going to bulk transcode as much as we can in a loop, iterating + // every time we see bad data that requires replacement. - switch (status) + OperationStatus operationStatus = OperationStatus.Done; + byte* pInputBufferRemaining = pOriginalSource; + char* pOutputBufferRemaining = pOriginalDestination; + + while (!source.IsEmpty) { - case OperationStatus.NeedMoreData: - - // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error - // if the caller told us that we shouldn't expect additional data in a - // future call. - - if (!isFinalBlock) - { - goto Finish; - } - - status = OperationStatus.InvalidData; - goto case OperationStatus.InvalidData; - - case OperationStatus.InvalidData: - - // Input buffer contained invalid data. If the caller told us not to - // perform U+FFFD replacement, terminate the loop immediately and return - // an error to the caller. - - if (!replaceInvalidSequences) - { - goto Finish; - } - - firstScalarValue = Rune.ReplacementChar; - goto default; - - default: - - // We know which scalar value we need to transcode to UTF-16. - // Do so now, and only terminate the loop if we ran out of space - // in the destination buffer. - - if (firstScalarValue.TryEncodeToUtf16(destination, out int charsWritten)) - { - source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution - destination = destination.Slice(charsWritten); - status = OperationStatus.Done; // forcibly set success - continue; - } - else - { - status = OperationStatus.DestinationTooSmall; - goto Finish; - } + // We've pinned the spans at the entry point to this method. + // It's safe for us to use Unsafe.AsPointer on them during this loop. + + operationStatus = Utf8Utility.TranscodeToUtf16( + pInputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)), + inputLength: source.Length, + pOutputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)), + outputCharsRemaining: destination.Length, + pInputBufferRemaining: out pInputBufferRemaining, + pOutputBufferRemaining: out pOutputBufferRemaining); + + // If we finished the operation entirely or we ran out of space in the destination buffer, + // or if we need more input data and the caller told us that there's possibly more data + // coming, return immediately. + + if (operationStatus <= OperationStatus.DestinationTooSmall + || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock)) + { + break; + } + + // We encountered invalid data, or we need more data but the caller told us we're + // at the end of the stream. In either case treat this as truly invalid. + // If the caller didn't tell us to replace invalid sequences, return immediately. + + if (!replaceInvalidSequences) + { + operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error + break; + } + + // We're going to attempt to write U+FFFD to the destination buffer. + // Do we even have enough space to do so? + + destination = destination.Slice((int)(pOutputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)))); + + if (destination.IsEmpty) + { + operationStatus = OperationStatus.DestinationTooSmall; + break; + } + + destination[0] = (char)UnicodeUtility.ReplacementChar; + destination = destination.Slice(1); + + // Now figure out how many bytes of the source we must skip over before we should retry + // the operation. This might be more than 1 byte. + + source = source.Slice((int)(pInputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)))); + Debug.Assert(!source.IsEmpty, "Expected 'Done' if source is fully consumed."); + + Rune.DecodeFromUtf8(source, out _, out int bytesConsumedJustNow); + source = source.Slice(bytesConsumedJustNow); + + operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case + pInputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); + pOutputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)); } - } - - Finish: - numBytesRead = originalSourceLength - source.Length; - numCharsWritten = originalDestinationLength - destination.Length; + // Not possible to make any further progress - report to our caller how far we got. - Debug.Assert((status == OperationStatus.Done) == (numBytesRead == originalSourceLength), - "Should report OperationStatus.Done if and only if we've consumed the entire input buffer."); - - return status; + numBytesRead = (int)(pInputBufferRemaining - pOriginalSource); + numCharsWritten = (int)(pOutputBufferRemaining - pOriginalDestination); + return operationStatus; + } } } } diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs new file mode 100644 index 0000000000..c17c2cdce7 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -0,0 +1,861 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers.Binary; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; + +namespace System.Text.Unicode +{ + internal static partial class Utf8Utility + { + /// <summary> + /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the + /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ExtractCharFromFirstThreeByteSequence(uint value) + { + if (BitConverter.IsLittleEndian) + { + // value = [ ######## | 10xxxxxx 10yyyyyy 1110zzzz ] + return ((value & 0x003F0_000u) >> 16) + | ((value & 0x0000_3F00u) >> 2) + | ((value & 0x0000_000Fu) << 12); + } + else + { + // value = [ 1110zzzz 10yyyyyy 10xxxxxx | ######## ] + return ((value & 0x0F00_0000u) >> 12) + | ((value & 0x003F_0000u) >> 10) + | ((value & 0x0000_3F00u) >> 8); + } + } + + /// <summary> + /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the + /// first two bytes as a two-byte UTF-8 subsequence and returns the UTF-16 representation. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ExtractCharFromFirstTwoByteSequence(uint value) + { + Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value) && !UInt32BeginsWithOverlongUtf8TwoByteSequence(value)); + + if (BitConverter.IsLittleEndian) + { + // value = [ ######## ######## | 10xxxxxx 110yyyyy ] + uint leadingByte = (uint)(byte)value << 6; + return (uint)(byte)(value >> 8) + leadingByte - (0xC0u << 6) - 0x80u; // remove header bits + } + else + { + // value = [ 110yyyyy 10xxxxxx | ######## ######## ] + return (char)(((value & 0x1F00_0000u) >> 18) | ((value & 0x003F_0000u) >> 16)); + } + } + + /// <summary> + /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a + /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + private static uint ExtractCharsFromFourByteSequence(uint value) + { + if (BitConverter.IsLittleEndian) + { + if (Bmi2.IsSupported) + { + // need to reverse endianness for bit manipulation to work correctly + value = BinaryPrimitives.ReverseEndianness(value); + + // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] + // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ] + // where wwww = uuuuu - 1 + + uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u); + uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u); + + uint combined = (lowSurrogateChar << 16) + highSurrogateChar; + combined -= 0x40u; // wwww = uuuuu - 1 + combined += 0xDC00_D800u; // add surrogate markers + return combined; + } + else + { + // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx + // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] + // where wwww = uuuuu - 1 + uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ] + retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ] + retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ] + retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ] + retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ] + retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ] + retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ] + retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ] + retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] + return retVal; + } + } + else + { + // input is UTF8 [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] = scalar 000uuuuu zzzzyyyy yyxxxxxx + // want to return UTF16 scalar 000uuuuuxxxxxxxxxxxxxxxx = [ 110110wwwwxxxxxx 110111xxxxxxxxx ] + // where wwww = uuuuu - 1 + uint retVal = value & 0xFF00_0000u; // retVal = [ 11110uuu 00000000 00000000 00000000 ] + retVal |= (value & 0x003F_0000u) << 2; // retVal = [ 11110uuu uuzzzz00 00000000 00000000 ] + retVal |= (value & 0x0000_3000u) << 4; // retVal = [ 11110uuu uuzzzzyy 00000000 00000000 ] + retVal |= (value & 0x0000_0F00u) >> 2; // retVal = [ 11110uuu uuzzzzyy 000000yy yy000000 ] + retVal |= (value & 0x0000_003Fu); // retVal = [ 11110uuu uuzzzzyy 000000yy yyxxxxxx ] + retVal -= 0x2000_0000u; // retVal = [ 11010uuu uuzzzzyy 000000yy yyxxxxxx ] + retVal -= 0x0040_0000u; // retVal = [ 110100ww wwzzzzyy 000000yy yyxxxxxx ] + retVal += 0x0000_DC00u; // retVal = [ 110100ww wwzzzzyy 110111yy yyxxxxxx ] + retVal += 0x0800_0000u; // retVal = [ 110110ww wwzzzzyy 110111yy yyxxxxxx ] + return retVal; + } + } + + /// <summary> + /// Given a 32-bit integer that represents a valid packed UTF-16 surrogate pair, all in machine-endian order, + /// returns the packed 4-byte UTF-8 representation of this scalar value, also in machine-endian order. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value) + { + Debug.Assert(IsWellFormedUtf16SurrogatePair(value)); + + if (BitConverter.IsLittleEndian) + { + // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx) + // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1 + + if (Bmi2.IsSupported) + { + // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want + // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple + // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across + // all four output bytes. + + uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */; + + // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning + // that should normally be masked out via an and, but we'll just direct pdep to ignore it. + + uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ] + return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] + } + else + { + value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] + + uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] + tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] + + uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] + uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] + tempC |= tempB; + + uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ] + tempD |= 0x8080_80F0u; + + return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] + } + } + else + { + // input = [ 110110wwwwzzzzyy 110111yyyyxxxxxx ], where wwww = uuuuu - 1 + // must return [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ], where wwww = uuuuu - 1 + + value -= 0xD800_DC00u; // = [ 000000wwwwzzzzyy 000000yyyyxxxxxx ] + value += 0x0040_0000u; // = [ 00000uuuuuzzzzyy 000000yyyyxxxxxx ] + + uint tempA = value & 0x0700_0000u; // = [ 00000uuu 00000000 00000000 00000000 ] + uint tempB = (value >> 2) & 0x003F_0000u; // = [ 00000000 00uuzzzz 00000000 00000000 ] + tempB |= tempA; + + uint tempC = (value << 2) & 0x0000_0F00u; // = [ 00000000 00000000 0000yyyy 00000000 ] + uint tempD = (value >> 6) & 0x0003_0000u; // = [ 00000000 00000000 00yy0000 00000000 ] + tempD |= tempC; + + uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ] + return (tempE | tempB | tempD); // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] + } + } + + /// <summary> + /// Given a machine-endian DWORD which represents two adjacent UTF-8 two-byte sequences, + /// returns the machine-endian DWORD representation of that same data as two adjacent + /// UTF-16 byte sequences. + /// </summary> + /// <param name="value"></param> + /// <returns></returns> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint value) + { + // We don't want to swap the position of the high and low WORDs, + // as the buffer was read in machine order and will be written in + // machine order. + + if (BitConverter.IsLittleEndian) + { + // value = [ 10xxxxxx 110yyyyy | 10xxxxxx 110yyyyy ] + return ((value & 0x3F003F00u) >> 8) | ((value & 0x001F001Fu) << 6); + } + else + { + // value = [ 110yyyyy 10xxxxxx | 110yyyyy 10xxxxxx ] + return ((value & 0x1F001F00u) >> 2) | (value & 0x003F003Fu); + } + } + + /// <summary> + /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences, + /// returns the machine-endian DWORD representation of that same data as two + /// adjacent UTF-8 two-byte sequences. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint value) + { + // stays in machine endian + + Debug.Assert(IsFirstCharTwoUtf8Bytes(value) && IsSecondCharTwoUtf8Bytes(value)); + + if (BitConverter.IsLittleEndian) + { + // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ] + // want to return [ 10XXXXXX 110YYYYY 10xxxxxx 110yyyyy ] + + return ((value >> 6) & 0x001F_001Fu) + ((value << 8) & 0x3F00_3F00u) + 0x80C0_80C0u; + } + else + { + // value = [ 00000YYY YYXXXXXX 00000yyy yyxxxxxx ] + // want to return [ 110YYYYY 10XXXXXX 110yyyyy 10xxxxxx ] + + return ((value << 2) & 0x1F00_1F00u) + (value & 0x003F_003Fu) + 0xC080_C080u; + } + } + + /// <summary> + /// Given a machine-endian DWORD which represents two adjacent UTF-16 sequences, + /// returns the machine-endian DWORD representation of the first UTF-16 char + /// as a UTF-8 two-byte sequence packed into a WORD and zero-extended to DWORD. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value) + { + // stays in machine endian + + Debug.Assert(IsFirstCharTwoUtf8Bytes(value)); + + if (BitConverter.IsLittleEndian) + { + // value = [ ######## ######## 00000yyy yyxxxxxx ] + // want to return [ ######## ######## 10xxxxxx 110yyyyy ] + + uint temp = (value << 2) & 0x1F00u; // [ 00000000 00000000 000yyyyy 00000000 ] + value &= 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ] + return BinaryPrimitives.ReverseEndianness((ushort)(temp + value + 0xC080u)); // [ 00000000 00000000 10xxxxxx 110yyyyy ] + } + else + { + // value = [ 00000yyy yyxxxxxx ######## ######## ] + // want to return [ ######## ######## 110yyyyy 10xxxxxx ] + + uint temp = (value >> 16) & 0x3Fu; // [ 00000000 00000000 00000000 00xxxxxx ] + value = (value >> 22) & 0x1F00u; // [ 00000000 00000000 000yyyyy 0000000 ] + return value + temp + 0xC080u; + } + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the first UTF-16 character is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsFirstCharAscii(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0000..007F ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0000..007F ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0) + || (!BitConverter.IsLittleEndian && value < 0x0080_0000u); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the first UTF-16 character requires *at least* 3 bytes to encode in UTF-8. + /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated). + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0) + || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the first UTF-16 character is a surrogate character (either high or low). + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsFirstCharSurrogate(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0) + || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsFirstCharTwoUtf8Bytes(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ]. + + // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian + // case, but the JIT doesn't currently emit 16-bit comparisons efficiently. + // Tracked as https://github.com/dotnet/coreclr/issues/18022. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u) + || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu)); + } + + /// <summary> + /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/> + /// is a UTF-8 continuation byte. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsLowByteUtf8ContinuationByte(uint value) + { + // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte), + // so the best we can do for now is the lea / cmp pair. + // Tracked as https://github.com/dotnet/coreclr/issues/18022. + + return (byte)(value - 0x80u) <= 0x3Fu; + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharAscii(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ]. + // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && value < 0x0080_0000u) + || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8. + /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated). + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ]. + // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0) + || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character is a surrogate character (either high or low). + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharSurrogate(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ]. + // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u) + || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharTwoUtf8Bytes(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ]. + // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ]. + + // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian + // case, but the JIT doesn't currently emit 16-bit comparisons efficiently. + // Tracked as https://github.com/dotnet/coreclr/issues/18022. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu)) + || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u); + } + + /// <summary> + /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte; + /// i.e., has binary representation 10xxxxxx, where x is any bit. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsUtf8ContinuationByte(in byte value) + { + // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements + // directly rather than bounce a temporary through a register. That is, we want the JIT to be + // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location + // to see if it's a continuation byte. Data that's already enregistered will go through the + // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions. + // + // The below check takes advantage of the two's complement representation of negative numbers. + // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ] + + return ((sbyte)value < -64); + } + + /// <summary> + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsWellFormedUtf16SurrogatePair(uint value) + { + // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ]. + // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ]. + // + // We're essentially performing a range check on each component of the input in parallel. The allowed range + // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We + // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400 + // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can* + // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc". + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0) + || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0); + } + + /// <summary> + /// Converts a DWORD from machine-endian to little-endian. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ToLittleEndian(uint value) + { + if (BitConverter.IsLittleEndian) + { + return value; + } + else + { + return BinaryPrimitives.ReverseEndianness(value); + } + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the first two bytes of the buffer are + /// an overlong representation of a sequence that should be represented as one byte. + /// This method *does not* validate that the sequence matches the appropriate + /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>). + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value) + { + // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input. + Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value)); + + // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF. + // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need + // to check that it's < C2. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u)) + || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the first four bytes of the buffer match + /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This + /// method *does not* validate that the sequence is well-formed; the caller must + /// still perform overlong form or out-of-range checking. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithUtf8FourByteMask(uint value) + { + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0xC0C0C0F8U; + // const uint comparand = 0x808080F0U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0xF8C0C0C0U; + // const uint comparand = 0xF0808000U; + // return ((value & mask) == comparand); + // } + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the first three bytes of the buffer match + /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not* + /// validate that the sequence is well-formed; the caller must still perform + /// overlong form or surrogate checking. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value) + { + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0x00C0C0F0U; + // const uint comparand = 0x008080E0U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0xF0C0C000U; + // const uint comparand = 0xE0808000U; + // return ((value & mask) == comparand); + // } + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the first two bytes of the buffer match + /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not* + /// validate that the sequence is well-formed; the caller must still perform + /// overlong form checking. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithUtf8TwoByteMask(uint value) + { + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0x0000C0E0U; + // const uint comparand = 0x000080C0U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0xE0C00000U; + // const uint comparand = 0xC0800000U; + // return ((value & mask) == comparand); + // } + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the first two bytes of the buffer are + /// an overlong representation of a sequence that should be represented as one byte. + /// This method *does not* validate that the sequence matches the appropriate + /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>). + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value) + { + // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input. + Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value)); + + // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF. + // We already validated that it's 80 .. DF (per mask check earlier). + // C2 = 1100 0010 + // DF = 1101 1111 + // This means that we can AND the leading byte with the mask 0001 1110 (1E), + // and if the result is zero the sequence is overlong. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the last two bytes of the buffer match + /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not* + /// validate that the sequence is well-formed; the caller must still perform + /// overlong form checking. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32EndsWithUtf8TwoByteMask(uint value) + { + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0xC0E00000U; + // const uint comparand = 0x80C00000U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0x0000E0C0U; + // const uint comparand = 0x0000C080U; + // return ((value & mask) == comparand); + // } + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine, + /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed + /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a + /// single operation. Returns <see langword="false"/> if running on a big-endian machine. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value) + { + // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ]. + // In little-endian, that would be represented as: + // [ ######## ######## 10xxxxxx 110yyyyy ]. + // Due to the little-endian representation we can perform a trick by ANDing the low + // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within + // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the + // 2-byte-sequence bitmask check and overlong form validation with one comparison. + + Debug.Assert(BitConverter.IsLittleEndian); + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu)) + || (!BitConverter.IsLittleEndian && false); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine, + /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed + /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a + /// single operation. Returns <see langword="false"/> if running on a big-endian machine. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value) + { + // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian. + + Debug.Assert(BitConverter.IsLittleEndian); + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u)) + || (!BitConverter.IsLittleEndian && false); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the first byte of the buffer is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32FirstByteIsAscii(uint value) + { + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0)) + || (!BitConverter.IsLittleEndian && ((int)value >= 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32FourthByteIsAscii(uint value) + { + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((int)value >= 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the second byte of the buffer is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32SecondByteIsAscii(uint value) + { + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0)); + } + + /// <summary> + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns <see langword="true"/> iff the third byte of the buffer is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32ThirdByteIsAscii(uint value) + { + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0)); + } + + /// <summary> + /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD + /// and writes the resulting QWORD into the destination with machine endianness. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value) + { + if (Bmi2.X64.IsSupported) + { + // BMI2 will work regardless of the processor's endianness. + Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul)); + } + else + { + if (BitConverter.IsLittleEndian) + { + outputBuffer = (char)(byte)value; + value >>= 8; + Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value; + value >>= 8; + Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value; + value >>= 8; + Unsafe.Add(ref outputBuffer, 3) = (char)value; + } + else + { + Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value; + value >>= 8; + Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value; + value >>= 8; + Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value; + value >>= 8; + outputBuffer = (char)value; + } + } + } + + /// <summary> + /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess, + /// converts those scalar values to their 3-byte UTF-8 representation and writes the + /// resulting 6 bytes to the destination buffer. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outputBuffer, uint value) + { + Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF"); + Debug.Assert(IsSecondCharAtLeastThreeUtf8Bytes(value) && !IsSecondCharSurrogate(value), "Second half of value should've been 0800..D7FF or E000..FFFF"); + + if (BitConverter.IsLittleEndian) + { + // value = [ ZZZZYYYY YYXXXXXX zzzzyyyy yyxxxxxx ] + // want to write [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ] [ 10XXXXXX 10YYYYYY ] + + uint tempA = ((value << 2) & 0x3F00u) | ((value & 0x3Fu) << 16); // = [ 00000000 00xxxxxx 00yyyyyy 00000000 ] + uint tempB = ((value >> 4) & 0x0F00_0000u) | ((value >> 12) & 0x0Fu); // = [ 0000ZZZZ 00000000 00000000 0000zzzz ] + Unsafe.WriteUnaligned<uint>(ref outputBuffer, tempA + tempB + 0xE080_80E0u); // = [ 1110ZZZZ 10xxxxxx 10yyyyyy 1110zzzz ] + Unsafe.WriteUnaligned<ushort>(ref Unsafe.Add(ref outputBuffer, 4), (ushort)(((value >> 22) & 0x3Fu) + ((value >> 8) & 0x3F00u) + 0x8080u)); // = [ 10XXXXXX 10YYYYYY ] + } + else + { + // value = [ zzzzyyyy yyxxxxxx ZZZZYYYY YYXXXXXX ] + // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ] [ 1110ZZZZ ] [ 10YYYYYY ] [ 10XXXXXX ] + + Unsafe.Add(ref outputBuffer, 5) = (byte)((value & 0x3Fu) | 0x80u); + Unsafe.Add(ref outputBuffer, 4) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u); + Unsafe.Add(ref outputBuffer, 3) = (byte)(((value >>= 6) & 0x0Fu) | 0xE0u); + Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 4) & 0x3Fu) | 0x80u); + Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u); + outputBuffer = (byte)((value >>= 6) | 0xE0u); + } + } + + + /// <summary> + /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess, + /// converts the first UTF-16 value to its 3-byte UTF-8 representation and writes the + /// resulting 3 bytes to the destination buffer. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref byte outputBuffer, uint value) + { + Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF"); + + if (BitConverter.IsLittleEndian) + { + // value = [ ######## ######## zzzzyyyy yyxxxxxx ] + // want to write [ 10yyyyyy 1110zzzz ] [ 10xxxxxx ] + + uint tempA = (value << 2) & 0x3F00u; // [ 00yyyyyy 00000000 ] + uint tempB = ((uint)(ushort)value >> 12); // [ 00000000 0000zzzz ] + Unsafe.WriteUnaligned<ushort>(ref outputBuffer, (ushort)(tempA + tempB + 0x80E0u)); // [ 10yyyyyy 1110zzzz ] + Unsafe.Add(ref outputBuffer, 2) = (byte)((value & 0x3Fu) | ~0x7Fu); // [ 10xxxxxx ] + } + else + { + // value = [ zzzzyyyy yyxxxxxx ######## ######## ] + // want to write [ 1110zzzz ] [ 10yyyyyy ] [ 10xxxxxx ] + + Unsafe.Add(ref outputBuffer, 2) = (byte)(((value >>= 16) & 0x3Fu) | 0x80u); + Unsafe.Add(ref outputBuffer, 1) = (byte)(((value >>= 6) & 0x3Fu) | 0x80u); + outputBuffer = (byte)((value >>= 6) | 0xE0u); + } + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs new file mode 100644 index 0000000000..2baee48024 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -0,0 +1,1452 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Buffers.Binary; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; + +#if BIT64 +using nint = System.Int64; +using nuint = System.UInt64; +#else // BIT64 +using nint = System.Int32; +using nuint = System.UInt32; +#endif // BIT64 + +namespace System.Text.Unicode +{ + internal static unsafe partial class Utf8Utility + { + // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where + // the next byte would have been consumed from / the next char would have been written to. + // inputLength in bytes, outputCharsRemaining in chars. + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining) + { + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative."); + Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); + + // First, try vectorized conversion. + + { + nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining)); + + pInputBuffer += numElementsConverted; + pOutputBuffer += numElementsConverted; + + // Quick check - did we just end up consuming the entire input buffer? + // If so, short-circuit the remainder of the method. + + if ((int)numElementsConverted == inputLength) + { + pInputBufferRemaining = pInputBuffer; + pOutputBufferRemaining = pOutputBuffer; + return OperationStatus.Done; + } + + inputLength -= (int)numElementsConverted; + outputCharsRemaining -= (int)numElementsConverted; + } + + if (inputLength < sizeof(uint)) + { + goto ProcessInputOfLessThanDWordSize; + } + + byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - 4; + + // Begin the main loop. + +#if DEBUG + byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds +#endif + + while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. + + uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + AfterReadDWord: + +#if DEBUG + Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read."); + pLastBufferPosProcessed = pInputBuffer; +#endif + // First, check for the common case of all-ASCII bytes. + + if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + { + // We read an all-ASCII sequence. + + if (outputCharsRemaining < sizeof(uint)) + { + goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data + } + + Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord); + pInputBuffer += 4; + pOutputBuffer += 4; + outputCharsRemaining -= 4; + + // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII. + // Below is basically unrolled loops with poor man's vectorization. + + uint remainingInputBytes = (uint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; + uint maxIters = Math.Min(remainingInputBytes, (uint)outputCharsRemaining) / (2 * sizeof(uint)); + uint secondDWord; + int i; + for (i = 0; (uint)i < maxIters; i++) + { + // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD. + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + sizeof(uint)); + + if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord)) + { + goto LoopTerminatedEarlyDueToNonAsciiData; + } + + pInputBuffer += 8; + + Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord); + Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord); + + pOutputBuffer += 8; + } + + outputCharsRemaining -= 8 * i; + + continue; // need to perform a bounds check because we might be running out of data + + LoopTerminatedEarlyDueToNonAsciiData: + + if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + { + // The first DWORD contained all-ASCII bytes, so expand it. + + Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord); + + // continue the outer loop from the second DWORD + + Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord)); + thisDWord = secondDWord; + + pInputBuffer += 4; + pOutputBuffer += 4; + outputCharsRemaining -= 4; + } + + outputCharsRemaining -= 8 * i; + + // We know that there's *at least* one DWORD of data remaining in the buffer. + // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop. + + goto AfterReadDWordSkipAllBytesAsciiCheck; + } + + AfterReadDWordSkipAllBytesAsciiCheck: + + Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier + + // Next, try stripping off ASCII bytes one at a time. + // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above. + + if (UInt32FirstByteIsAscii(thisDWord)) + { + if (outputCharsRemaining >= 3) + { + // Fast-track: we don't need to check the destination length for subsequent + // ASCII bytes since we know we can write them all now. + + uint thisDWordLittleEndian = ToLittleEndian(thisDWord); + + nuint adjustment = 1; + pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian; + + if (UInt32SecondByteIsAscii(thisDWord)) + { + adjustment++; + thisDWordLittleEndian >>= 8; + pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian; + + if (UInt32ThirdByteIsAscii(thisDWord)) + { + adjustment++; + thisDWordLittleEndian >>= 8; + pOutputBuffer[2] = (char)(byte)thisDWordLittleEndian; + } + } + + pInputBuffer += adjustment; + pOutputBuffer += adjustment; + outputCharsRemaining -= (int)adjustment; + } + else + { + // Slow-track: we need to make sure each individual write has enough + // of a buffer so that we don't overrun the destination. + + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; + } + + uint thisDWordLittleEndian = ToLittleEndian(thisDWord); + + pInputBuffer++; + *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian; + outputCharsRemaining--; + + if (UInt32SecondByteIsAscii(thisDWord)) + { + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; + } + + pInputBuffer++; + thisDWordLittleEndian >>= 8; + *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian; + + // We can perform a small optimization here. We know at this point that + // the output buffer is fully consumed (we read two ASCII bytes and wrote + // two ASCII chars, and we checked earlier that the destination buffer + // can't store a third byte). If the next byte is ASCII, we can jump straight + // to the return statement since the end-of-method logic only relies on the + // destination buffer pointer -- NOT the output chars remaining count -- being + // correct. If the next byte is not ASCII, we'll need to continue with the + // rest of the main loop, but we can set the buffer length directly to zero + // rather than decrementing it from 1 to 0. + + Debug.Assert(outputCharsRemaining == 1); + + if (UInt32ThirdByteIsAscii(thisDWord)) + { + goto OutputBufferTooSmall; + } + else + { + outputCharsRemaining = 0; + } + } + } + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessRemainingBytesSlow; // input buffer doesn't contain enough data to read a DWORD + } + else + { + // The input buffer at the current offset contains a non-ASCII byte. + // Read an entire DWORD and fall through to multi-byte consumption logic. + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + } + } + + BeforeProcessTwoByteSequence: + + // At this point, we know we're working with a multi-byte code unit, + // but we haven't yet validated it. + + // The masks and comparands are derived from the Unicode Standard, Table 3-6. + // Additionally, we need to check for valid byte sequences per Table 3-7. + + // Check the 2-byte case. + + if (UInt32BeginsWithUtf8TwoByteMask(thisDWord)) + { + // Per Table 3-7, valid sequences are: + // [ C2..DF ] [ 80..BF ] + + if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord)) + { + goto Error; + } + + ProcessTwoByteSequenceSkipOverlongFormCheck: + + // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew, + // there's a good chance that if we see one two-byte run then there's another two-byte + // run immediately after. Let's check that now. + + // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that + // the value isn't overlong using a single comparison. On big-endian platforms, we'll need + // to validate the mask and validate that the sequence isn't overlong as two separate comparisons. + + if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) + || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord)))) + { + // We have two runs of two bytes each. + + if (outputCharsRemaining < 2) + { + goto ProcessRemainingBytesSlow; // running out of output buffer + } + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord)); + + pInputBuffer += 4; + pOutputBuffer += 2; + outputCharsRemaining -= 2; + + if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + // Optimization: If we read a long run of two-byte sequences, the next sequence is probably + // also two bytes. Check for that first before going back to the beginning of the loop. + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + if (BitConverter.IsLittleEndian) + { + if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) + { + // The next sequence is a valid two-byte sequence. + goto ProcessTwoByteSequenceSkipOverlongFormCheck; + } + } + else + { + if (UInt32BeginsWithUtf8TwoByteMask(thisDWord)) + { + if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord)) + { + goto Error; // The next sequence purports to be a 2-byte sequence but is overlong. + } + + goto ProcessTwoByteSequenceSkipOverlongFormCheck; + } + } + + // If we reached this point, the next sequence is something other than a valid + // two-byte sequence, so go back to the beginning of the loop. + goto AfterReadDWord; + } + else + { + goto ProcessRemainingBytesSlow; // Running out of data - go down slow path + } + } + + // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence. + // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining + // bytes are ASCII? + + uint charToWrite = ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough + + if (UInt32ThirdByteIsAscii(thisDWord)) + { + if (UInt32FourthByteIsAscii(thisDWord)) + { + if (outputCharsRemaining < 3) + { + goto ProcessRemainingBytesSlow; // running out of output buffer + } + + pOutputBuffer[0] = (char)charToWrite; + if (BitConverter.IsLittleEndian) + { + thisDWord >>= 16; + pOutputBuffer[1] = (char)(byte)thisDWord; + thisDWord >>= 8; + pOutputBuffer[2] = (char)thisDWord; + } + else + { + pOutputBuffer[2] = (char)(byte)thisDWord; + pOutputBuffer[1] = (char)(byte)(thisDWord >> 8); + } + pInputBuffer += 4; + pOutputBuffer += 3; + outputCharsRemaining -= 3; + + continue; // go back to original bounds check and check for ASCII + } + else + { + if (outputCharsRemaining < 2) + { + goto ProcessRemainingBytesSlow; // running out of output buffer + } + + pOutputBuffer[0] = (char)charToWrite; + pOutputBuffer[1] = (char)(byte)(thisDWord >> (BitConverter.IsLittleEndian ? 16 : 8)); + pInputBuffer += 3; + pOutputBuffer += 2; + outputCharsRemaining -= 2; + + // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte. + // Read in the next DWORD and jump directly to the start of the multi-byte processing block. + + if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer) + { + goto ProcessRemainingBytesSlow; // Running out of data - go down slow path + } + else + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + goto BeforeProcessTwoByteSequence; + } + } + } + else + { + if (outputCharsRemaining == 0) + { + goto ProcessRemainingBytesSlow; // running out of output buffer + } + + pOutputBuffer[0] = (char)charToWrite; + pInputBuffer += 2; + pOutputBuffer += 1; + outputCharsRemaining--; + + if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer) + { + goto ProcessRemainingBytesSlow; // Running out of data - go down slow path + } + else + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above) + } + } + } + + // Check the 3-byte case. + + BeforeProcessThreeByteSequence: + + if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) + { + ProcessThreeByteSequenceWithCheck: + + // We need to check for overlong or surrogate three-byte sequences. + // + // Per Table 3-7, valid sequences are: + // [ E0 ] [ A0..BF ] [ 80..BF ] + // [ E1..EC ] [ 80..BF ] [ 80..BF ] + // [ ED ] [ 80..9F ] [ 80..BF ] + // [ EE..EF ] [ 80..BF ] [ 80..BF ] + // + // Big-endian examples of using the above validation table: + // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# #### + // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# #### + // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20), + // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000), + // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20). + + if (BitConverter.IsLittleEndian) + { + // The "overlong or surrogate" check can be implemented using a single jump, but there's + // some overhead to moving the bits into the correct locations in order to perform the + // correct comparison, and in practice the processor's branch prediction capability is + // good enough that we shouldn't bother. So we'll use two jumps instead. + + // Can't extract this check into its own helper method because JITter produces suboptimal + // assembly, even with aggressive inlining. + + // Code below becomes 5 instructions: test, jz, lea, test, jz + + if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0)) + { + goto Error; // overlong or surrogate + } + } + else + { + if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0)) + { + goto Error; // overlong or surrogate + } + } + + // At this point, we know the incoming scalar is well-formed. + + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; // not enough space in the destination buffer to write + } + + // As an optimization, on compatible platforms check if a second three-byte sequence immediately + // follows the one we just read, and if so use BSWAP and BMI2 to extract them together. + + if (BitConverter.IsLittleEndian && Bmi2.X64.IsSupported) + { + if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0) + { + if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 7) + { + // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT. + + uint nextDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3); + if (((nextDWord & 0x0000_200Fu) != 0) && (((nextDWord - 0x0000_200Du) & 0x0000_200Fu) != 0)) + { + // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD + ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(nextDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord); + thisDWord = nextDWord; // store this value in the correct local for the ASCII drain logic + + // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ] + ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul); + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord); + pInputBuffer += 6; + pOutputBuffer += 2; + outputCharsRemaining -= 2; + + // Drain any ASCII data following the second three-byte sequence. + + goto CheckForAsciiByteAfterThreeByteSequence; + } + } + } + } + + // Couldn't extract 2x three-byte sequences together, just do this one by itself. + + *pOutputBuffer = (char)ExtractCharFromFirstThreeByteSequence(thisDWord); + pInputBuffer += 3; + pOutputBuffer += 1; + outputCharsRemaining -= 1; + + CheckForAsciiByteAfterThreeByteSequence: + + // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way + // in to the text. If this happens strip it off now before seeing if the next character + // consists of three code units. + + if (UInt32FourthByteIsAscii(thisDWord)) + { + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; + } + + if (BitConverter.IsLittleEndian) + { + *pOutputBuffer = (char)(thisDWord >> 24); + } + else + { + *pOutputBuffer = (char)(byte)thisDWord; + } + + pInputBuffer += 1; + pOutputBuffer += 1; + outputCharsRemaining -= 1; + } + + if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + // Optimization: A three-byte character could indicate CJK text, which makes it likely + // that the character following this one is also CJK. We'll check for a three-byte sequence + // marker now and jump directly to three-byte sequence processing if we see one, skipping + // all of the logic at the beginning of the loop. + + if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) + { + goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume + } + else + { + goto AfterReadDWord; // probably ASCII punctuation or whitespace + } + } + else + { + goto ProcessRemainingBytesSlow; // Running out of data - go down slow path + } + } + + // Assume the 4-byte case, but we need to validate. + + { + // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences. + // + // Per Table 3-7, valid sequences are: + // [ F0 ] [ 90..BF ] [ 80..BF ] [ 80..BF ] + // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ] + // [ F4 ] [ 80..8F ] [ 80..BF ] [ 80..BF ] + + if (!UInt32BeginsWithUtf8FourByteMask(thisDWord)) + { + goto Error; + } + + // Now check for overlong / out-of-range sequences. + + if (BitConverter.IsLittleEndian) + { + // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ]. + // We want to get the 'w' byte in front of the 'z' byte so that we can perform + // a single range comparison. We'll take advantage of the fact that the JITter + // can detect a ROR / ROL operation, then we'll just zero out the bytes that + // aren't involved in the range check. + + uint toCheck = thisDWord & 0x0000_FFFFu; + + // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ]. + + toCheck = BitOperations.RotateRight(toCheck, 8); + + // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ]. + + if (!UnicodeUtility.IsInRangeInclusive(toCheck, 0xF000_0090u, 0xF400_008Fu)) + { + goto Error; + } + } + else + { + if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0xF090_0000u, 0xF48F_FFFFu)) + { + goto Error; + } + } + + // Validation complete. + + if (outputCharsRemaining < 2) + { + // There's no point to falling back to the "drain the input buffer" logic, since we know + // we can't write anything to the destination. So we'll just exit immediately. + goto OutputBufferTooSmall; + } + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractCharsFromFourByteSequence(thisDWord)); + + pInputBuffer += 4; + pOutputBuffer += 2; + outputCharsRemaining -= 2; + + continue; // go back to beginning of loop for processing + } + } + + ProcessRemainingBytesSlow: + inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; + + ProcessInputOfLessThanDWordSize: + while (inputLength > 0) + { + uint firstByte = pInputBuffer[0]; + if (firstByte <= 0x7Fu) + { + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; // we have no hope of writing anything to the output + } + + // 1-byte (ASCII) case + *pOutputBuffer = (char)firstByte; + + pInputBuffer += 1; + pOutputBuffer += 1; + inputLength -= 1; + outputCharsRemaining -= 1; + continue; + } + + // Potentially the start of a multi-byte sequence? + + firstByte -= 0xC2u; + if ((byte)firstByte <= (0xDFu - 0xC2u)) + { + // Potentially a 2-byte sequence? + if (inputLength < 2) + { + goto InputBufferTooSmall; // out of data + } + + uint secondByte = pInputBuffer[1]; + if (!IsLowByteUtf8ContinuationByte(secondByte)) + { + goto Error; // 2-byte marker not followed by continuation byte + } + + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; // we have no hope of writing anything to the output + } + + uint asChar = (firstByte << 6) + secondByte + ((0xC2u - 0xC0u) << 6) - 0x80u; // remove UTF-8 markers from scalar + *pOutputBuffer = (char)asChar; + + pInputBuffer += 2; + pOutputBuffer += 1; + inputLength -= 2; + outputCharsRemaining -= 1; + continue; + } + else if ((byte)firstByte <= (0xEFu - 0xC2u)) + { + // Potentially a 3-byte sequence? + if (inputLength >= 3) + { + uint secondByte = pInputBuffer[1]; + uint thirdByte = pInputBuffer[2]; + if (!IsLowByteUtf8ContinuationByte(secondByte) || !IsLowByteUtf8ContinuationByte(thirdByte)) + { + goto Error; // 3-byte marker not followed by 2 continuation bytes + } + + // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet. + // We account for this in the comparisons below. + + uint partialChar = (firstByte << 12) + (secondByte << 6); + if (partialChar < ((0xE0u - 0xC2u) << 12) + (0xA0u << 6)) + { + goto Error; // this is an overlong encoding; fail + } + + partialChar -= ((0xEDu - 0xC2u) << 12) + (0xA0u << 6); //if partialChar = 0, we're at beginning of UTF-16 surrogate code point range + if (partialChar < (0x0800u /* number of code points in UTF-16 surrogate code point range */)) + { + goto Error; // attempted to encode a UTF-16 surrogate code point; fail + } + + if (outputCharsRemaining == 0) + { + goto OutputBufferTooSmall; // we have no hope of writing anything to the output + } + + // Now restore the full scalar value. + + partialChar += thirdByte; + partialChar += 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds + partialChar -= 0x80u; // remove third byte continuation marker + + *pOutputBuffer = (char)partialChar; + + pInputBuffer += 3; + pOutputBuffer += 1; + inputLength -= 3; + outputCharsRemaining -= 1; + continue; + } + else if (inputLength >= 2) + { + uint secondByte = pInputBuffer[1]; + if (!IsLowByteUtf8ContinuationByte(secondByte)) + { + goto Error; // 3-byte marker not followed by continuation byte + } + + // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations + // from just the first two bytes. + + uint partialChar = (firstByte << 6) + secondByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison + if (partialChar < ((0xE0u - 0xC2u) << 6) + 0xA0u) + { + goto Error; // failed overlong check + } + if (UnicodeUtility.IsInRangeInclusive(partialChar, ((0xEDu - 0xC2u) << 6) + 0xA0u, ((0xEEu - 0xC2u) << 6) + 0x7Fu)) + { + goto Error; // failed surrogate check + } + } + + goto InputBufferTooSmall; // out of data + } + else if ((byte)firstByte <= (0xF4u - 0xC2u)) + { + // Potentially a 4-byte sequence? + + if (inputLength < 2) + { + goto InputBufferTooSmall; // ran out of data + } + + uint nextByte = pInputBuffer[1]; + if (!IsLowByteUtf8ContinuationByte(nextByte)) + { + goto Error; // 4-byte marker not followed by a continuation byte + } + + uint asPartialChar = (firstByte << 6) + nextByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison + if (!UnicodeUtility.IsInRangeInclusive(asPartialChar, ((0xF0u - 0xC2u) << 6) + 0x90u, ((0xF4u - 0xC2u) << 6) + 0x8Fu)) + { + goto Error; // failed overlong / out-of-range check + } + + if (inputLength < 3) + { + goto InputBufferTooSmall; // ran out of data + } + + if (!IsLowByteUtf8ContinuationByte(pInputBuffer[2])) + { + goto Error; // third byte in 4-byte sequence not a continuation byte + } + + if (inputLength < 4) + { + goto InputBufferTooSmall; // ran out of data + } + + if (!IsLowByteUtf8ContinuationByte(pInputBuffer[3])) + { + goto Error; // fourth byte in 4-byte sequence not a continuation byte + } + + // If we read a valid astral scalar value, the only way we could've fallen down this code path + // is that we didn't have enough output buffer to write the result. + + goto OutputBufferTooSmall; + } + else + { + goto Error; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte + } + } + + OperationStatus retVal = OperationStatus.Done; + goto ReturnCommon; + + InputBufferTooSmall: + retVal = OperationStatus.NeedMoreData; + goto ReturnCommon; + + OutputBufferTooSmall: + retVal = OperationStatus.DestinationTooSmall; + goto ReturnCommon; + + Error: + retVal = OperationStatus.InvalidData; + goto ReturnCommon; + + ReturnCommon: + pInputBufferRemaining = pInputBuffer; + pOutputBufferRemaining = pOutputBuffer; + return retVal; + } + + // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where + // the next char would have been consumed from / the next byte would have been written to. + // inputLength in chars, outputBytesRemaining in bytes. + public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining) + { + const int CharsPerDWord = sizeof(uint) / sizeof(char); + + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative."); + Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); + + // First, try vectorized conversion. + + { + nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining)); + + pInputBuffer += numElementsConverted; + pOutputBuffer += numElementsConverted; + + // Quick check - did we just end up consuming the entire input buffer? + // If so, short-circuit the remainder of the method. + + if ((int)numElementsConverted == inputLength) + { + pInputBufferRemaining = pInputBuffer; + pOutputBufferRemaining = pOutputBuffer; + return OperationStatus.Done; + } + + inputLength -= (int)numElementsConverted; + outputBytesRemaining -= (int)numElementsConverted; + } + + if (inputLength < CharsPerDWord) + { + goto ProcessInputOfLessThanDWordSize; + } + + char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord; + + // Begin the main loop. + +#if DEBUG + char* pLastBufferPosProcessed = null; // used for invariant checking in debug builds +#endif + + uint thisDWord; + + while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar. + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + AfterReadDWord: + +#if DEBUG + Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read."); + pLastBufferPosProcessed = pInputBuffer; +#endif + + // First, check for the common case of all-ASCII chars. + + if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)) + { + // We read an all-ASCII sequence (2 chars). + + if (outputBytesRemaining < 2) + { + goto ProcessOneCharFromCurrentDWordAndFinish; // running out of space, but may be able to write some data + } + + // The high WORD of the local declared below might be populated with garbage + // as a result of our shifts below, but that's ok since we're only going to + // write the low WORD. + // + // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ] + // (Same logic works regardless of endianness.) + uint valueToWrite = thisDWord | (thisDWord >> 8); + + Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)valueToWrite); + + pInputBuffer += 2; + pOutputBuffer += 2; + outputBytesRemaining -= 2; + + // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII. + // Below is basically unrolled loops with poor man's vectorization. + + uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; + uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); + + if (BitConverter.IsLittleEndian && Bmi2.X64.IsSupported) + { + const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul; + + // Try reading and writing 8 elements per iteration. + uint maxIters = minElementsRemaining / 8; + ulong firstQWord, secondQWord; + int i; + for (i = 0; (uint)i < maxIters; i++) + { + firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); + secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4); + + if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord)) + { + goto LoopTerminatedDueToNonAsciiData; + } + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK)); + Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK)); + + pInputBuffer += 8; + pOutputBuffer += 8; + } + + outputBytesRemaining -= 8 * i; + + // Can we perform one more iteration, but reading & writing 4 elements instead of 8? + + if ((minElementsRemaining & 4) != 0) + { + secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); + + if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)) + { + goto LoopTerminatedDueToNonAsciiDataInSecondQWord; + } + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK)); + + pInputBuffer += 4; + pOutputBuffer += 4; + outputBytesRemaining -= 4; + } + + continue; // Go back to beginning of main loop, read data, check for ASCII + + LoopTerminatedDueToNonAsciiData: + + outputBytesRemaining -= 8 * i; + + // First, see if we can drain any ASCII data from the first QWORD. + + if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord)) + { + Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK)); + pInputBuffer += 4; + pOutputBuffer += 4; + outputBytesRemaining -= 4; + } + else + { + secondQWord = firstQWord; + } + + LoopTerminatedDueToNonAsciiDataInSecondQWord: + + Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier + + thisDWord = (uint)secondQWord; + if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)) + { + // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ] + Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8))); + pInputBuffer += 2; + pOutputBuffer += 2; + outputBytesRemaining -= 2; + thisDWord = (uint)(secondQWord >> 32); + } + + goto AfterReadDWordSkipAllCharsAsciiCheck; + } + else + { + // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration. + uint maxIters = minElementsRemaining / 4; + uint secondDWord; + int i; + for (i = 0; (uint)i < maxIters; i++) + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 2); + + if (!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord | secondDWord)) + { + goto LoopTerminatedDueToNonAsciiData; + } + + // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ] + // (Same logic works regardless of endianness.) + Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8))); + Unsafe.WriteUnaligned<ushort>(pOutputBuffer + 2, (ushort)(secondDWord | (secondDWord >> 8))); + + pInputBuffer += 4; + pOutputBuffer += 4; + } + + outputBytesRemaining -= 4 * i; + + continue; // Go back to beginning of main loop, read data, check for ASCII + + LoopTerminatedDueToNonAsciiData: + + outputBytesRemaining -= 4 * i; + + // First, see if we can drain any ASCII data from the first DWORD. + + if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)) + { + // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ] + // (Same logic works regardless of endianness.) + Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8))); + pInputBuffer += 2; + pOutputBuffer += 2; + outputBytesRemaining -= 2; + thisDWord = secondDWord; + } + + goto AfterReadDWordSkipAllCharsAsciiCheck; + } + } + + AfterReadDWordSkipAllCharsAsciiCheck: + + Debug.Assert(!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)); // this should have been handled earlier + + // Next, try stripping off the first ASCII char if it exists. + // We don't check for a second ASCII char since that should have been handled above. + + if (IsFirstCharAscii(thisDWord)) + { + if (outputBytesRemaining == 0) + { + goto OutputBufferTooSmall; + } + + if (BitConverter.IsLittleEndian) + { + pOutputBuffer[0] = (byte)thisDWord; // extract [ ## ## 00 AA ] + } + else + { + pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ] + } + + pInputBuffer += 1; + pOutputBuffer += 1; + outputBytesRemaining -= 1; + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessNextCharAndFinish; // input buffer doesn't contain enough data to read a DWORD + } + else + { + // The input buffer at the current offset contains a non-ASCII char. + // Read an entire DWORD and fall through to non-ASCII consumption logic. + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + } + } + + // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it. + + if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord)) + { + TryConsumeMultipleTwoByteSequences: + + // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in + // a tight loop without falling back to the main loop. + + if (IsSecondCharTwoUtf8Bytes(thisDWord)) + { + // We have two runs of two bytes each. + + if (outputBytesRemaining < 4) + { + goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer + } + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord)); + + pInputBuffer += 2; + pOutputBuffer += 4; + outputBytesRemaining -= 4; + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessNextCharAndFinish; // Running out of data - go down slow path + } + else + { + // Optimization: If we read a long run of two-byte sequences, the next sequence is probably + // also two bytes. Check for that first before going back to the beginning of the loop. + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + if (IsFirstCharTwoUtf8Bytes(thisDWord)) + { + // Validated we have a two-byte sequence coming up + goto TryConsumeMultipleTwoByteSequences; + } + + // If we reached this point, the next sequence is something other than a valid + // two-byte sequence, so go back to the beginning of the loop. + goto AfterReadDWord; + } + } + + if (outputBytesRemaining < 2) + { + goto OutputBufferTooSmall; + } + + Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord)); + + // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence. + // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining + // char is ASCII? + + if (IsSecondCharAscii(thisDWord)) + { + if (outputBytesRemaining >= 3) + { + if (BitConverter.IsLittleEndian) + { + thisDWord >>= 16; + } + pOutputBuffer[2] = (byte)thisDWord; + + pInputBuffer += 2; + pOutputBuffer += 3; + outputBytesRemaining -= 3; + + continue; // go back to original bounds check and check for ASCII + } + else + { + pInputBuffer += 1; + pOutputBuffer += 2; + goto OutputBufferTooSmall; + } + } + else + { + pInputBuffer += 1; + pOutputBuffer += 2; + outputBytesRemaining -= 2; + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessNextCharAndFinish; // Running out of data - go down slow path + } + else + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above) + } + } + } + + // Check the 3-byte case. + + BeforeProcessThreeByteSequence: + + if (!IsFirstCharSurrogate(thisDWord)) + { + // Optimization: A three-byte character could indicate CJK text, which makes it likely + // that the character following this one is also CJK. We'll perform the check now + // rather than jumping to the beginning of the main loop. + + if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord)) + { + if (!IsSecondCharSurrogate(thisDWord)) + { + if (outputBytesRemaining < 6) + { + goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can + } + + WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord); + + pInputBuffer += 2; + pOutputBuffer += 6; + outputBytesRemaining -= 6; + + // Try to remain in the 3-byte processing loop if at all possible. + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessNextCharAndFinish; // Running out of data - go down slow path + } + else + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord)) + { + goto BeforeProcessThreeByteSequence; + } + else + { + // Fall back to standard processing loop since we don't know how to optimize this. + goto AfterReadDWord; + } + } + } + } + + ConsumeSingleThreeByteRun: + + if (outputBytesRemaining < 3) + { + goto OutputBufferTooSmall; + } + + WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord); + + pInputBuffer += 1; + pOutputBuffer += 3; + outputBytesRemaining -= 3; + + // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way + // in to the text. If this happens strip it off now before seeing if the next character + // consists of three code units. + + if (IsSecondCharAscii(thisDWord)) + { + if (outputBytesRemaining == 0) + { + goto OutputBufferTooSmall; + } + + if (BitConverter.IsLittleEndian) + { + *pOutputBuffer = (byte)(thisDWord >> 16); + } + else + { + *pOutputBuffer = (byte)(thisDWord); + } + + pInputBuffer += 1; + pOutputBuffer += 1; + outputBytesRemaining -= 1; + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessNextCharAndFinish; // Running out of data - go down slow path + } + else + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord)) + { + goto BeforeProcessThreeByteSequence; + } + else + { + // Fall back to standard processing loop since we don't know how to optimize this. + goto AfterReadDWord; + } + } + } + + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessNextCharAndFinish; // Running out of data - go down slow path + } + else + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + goto AfterReadDWordSkipAllCharsAsciiCheck; // we just checked above that this value isn't ASCII + } + } + + // Four byte sequence processing + + if (IsWellFormedUtf16SurrogatePair(thisDWord)) + { + if (outputBytesRemaining < 4) + { + goto OutputBufferTooSmall; + } + + Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractFourUtf8BytesFromSurrogatePair(thisDWord)); + + pInputBuffer += 2; + pOutputBuffer += 4; + outputBytesRemaining -= 4; + + continue; // go back to beginning of loop for processing + } + + goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high + } + + ProcessNextCharAndFinish: + inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord; + + ProcessInputOfLessThanDWordSize: + Debug.Assert(inputLength < CharsPerDWord); + + if (inputLength == 0) + { + goto InputBufferFullyConsumed; + } + + uint thisChar = *pInputBuffer; + goto ProcessFinalChar; + + ProcessOneCharFromCurrentDWordAndFinish: + if (BitConverter.IsLittleEndian) + { + thisChar = thisDWord & 0xFFFFu; // preserve only the first char + } + else + { + thisChar = thisDWord >> 16; // preserve only the first char + } + + ProcessFinalChar: + { + if (thisChar <= 0x7Fu) + { + if (outputBytesRemaining == 0) + { + goto OutputBufferTooSmall; // we have no hope of writing anything to the output + } + + // 1-byte (ASCII) case + *pOutputBuffer = (byte)thisChar; + + pInputBuffer += 1; + pOutputBuffer += 1; + } + else if (thisChar < 0x0800u) + { + if (outputBytesRemaining < 2) + { + goto OutputBufferTooSmall; // we have no hope of writing anything to the output + } + + // 2-byte case + pOutputBuffer[1] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ] + pOutputBuffer[0] = (byte)((thisChar >> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ] + + pInputBuffer += 1; + pOutputBuffer += 2; + } + else if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) + { + if (outputBytesRemaining < 3) + { + goto OutputBufferTooSmall; // we have no hope of writing anything to the output + } + + // 3-byte case + pOutputBuffer[2] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ] + pOutputBuffer[1] = (byte)(((thisChar >> 6) & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ] + pOutputBuffer[0] = (byte)((thisChar >> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ] + + pInputBuffer += 1; + pOutputBuffer += 3; + } + else if (thisChar <= 0xDBFFu) + { + // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer + goto InputBufferTooSmall; + } + else + { + // UTF-16 low surrogate code point with no leading data, report error + goto Error; + } + } + + // There are two ways we can end up here. Either we were running low on input data, + // or we were running low on space in the destination buffer. If we're running low on + // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish), + // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done. + // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish), + // then we didn't modify inputLength since entering the main loop, which means it should + // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine + // which of the two scenarios we're in. + + if (inputLength > 1) + { + goto OutputBufferTooSmall; + } + + InputBufferFullyConsumed: + OperationStatus retVal = OperationStatus.Done; + goto ReturnCommon; + + InputBufferTooSmall: + retVal = OperationStatus.NeedMoreData; + goto ReturnCommon; + + OutputBufferTooSmall: + retVal = OperationStatus.DestinationTooSmall; + goto ReturnCommon; + + Error: + retVal = OperationStatus.InvalidData; + goto ReturnCommon; + + ReturnCommon: + pInputBufferRemaining = pInputBuffer; + pOutputBufferRemaining = pOutputBuffer; + return retVal; + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs new file mode 100644 index 0000000000..671bf1fc60 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Validation.cs @@ -0,0 +1,729 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics.X86; +using Internal.Runtime.CompilerServices; + +#if BIT64 +using nint = System.Int64; +using nuint = System.UInt64; +#else // BIT64 +using nint = System.Int32; +using nuint = System.UInt32; +#endif // BIT64 + +namespace System.Text.Unicode +{ + internal static unsafe partial class Utf8Utility + { + // Returns &inputBuffer[inputLength] if the input buffer is valid. + /// <summary> + /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>, + /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. + /// </summary> + /// <remarks> + /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. + /// </remarks> + public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + // First, try to drain off as many ASCII bytes as we can from the beginning. + + { + nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength); + pInputBuffer += numAsciiBytesCounted; + + // Quick check - did we just end up consuming the entire input buffer? + // If so, short-circuit the remainder of the method. + + inputLength -= (int)numAsciiBytesCounted; + if (inputLength == 0) + { + utf16CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; + return pInputBuffer; + } + } + +#if DEBUG + // Keep these around for final validation at the end of the method. + byte* pOriginalInputBuffer = pInputBuffer; + int originalInputLength = inputLength; +#endif + + // Enregistered locals that we'll eventually out to our caller. + + int tempUtf16CodeUnitCountAdjustment = 0; + int tempScalarCountAdjustment = 0; + + if (inputLength < sizeof(uint)) + { + goto ProcessInputOfLessThanDWordSize; + } + + byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint); + + // Begin the main loop. + +#if DEBUG + byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds +#endif + + while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. + + uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + AfterReadDWord: + +#if DEBUG + Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read."); + pLastBufferPosProcessed = pInputBuffer; +#endif + + // First, check for the common case of all-ASCII bytes. + + if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + { + // We read an all-ASCII sequence. + + pInputBuffer += sizeof(uint); + + // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII. + // Below is basically unrolled loops with poor man's vectorization. + + // Below check is "can I read at least five DWORDs from the input stream?" + // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value, + // hence using nint instead of nuint. + + if ((nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint)) + { + // We want reads in the inner loop to be aligned. So let's perform a quick + // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump + // the read pointer up to the next aligned address. + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + { + goto AfterReadDWordSkipAllBytesAsciiCheck; + } + + pInputBuffer = (byte*)((nuint)(pInputBuffer + 4) & ~(nuint)3); + + // At this point, the input buffer offset points to an aligned DWORD. We also know that there's + // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above: + // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and + // the alignment check consumes at most a single DWORD.) + + byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here + uint mask; + + do + { + if (Sse2.IsSupported && Bmi1.IsSupported) + { + // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're + // going to perform an unaligned load. We don't necessarily care about aligning + // this because we pessimistically assume we'll encounter non-ASCII data at some + // point in the not-too-distant future (otherwise we would've stayed entirely + // within the all-ASCII vectorized code at the entry to this method). + + mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer)); + if (mask != 0) + { + goto Sse2LoopTerminatedEarlyDueToNonAsciiData; + } + } + else + { + if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1])) + { + goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair; + } + + if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[2] | ((uint*)pInputBuffer)[3])) + { + goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair; + } + } + + pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs + } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); + + continue; // need to perform a bounds check because we might be running out of data + + Sse2LoopTerminatedEarlyDueToNonAsciiData: + + Debug.Assert(BitConverter.IsLittleEndian); + Debug.Assert(Sse2.IsSupported); + Debug.Assert(Bmi1.IsSupported); + + // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit + // for each non-ASCII byte we saw. We can count the number of ASCII bytes, + // bump our input counter by that amount, and resume processing from the + // "the first byte is no longer ASCII" portion of the main loop. + + Debug.Assert(mask != 0); + + pInputBuffer += Bmi1.TrailingZeroCount(mask); + if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) + { + goto ProcessRemainingBytesSlow; + } + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); // no longer guaranteed to be aligned + goto BeforeProcessTwoByteSequence; + + LoopTerminatedEarlyDueToNonAsciiDataInSecondPair: + + pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs + + LoopTerminatedEarlyDueToNonAsciiDataInFirstPair: + + // We know that there's *at least* two DWORDs of data remaining in the buffer. + // We also know that one of them (or both of them) contains non-ASCII data somewhere. + // Let's perform a quick check here to bypass the logic at the beginning of the main loop. + + thisDWord = *(uint*)pInputBuffer; // still aligned here + if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + { + pInputBuffer += sizeof(uint); // consumed 1 more DWORD + thisDWord = *(uint*)pInputBuffer; // still aligned here + } + + goto AfterReadDWordSkipAllBytesAsciiCheck; + } + + continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks + } + + AfterReadDWordSkipAllBytesAsciiCheck: + + Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier + + // Next, try stripping off ASCII bytes one at a time. + // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above. + + { + uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFrom24BitInteger(thisDWord); + pInputBuffer += numLeadingAsciiBytes; + + if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer) + { + goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD + } + else + { + // The input buffer at the current offset contains a non-ASCII byte. + // Read an entire DWORD and fall through to multi-byte consumption logic. + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + } + } + + BeforeProcessTwoByteSequence: + + // At this point, we suspect we're working with a multi-byte code unit sequence, + // but we haven't yet validated it for well-formedness. + + // The masks and comparands are derived from the Unicode Standard, Table 3-6. + // Additionally, we need to check for valid byte sequences per Table 3-7. + + // Check the 2-byte case. + + thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u; + if ((thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u)) == 0) + { + // Per Table 3-7, valid sequences are: + // [ C2..DF ] [ 80..BF ] + // + // Due to our modification of 'thisDWord' above, this becomes: + // [ 02..1F ] [ 00..3F ] + // + // We've already checked that the leading byte was originally in the range [ C0..DF ] + // and that the trailing byte was originally in the range [ 80..BF ], so now we only need + // to check that the modified leading byte is >= [ 02 ]. + + if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u) + || (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u)) + { + goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ] + } + + ProcessTwoByteSequenceSkipOverlongFormCheck: + + // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew, + // there's a good chance that if we see one two-byte run then there's another two-byte + // run immediately after. Let's check that now. + + // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that + // the value isn't overlong using a single comparison. On big-endian platforms, we'll need + // to validate the mask and validate that the sequence isn't overlong as two separate comparisons. + + if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) + || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord)))) + { + // We have two runs of two bytes each. + pInputBuffer += 4; + tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars) + + if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + // Optimization: If we read a long run of two-byte sequences, the next sequence is probably + // also two bytes. Check for that first before going back to the beginning of the loop. + + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + if (BitConverter.IsLittleEndian) + { + if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) + { + // The next sequence is a valid two-byte sequence. + goto ProcessTwoByteSequenceSkipOverlongFormCheck; + } + } + else + { + if (UInt32BeginsWithUtf8TwoByteMask(thisDWord)) + { + if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord)) + { + goto Error; // The next sequence purports to be a 2-byte sequence but is overlong. + } + + goto ProcessTwoByteSequenceSkipOverlongFormCheck; + } + } + + // If we reached this point, the next sequence is something other than a valid + // two-byte sequence, so go back to the beginning of the loop. + goto AfterReadDWord; + } + else + { + goto ProcessRemainingBytesSlow; // Running out of data - go down slow path + } + } + + // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence. + // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining + // bytes are ASCII? + + tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing] + + if (UInt32ThirdByteIsAscii(thisDWord)) + { + if (UInt32FourthByteIsAscii(thisDWord)) + { + pInputBuffer += 4; + } + else + { + pInputBuffer += 3; + + // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte. + // Read in the next DWORD and jump directly to the start of the multi-byte processing block. + + if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + goto BeforeProcessTwoByteSequence; + } + } + } + else + { + pInputBuffer += 2; + } + + continue; + } + + // Check the 3-byte case. + // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte. + + thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u); + if ((thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u)) == 0) + { + ProcessThreeByteSequenceWithCheck: + + // We assume the caller has confirmed that the bit pattern is representative of a three-byte + // sequence, but it may still be overlong or surrogate. We need to check for these possibilities. + // + // Per Table 3-7, valid sequences are: + // [ E0 ] [ A0..BF ] [ 80..BF ] + // [ E1..EC ] [ 80..BF ] [ 80..BF ] + // [ ED ] [ 80..9F ] [ 80..BF ] + // [ EE..EF ] [ 80..BF ] [ 80..BF ] + // + // Big-endian examples of using the above validation table: + // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# #### + // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# #### + // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20), + // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000), + // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20). + // + // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80) + // as long as they haven't touched the bits we're about to use in our mask checking below. + + if (BitConverter.IsLittleEndian) + { + // The "overlong or surrogate" check can be implemented using a single jump, but there's + // some overhead to moving the bits into the correct locations in order to perform the + // correct comparison, and in practice the processor's branch prediction capability is + // good enough that we shouldn't bother. So we'll use two jumps instead. + + // Can't extract this check into its own helper method because JITter produces suboptimal + // assembly, even with aggressive inlining. + + // Code below becomes 5 instructions: test, jz, add, test, jz + + if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord -= 0x0000_200Du) & 0x0000_200Fu) == 0)) + { + goto Error; // overlong or surrogate + } + } + else + { + if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord -= 0x0D20_0000u) & 0x0F20_0000u) == 0)) + { + goto Error; // overlong or surrogate + } + } + + ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks: + + // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way + // in to the text. If this happens strip it off now before seeing if the next character + // consists of three code units. + + // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end + + nint asciiAdjustment; + if (BitConverter.IsLittleEndian) + { + asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value + } + else + { + asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value + } + + // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise + + // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method + // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid + // to add 4 before backing up since we already checked previously that the input buffer contains at + // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can + // no longer track the reference. However, we can't back up before adding 4, since we might back up to + // before the start of the buffer, and the GC isn't guaranteed to be able to track this. + + pInputBuffer += 4; // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte + pInputBuffer += asciiAdjustment; // back up if we didn't actually consume an ASCII byte + + tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar) + + SuccessfullyProcessedThreeByteSequence: + + if (IntPtr.Size >= 8 && BitConverter.IsLittleEndian) + { + // x64 little-endian optimization: A three-byte character could indicate CJK text, + // which makes it likely that the character following this one is also CJK. + // We'll try to process several three-byte sequences at a time. + + // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset + // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so + // use nint instead of nuint. + + if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5) + { + ulong thisQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); + + // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward + // to a previous location in the loop. This offers defense against reading main memory again (which may + // have been modified and could lead to a race condition). + + thisDWord = (uint)thisQWord; + + // Is this three 3-byte sequences in a row? + // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ] + // ---- CHAR 3 ---- --------- CHAR 2 --------- --------- CHAR 1 --------- -CHAR 3- + if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && IsUtf8ContinuationByte(in pInputBuffer[8])) + { + // Saw a proper bitmask for three incoming 3-byte sequences, perform the + // overlong and surrogate sequence checking now. + + // Check the first character. + // If the first character is overlong or a surrogate, fail immediately. + + if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) + { + goto Error; + } + + // Check the second character. + // At this point, we now know the first three bytes represent a well-formed sequence. + // If there's an error beyond here, we'll jump back to the "process three known good bytes" + // logic. + + thisQWord >>= 24; + if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) + { + goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; + } + + // Check the third character (we already checked that it's followed by a continuation byte). + + thisQWord >>= 24; + if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) + { + goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; + } + + pInputBuffer += 9; + tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars) + + goto SuccessfullyProcessedThreeByteSequence; + } + + // Is this two 3-byte sequences in a row? + // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] + // --------- CHAR 2 --------- --------- CHAR 1 --------- + if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul) + { + // Saw a proper bitmask for two incoming 3-byte sequences, perform the + // overlong and surrogate sequence checking now. + + // Check the first character. + // If the first character is overlong or a surrogate, fail immediately. + + if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) + { + goto Error; + } + + // Check the second character. + // At this point, we now know the first three bytes represent a well-formed sequence. + // If there's an error beyond here, we'll jump back to the "process three known good bytes" + // logic. + + thisQWord >>= 24; + if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) + { + goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; + } + + pInputBuffer += 6; + tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars) + + // The next byte in the sequence didn't have a 3-byte marker, so it's probably + // an ASCII character. Jump back to the beginning of loop processing. + + continue; + } + + if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) + { + // A single three-byte sequence. + goto ProcessThreeByteSequenceWithCheck; + } + else + { + // Not a three-byte sequence; perhaps ASCII? + goto AfterReadDWord; + } + } + } + + if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + { + thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); + + // Optimization: A three-byte character could indicate CJK text, which makes it likely + // that the character following this one is also CJK. We'll check for a three-byte sequence + // marker now and jump directly to three-byte sequence processing if we see one, skipping + // all of the logic at the beginning of the loop. + + if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) + { + goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process + } + else + { + goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop + } + } + else + { + goto ProcessRemainingBytesSlow; // Running out of data + } + } + + // Assume the 4-byte case, but we need to validate. + + if (BitConverter.IsLittleEndian) + { + thisDWord &= 0xC0C0_FFFFu; + + // After the above modifications earlier in this method, we expect 'thisDWord' + // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now + // perform two checks to confirm this. The first will verify the + // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's + // complement representation to perform a single *signed* integer check. + + if ((int)thisDWord > unchecked((int)0x8000_3FFF)) + { + goto Error; // didn't have three trailing bytes + } + + // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) + // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). + + thisDWord = BitOperations.RotateRight(thisDWord, 8); + + // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. + // The check is now a simple add / cmp / jcc combo. + + if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu)) + { + goto Error; // overlong or out-of-range + } + } + else + { + thisDWord -= 0x80u; + + // After the above modifications earlier in this method, we expect 'thisDWord' + // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now + // perform two checks to confirm this. The first will verify the + // [ ######## 00###### 00###### 00###### ] structure. + + if ((thisDWord & 0x00C0_C0C0u) != 0) + { + goto Error; // didn't have three trailing bytes + } + + // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) + // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). + // This is a simple range check. (We don't care about the low two bytes.) + + if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu)) + { + goto Error; // overlong or out-of-range + } + } + + // Validation of 4-byte case complete. + + pInputBuffer += 4; + tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units + tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar + + continue; // go back to beginning of loop for processing + } + + goto ProcessRemainingBytesSlow; + + ProcessInputOfLessThanDWordSize: + + Debug.Assert(inputLength < 4); + nuint inputBufferRemainingBytes = (uint)inputLength; + goto ProcessSmallBufferCommon; + + ProcessRemainingBytesSlow: + + inputBufferRemainingBytes = (nuint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; + + ProcessSmallBufferCommon: + + Debug.Assert(inputBufferRemainingBytes < 4); + while (inputBufferRemainingBytes > 0) + { + uint firstByte = pInputBuffer[0]; + + if ((byte)firstByte < 0x80u) + { + // 1-byte (ASCII) case + pInputBuffer++; + inputBufferRemainingBytes--; + continue; + } + else if (inputBufferRemainingBytes >= 2) + { + uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value + if ((byte)firstByte < 0xE0u) + { + // 2-byte case + if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte)) + { + pInputBuffer += 2; + tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar) + inputBufferRemainingBytes -= 2; + continue; + } + } + else if (inputBufferRemainingBytes >= 3) + { + if ((byte)firstByte < 0xF0u) + { + if ((byte)firstByte == 0xE0u) + { + if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu)) + { + goto Error; // overlong encoding + } + } + else if ((byte)firstByte == 0xEDu) + { + if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu)) + { + goto Error; // would be a UTF-16 surrogate code point + } + } + else + { + if (!IsLowByteUtf8ContinuationByte(secondByte)) + { + goto Error; // first trailing byte doesn't have proper continuation marker + } + } + + if (IsUtf8ContinuationByte(in pInputBuffer[2])) + { + pInputBuffer += 3; + tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars) + inputBufferRemainingBytes -= 3; + continue; + } + } + } + } + + // Error - no match. + + goto Error; + } + + // If we reached this point, we're out of data, and we saw no bad UTF8 sequence. + +#if DEBUG + // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength]. + Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value."); +#endif + + Error: + + // Report back to our caller how far we got before seeing invalid data. + // (Also used for normal termination when falling out of the loop above.) + + utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = tempScalarCountAdjustment; + return pInputBuffer; + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs index 6ee9ca05a6..d24f766474 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs @@ -6,10 +6,12 @@ using System.Buffers; using System.Diagnostics; using System.IO; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Internal.Runtime.CompilerServices; namespace System.Text.Unicode { - internal static class Utf8Utility + internal static partial class Utf8Utility { /// <summary> /// The maximum number of bytes that can result from UTF-8 transcoding @@ -29,26 +31,16 @@ namespace System.Text.Unicode /// comes first) is ASCII. /// </summary> [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii) + public unsafe static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii) { - // TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948). - - bool tempIsAscii = true; - int originalDataLength = utf8Data.Length; - - while (!utf8Data.IsEmpty) + fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data)) { - if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done) - { - break; - } + byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _); + int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte); - tempIsAscii &= result.IsAscii; - utf8Data = utf8Data.Slice(bytesConsumed); + isAscii = (utf16CodeUnitCountAdjustment == 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII. + return (index < utf8Data.Length) ? index : -1; } - - isAscii = tempIsAscii; - return (utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length); } #if FEATURE_UTF8STRING diff --git a/tests/CoreFX/CoreFX.issues.json b/tests/CoreFX/CoreFX.issues.json index cfd18acfd0..6d613d96b9 100644 --- a/tests/CoreFX/CoreFX.issues.json +++ b/tests/CoreFX/CoreFX.issues.json @@ -896,15 +896,23 @@ "methods": [ { "name": "System.Text.Tests.EncoderConvert2.EncoderASCIIConvertMixedASCIIUnicodeCharArrayPartial", - "reason": "https://github.com/dotnet/coreclr/issues/23020" + "reason": "https://github.com/dotnet/coreclr/issues/23864" }, { "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertMixedASCIIUnicodeCharArrayPartial", - "reason": "https://github.com/dotnet/coreclr/issues/23020" + "reason": "https://github.com/dotnet/coreclr/issues/23864" }, { "name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertUnicodeCharArrayPartial", - "reason": "https://github.com/dotnet/coreclr/issues/23020" + "reason": "https://github.com/dotnet/coreclr/issues/23864" + }, + { + "name": "System.Text.Tests.NegativeEncodingTests.GetByteCount_Invalid", + "reason": "https://github.com/dotnet/coreclr/issues/23864" + }, + { + "name": "System.Text.Tests.UTF8EncodingDecode.Decode_InvalidBytes", + "reason": "https://github.com/dotnet/coreclr/issues/23864" } ] } |