diff options
author | Levi Broderick <GrabYourPitchforks@users.noreply.github.com> | 2020-02-18 10:06:38 -0800 |
---|---|---|
committer | Hyungju Lee <leee.lee@samsung.com> | 2020-10-30 18:20:49 +0900 |
commit | 26f4d5ce68467aca9a9e9e2073a3a56e4bed3071 (patch) | |
tree | 8d09563498d72d61daada4ffb86daf80b86e38ef /src | |
parent | 8e0799ded6b0dc6f1d35c605ef823b093622ab4c (diff) | |
download | coreclr-26f4d5ce68467aca9a9e9e2073a3a56e4bed3071.tar.gz coreclr-26f4d5ce68467aca9a9e9e2073a3a56e4bed3071.tar.bz2 coreclr-26f4d5ce68467aca9a9e9e2073a3a56e4bed3071.zip |
Port dotnet/runtime#31904 to release/3.1 (#28013)
Remove BMI2 from ASCII and UTF-16 processing hot paths, as not all processors have optimized implementations of pext/pdep
Diffstat (limited to 'src')
3 files changed, 92 insertions, 156 deletions
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs index b37b01779d..bf2bccf879 100644 --- a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs @@ -1009,10 +1009,14 @@ namespace System.Text { Debug.Assert(AllCharsInUInt64AreAscii(value)); - if (Bmi2.X64.IsSupported) + if (Sse2.X64.IsSupported) { - // BMI2 will work regardless of the processor's endianness. - Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul)); + // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes + // [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination. + + Vector128<short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16(); + Vector128<uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32(); + Unsafe.WriteUnaligned<uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow)); } else { @@ -1694,14 +1698,16 @@ namespace System.Text /// writes them to the output buffer with machine endianness. /// </summary> [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value) + internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value) { Debug.Assert(AllBytesInUInt32AreAscii(value)); - if (Bmi2.X64.IsSupported) + if (Sse2.X64.IsSupported) { - // BMI2 will work regardless of the processor's endianness. - Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul)); + Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian."); + Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64(); + Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide)); } else { diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs index 54940f9816..9885a30689 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -6,7 +6,6 @@ using System.Buffers.Binary; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; namespace System.Text.Unicode @@ -61,7 +60,7 @@ namespace System.Text.Unicode } /// <summary> - /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a + /// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation. /// </summary> [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -69,39 +68,19 @@ namespace System.Text.Unicode { if (BitConverter.IsLittleEndian) { - if (Bmi2.IsSupported) - { - // need to reverse endianness for bit manipulation to work correctly - value = BinaryPrimitives.ReverseEndianness(value); - - // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] - // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ] - // where wwww = uuuuu - 1 - - uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u); - uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u); - - uint combined = (lowSurrogateChar << 16) + highSurrogateChar; - combined -= 0x40u; // wwww = uuuuu - 1 - combined += 0xDC00_D800u; // add surrogate markers - return combined; - } - else - { - // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx - // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] - // where wwww = uuuuu - 1 - uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ] - retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ] - retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ] - retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ] - retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ] - retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ] - retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ] - retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ] - retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] - return retVal; - } + // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx + // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] + // where wwww = uuuuu - 1 + uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ] + retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ] + retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ] + retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ] + retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ] + retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ] + retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ] + retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ] + retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] + return retVal; } else { @@ -135,37 +114,19 @@ namespace System.Text.Unicode // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx) // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1 - if (Bmi2.IsSupported) - { - // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want - // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple - // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across - // all four output bytes. - - uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */; - - // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning - // that should normally be masked out via an and, but we'll just direct pdep to ignore it. + value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] - uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ] - return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] - } - else - { - value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] + uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] + tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] - uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] - tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] + uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] + uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] + tempC |= tempB; - uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] - uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] - tempC |= tempB; + uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ] + tempD |= 0x8080_80F0u; - uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ] - tempD |= 0x8080_80F0u; - - return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] - } + return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] } else { @@ -757,43 +718,6 @@ namespace System.Text.Unicode } /// <summary> - /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD - /// and writes the resulting QWORD into the destination with machine endianness. - /// </summary> - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value) - { - if (Bmi2.X64.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul)); - } - else - { - if (BitConverter.IsLittleEndian) - { - outputBuffer = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 3) = (char)value; - } - else - { - Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value; - value >>= 8; - outputBuffer = (char)value; - } - } - } - - /// <summary> /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess, /// converts those scalar values to their 3-byte UTF-8 representation and writes the /// resulting 6 bytes to the destination buffer. diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs index 126974c892..f050248601 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -7,6 +7,7 @@ using System.Buffers.Binary; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; @@ -78,7 +79,8 @@ namespace System.Text.Unicode byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds #endif - while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); + do { // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. @@ -101,7 +103,7 @@ namespace System.Text.Unicode goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data } - Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); pInputBuffer += 4; pOutputBuffer += 4; outputCharsRemaining -= 4; @@ -127,8 +129,8 @@ namespace System.Text.Unicode pInputBuffer += 8; - Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord); - Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord); pOutputBuffer += 8; } @@ -143,7 +145,7 @@ namespace System.Text.Unicode { // The first DWORD contained all-ASCII bytes, so expand it. - Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); // continue the outer loop from the second DWORD @@ -487,12 +489,10 @@ namespace System.Text.Unicode } // As an optimization, on compatible platforms check if a second three-byte sequence immediately - // follows the one we just read, and if so use BSWAP and BMI2 to extract them together. + // follows the one we just read, and if so extract them together. - if (Bmi2.X64.IsSupported) + if (BitConverter.IsLittleEndian) { - Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian."); - // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which // would indicate the potential start of a second three-byte sequence. @@ -504,7 +504,7 @@ namespace System.Text.Unicode if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3) { - // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT. + // We're going to attempt to read a second 3-byte sequence and write them both out one after the other. // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid @@ -517,14 +517,8 @@ namespace System.Text.Unicode && ((secondDWord & 0x0000_200Fu) != 0) && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0)) { - // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD - ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord); - thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic - - // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ] - ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul); - - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord); + pOutputBuffer[0] = (char)ExtractCharFromFirstThreeByteSequence(thisDWord); + pOutputBuffer[1] = (char)ExtractCharFromFirstThreeByteSequence(secondDWord); pInputBuffer += 6; pOutputBuffer += 2; outputCharsRemaining -= 2; @@ -658,7 +652,7 @@ namespace System.Text.Unicode continue; // go back to beginning of loop for processing } - } + } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); ProcessRemainingBytesSlow: inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; @@ -900,6 +894,16 @@ namespace System.Text.Unicode char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord; + // We have paths for SSE4.1 vectorization inside the inner loop. Since the below + // vector is only used in those code paths, we leave it uninitialized if SSE4.1 + // is not enabled. + + Vector128<short> nonAsciiUtf16DataMask = default; + if (Sse41.X64.IsSupported) + { + nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char + } + // Begin the main loop. #if DEBUG @@ -908,7 +912,8 @@ namespace System.Text.Unicode uint thisDWord; - while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); + do { // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar. @@ -952,27 +957,26 @@ namespace System.Text.Unicode uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); - if (Bmi2.X64.IsSupported) + if (Sse41.X64.IsSupported) { - Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian."); - const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul; + Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian."); // Try reading and writing 8 elements per iteration. uint maxIters = minElementsRemaining / 8; - ulong firstQWord, secondQWord; + ulong possibleNonAsciiQWord; int i; + Vector128<short> utf16Data; for (i = 0; (uint)i < maxIters; i++) { - firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); - secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4); - - if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord)) + utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer); + if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask)) { - goto LoopTerminatedDueToNonAsciiData; + goto LoopTerminatedDueToNonAsciiDataInVectorLocal; } - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK)); - Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK)); + // narrow and write + + Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64()); pInputBuffer += 8; pOutputBuffer += 8; @@ -984,14 +988,14 @@ namespace System.Text.Unicode if ((minElementsRemaining & 4) != 0) { - secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); - - if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)) + possibleNonAsciiQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); + if (!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) { - goto LoopTerminatedDueToNonAsciiDataInSecondQWord; + goto LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal; } - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK)); + utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16(); + Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); pInputBuffer += 4; pOutputBuffer += 4; @@ -1000,29 +1004,31 @@ namespace System.Text.Unicode continue; // Go back to beginning of main loop, read data, check for ASCII - LoopTerminatedDueToNonAsciiData: + LoopTerminatedDueToNonAsciiDataInVectorLocal: outputBytesRemaining -= 8 * i; + possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64()); - // First, see if we can drain any ASCII data from the first QWORD. + // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector, + // then check whether it's all-ASCII. If so, narrow and write to the destination + // buffer. Since we know that either the high 64 bits or the low 64 bits of the + // vector contains non-ASCII data, by the end of the following block the + // 'possibleNonAsciiQWord' local is guaranteed to contain the non-ASCII segment. - if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord)) + if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII { - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK)); + Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); pInputBuffer += 4; pOutputBuffer += 4; outputBytesRemaining -= 4; - } - else - { - secondQWord = firstQWord; + possibleNonAsciiQWord = utf16Data.AsUInt64().GetElement(1); } - LoopTerminatedDueToNonAsciiDataInSecondQWord: + LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal: - Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier + Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)); // this condition should've been checked earlier - thisDWord = (uint)secondQWord; + thisDWord = (uint)possibleNonAsciiQWord; if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)) { // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ] @@ -1030,14 +1036,14 @@ namespace System.Text.Unicode pInputBuffer += 2; pOutputBuffer += 2; outputBytesRemaining -= 2; - thisDWord = (uint)(secondQWord >> 32); + thisDWord = (uint)(possibleNonAsciiQWord >> 32); } goto AfterReadDWordSkipAllCharsAsciiCheck; } else { - // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration. + // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration. uint maxIters = minElementsRemaining / 4; uint secondDWord; int i; @@ -1358,7 +1364,7 @@ namespace System.Text.Unicode } goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high - } + } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); ProcessNextCharAndFinish: inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord; |