diff options
author | Levi Broderick <GrabYourPitchforks@users.noreply.github.com> | 2020-02-18 10:06:38 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-18 10:06:38 -0800 |
commit | a74f1dbf5a3aed3c4f5ba723641598aac07c31d9 (patch) | |
tree | b24b442d57855329794e03038908eac7081080f3 /src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs | |
parent | 6689dd74ef07963fb7bcc8072e80d7e6fd2cd1de (diff) | |
download | coreclr-a74f1dbf5a3aed3c4f5ba723641598aac07c31d9.tar.gz coreclr-a74f1dbf5a3aed3c4f5ba723641598aac07c31d9.tar.bz2 coreclr-a74f1dbf5a3aed3c4f5ba723641598aac07c31d9.zip |
Port dotnet/runtime#31904 to release/3.1 (#28013)
Remove BMI2 from ASCII and UTF-16 processing hot paths, as not all processors have optimized implementations of pext/pdep
Diffstat (limited to 'src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs')
-rw-r--r-- | src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs | 106 |
1 files changed, 56 insertions, 50 deletions
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs index 126974c892..f050248601 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -7,6 +7,7 @@ using System.Buffers.Binary; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; @@ -78,7 +79,8 @@ namespace System.Text.Unicode byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds #endif - while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); + do { // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. @@ -101,7 +103,7 @@ namespace System.Text.Unicode goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data } - Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); pInputBuffer += 4; pOutputBuffer += 4; outputCharsRemaining -= 4; @@ -127,8 +129,8 @@ namespace System.Text.Unicode pInputBuffer += 8; - Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord); - Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord); pOutputBuffer += 8; } @@ -143,7 +145,7 @@ namespace System.Text.Unicode { // The first DWORD contained all-ASCII bytes, so expand it. - Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord); + ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); // continue the outer loop from the second DWORD @@ -487,12 +489,10 @@ namespace System.Text.Unicode } // As an optimization, on compatible platforms check if a second three-byte sequence immediately - // follows the one we just read, and if so use BSWAP and BMI2 to extract them together. + // follows the one we just read, and if so extract them together. - if (Bmi2.X64.IsSupported) + if (BitConverter.IsLittleEndian) { - Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian."); - // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which // would indicate the potential start of a second three-byte sequence. @@ -504,7 +504,7 @@ namespace System.Text.Unicode if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3) { - // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT. + // We're going to attempt to read a second 3-byte sequence and write them both out one after the other. // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid @@ -517,14 +517,8 @@ namespace System.Text.Unicode && ((secondDWord & 0x0000_200Fu) != 0) && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0)) { - // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD - ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord); - thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic - - // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ] - ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul); - - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord); + pOutputBuffer[0] = (char)ExtractCharFromFirstThreeByteSequence(thisDWord); + pOutputBuffer[1] = (char)ExtractCharFromFirstThreeByteSequence(secondDWord); pInputBuffer += 6; pOutputBuffer += 2; outputCharsRemaining -= 2; @@ -658,7 +652,7 @@ namespace System.Text.Unicode continue; // go back to beginning of loop for processing } - } + } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); ProcessRemainingBytesSlow: inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; @@ -900,6 +894,16 @@ namespace System.Text.Unicode char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord; + // We have paths for SSE4.1 vectorization inside the inner loop. Since the below + // vector is only used in those code paths, we leave it uninitialized if SSE4.1 + // is not enabled. + + Vector128<short> nonAsciiUtf16DataMask = default; + if (Sse41.X64.IsSupported) + { + nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char + } + // Begin the main loop. #if DEBUG @@ -908,7 +912,8 @@ namespace System.Text.Unicode uint thisDWord; - while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) + Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); + do { // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar. @@ -952,27 +957,26 @@ namespace System.Text.Unicode uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); - if (Bmi2.X64.IsSupported) + if (Sse41.X64.IsSupported) { - Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian."); - const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul; + Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian."); // Try reading and writing 8 elements per iteration. uint maxIters = minElementsRemaining / 8; - ulong firstQWord, secondQWord; + ulong possibleNonAsciiQWord; int i; + Vector128<short> utf16Data; for (i = 0; (uint)i < maxIters; i++) { - firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); - secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4); - - if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord)) + utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer); + if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask)) { - goto LoopTerminatedDueToNonAsciiData; + goto LoopTerminatedDueToNonAsciiDataInVectorLocal; } - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK)); - Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK)); + // narrow and write + + Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64()); pInputBuffer += 8; pOutputBuffer += 8; @@ -984,14 +988,14 @@ namespace System.Text.Unicode if ((minElementsRemaining & 4) != 0) { - secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); - - if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)) + possibleNonAsciiQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer); + if (!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) { - goto LoopTerminatedDueToNonAsciiDataInSecondQWord; + goto LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal; } - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK)); + utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16(); + Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); pInputBuffer += 4; pOutputBuffer += 4; @@ -1000,29 +1004,31 @@ namespace System.Text.Unicode continue; // Go back to beginning of main loop, read data, check for ASCII - LoopTerminatedDueToNonAsciiData: + LoopTerminatedDueToNonAsciiDataInVectorLocal: outputBytesRemaining -= 8 * i; + possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64()); - // First, see if we can drain any ASCII data from the first QWORD. + // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector, + // then check whether it's all-ASCII. If so, narrow and write to the destination + // buffer. Since we know that either the high 64 bits or the low 64 bits of the + // vector contains non-ASCII data, by the end of the following block the + // 'possibleNonAsciiQWord' local is guaranteed to contain the non-ASCII segment. - if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord)) + if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII { - Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK)); + Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32())); pInputBuffer += 4; pOutputBuffer += 4; outputBytesRemaining -= 4; - } - else - { - secondQWord = firstQWord; + possibleNonAsciiQWord = utf16Data.AsUInt64().GetElement(1); } - LoopTerminatedDueToNonAsciiDataInSecondQWord: + LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal: - Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier + Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)); // this condition should've been checked earlier - thisDWord = (uint)secondQWord; + thisDWord = (uint)possibleNonAsciiQWord; if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)) { // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ] @@ -1030,14 +1036,14 @@ namespace System.Text.Unicode pInputBuffer += 2; pOutputBuffer += 2; outputBytesRemaining -= 2; - thisDWord = (uint)(secondQWord >> 32); + thisDWord = (uint)(possibleNonAsciiQWord >> 32); } goto AfterReadDWordSkipAllCharsAsciiCheck; } else { - // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration. + // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration. uint maxIters = minElementsRemaining / 4; uint secondDWord; int i; @@ -1358,7 +1364,7 @@ namespace System.Text.Unicode } goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high - } + } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer); ProcessNextCharAndFinish: inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord; |