summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLevi Broderick <GrabYourPitchforks@users.noreply.github.com>2020-02-18 10:06:38 -0800
committerHyungju Lee <leee.lee@samsung.com>2020-10-30 18:20:49 +0900
commit26f4d5ce68467aca9a9e9e2073a3a56e4bed3071 (patch)
tree8d09563498d72d61daada4ffb86daf80b86e38ef
parent8e0799ded6b0dc6f1d35c605ef823b093622ab4c (diff)
downloadcoreclr-26f4d5ce68467aca9a9e9e2073a3a56e4bed3071.tar.gz
coreclr-26f4d5ce68467aca9a9e9e2073a3a56e4bed3071.tar.bz2
coreclr-26f4d5ce68467aca9a9e9e2073a3a56e4bed3071.zip
Port dotnet/runtime#31904 to release/3.1 (#28013)
Remove BMI2 from ASCII and UTF-16 processing hot paths, as not all processors have optimized implementations of pext/pdep
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs20
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs122
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs106
3 files changed, 92 insertions, 156 deletions
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
index b37b01779d..bf2bccf879 100644
--- a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
@@ -1009,10 +1009,14 @@ namespace System.Text
{
Debug.Assert(AllCharsInUInt64AreAscii(value));
- if (Bmi2.X64.IsSupported)
+ if (Sse2.X64.IsSupported)
{
- // BMI2 will work regardless of the processor's endianness.
- Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
+ // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
+ // [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination.
+
+ Vector128<short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16();
+ Vector128<uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32();
+ Unsafe.WriteUnaligned<uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow));
}
else
{
@@ -1694,14 +1698,16 @@ namespace System.Text
/// writes them to the output buffer with machine endianness.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
+ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
{
Debug.Assert(AllBytesInUInt32AreAscii(value));
- if (Bmi2.X64.IsSupported)
+ if (Sse2.X64.IsSupported)
{
- // BMI2 will work regardless of the processor's endianness.
- Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
+ Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
+ Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
+ Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
+ Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
}
else
{
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
index 54940f9816..9885a30689 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
@@ -6,7 +6,6 @@ using System.Buffers.Binary;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics.X86;
using Internal.Runtime.CompilerServices;
namespace System.Text.Unicode
@@ -61,7 +60,7 @@ namespace System.Text.Unicode
}
/// <summary>
- /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a
+ /// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a
/// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -69,39 +68,19 @@ namespace System.Text.Unicode
{
if (BitConverter.IsLittleEndian)
{
- if (Bmi2.IsSupported)
- {
- // need to reverse endianness for bit manipulation to work correctly
- value = BinaryPrimitives.ReverseEndianness(value);
-
- // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
- // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ]
- // where wwww = uuuuu - 1
-
- uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u);
- uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u);
-
- uint combined = (lowSurrogateChar << 16) + highSurrogateChar;
- combined -= 0x40u; // wwww = uuuuu - 1
- combined += 0xDC00_D800u; // add surrogate markers
- return combined;
- }
- else
- {
- // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
- // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
- // where wwww = uuuuu - 1
- uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
- retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
- retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
- retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
- retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
- retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
- retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
- retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
- retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
- return retVal;
- }
+ // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+ // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+ // where wwww = uuuuu - 1
+ uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
+ retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
+ retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
+ retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
+ retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
+ retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
+ retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
+ retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
+ retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+ return retVal;
}
else
{
@@ -135,37 +114,19 @@ namespace System.Text.Unicode
// input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
// must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
- if (Bmi2.IsSupported)
- {
- // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
- // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple
- // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across
- // all four output bytes.
-
- uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */;
-
- // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning
- // that should normally be masked out via an and, but we'll just direct pdep to ignore it.
+ value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
- uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ]
- return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
- }
- else
- {
- value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
+ uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
+ tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
- uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
- tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
+ uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
+ uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
+ tempC |= tempB;
- uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
- uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
- tempC |= tempB;
+ uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
+ tempD |= 0x8080_80F0u;
- uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
- tempD |= 0x8080_80F0u;
-
- return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
- }
+ return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
}
else
{
@@ -757,43 +718,6 @@ namespace System.Text.Unicode
}
/// <summary>
- /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
- /// and writes the resulting QWORD into the destination with machine endianness.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
- {
- if (Bmi2.X64.IsSupported)
- {
- // BMI2 will work regardless of the processor's endianness.
- Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
- }
- else
- {
- if (BitConverter.IsLittleEndian)
- {
- outputBuffer = (char)(byte)value;
- value >>= 8;
- Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
- value >>= 8;
- Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
- value >>= 8;
- Unsafe.Add(ref outputBuffer, 3) = (char)value;
- }
- else
- {
- Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
- value >>= 8;
- Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
- value >>= 8;
- Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
- value >>= 8;
- outputBuffer = (char)value;
- }
- }
- }
-
- /// <summary>
/// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
/// converts those scalar values to their 3-byte UTF-8 representation and writes the
/// resulting 6 bytes to the destination buffer.
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
index 126974c892..f050248601 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
@@ -7,6 +7,7 @@ using System.Buffers.Binary;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using Internal.Runtime.CompilerServices;
@@ -78,7 +79,8 @@ namespace System.Text.Unicode
byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
#endif
- while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+ do
{
// Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
@@ -101,7 +103,7 @@ namespace System.Text.Unicode
goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
}
- Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+ ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
pInputBuffer += 4;
pOutputBuffer += 4;
outputCharsRemaining -= 4;
@@ -127,8 +129,8 @@ namespace System.Text.Unicode
pInputBuffer += 8;
- Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
- Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
+ ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord);
+ ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord);
pOutputBuffer += 8;
}
@@ -143,7 +145,7 @@ namespace System.Text.Unicode
{
// The first DWORD contained all-ASCII bytes, so expand it.
- Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+ ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
// continue the outer loop from the second DWORD
@@ -487,12 +489,10 @@ namespace System.Text.Unicode
}
// As an optimization, on compatible platforms check if a second three-byte sequence immediately
- // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
+ // follows the one we just read, and if so extract them together.
- if (Bmi2.X64.IsSupported)
+ if (BitConverter.IsLittleEndian)
{
- Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-
// First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
// would indicate the potential start of a second three-byte sequence.
@@ -504,7 +504,7 @@ namespace System.Text.Unicode
if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
{
- // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
+ // We're going to attempt to read a second 3-byte sequence and write them both out one after the other.
// We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
// byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
// checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
@@ -517,14 +517,8 @@ namespace System.Text.Unicode
&& ((secondDWord & 0x0000_200Fu) != 0)
&& (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
{
- // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
- ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
- thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
-
- // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
- ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
-
- Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
+ pOutputBuffer[0] = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+ pOutputBuffer[1] = (char)ExtractCharFromFirstThreeByteSequence(secondDWord);
pInputBuffer += 6;
pOutputBuffer += 2;
outputCharsRemaining -= 2;
@@ -658,7 +652,7 @@ namespace System.Text.Unicode
continue; // go back to beginning of loop for processing
}
- }
+ } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
ProcessRemainingBytesSlow:
inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
@@ -900,6 +894,16 @@ namespace System.Text.Unicode
char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
+ // We have paths for SSE4.1 vectorization inside the inner loop. Since the below
+ // vector is only used in those code paths, we leave it uninitialized if SSE4.1
+ // is not enabled.
+
+ Vector128<short> nonAsciiUtf16DataMask = default;
+ if (Sse41.X64.IsSupported)
+ {
+ nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
+ }
+
// Begin the main loop.
#if DEBUG
@@ -908,7 +912,8 @@ namespace System.Text.Unicode
uint thisDWord;
- while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+ Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+ do
{
// Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
@@ -952,27 +957,26 @@ namespace System.Text.Unicode
uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
- if (Bmi2.X64.IsSupported)
+ if (Sse41.X64.IsSupported)
{
- Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
- const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
+ Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian.");
// Try reading and writing 8 elements per iteration.
uint maxIters = minElementsRemaining / 8;
- ulong firstQWord, secondQWord;
+ ulong possibleNonAsciiQWord;
int i;
+ Vector128<short> utf16Data;
for (i = 0; (uint)i < maxIters; i++)
{
- firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
- secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
-
- if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
+ utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer);
+ if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask))
{
- goto LoopTerminatedDueToNonAsciiData;
+ goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
}
- Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
- Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+ // narrow and write
+
+ Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
pInputBuffer += 8;
pOutputBuffer += 8;
@@ -984,14 +988,14 @@ namespace System.Text.Unicode
if ((minElementsRemaining & 4) != 0)
{
- secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-
- if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
+ possibleNonAsciiQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+ if (!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord))
{
- goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
+ goto LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal;
}
- Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+ utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16();
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
pInputBuffer += 4;
pOutputBuffer += 4;
@@ -1000,29 +1004,31 @@ namespace System.Text.Unicode
continue; // Go back to beginning of main loop, read data, check for ASCII
- LoopTerminatedDueToNonAsciiData:
+ LoopTerminatedDueToNonAsciiDataInVectorLocal:
outputBytesRemaining -= 8 * i;
+ possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64());
- // First, see if we can drain any ASCII data from the first QWORD.
+ // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector,
+ // then check whether it's all-ASCII. If so, narrow and write to the destination
+ // buffer. Since we know that either the high 64 bits or the low 64 bits of the
+ // vector contains non-ASCII data, by the end of the following block the
+ // 'possibleNonAsciiQWord' local is guaranteed to contain the non-ASCII segment.
- if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
+ if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII
{
- Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+ Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
pInputBuffer += 4;
pOutputBuffer += 4;
outputBytesRemaining -= 4;
- }
- else
- {
- secondQWord = firstQWord;
+ possibleNonAsciiQWord = utf16Data.AsUInt64().GetElement(1);
}
- LoopTerminatedDueToNonAsciiDataInSecondQWord:
+ LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal:
- Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
+ Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)); // this condition should've been checked earlier
- thisDWord = (uint)secondQWord;
+ thisDWord = (uint)possibleNonAsciiQWord;
if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
{
// [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
@@ -1030,14 +1036,14 @@ namespace System.Text.Unicode
pInputBuffer += 2;
pOutputBuffer += 2;
outputBytesRemaining -= 2;
- thisDWord = (uint)(secondQWord >> 32);
+ thisDWord = (uint)(possibleNonAsciiQWord >> 32);
}
goto AfterReadDWordSkipAllCharsAsciiCheck;
}
else
{
- // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
+ // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration.
uint maxIters = minElementsRemaining / 4;
uint secondDWord;
int i;
@@ -1358,7 +1364,7 @@ namespace System.Text.Unicode
}
goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
- }
+ } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
ProcessNextCharAndFinish:
inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;