diff options
Diffstat (limited to 'src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs')
-rw-r--r-- | src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs | 122 |
1 files changed, 23 insertions, 99 deletions
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs index 54940f9816..9885a30689 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -6,7 +6,6 @@ using System.Buffers.Binary; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; namespace System.Text.Unicode @@ -61,7 +60,7 @@ namespace System.Text.Unicode } /// <summary> - /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a + /// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation. /// </summary> [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -69,39 +68,19 @@ namespace System.Text.Unicode { if (BitConverter.IsLittleEndian) { - if (Bmi2.IsSupported) - { - // need to reverse endianness for bit manipulation to work correctly - value = BinaryPrimitives.ReverseEndianness(value); - - // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] - // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ] - // where wwww = uuuuu - 1 - - uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u); - uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u); - - uint combined = (lowSurrogateChar << 16) + highSurrogateChar; - combined -= 0x40u; // wwww = uuuuu - 1 - combined += 0xDC00_D800u; // add surrogate markers - return combined; - } - else - { - // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx - // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] - // where wwww = uuuuu - 1 - uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ] - retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ] - retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ] - retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ] - retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ] - retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ] - retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ] - retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ] - retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] - return retVal; - } + // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx + // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] + // where wwww = uuuuu - 1 + uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ] + retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ] + retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ] + retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ] + retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ] + retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ] + retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ] + retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ] + retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ] + return retVal; } else { @@ -135,37 +114,19 @@ namespace System.Text.Unicode // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx) // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1 - if (Bmi2.IsSupported) - { - // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want - // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple - // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across - // all four output bytes. - - uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */; - - // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning - // that should normally be masked out via an and, but we'll just direct pdep to ignore it. + value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] - uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ] - return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] - } - else - { - value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] + uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] + tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] - uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] - tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] + uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] + uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] + tempC |= tempB; - uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] - uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] - tempC |= tempB; + uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ] + tempD |= 0x8080_80F0u; - uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ] - tempD |= 0x8080_80F0u; - - return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] - } + return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] } else { @@ -757,43 +718,6 @@ namespace System.Text.Unicode } /// <summary> - /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD - /// and writes the resulting QWORD into the destination with machine endianness. - /// </summary> - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value) - { - if (Bmi2.X64.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul)); - } - else - { - if (BitConverter.IsLittleEndian) - { - outputBuffer = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 3) = (char)value; - } - else - { - Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value; - value >>= 8; - Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value; - value >>= 8; - outputBuffer = (char)value; - } - } - } - - /// <summary> /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess, /// converts those scalar values to their 3-byte UTF-8 representation and writes the /// resulting 6 bytes to the destination buffer. |