summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/System.Private.CoreLib/Resources/Strings.resx3
-rw-r--r--src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems3
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Rune.cs731
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs53
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs180
-rw-r--r--src/System.Private.CoreLib/src/System/ThrowHelper.cs7
6 files changed, 977 insertions, 0 deletions
diff --git a/src/System.Private.CoreLib/Resources/Strings.resx b/src/System.Private.CoreLib/Resources/Strings.resx
index 42dcde3e2f..4f17a289c8 100644
--- a/src/System.Private.CoreLib/Resources/Strings.resx
+++ b/src/System.Private.CoreLib/Resources/Strings.resx
@@ -3652,6 +3652,9 @@
<data name="Argument_MethodRedefined" xml:space="preserve">
<value>Method has been already defined.</value>
</data>
+ <data name="Argument_CannotExtractScalar" xml:space="preserve">
+ <value>Cannot extract a Unicode scalar value from the specified index in the input.</value>
+ </data>
<data name="Argument_CannotParsePrecision" xml:space="preserve">
<value>Characters following the format symbol must be a number of {0} or less.</value>
</data>
diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
index 79956c4342..8768d19223 100644
--- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
+++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems
@@ -651,9 +651,12 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingProvider.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Latin1Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\NormalizationForm.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\Rune.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.Debug.cs" Condition="'$(Configuration)' == 'Debug'" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
+ <Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf16Utility.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
new file mode 100644
index 0000000000..a4ef3a37b7
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
@@ -0,0 +1,731 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Globalization;
+using System.Runtime.CompilerServices;
+
+namespace System.Text
+{
+ /// <summary>
+ /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
+ /// </summary>
+ /// <remarks>
+ /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
+ /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
+ /// </remarks>
+ [DebuggerDisplay("{DebuggerDisplay,nq}")]
+ public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
+ {
+ private const byte IsWhiteSpaceFlag = 0x80;
+ private const byte IsLetterOrDigitFlag = 0x40;
+ private const byte UnicodeCategoryMask = 0x1F;
+
+ // Contains information about the ASCII character range [ U+0000..U+007F ], with:
+ // - 0x80 bit if set means 'is whitespace'
+ // - 0x40 bit if set means 'is letter or digit'
+ // - 0x20 bit is reserved for future use
+ // - bottom 5 bits are the UnicodeCategory of the character
+ private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
+ {
+ 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
+ 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
+ 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
+ 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
+ 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
+ 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
+ 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
+ };
+
+ private readonly uint _value;
+
+ /// <summary>
+ /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
+ /// </summary>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
+ /// U+D800..U+DFFF, inclusive.
+ /// </exception>
+ public Rune(char ch)
+ {
+ uint expanded = ch;
+ if (UnicodeUtility.IsSurrogateCodePoint(expanded))
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
+ }
+ _value = expanded;
+ }
+
+ /// <summary>
+ /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
+ /// </summary>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// If <paramref name="value"/> does not represent a value Unicode scalar value.
+ /// </exception>
+ public Rune(int value)
+ : this((uint)value)
+ {
+ }
+
+ /// <summary>
+ /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
+ /// </summary>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// If <paramref name="value"/> does not represent a value Unicode scalar value.
+ /// </exception>
+ [CLSCompliant(false)]
+ public Rune(uint value)
+ {
+ if (!UnicodeUtility.IsValidUnicodeScalar(value))
+ {
+ ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
+ }
+ _value = value;
+ }
+
+ // non-validating ctor
+ private Rune(uint scalarValue, bool unused)
+ {
+ UnicodeDebug.AssertIsValidScalar(scalarValue);
+ _value = scalarValue;
+ }
+
+ public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
+
+ public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
+
+ public static bool operator <(Rune left, Rune right) => (left._value < right._value);
+
+ public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
+
+ public static bool operator >(Rune left, Rune right) => (left._value > right._value);
+
+ public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
+
+ // Operators below are explicit because they may throw.
+
+ public static explicit operator Rune(char ch) => new Rune(ch);
+
+ [CLSCompliant(false)]
+ public static explicit operator Rune(uint value) => new Rune(value);
+
+ public static explicit operator Rune(int value) => new Rune(value);
+
+ // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
+ private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
+
+ /// <summary>
+ /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
+ /// and therefore representable by a single UTF-8 code unit.
+ /// </summary>
+ public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
+
+ /// <summary>
+ /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
+ /// and therefore representable by a single UTF-16 code unit.
+ /// </summary>
+ public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
+
+ /// <summary>
+ /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
+ /// </summary>
+ public int Plane => UnicodeUtility.GetPlane(_value);
+
+ /// <summary>
+ /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
+ /// </summary>
+ public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
+
+ /// <summary>
+ /// Returns the length in code units (<see cref="Char"/>) of the
+ /// UTF-16 sequence required to represent this scalar value.
+ /// </summary>
+ /// <remarks>
+ /// The return value will be 1 or 2.
+ /// </remarks>
+ public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
+
+ /// <summary>
+ /// Returns the length in code units (<see cref="Utf8Char"/>) of the
+ /// UTF-8 sequence required to represent this scalar value.
+ /// </summary>
+ /// <remarks>
+ /// The return value will be 1 through 4, inclusive.
+ /// </remarks>
+ public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
+
+ /// <summary>
+ /// Returns the Unicode scalar value as an integer.
+ /// </summary>
+ public int Value => (int)_value;
+
+ private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper)
+ {
+ if (culture == null)
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
+ }
+
+ var textInfo = culture.TextInfo;
+
+ Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
+ Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
+
+ int charCount = rune.EncodeToUtf16(original);
+ original = original.Slice(0, charCount);
+ modified = modified.Slice(0, charCount);
+
+ if (toUpper)
+ {
+ textInfo.ChangeCaseToUpper(original, modified);
+ }
+ else
+ {
+ textInfo.ChangeCaseToLower(original, modified);
+ }
+
+ // We use simple case folding rules, which disallows moving between the BMP and supplementary
+ // planes when performing a case conversion. The helper methods which reconstruct a Rune
+ // contain debug asserts for this condition.
+
+ if (rune.IsBmp)
+ {
+ return UnsafeCreate(modified[0]);
+ }
+ else
+ {
+ return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
+ }
+ }
+
+ public int CompareTo(Rune other) => this._value.CompareTo(other._value);
+
+ // returns the number of chars written
+ private int EncodeToUtf16(Span<char> destination)
+ {
+ Debug.Assert(destination.Length >= Utf16SequenceLength, "Caller should've provided a large enough buffer.");
+ bool success = TryEncode(destination, out int charsWritten);
+ Debug.Assert(success, "TryEncode should never fail given a large enough buffer.");
+ return charsWritten;
+ }
+
+ public override bool Equals(object obj) => (obj is Rune other) && this.Equals(other);
+
+ public bool Equals(Rune other) => (this == other);
+
+ public override int GetHashCode() => Value;
+
+ /// <summary>
+ /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
+ /// string <paramref name="input"/>.
+ /// </summary>
+ /// <remarks>
+ /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
+ /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
+ /// </remarks>
+ public static Rune GetRuneAt(string input, int index)
+ {
+ int runeValue = ReadRuneFromString(input, index);
+ if (runeValue < 0)
+ {
+ ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
+ }
+
+ return UnsafeCreate((uint)runeValue);
+ }
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+ /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+ /// </summary>
+ public static bool IsValid(int value) => IsValid((uint)value);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+ /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+ /// </summary>
+ [CLSCompliant(false)]
+ public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
+
+ // returns a negative number on failure
+ private static int ReadRuneFromString(string input, int index)
+ {
+ if (input is null)
+ {
+ ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
+ }
+
+ if ((uint)index >= (uint)input.Length)
+ {
+ ThrowHelper.ThrowArgumentOutOfRange_IndexException();
+ }
+
+ // Optimistically assume input is within BMP.
+
+ uint returnValue = input[index];
+ if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
+ {
+ if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
+ {
+ return -1;
+ }
+
+ // Treat 'returnValue' as the high surrogate.
+ //
+ // If this becomes a hot code path, we can skip the below bounds check by reading
+ // off the end of the string using unsafe code. Since strings are null-terminated,
+ // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
+ // the string terminates unexpectedly.
+
+ index++;
+ if ((uint)index >= (uint)input.Length)
+ {
+ return -1; // not an argument exception - just a "bad data" failure
+ }
+
+ uint potentialLowSurrogate = input[index];
+ if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
+ {
+ return -1;
+ }
+
+ returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
+ }
+
+ return (int)returnValue;
+ }
+
+ /// <summary>
+ /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
+ /// </summary>
+ public override string ToString()
+ {
+ Span<char> chars = stackalloc char[2]; // worst case
+ return new string(chars.Slice(0, EncodeToUtf16(chars)));
+ }
+
+ /// <summary>
+ /// Attempts to create a <see cref="Rune"/> from the provided input value.
+ /// </summary>
+ public static bool TryCreate(char ch, out Rune result)
+ {
+ uint extendedValue = ch;
+ if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
+ {
+ result = UnsafeCreate(extendedValue);
+ return true;
+ }
+ else
+ {
+ result = default;
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Attempts to create a <see cref="Rune"/> from the provided input value.
+ /// </summary>
+ public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
+
+ /// <summary>
+ /// Attempts to create a <see cref="Rune"/> from the provided input value.
+ /// </summary>
+ [CLSCompliant(false)]
+ public static bool TryCreate(uint value, out Rune result)
+ {
+ if (UnicodeUtility.IsValidUnicodeScalar(value))
+ {
+ result = UnsafeCreate(value);
+ return true;
+ }
+ else
+ {
+ result = default;
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
+ /// </summary>
+ /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
+ /// <param name="charsWritten">
+ /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
+ /// or 0 if the destination buffer is not large enough to contain the output.</param>
+ /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
+ /// <remarks>
+ /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
+ /// the required size of the <paramref name="destination"/> buffer.
+ /// </remarks>
+ public bool TryEncode(Span<char> destination, out int charsWritten)
+ {
+ if (destination.Length >= 1)
+ {
+ if (IsBmp)
+ {
+ destination[0] = (char)_value;
+ charsWritten = 1;
+ return true;
+ }
+ else if (destination.Length >= 2)
+ {
+ UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
+ charsWritten = 2;
+ return true;
+ }
+ }
+
+ // Destination buffer not large enough
+
+ charsWritten = default;
+ return false;
+ }
+
+ /// <summary>
+ /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
+ /// </summary>
+ /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
+ /// <param name="charsWritten">
+ /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
+ /// or 0 if the destination buffer is not large enough to contain the output.</param>
+ /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
+ /// <remarks>
+ /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
+ /// the required size of the <paramref name="destination"/> buffer.
+ /// </remarks>
+ // ** This is public so it can be unit tested but isn't yet exposed via the reference assemblies. **
+ public bool TryEncodeToUtf8Bytes(Span<byte> destination, out int bytesWritten)
+ {
+ // TODO: Optimize some of these writes by using BMI2 instructions.
+
+ // The bit patterns below come from the Unicode Standard, Table 3-6.
+
+ if (destination.Length >= 1)
+ {
+ if (IsAscii)
+ {
+ destination[0] = (byte)_value;
+ bytesWritten = 1;
+ return true;
+ }
+
+ if (destination.Length >= 2)
+ {
+ if (_value <= 0x7FFu)
+ {
+ // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
+ destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
+ destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
+ bytesWritten = 2;
+ return true;
+ }
+
+ if (destination.Length >= 3)
+ {
+ if (_value <= 0xFFFFu)
+ {
+ // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
+ destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
+ destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
+ destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
+ bytesWritten = 3;
+ return true;
+ }
+
+ if (destination.Length >= 4)
+ {
+ // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+ destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
+ destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
+ destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
+ destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
+ bytesWritten = 4;
+ return true;
+ }
+ }
+ }
+ }
+
+ // Destination buffer not large enough
+
+ bytesWritten = default;
+ return false;
+ }
+
+ /// <summary>
+ /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
+ /// string <paramref name="input"/>.
+ /// </summary>
+ /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
+ /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
+ /// <remarks>
+ /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
+ /// </remarks>
+ public static bool TryGetRuneAt(string input, int index, out Rune value)
+ {
+ int runeValue = ReadRuneFromString(input, index);
+ if (runeValue >= 0)
+ {
+ value = UnsafeCreate((uint)runeValue);
+ return true;
+ }
+ else
+ {
+ value = default;
+ return false;
+ }
+ }
+
+ // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
+ // validation. It is the caller's responsibility to have performed manual validation
+ // before calling this method. If a Rune instance is forcibly constructed
+ // from invalid input, the APIs on this type have undefined behavior, potentially including
+ // introducing a security hole in the consuming application.
+ //
+ // An example of a security hole resulting from an invalid Rune value, which could result
+ // in a stack overflow.
+ //
+ // public int GetMarvin32HashCode(Rune r) {
+ // Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
+ // r.TryEncode(buffer, ...);
+ // return Marvin32.ComputeHash(buffer.AsBytes());
+ // }
+
+ /// <summary>
+ /// Creates a <see cref="Rune"/> without performing validation on the input.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
+
+ // These are analogs of APIs on System.Char
+
+ public static double GetNumericValue(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ uint baseNum = value._value - '0';
+ return (baseNum <= 9) ? (double)baseNum : -1;
+ }
+ else
+ {
+ // not an ASCII char; fall back to globalization table
+ return CharUnicodeInfo.InternalGetNumericValue(value.Value);
+ }
+ }
+
+ public static UnicodeCategory GetUnicodeCategory(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
+ }
+ else
+ {
+ return GetUnicodeCategoryNonAscii(value);
+ }
+ }
+
+ private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
+ {
+ Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
+ return CharUnicodeInfo.GetUnicodeCategory(value.Value);
+ }
+
+ // Returns true iff this Unicode category represents a letter
+ private static bool IsCategoryLetter(UnicodeCategory category)
+ {
+ return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
+ }
+
+ // Returns true iff this Unicode category represents a letter or a decimal digit
+ private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
+ {
+ return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
+ || (category == UnicodeCategory.DecimalDigitNumber);
+ }
+
+ // Returns true iff this Unicode category represents a number
+ private static bool IsCategoryNumber(UnicodeCategory category)
+ {
+ return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
+ }
+
+ // Returns true iff this Unicode category represents a punctuation mark
+ private static bool IsCategoryPunctuation(UnicodeCategory category)
+ {
+ return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
+ }
+
+ // Returns true iff this Unicode category represents a separator
+ private static bool IsCategorySeparator(UnicodeCategory category)
+ {
+ return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
+ }
+
+ // Returns true iff this Unicode category represents a symbol
+ private static bool IsCategorySymbol(UnicodeCategory category)
+ {
+ return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
+ }
+
+ public static bool IsControl(Rune value)
+ {
+ // Per the Unicode stability policy, the set of control characters
+ // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
+ // characters will ever be added to the "control characters" group.
+ // See http://www.unicode.org/policies/stability_policy.html.
+
+ // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
+ // 00..1F (+1) => 01..20 (&~80) => 01..20
+ // 7F..9F (+1) => 80..A0 (&~80) => 00..20
+
+ return (((value._value + 1) & ~0x80u) <= 0x20u);
+ }
+
+ public static bool IsDigit(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
+ }
+ else
+ {
+ return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber);
+ }
+ }
+
+ public static bool IsLetter(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
+ }
+ else
+ {
+ return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
+ }
+ }
+
+ public static bool IsLetterOrDigit(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
+ }
+ else
+ {
+ return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
+ }
+ }
+
+ public static bool IsLower(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
+ }
+ else
+ {
+ return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter);
+ }
+ }
+
+ public static bool IsNumber(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
+ }
+ else
+ {
+ return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
+ }
+ }
+
+ public static bool IsPunctuation(Rune value)
+ {
+ return IsCategoryPunctuation(GetUnicodeCategory(value));
+ }
+
+ public static bool IsSeparator(Rune value)
+ {
+ return IsCategorySeparator(GetUnicodeCategory(value));
+ }
+
+ public static bool IsSymbol(Rune value)
+ {
+ return IsCategorySymbol(GetUnicodeCategory(value));
+ }
+
+ public static bool IsUpper(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
+ }
+ else
+ {
+ return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter);
+ }
+ }
+
+ public static bool IsWhiteSpace(Rune value)
+ {
+ if (value.IsAscii)
+ {
+ return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
+ }
+
+ // U+0085 is special since it's a whitespace character but is in the Control category
+ // instead of a normal separator category. No other code point outside the ASCII range
+ // has this mismatch.
+
+ if (value._value == 0x0085u)
+ {
+ return true;
+ }
+
+ return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
+ }
+
+ public static Rune ToLower(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: false);
+
+ public static Rune ToLowerInvariant(Rune value)
+ {
+ // Handle the most common case (ASCII data) first. Within the common case, we expect
+ // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
+
+ if (value.IsAscii || GlobalizationMode.Invariant)
+ {
+ // It's ok for us to use the UTF-16 conversion utility for this since the high
+ // 16 bits of the value will never be set so will be left unchanged.
+ return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
+ }
+
+ // Non-ASCII data requires going through the case folding tables.
+
+ return ToLower(value, CultureInfo.InvariantCulture);
+ }
+
+ public static Rune ToUpper(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: true);
+
+ public static Rune ToUpperInvariant(Rune value)
+ {
+ // Handle the most common case (ASCII data) first. Within the common case, we expect
+ // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
+
+ if (value.IsAscii || GlobalizationMode.Invariant)
+ {
+ // It's ok for us to use the UTF-16 conversion utility for this since the high
+ // 16 bits of the value will never be set so will be left unchanged.
+ return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
+ }
+
+ // Non-ASCII data requires going through the case folding tables.
+
+ return ToUpper(value, CultureInfo.InvariantCulture);
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs
new file mode 100644
index 0000000000..dedfbe2254
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs
@@ -0,0 +1,53 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace System.Text
+{
+ internal static class UnicodeDebug
+ {
+ [Conditional("DEBUG")]
+ internal static void AssertIsHighSurrogateCodePoint(uint codePoint)
+ {
+ Debug.Assert(UnicodeUtility.IsHighSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 high surrogate code point.");
+ }
+
+ [Conditional("DEBUG")]
+ internal static void AssertIsLowSurrogateCodePoint(uint codePoint)
+ {
+ Debug.Assert(UnicodeUtility.IsLowSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 low surrogate code point.");
+ }
+
+ [Conditional("DEBUG")]
+ internal static void AssertIsValidCodePoint(uint codePoint)
+ {
+ Debug.Assert(UnicodeUtility.IsValidCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid Unicode code point.");
+ }
+
+ [Conditional("DEBUG")]
+ internal static void AssertIsValidScalar(uint scalarValue)
+ {
+ Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid Unicode scalar value.");
+ }
+
+ [Conditional("DEBUG")]
+ internal static void AssertIsValidSupplementaryPlaneScalar(uint scalarValue)
+ {
+ Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue) && !UnicodeUtility.IsBmpCodePoint(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid supplementary plane Unicode scalar value.");
+ }
+
+ /// <summary>
+ /// Formats a code point as the hex string "U+XXXX".
+ /// </summary>
+ /// <remarks>
+ /// The input value doesn't have to be a real code point in the Unicode codespace. It can be any integer.
+ /// </remarks>
+ private static string ToHexString(uint codePoint)
+ {
+ return FormattableString.Invariant($"U+{codePoint:X4}");
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
new file mode 100644
index 0000000000..c1dcefd74b
--- /dev/null
+++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
@@ -0,0 +1,180 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+
+namespace System.Text
+{
+ internal static class UnicodeUtility
+ {
+ /// <summary>
+ /// The Unicode replacement character U+FFFD.
+ /// </summary>
+ public const uint ReplacementChar = 0xFFFDU;
+
+ /// <summary>
+ /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point.
+ /// </summary>
+ public static int GetPlane(uint codePoint)
+ {
+ UnicodeDebug.AssertIsValidCodePoint(codePoint);
+
+ return (int)(codePoint >> 16);
+ }
+
+ /// <summary>
+ /// Returns a Unicode scalar value from two code points representing a UTF-16 surrogate pair.
+ /// </summary>
+ public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint)
+ {
+ UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint);
+ UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint);
+
+ // This calculation comes from the Unicode specification, Table 3-5.
+ // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate,
+ // then fix up the "wwww = uuuuu - 1" section of the bit distribution. The code is written as below
+ // to become just two instructions: shl, lea.
+
+ return (highSurrogateCodePoint << 10) + lowSurrogateCodePoint - ((0xD800U << 10) + 0xDC00U - (1 << 16));
+ }
+
+ /// <summary>
+ /// Given a Unicode scalar value, gets the number of UTF-16 code units required to represent this value.
+ /// </summary>
+ public static int GetUtf16SequenceLength(uint value)
+ {
+ UnicodeDebug.AssertIsValidScalar(value);
+
+ value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00
+ value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02
+ value >>= 24; // shift high byte down
+ return (int)value; // and return it
+ }
+
+ /// <summary>
+ /// Decomposes an astral Unicode scalar into UTF-16 high and low surrogate code units.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint)
+ {
+ UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value);
+
+ // This calculation comes from the Unicode specification, Table 3-5.
+
+ highSurrogateCodePoint = (char)((value + ((0xD800u - 0x40u) << 10)) >> 10);
+ lowSurrogateCodePoint = (char)((value & 0x3FFu) + 0xDC00u);
+ }
+
+ /// <summary>
+ /// Given a Unicode scalar value, gets the number of UTF-8 code units required to represent this value.
+ /// </summary>
+ public static int GetUtf8SequenceLength(uint value)
+ {
+ UnicodeDebug.AssertIsValidScalar(value);
+
+ // The logic below can handle all valid scalar values branchlessly.
+ // It gives generally good performance across all inputs, and on x86
+ // it's only six instructions: lea, sar, xor, add, shr, lea.
+
+ // 'a' will be -1 if input is < 0x800; else 'a' will be 0
+ // => 'a' will be -1 if input is 1 or 2 UTF-8 code units; else 'a' will be 0
+
+ int a = ((int)value - 0x0800) >> 31;
+
+ // The number of UTF-8 code units for a given scalar is as follows:
+ // - U+0000..U+007F => 1 code unit
+ // - U+0080..U+07FF => 2 code units
+ // - U+0800..U+FFFF => 3 code units
+ // - U+10000+ => 4 code units
+ //
+ // If we XOR the incoming scalar with 0xF800, the chart mutates:
+ // - U+0000..U+F7FF => 3 code units
+ // - U+F800..U+F87F => 1 code unit
+ // - U+F880..U+FFFF => 2 code units
+ // - U+10000+ => 4 code units
+ //
+ // Since the 1- and 3-code unit cases are now clustered, they can
+ // both be checked together very cheaply.
+
+ value ^= 0xF800u;
+ value -= 0xF880u; // if scalar is 1 or 3 code units, high byte = 0xFF; else high byte = 0x00
+ value += (4 << 24); // if scalar is 1 or 3 code units, high byte = 0x03; else high byte = 0x04
+ value >>= 24; // shift high byte down
+
+ // Final return value:
+ // - U+0000..U+007F => 3 + (-1) * 2 = 1
+ // - U+0080..U+07FF => 4 + (-1) * 2 = 2
+ // - U+0800..U+FFFF => 3 + ( 0) * 2 = 3
+ // - U+10000+ => 4 + ( 0) * 2 = 4
+ return (int)value + (a * 2);
+ }
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is an ASCII
+ /// character ([ U+0000..U+007F ]).
+ /// </summary>
+ /// <remarks>
+ /// Per http://www.unicode.org/glossary/#ASCII, ASCII is only U+0000..U+007F.
+ /// </remarks>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsAsciiCodePoint(uint value) => (value <= 0x7Fu);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is in the
+ /// Basic Multilingual Plane (BMP).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsBmpCodePoint(uint value) => (value <= 0xFFFFu);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 high surrogate code point,
+ /// i.e., is in [ U+D800..U+DBFF ], inclusive.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsHighSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDBFFU);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is between
+ /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound) => ((value - lowerBound) <= (upperBound - lowerBound));
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 low surrogate code point,
+ /// i.e., is in [ U+DC00..U+DFFF ], inclusive.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsLowSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xDC00U, 0xDFFFU);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 surrogate code point,
+ /// i.e., is in [ U+D800..U+DFFF ], inclusive.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDFFFU);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode code
+ /// point, i.e., is in [ U+0000..U+10FFFF ], inclusive.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsValidCodePoint(uint codePoint) => (codePoint <= 0x10FFFFU);
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
+ /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsValidUnicodeScalar(uint value)
+ {
+ // By XORing the incoming value with 0xD800, surrogate code points
+ // are moved to the range [ U+0000..U+07FF ], and all valid scalar
+ // values are clustered into the single range [ U+0800..U+10FFFF ],
+ // which allows performing a single fast range check.
+
+ return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
+ }
+ }
+}
diff --git a/src/System.Private.CoreLib/src/System/ThrowHelper.cs b/src/System.Private.CoreLib/src/System/ThrowHelper.cs
index 41745e7801..551f87e7c5 100644
--- a/src/System.Private.CoreLib/src/System/ThrowHelper.cs
+++ b/src/System.Private.CoreLib/src/System/ThrowHelper.cs
@@ -76,6 +76,11 @@ namespace System
throw new ArgumentException(SR.Argument_OverlapAlignmentMismatch);
}
+ internal static void ThrowArgumentException_CannotExtractScalar(ExceptionArgument argument)
+ {
+ throw GetArgumentException(ExceptionResource.Argument_CannotExtractScalar, argument);
+ }
+
internal static void ThrowArgumentOutOfRange_IndexException()
{
throw GetArgumentOutOfRangeException(ExceptionArgument.index,
@@ -490,6 +495,7 @@ namespace System
pHandle,
values,
task,
+ ch,
s,
input,
pointer,
@@ -528,6 +534,7 @@ namespace System
ArgumentOutOfRange_Index,
Argument_InvalidOffLen,
Argument_ItemNotExist,
+ Argument_CannotExtractScalar,
ArgumentOutOfRange_Count,
ArgumentOutOfRange_InvalidThreshold,
ArgumentOutOfRange_ListInsert,