summaryrefslogtreecommitdiff
path: root/src/System.Private.CoreLib
diff options
context:
space:
mode:
authorLevi Broderick <GrabYourPitchforks@users.noreply.github.com>2019-02-14 11:00:02 -0800
committerGitHub <noreply@github.com>2019-02-14 11:00:02 -0800
commit84eaa7ac079e625f2fbe36ba976f735dbdacdc6b (patch)
treee392054ae46786a81350196aa007fd4f61a8922e /src/System.Private.CoreLib
parent94b0faee9f93247bb32c89ab7ed545a07540ba97 (diff)
downloadcoreclr-84eaa7ac079e625f2fbe36ba976f735dbdacdc6b.tar.gz
coreclr-84eaa7ac079e625f2fbe36ba976f735dbdacdc6b.tar.bz2
coreclr-84eaa7ac079e625f2fbe36ba976f735dbdacdc6b.zip
Add Rune creation API from UTF-16 surrogate pair (#22590)
Also brings in some perf improvements to existing char and UnicodeUtility APIs
Diffstat (limited to 'src/System.Private.CoreLib')
-rw-r--r--src/System.Private.CoreLib/shared/System/Char.cs48
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/Rune.cs42
-rw-r--r--src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs16
3 files changed, 95 insertions, 11 deletions
diff --git a/src/System.Private.CoreLib/shared/System/Char.cs b/src/System.Private.CoreLib/shared/System/Char.cs
index 1312380296..50dd092671 100644
--- a/src/System.Private.CoreLib/shared/System/Char.cs
+++ b/src/System.Private.CoreLib/shared/System/Char.cs
@@ -904,7 +904,14 @@ namespace System
public static bool IsSurrogatePair(char highSurrogate, char lowSurrogate)
{
- return IsHighSurrogate(highSurrogate) && IsLowSurrogate(lowSurrogate);
+ // Since both the high and low surrogate ranges are exactly 0x400 elements
+ // wide, and since this is a power of two, we can perform a single comparison
+ // by baselining each value to the start of its respective range and taking
+ // the logical OR of them.
+
+ uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
+ uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
+ return (highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE;
}
internal const int UNICODE_PLANE00_END = 0x00ffff;
@@ -937,15 +944,44 @@ namespace System
public static int ConvertToUtf32(char highSurrogate, char lowSurrogate)
{
- if (!IsHighSurrogate(highSurrogate))
+ // First, extend both to 32 bits, then calculate the offset of
+ // each candidate surrogate char from the start of its range.
+
+ uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
+ uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
+
+ // This is a single comparison which allows us to check both for validity at once since
+ // both the high surrogate range and the low surrogate range are the same length.
+ // If the comparison fails, we call to a helper method to throw the correct exception message.
+
+ if ((highSurrogateOffset | lowSurrogateOffset) > CharUnicodeInfo.HIGH_SURROGATE_RANGE)
+ {
+ ConvertToUtf32_ThrowInvalidArgs(highSurrogateOffset);
+ }
+
+ // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
+ return ((int)highSurrogateOffset << 10) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40 << 10);
+ }
+
+ [StackTraceHidden]
+ private static void ConvertToUtf32_ThrowInvalidArgs(uint highSurrogateOffset)
+ {
+ // If the high surrogate is not within its expected range, throw an exception
+ // whose message fingers it as invalid. If it's within the expected range,
+ // change the message to read that the low surrogate was the problem.
+
+ if (highSurrogateOffset > CharUnicodeInfo.HIGH_SURROGATE_RANGE)
{
- throw new ArgumentOutOfRangeException(nameof(highSurrogate), SR.ArgumentOutOfRange_InvalidHighSurrogate);
+ throw new ArgumentOutOfRangeException(
+ paramName: "highSurrogate",
+ message: SR.ArgumentOutOfRange_InvalidHighSurrogate);
}
- if (!IsLowSurrogate(lowSurrogate))
+ else
{
- throw new ArgumentOutOfRangeException(nameof(lowSurrogate), SR.ArgumentOutOfRange_InvalidLowSurrogate);
+ throw new ArgumentOutOfRangeException(
+ paramName: "lowSurrogate",
+ message: SR.ArgumentOutOfRange_InvalidLowSurrogate);
}
- return (((highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START) * 0x400) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + UNICODE_PLANE01_START);
}
/*=============================ConvertToUtf32===================================
diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
index 74aecbe5db..35733dc7af 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
@@ -59,6 +59,18 @@ namespace System.Text
}
/// <summary>
+ /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
+ /// </summary>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
+ /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
+ /// </exception>
+ public Rune(char highSurrogate, char lowSurrogate)
+ : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false)
+ {
+ }
+
+ /// <summary>
/// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
/// </summary>
/// <exception cref="ArgumentOutOfRangeException">
@@ -365,6 +377,36 @@ namespace System.Text
}
/// <summary>
+ /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
+ /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
+ /// </summary>
+ public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result)
+ {
+ // First, extend both to 32 bits, then calculate the offset of
+ // each candidate surrogate char from the start of its range.
+
+ uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
+ uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
+
+ // This is a single comparison which allows us to check both for validity at once since
+ // both the high surrogate range and the low surrogate range are the same length.
+ // If the comparison fails, we call to a helper method to throw the correct exception message.
+
+ if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE)
+ {
+ // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
+ result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10));
+ return true;
+ }
+ else
+ {
+ // Didn't have a high surrogate followed by a low surrogate.
+ result = default;
+ return false;
+ }
+ }
+
+ /// <summary>
/// Attempts to create a <see cref="Rune"/> from the provided input value.
/// </summary>
public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
index e607acd6a1..3aad29679d 100644
--- a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
@@ -169,12 +169,18 @@ namespace System.Text
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsValidUnicodeScalar(uint value)
{
- // By XORing the incoming value with 0xD800, surrogate code points
- // are moved to the range [ U+0000..U+07FF ], and all valid scalar
- // values are clustered into the single range [ U+0800..U+10FFFF ],
- // which allows performing a single fast range check.
+ // This is an optimized check that on x86 is just three instructions: lea, xor, cmp.
+ //
+ // After the subtraction operation, the input value is modified as such:
+ // [ 00000000..0010FFFF ] -> [ FFEF0000..FFFFFFFF ]
+ //
+ // We now want to _exclude_ the range [ FFEFD800..FFEFDFFF ] (surrogates) from being valid.
+ // After the xor, this particular exclusion range becomes [ FFEF0000..FFEF07FF ].
+ //
+ // So now the range [ FFEF0800..FFFFFFFF ] contains all valid code points,
+ // excluding surrogates. This allows us to perform a single comparison.
- return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
+ return ((value - 0x110000u) ^ 0xD800u) >= 0xFFEF0800u;
}
}
}