summaryrefslogtreecommitdiff
path: root/src/mscorlib/corefx/System/Globalization/CharUnicodeInfo.cs
diff options
context:
space:
mode:
Diffstat (limited to 'src/mscorlib/corefx/System/Globalization/CharUnicodeInfo.cs')
-rw-r--r--src/mscorlib/corefx/System/Globalization/CharUnicodeInfo.cs339
1 files changed, 339 insertions, 0 deletions
diff --git a/src/mscorlib/corefx/System/Globalization/CharUnicodeInfo.cs b/src/mscorlib/corefx/System/Globalization/CharUnicodeInfo.cs
new file mode 100644
index 0000000000..4cb95fb8f1
--- /dev/null
+++ b/src/mscorlib/corefx/System/Globalization/CharUnicodeInfo.cs
@@ -0,0 +1,339 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+////////////////////////////////////////////////////////////////////////////
+//
+//
+// Purpose: This class implements a set of methods for retrieving
+// character type information. Character type information is
+// independent of culture and region.
+//
+//
+////////////////////////////////////////////////////////////////////////////
+
+using System;
+using System.Threading;
+using System.Runtime.InteropServices;
+using System.Runtime.CompilerServices;
+using System.Reflection;
+using System.Security;
+using System.Diagnostics.Contracts;
+
+namespace System.Globalization
+{
+ public static partial class CharUnicodeInfo
+ {
+ //--------------------------------------------------------------------//
+ // Internal Information //
+ //--------------------------------------------------------------------//
+
+ //
+ // Native methods to access the Unicode category data tables in charinfo.nlp.
+ //
+ internal const char HIGH_SURROGATE_START = '\ud800';
+ internal const char HIGH_SURROGATE_END = '\udbff';
+ internal const char LOW_SURROGATE_START = '\udc00';
+ internal const char LOW_SURROGATE_END = '\udfff';
+
+ internal const int UNICODE_CATEGORY_OFFSET = 0;
+ internal const int BIDI_CATEGORY_OFFSET = 1;
+
+
+
+ // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
+ internal const int UNICODE_PLANE01_START = 0x10000;
+
+
+ ////////////////////////////////////////////////////////////////////////
+ //
+ // Actions:
+ // Convert the BMP character or surrogate pointed by index to a UTF32 value.
+ // This is similar to Char.ConvertToUTF32, but the difference is that
+ // it does not throw exceptions when invalid surrogate characters are passed in.
+ //
+ // WARNING: since it doesn't throw an exception it CAN return a value
+ // in the surrogate range D800-DFFF, which are not legal unicode values.
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+ internal static int InternalConvertToUtf32(String s, int index)
+ {
+ Contract.Assert(s != null, "s != null");
+ Contract.Assert(index >= 0 && index < s.Length, "index < s.Length");
+ if (index < s.Length - 1)
+ {
+ int temp1 = (int)s[index] - HIGH_SURROGATE_START;
+ if (temp1 >= 0 && temp1 <= 0x3ff)
+ {
+ int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
+ if (temp2 >= 0 && temp2 <= 0x3ff)
+ {
+ // Convert the surrogate to UTF32 and get the result.
+ return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
+ }
+ }
+ }
+ return ((int)s[index]);
+ }
+ ////////////////////////////////////////////////////////////////////////
+ //
+ // Convert a character or a surrogate pair starting at index of string s
+ // to UTF32 value.
+ //
+ // Parameters:
+ // s The string
+ // index The starting index. It can point to a BMP character or
+ // a surrogate pair.
+ // len The length of the string.
+ // charLength [out] If the index points to a BMP char, charLength
+ // will be 1. If the index points to a surrogate pair,
+ // charLength will be 2.
+ //
+ // WARNING: since it doesn't throw an exception it CAN return a value
+ // in the surrogate range D800-DFFF, which are not legal unicode values.
+ //
+ // Returns:
+ // The UTF32 value
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+ internal static int InternalConvertToUtf32(String s, int index, out int charLength)
+ {
+ Contract.Assert(s != null, "s != null");
+ Contract.Assert(s.Length > 0, "s.Length > 0");
+ Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
+ charLength = 1;
+ if (index < s.Length - 1)
+ {
+ int temp1 = (int)s[index] - HIGH_SURROGATE_START;
+ if (temp1 >= 0 && temp1 <= 0x3ff)
+ {
+ int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
+ if (temp2 >= 0 && temp2 <= 0x3ff)
+ {
+ // Convert the surrogate to UTF32 and get the result.
+ charLength++;
+ return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
+ }
+ }
+ }
+ return ((int)s[index]);
+ }
+
+ ////////////////////////////////////////////////////////////////////////
+ //
+ // IsWhiteSpace
+ //
+ // Determines if the given character is a white space character.
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+ internal static bool IsWhiteSpace(String s, int index)
+ {
+ Contract.Assert(s != null, "s!=null");
+ Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
+
+ UnicodeCategory uc = GetUnicodeCategory(s, index);
+ // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
+ // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
+ switch (uc)
+ {
+ case (UnicodeCategory.SpaceSeparator):
+ case (UnicodeCategory.LineSeparator):
+ case (UnicodeCategory.ParagraphSeparator):
+ return (true);
+ }
+ return (false);
+ }
+
+
+ internal static bool IsWhiteSpace(char c)
+ {
+ UnicodeCategory uc = GetUnicodeCategory(c);
+ // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
+ // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
+ switch (uc)
+ {
+ case (UnicodeCategory.SpaceSeparator):
+ case (UnicodeCategory.LineSeparator):
+ case (UnicodeCategory.ParagraphSeparator):
+ return (true);
+ }
+
+ return (false);
+ }
+
+
+ //
+ // This is called by the public char and string, index versions
+ //
+ // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
+ //
+ internal unsafe static double InternalGetNumericValue(int ch)
+ {
+ Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
+ // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
+ ushort index = s_pNumericLevel1Index[ch >> 8];
+ // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
+ // The offset is referred to an float item in m_pNumericFloatData.
+ // Note that & has the lower precedence than addition, so don't forget the parathesis.
+ index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
+
+ fixed (ushort* pUshortPtr = &(s_pNumericLevel1Index[index]))
+ {
+ byte* pBytePtr = (byte*)pUshortPtr;
+ fixed (byte* pByteNum = s_pNumericValues)
+ {
+ double* pDouble = (double*)pByteNum;
+ return pDouble[pBytePtr[(ch & 0x000f)]];
+ }
+ }
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////
+ //
+ //Returns the numeric value associated with the character c. If the character is a fraction,
+ // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
+ //
+ //Returns:
+ // the numeric value for the specified Unicode character. If the character does not have a numeric value, the return value is -1.
+ //Arguments:
+ // ch a Unicode character
+ //Exceptions:
+ // ArgumentNullException
+ // ArgumentOutOfRangeException
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+
+ public static double GetNumericValue(char ch)
+ {
+ return (InternalGetNumericValue(ch));
+ }
+
+
+ public static double GetNumericValue(String s, int index)
+ {
+ if (s == null)
+ {
+ throw new ArgumentNullException("s");
+ }
+ if (index < 0 || index >= s.Length)
+ {
+ throw new ArgumentOutOfRangeException("index", SR.ArgumentOutOfRange_Index);
+ }
+ Contract.EndContractBlock();
+ return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));
+ }
+
+ public static UnicodeCategory GetUnicodeCategory(char ch)
+ {
+ return (InternalGetUnicodeCategory(ch));
+ }
+
+ public static UnicodeCategory GetUnicodeCategory(String s, int index)
+ {
+ if (s == null)
+ throw new ArgumentNullException("s");
+ if (((uint)index) >= ((uint)s.Length))
+ {
+ throw new ArgumentOutOfRangeException("index");
+ }
+ Contract.EndContractBlock();
+ return InternalGetUnicodeCategory(s, index);
+ }
+
+ internal unsafe static UnicodeCategory InternalGetUnicodeCategory(int ch)
+ {
+ return ((UnicodeCategory)InternalGetCategoryValue(ch, UNICODE_CATEGORY_OFFSET));
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////
+ //
+ //Action: Returns the Unicode Category property for the character c.
+ //Returns:
+ // an value in UnicodeCategory enum
+ //Arguments:
+ // ch a Unicode character
+ //Exceptions:
+ // None
+ //
+ //Note that this API will return values for D800-DF00 surrogate halves.
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+ internal unsafe static byte InternalGetCategoryValue(int ch, int offset)
+ {
+ Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
+ // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
+ ushort index = s_pCategoryLevel1Index[ch >> 8];
+ // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
+ // Note that & has the lower precedence than addition, so don't forget the parathesis.
+ index = s_pCategoryLevel1Index[index + ((ch >> 4) & 0x000f)];
+
+ fixed (ushort* pUshortPtr = &(s_pCategoryLevel1Index[index]))
+ {
+ byte* pBytePtr = (byte*)pUshortPtr;
+ // Get the result from the 0 -3 bit of ch.
+ byte valueIndex = pBytePtr[(ch & 0x000f)];
+ byte uc = s_pCategoriesValue[valueIndex * 2 + offset];
+ //
+ // Make sure that OtherNotAssigned is the last category in UnicodeCategory.
+ // If that changes, change the following assertion as well.
+ //
+ //Contract.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category");
+ return (uc);
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////
+ //
+ //Action: Returns the Unicode Category property for the character c.
+ //Returns:
+ // an value in UnicodeCategory enum
+ //Arguments:
+ // value a Unicode String
+ // index Index for the specified string.
+ //Exceptions:
+ // None
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+ internal static UnicodeCategory InternalGetUnicodeCategory(String value, int index)
+ {
+ Contract.Assert(value != null, "value can not be null");
+ Contract.Assert(index < value.Length, "index < value.Length");
+
+ return (InternalGetUnicodeCategory(InternalConvertToUtf32(value, index)));
+ }
+
+ ////////////////////////////////////////////////////////////////////////
+ //
+ // Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
+ // If the character is a valid surrogate pair, charLength will return 2.
+ //
+ ////////////////////////////////////////////////////////////////////////
+
+ internal static UnicodeCategory InternalGetUnicodeCategory(String str, int index, out int charLength)
+ {
+ Contract.Assert(str != null, "str can not be null");
+ Contract.Assert(str.Length > 0, "str.Length > 0"); ;
+ Contract.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
+
+ return (InternalGetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
+ }
+
+ internal static bool IsCombiningCategory(UnicodeCategory uc)
+ {
+ Contract.Assert(uc >= 0, "uc >= 0");
+ return (
+ uc == UnicodeCategory.NonSpacingMark ||
+ uc == UnicodeCategory.SpacingCombiningMark ||
+ uc == UnicodeCategory.EnclosingMark
+ );
+ }
+ }
+}