summaryrefslogtreecommitdiff
path: root/src/mscorlib/src/System/Text/Normalization.Unix.cs
diff options
context:
space:
mode:
Diffstat (limited to 'src/mscorlib/src/System/Text/Normalization.Unix.cs')
-rw-r--r--src/mscorlib/src/System/Text/Normalization.Unix.cs123
1 files changed, 123 insertions, 0 deletions
diff --git a/src/mscorlib/src/System/Text/Normalization.Unix.cs b/src/mscorlib/src/System/Text/Normalization.Unix.cs
new file mode 100644
index 0000000000..d49bdc6c21
--- /dev/null
+++ b/src/mscorlib/src/System/Text/Normalization.Unix.cs
@@ -0,0 +1,123 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Security;
+using System.Text;
+
+namespace System.Text
+{
+ static partial class Normalization
+ {
+ public static bool IsNormalized(this string strInput, NormalizationForm normalizationForm)
+ {
+ ValidateArguments(strInput, normalizationForm);
+
+ int ret = Interop.GlobalizationInterop.IsNormalized(normalizationForm, strInput, strInput.Length);
+
+ if (ret == -1)
+ {
+ throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
+ }
+
+ return ret == 1;
+ }
+
+ public static string Normalize(this string strInput, NormalizationForm normalizationForm)
+ {
+ ValidateArguments(strInput, normalizationForm);
+
+ char[] buf = new char[strInput.Length];
+
+ for (int attempts = 2; attempts > 0; attempts--)
+ {
+ int realLen = Interop.GlobalizationInterop.NormalizeString(normalizationForm, strInput, strInput.Length, buf, buf.Length);
+
+ if (realLen == -1)
+ {
+ throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
+ }
+
+ if (realLen <= buf.Length)
+ {
+ return new string(buf, 0, realLen);
+ }
+
+ buf = new char[realLen];
+ }
+
+ throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
+ }
+
+ // -----------------------------
+ // ---- PAL layer ends here ----
+ // -----------------------------
+
+ private static void ValidateArguments(string strInput, NormalizationForm normalizationForm)
+ {
+ if (strInput == null)
+ {
+ throw new ArgumentNullException(nameof(strInput));
+ }
+
+ if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD &&
+ normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD)
+ {
+ throw new ArgumentException(Environment.GetResourceString("Argument_InvalidNormalizationForm"), nameof(normalizationForm));
+ }
+
+ if (HasInvalidUnicodeSequence(strInput))
+ {
+ throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
+ }
+ }
+
+ /// <summary>
+ /// ICU does not signal an error during normalization if the input string has invalid unicode,
+ /// unlike Windows (which uses the ERROR_NO_UNICODE_TRANSLATION error value to signal an error).
+ ///
+ /// We walk the string ourselves looking for these bad sequences so we can continue to throw
+ /// ArgumentException in these cases.
+ /// </summary>
+ private static bool HasInvalidUnicodeSequence(string s)
+ {
+ for (int i = 0; i < s.Length; i++)
+ {
+ char c = s[i];
+
+ if (c < '\ud800')
+ {
+ continue;
+ }
+
+ if (c == '\uFFFE')
+ {
+ return true;
+ }
+
+ // If we see low surrogate before a high one, the string is invalid.
+ if (char.IsLowSurrogate(c))
+ {
+ return true;
+ }
+
+ if (char.IsHighSurrogate(c))
+ {
+ if (i + 1 >= s.Length || !char.IsLowSurrogate(s[i + 1]))
+ {
+ // A high surrogate at the end of the string or a high surrogate
+ // not followed by a low surrogate
+ return true;
+ }
+ else
+ {
+ i++; // consume the low surrogate.
+ continue;
+ }
+ }
+ }
+
+ return false;
+ }
+ }
+}