diff options
Diffstat (limited to 'src/mscorlib/src/System/Text/Normalization.Unix.cs')
-rw-r--r-- | src/mscorlib/src/System/Text/Normalization.Unix.cs | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/src/mscorlib/src/System/Text/Normalization.Unix.cs b/src/mscorlib/src/System/Text/Normalization.Unix.cs new file mode 100644 index 0000000000..d49bdc6c21 --- /dev/null +++ b/src/mscorlib/src/System/Text/Normalization.Unix.cs @@ -0,0 +1,123 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Security; +using System.Text; + +namespace System.Text +{ + static partial class Normalization + { + public static bool IsNormalized(this string strInput, NormalizationForm normalizationForm) + { + ValidateArguments(strInput, normalizationForm); + + int ret = Interop.GlobalizationInterop.IsNormalized(normalizationForm, strInput, strInput.Length); + + if (ret == -1) + { + throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput)); + } + + return ret == 1; + } + + public static string Normalize(this string strInput, NormalizationForm normalizationForm) + { + ValidateArguments(strInput, normalizationForm); + + char[] buf = new char[strInput.Length]; + + for (int attempts = 2; attempts > 0; attempts--) + { + int realLen = Interop.GlobalizationInterop.NormalizeString(normalizationForm, strInput, strInput.Length, buf, buf.Length); + + if (realLen == -1) + { + throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput)); + } + + if (realLen <= buf.Length) + { + return new string(buf, 0, realLen); + } + + buf = new char[realLen]; + } + + throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput)); + } + + // ----------------------------- + // ---- PAL layer ends here ---- + // ----------------------------- + + private static void ValidateArguments(string strInput, NormalizationForm normalizationForm) + { + if (strInput == null) + { + throw new ArgumentNullException(nameof(strInput)); + } + + if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD && + normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD) + { + throw new ArgumentException(Environment.GetResourceString("Argument_InvalidNormalizationForm"), nameof(normalizationForm)); + } + + if (HasInvalidUnicodeSequence(strInput)) + { + throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput)); + } + } + + /// <summary> + /// ICU does not signal an error during normalization if the input string has invalid unicode, + /// unlike Windows (which uses the ERROR_NO_UNICODE_TRANSLATION error value to signal an error). + /// + /// We walk the string ourselves looking for these bad sequences so we can continue to throw + /// ArgumentException in these cases. + /// </summary> + private static bool HasInvalidUnicodeSequence(string s) + { + for (int i = 0; i < s.Length; i++) + { + char c = s[i]; + + if (c < '\ud800') + { + continue; + } + + if (c == '\uFFFE') + { + return true; + } + + // If we see low surrogate before a high one, the string is invalid. + if (char.IsLowSurrogate(c)) + { + return true; + } + + if (char.IsHighSurrogate(c)) + { + if (i + 1 >= s.Length || !char.IsLowSurrogate(s[i + 1])) + { + // A high surrogate at the end of the string or a high surrogate + // not followed by a low surrogate + return true; + } + else + { + i++; // consume the low surrogate. + continue; + } + } + } + + return false; + } + } +} |