src/mscorlib/src/System/Text/Normalization.Unix.cs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Security;
using System.Text;
using System.Globalization;

namespace System.Text
{
    static partial class Normalization
    {
        public static bool IsNormalized(this string strInput, NormalizationForm normalizationForm)
        {
            ValidateArguments(strInput, normalizationForm);

            if (GlobalizationMode.Invariant)
            {
                // In Invariant mode we assume all characters are normalized. 
                // This is because we don't support any linguistic operation on the strings
                return true;
            }

            int ret = Interop.GlobalizationInterop.IsNormalized(normalizationForm, strInput, strInput.Length);

            if (ret == -1)
            {
                throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
            }

            return ret == 1;
        }

        public static string Normalize(this string strInput, NormalizationForm normalizationForm)
        {
            ValidateArguments(strInput, normalizationForm);

            if (GlobalizationMode.Invariant)
            {
                // In Invariant mode we assume all characters are normalized. 
                // This is because we don't support any linguistic operation on the strings
                return strInput;
            }

            char[] buf = new char[strInput.Length];

            for (int attempts = 2; attempts > 0; attempts--)
            {
                int realLen = Interop.GlobalizationInterop.NormalizeString(normalizationForm, strInput, strInput.Length, buf, buf.Length);

                if (realLen == -1)
                {
                    throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
                }

                if (realLen <= buf.Length)
                {
                    return new string(buf, 0, realLen);
                }

                buf = new char[realLen];
            }

            throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
        }

        // -----------------------------
        // ---- PAL layer ends here ----
        // -----------------------------

        private static void ValidateArguments(string strInput, NormalizationForm normalizationForm)
        {
            if (strInput == null)
            {
                throw new ArgumentNullException(nameof(strInput));
            }

            if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD &&
                normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD)
            {
                throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm));
            }

            if (HasInvalidUnicodeSequence(strInput))
            {
                throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
            }
        }

        /// <summary>
        /// ICU does not signal an error during normalization if the input string has invalid unicode,
        /// unlike Windows (which uses the ERROR_NO_UNICODE_TRANSLATION error value to signal an error).
        ///
        /// We walk the string ourselves looking for these bad sequences so we can continue to throw
        /// ArgumentException in these cases.
        /// </summary>
        private static bool HasInvalidUnicodeSequence(string s)
        {
            for (int i = 0; i < s.Length; i++)
            {
                char c = s[i];
                
                if (c < '\ud800')
                {
                    continue;
                }

                if (c == '\uFFFE')
                {
                    return true;
                }

                // If we see low surrogate before a high one, the string is invalid.
                if (char.IsLowSurrogate(c))
                {
                    return true;
                }

                if (char.IsHighSurrogate(c))
                {
                    if (i + 1 >= s.Length || !char.IsLowSurrogate(s[i + 1]))
                    {
                        // A high surrogate at the end of the string or a high surrogate
                        // not followed by a low surrogate
                        return true;
                    }
                    else
                    {
                        i++; // consume the low surrogate.
                        continue;
                    }
                }
            }

            return false;
        }
    }
}