1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Security;
using System.Text;
namespace System.Text
{
static partial class Normalization
{
public static bool IsNormalized(this string strInput, NormalizationForm normalizationForm)
{
ValidateArguments(strInput, normalizationForm);
int ret = Interop.GlobalizationInterop.IsNormalized(normalizationForm, strInput, strInput.Length);
if (ret == -1)
{
throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
}
return ret == 1;
}
public static string Normalize(this string strInput, NormalizationForm normalizationForm)
{
ValidateArguments(strInput, normalizationForm);
char[] buf = new char[strInput.Length];
for (int attempts = 2; attempts > 0; attempts--)
{
int realLen = Interop.GlobalizationInterop.NormalizeString(normalizationForm, strInput, strInput.Length, buf, buf.Length);
if (realLen == -1)
{
throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
}
if (realLen <= buf.Length)
{
return new string(buf, 0, realLen);
}
buf = new char[realLen];
}
throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
}
// -----------------------------
// ---- PAL layer ends here ----
// -----------------------------
private static void ValidateArguments(string strInput, NormalizationForm normalizationForm)
{
if (strInput == null)
{
throw new ArgumentNullException(nameof(strInput));
}
if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD &&
normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD)
{
throw new ArgumentException(Environment.GetResourceString("Argument_InvalidNormalizationForm"), nameof(normalizationForm));
}
if (HasInvalidUnicodeSequence(strInput))
{
throw new ArgumentException(Environment.GetResourceString("Argument_InvalidCharSequenceNoIndex"), nameof(strInput));
}
}
/// <summary>
/// ICU does not signal an error during normalization if the input string has invalid unicode,
/// unlike Windows (which uses the ERROR_NO_UNICODE_TRANSLATION error value to signal an error).
///
/// We walk the string ourselves looking for these bad sequences so we can continue to throw
/// ArgumentException in these cases.
/// </summary>
private static bool HasInvalidUnicodeSequence(string s)
{
for (int i = 0; i < s.Length; i++)
{
char c = s[i];
if (c < '\ud800')
{
continue;
}
if (c == '\uFFFE')
{
return true;
}
// If we see low surrogate before a high one, the string is invalid.
if (char.IsLowSurrogate(c))
{
return true;
}
if (char.IsHighSurrogate(c))
{
if (i + 1 >= s.Length || !char.IsLowSurrogate(s[i + 1]))
{
// A high surrogate at the end of the string or a high surrogate
// not followed by a low surrogate
return true;
}
else
{
i++; // consume the low surrogate.
continue;
}
}
}
return false;
}
}
}
|