summaryrefslogtreecommitdiff
path: root/src/corefx
diff options
context:
space:
mode:
authorEric Erhardt <eric.erhardt@microsoft.com>2015-12-07 11:58:09 -0600
committerEric Erhardt <eric.erhardt@microsoft.com>2015-12-09 14:00:38 -0600
commitd966f66753d9dfcaa57689ffcc5c8b3041c75803 (patch)
tree98943eca2805afeaf3405ac6c684855043a6243c /src/corefx
parent8621fe0198f2030380c6fca23b9a613f2ea0b029 (diff)
downloadcoreclr-d966f66753d9dfcaa57689ffcc5c8b3041c75803.tar.gz
coreclr-d966f66753d9dfcaa57689ffcc5c8b3041c75803.tar.bz2
coreclr-d966f66753d9dfcaa57689ffcc5c8b3041c75803.zip
Fixing collation for the following scenarios:
1. When IgnoreSymbols is true, ensure we still ignore half and fullwidth characters that are symbols. 2. Hiragana-Katakana characters differ at the tertiary strength, fixing the rule. 3. Fix collation on OSX which uses ICU 55.1. ICU 55 doesn't support having certain unicode characters using primary '<' rules. These characters are not necessary in the rules, since Windows always treats them the same. Removing 0x3099 and 0x309A from the half/full width rules.
Diffstat (limited to 'src/corefx')
-rw-r--r--src/corefx/System.Globalization.Native/collation.cpp85
1 files changed, 56 insertions, 29 deletions
diff --git a/src/corefx/System.Globalization.Native/collation.cpp b/src/corefx/System.Globalization.Native/collation.cpp
index a4924a03fb..228dbfc79b 100644
--- a/src/corefx/System.Globalization.Native/collation.cpp
+++ b/src/corefx/System.Globalization.Native/collation.cpp
@@ -65,13 +65,14 @@ const UChar g_HalfFullLowerChars[] = {
// fullwidth characters
0x3002, 0x300c, 0x300d, 0x3001, 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3,
- 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb,
- 0x30bd, 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8,
- 0x30db, 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef,
- 0x30f3, 0x3099, 0x309a, 0x3164, 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136, 0x3137, 0x3138, 0x3139, 0x313a, 0x313b,
- 0x313c, 0x313d, 0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3143, 0x3144, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a,
- 0x314b, 0x314c, 0x314d, 0x314e, 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154, 0x3155, 0x3156, 0x3157, 0x3158, 0x3159,
- 0x315a, 0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160, 0x3161, 0x3162, 0x3163
+ 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
+ 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db,
+ 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3,
+ 0x3164, 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136, 0x3137, 0x3138, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d, 0x313e,
+ 0x313f, 0x3140, 0x3141, 0x3142, 0x3143, 0x3144, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
+ 0x314e, 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154, 0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a, 0x315b, 0x315c,
+ 0x315d, 0x315e, 0x315f, 0x3160, 0x3161, 0x3162, 0x3163
+
};
const UChar g_HalfFullHigherChars[] = {
// fullwidth characters
@@ -85,13 +86,13 @@ const UChar g_HalfFullHigherChars[] = {
// halfwidth characters
0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
- 0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e,
- 0xff7f, 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d,
- 0xff8e, 0xff8f, 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c,
- 0xff9d, 0xff9e, 0xff9f, 0xffa0, 0xffa1, 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, 0xffaa, 0xffab,
- 0xffac, 0xffad, 0xffae, 0xffaf, 0xffb0, 0xffb1, 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, 0xffba,
- 0xffbb, 0xffbc, 0xffbd, 0xffbe, 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce,
- 0xffcf, 0xffd2, 0xffd3, 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffda, 0xffdb, 0xffdc
+ 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
+ 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e,
+ 0xff8f, 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d,
+ 0xffa0, 0xffa1, 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae,
+ 0xffaf, 0xffb0, 0xffb1, 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, 0xffba, 0xffbb, 0xffbc, 0xffbd,
+ 0xffbe, 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd2, 0xffd3,
+ 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffda, 0xffdb, 0xffdc
};
const int32_t g_HalfFullCharsLength = (sizeof(g_HalfFullHigherChars) / sizeof(UChar));
@@ -110,19 +111,35 @@ bool NeedsEscape(UChar character)
}
/*
+Gets a value indicating whether the HalfFullHigher character is considered a symbol character.
+
+The ranges specified here are only checking for characters in the g_HalfFullHigherChars list and needs
+to be combined with NeedsEscape above with the g_HalfFullLowerChars for all the IgnoreSymbols characters.
+This is done so we can use range checks instead of comparing individual characters.
+
+These ranges were obtained by running the above characters through .NET CompareInfo.Compare
+with CompareOptions.IgnoreSymbols on Windows.
+*/
+bool IsHalfFullHigherSymbol(UChar character)
+{
+ return (0xffe0 <= character && character <= 0xffe6)
+ || (0xff61 <= character && character <= 0xff65);
+}
+
+/*
Gets a string of custom collation rules, if necessary.
Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using
custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly.
*/
-std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength)
+std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength, bool isIgnoreSymbols)
{
bool isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType;
bool isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth;
- // kana differs at the quaternary level
- bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_QUATERNARY;
- bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_QUATERNARY;
+ // kana differs at the tertiary level
+ bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY;
+ bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY;
// character width differs at the tertiary level
bool needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY;
@@ -157,22 +174,32 @@ std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength)
if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)
{
UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<';
-
+
UChar lowerChar;
+ UChar higherChar;
+ bool needsEscape;
for (int i = 0; i < g_HalfFullCharsLength; i++)
{
- customRules.push_back('&');
-
- // the lower chars need to be checked for escaping since they contain ASCII punctuation
lowerChar = g_HalfFullLowerChars[i];
- if (NeedsEscape(lowerChar))
+ higherChar = g_HalfFullHigherChars[i];
+ // the lower chars need to be checked for escaping since they contain ASCII punctuation
+ needsEscape = NeedsEscape(lowerChar);
+
+ // when isIgnoreSymbols is true and we are not ignoring width, check to see if
+ // this character is a symbol, and if so skip it
+ if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar))))
{
- customRules.push_back('\\');
- }
- customRules.push_back(lowerChar);
+ customRules.push_back('&');
- customRules.push_back(compareChar);
- customRules.push_back(g_HalfFullHigherChars[i]);
+ if (needsEscape)
+ {
+ customRules.push_back('\\');
+ }
+ customRules.push_back(lowerChar);
+
+ customRules.push_back(compareChar);
+ customRules.push_back(higherChar);
+ }
}
}
}
@@ -205,7 +232,7 @@ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options,
}
UCollator* pClonedCollator;
- std::vector<UChar> customRules = GetCustomRules(options, strength);
+ std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols);
if (customRules.empty())
{
pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr);