summaryrefslogtreecommitdiff
path: root/src/corefx
diff options
context:
space:
mode:
authorstephentoub <stoub@microsoft.com>2015-10-20 15:34:38 -0400
committerstephentoub <stoub@microsoft.com>2015-10-22 14:31:58 -0400
commit80800eb0ead17dc724275d426904755b3bc64139 (patch)
tree319fdadf2945c08ff72f3bd0b43955abb64e8455 /src/corefx
parent3ddea17fc4e5b0c9ff2cb0651be82594a0eddffb (diff)
downloadcoreclr-80800eb0ead17dc724275d426904755b3bc64139.tar.gz
coreclr-80800eb0ead17dc724275d426904755b3bc64139.tar.bz2
coreclr-80800eb0ead17dc724275d426904755b3bc64139.zip
Improve string.{Last}IndexOf perf on Unix for Ordinal/OrdinalIgnoreCase
Our current implementation of IndexOfOrdinal for strings on Unix uses Substring to get the piece of the source string we care about; this results in an unnecessary allocation / string copy. When using OrdinalIgnoreCase, we also convert both the source and search strings to upper-case using ToUpperInvariant, resulting in more allocations. And our LastIndexOfOrdinal implementation delegates to IndexOfOrdinal repeatedly, incurring such allocations potentially multiple times. This change reimplements Ordinal searching in managed code to not use Substring, and it implements OrdinalIgnoreCase searching via new functions exposed in the native globalization shim, so as to use ICU without having to make managed/native transitions for each character. With the changes, {Last}IndexOf with Ordinal/OrdinalIgnoreCase are now allocateion-free (as you'd expect), and throughput when startIndex/count and/or OrdinalIgnoreCase are used is increased significantly, on my machine anywhere from 20% to 3x, depending on the inputs.
Diffstat (limited to 'src/corefx')
-rw-r--r--src/corefx/System.Globalization.Native/collation.cpp78
1 files changed, 78 insertions, 0 deletions
diff --git a/src/corefx/System.Globalization.Native/collation.cpp b/src/corefx/System.Globalization.Native/collation.cpp
index 7cf32b9419..d82d8d78a8 100644
--- a/src/corefx/System.Globalization.Native/collation.cpp
+++ b/src/corefx/System.Globalization.Native/collation.cpp
@@ -126,6 +126,84 @@ extern "C" int32_t LastIndexOf(
}
/*
+Static Function:
+AreEqualOrdinalIgnoreCase
+*/
+static bool AreEqualOrdinalIgnoreCase(UChar one, UChar two)
+{
+ // Return whether the two characters are identical or would be identical if they were upper-cased.
+
+ if (one == two)
+ {
+ return true;
+ }
+
+ if (one == 0x0131 || two == 0x0131)
+ {
+ // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
+ // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
+ // We special case it to match the Windows invariant behavior.
+ return false;
+ }
+
+ return u_toupper(one) == u_toupper(two);
+}
+
+/*
+Function:
+IndexOfOrdinalIgnoreCase
+*/
+extern "C" int32_t
+IndexOfOrdinalIgnoreCase(const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength)
+{
+ int32_t endIndex = cwSourceLength - cwTargetLength;
+ assert(endIndex >= 0);
+
+ for (int32_t i = 0; i <= endIndex; i++)
+ {
+ int32_t targetIdx = 0;
+ for (int32_t srcIdx = i; targetIdx < cwTargetLength; srcIdx++, targetIdx++) {
+ if (!AreEqualOrdinalIgnoreCase(lpSource[srcIdx], lpTarget[targetIdx])) {
+ break;
+ }
+ }
+
+ if (targetIdx == cwTargetLength) {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+/*
+Function:
+LastIndexOfOrdinalIgnoreCase
+*/
+extern "C" int32_t
+LastIndexOfOrdinalIgnoreCase(const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength)
+{
+ int32_t endIndex = cwSourceLength - cwTargetLength;
+ assert(endIndex >= 0);
+
+ for (int32_t i = endIndex; i >= 0; i--)
+ {
+ int32_t targetIdx = 0;
+ for (int32_t srcIdx = i; targetIdx < cwTargetLength; srcIdx++, targetIdx++) {
+ if (!AreEqualOrdinalIgnoreCase(lpSource[srcIdx], lpTarget[targetIdx])) {
+ break;
+ }
+ }
+
+ if (targetIdx == cwTargetLength) {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+/*
Return value is a "Win32 BOOL" (1 = true, 0 = false)
*/
extern "C" int32_t StartsWith(