diff options
author | Levi Broderick <GrabYourPitchforks@users.noreply.github.com> | 2019-03-18 22:58:32 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-03-18 22:58:32 -0700 |
commit | 1f3f474a13bdde1c5fecdf8cd9ce525dbe5df000 (patch) | |
tree | 6dfa953c84f1b4d1a7af414c89bb69a025902b1b | |
parent | 31581af5fa816fb2ea94145823ec3bdd6c0b0327 (diff) | |
download | coreclr-1f3f474a13bdde1c5fecdf8cd9ce525dbe5df000.tar.gz coreclr-1f3f474a13bdde1c5fecdf8cd9ce525dbe5df000.tar.bz2 coreclr-1f3f474a13bdde1c5fecdf8cd9ce525dbe5df000.zip |
Add Utf8String skeleton (#23209)
Utf8String is an experimental type that is string-like (heap-allocated, immutable, variable-length, null-terminated) but whose inner representation is UTF-8, not UTF-16.
This is a skeleton implementation of the basic API shape. The ecosystem of APIs has not yet been built around it. All Utf8String-related code is currently surrounded by ifdefs to allow easy identification and removal from release branches.
43 files changed, 1797 insertions, 27 deletions
diff --git a/clr.defines.targets b/clr.defines.targets index 3fa0417f51..e2f10586f1 100644 --- a/clr.defines.targets +++ b/clr.defines.targets @@ -1,6 +1,7 @@ <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <!-- Features we're currently flighting, but don't intend to ship in officially supported releases --> <PropertyGroup Condition="'$(IsPrerelease)' == 'true'"> + <FeatureUtf8String>true</FeatureUtf8String> <!-- FeatureXXX>true</FeatureXXX --> </PropertyGroup> diff --git a/clrdefinitions.cmake b/clrdefinitions.cmake index 9e22da2033..a25d19d130 100644 --- a/clrdefinitions.cmake +++ b/clrdefinitions.cmake @@ -6,6 +6,7 @@ set(PRERELEASE 1) # Features we're currently flighting, but don't intend to ship in officially supported releases if (PRERELEASE) + add_definitions(-DFEATURE_UTF8STRING=1) # add_definitions(-DFEATURE_XXX=1) endif (PRERELEASE) diff --git a/src/System.Private.CoreLib/System.Private.CoreLib.csproj b/src/System.Private.CoreLib/System.Private.CoreLib.csproj index 6e73aeffa5..0cf6733ad9 100644 --- a/src/System.Private.CoreLib/System.Private.CoreLib.csproj +++ b/src/System.Private.CoreLib/System.Private.CoreLib.csproj @@ -112,6 +112,10 @@ <!-- CLR Features --> <Import Project="$(MSBuildThisFileDirectory)..\..\clr.coreclr.props" /> <Import Project="$(MSBuildThisFileDirectory)..\..\clr.defines.targets" /> + <!-- Experimental features --> + <PropertyGroup Condition="'$(FeatureUtf8String)' == 'true'"> + <DefineConstants>$(DefineConstants);FEATURE_UTF8STRING</DefineConstants> + </PropertyGroup> <!-- Sources --> <ItemGroup> <Compile Include="$(BclSourcesRoot)\Internal\Console.cs" /> @@ -274,6 +278,14 @@ <Compile Include="shared\Interop\Windows\Ole32\Interop.CoTaskMemAlloc.cs" /> <Compile Include="shared\Interop\Windows\OleAut32\Interop.SysAllocStringByteLen.cs" /> </ItemGroup> + <ItemGroup Condition="'$(FeatureUtf8String)' == 'true'"> + <Compile Include="$(BclSourcesRoot)\System\Char8.cs" /> + <Compile Include="$(BclSourcesRoot)\System\Utf8Extensions.cs" /> + <Compile Include="$(BclSourcesRoot)\System\Utf8String.cs" /> + <Compile Include="$(BclSourcesRoot)\System\Utf8String.Construction.cs" /> + <Compile Include="$(BclSourcesRoot)\System\Utf8String.Manipulation.cs" /> + <Compile Include="$(BclSourcesRoot)\System\Utf8String.Searching.cs" /> + </ItemGroup> <ItemGroup> <Compile Include="$(BclSourcesRoot)\System\Diagnostics\Eventing\XplatEventLogger.cs" Condition="'$(FeatureXplatEventSource)' == 'true'" /> <Compile Include="$(IntermediateOutputPath)..\Eventing\NativeRuntimeEventSource.cs" Condition="'$(FeaturePerfTracing)' == 'true'"/> diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index 85ba8b8cd4..b1c9da0bfa 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -801,6 +801,7 @@ <Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" /> + <Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" /> <Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" /> diff --git a/src/System.Private.CoreLib/shared/System/Memory.cs b/src/System.Private.CoreLib/shared/System/Memory.cs index ba31a6aeae..2074404630 100644 --- a/src/System.Private.CoreLib/shared/System/Memory.cs +++ b/src/System.Private.CoreLib/shared/System/Memory.cs @@ -6,6 +6,7 @@ using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Text; using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute; using EditorBrowsableState = System.ComponentModel.EditorBrowsableState; @@ -164,7 +165,13 @@ namespace System // No validation performed in release builds; caller must provide any necessary validation. // 'obj is T[]' below also handles things like int[] <-> uint[] being convertible - Debug.Assert((obj == null) || (typeof(T) == typeof(char) && obj is string) || (obj is T[]) || (obj is MemoryManager<T>)); + Debug.Assert((obj == null) + || (typeof(T) == typeof(char) && obj is string) +#if FEATURE_UTF8STRING + || ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj is Utf8String) +#endif // FEATURE_UTF8STRING + || (obj is T[]) + || (obj is MemoryManager<T>)); _object = obj; _index = start; @@ -212,6 +219,14 @@ namespace System { return (_object is string str) ? str.Substring(_index, _length) : Span.ToString(); } +#if FEATURE_UTF8STRING + else if (typeof(T) == typeof(Char8)) + { + // TODO_UTF8STRING: Call into optimized transcoding routine when it's available. + Span<T> span = Span; + return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), span.Length)); + } +#endif // FEATURE_UTF8STRING return string.Format("System.Memory<{0}>[{1}]", typeof(T).Name, _length); } @@ -317,6 +332,13 @@ namespace System refToReturn = ref Unsafe.As<char, T>(ref Unsafe.As<string>(tmpObject).GetRawStringData()); lengthOfUnderlyingSpan = Unsafe.As<string>(tmpObject).Length; } +#if FEATURE_UTF8STRING + else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject.GetType() == typeof(Utf8String)) + { + refToReturn = ref Unsafe.As<byte, T>(ref Unsafe.As<Utf8String>(tmpObject).DangerousGetMutableReference()); + lengthOfUnderlyingSpan = Unsafe.As<Utf8String>(tmpObject).Length; + } +#endif // FEATURE_UTF8STRING else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject)) { // We know the object is not null, it's not a string, and it is variable-length. The only @@ -427,6 +449,14 @@ namespace System ref char stringData = ref Unsafe.Add(ref s.GetRawStringData(), _index); return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle); } +#if FEATURE_UTF8STRING + else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject is Utf8String utf8String) + { + GCHandle handle = GCHandle.Alloc(tmpObject, GCHandleType.Pinned); + ref byte stringData = ref utf8String.DangerousGetMutableReference(_index); + return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle); + } +#endif // FEATURE_UTF8STRING else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject)) { // 'tmpObject is T[]' below also handles things like int[] <-> uint[] being convertible diff --git a/src/System.Private.CoreLib/shared/System/ReadOnlyMemory.cs b/src/System.Private.CoreLib/shared/System/ReadOnlyMemory.cs index 6c598430ad..bf90f0449d 100644 --- a/src/System.Private.CoreLib/shared/System/ReadOnlyMemory.cs +++ b/src/System.Private.CoreLib/shared/System/ReadOnlyMemory.cs @@ -6,6 +6,7 @@ using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Text; using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute; using EditorBrowsableState = System.ComponentModel.EditorBrowsableState; @@ -99,7 +100,13 @@ namespace System // No validation performed in release builds; caller must provide any necessary validation. // 'obj is T[]' below also handles things like int[] <-> uint[] being convertible - Debug.Assert((obj == null) || (typeof(T) == typeof(char) && obj is string) || (obj is T[]) || (obj is MemoryManager<T>)); + Debug.Assert((obj == null) + || (typeof(T) == typeof(char) && obj is string) +#if FEATURE_UTF8STRING + || ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj is Utf8String) +#endif // FEATURE_UTF8STRING + || (obj is T[]) + || (obj is MemoryManager<T>)); _object = obj; _index = start; @@ -141,6 +148,14 @@ namespace System { return (_object is string str) ? str.Substring(_index, _length) : Span.ToString(); } +#if FEATURE_UTF8STRING + else if (typeof(T) == typeof(Char8)) + { + // TODO_UTF8STRING: Call into optimized transcoding routine when it's available. + ReadOnlySpan<T> span = Span; + return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), span.Length)); + } +#endif // FEATURE_UTF8STRING return string.Format("System.ReadOnlyMemory<{0}>[{1}]", typeof(T).Name, _length); } @@ -239,6 +254,13 @@ namespace System refToReturn = ref Unsafe.As<char, T>(ref Unsafe.As<string>(tmpObject).GetRawStringData()); lengthOfUnderlyingSpan = Unsafe.As<string>(tmpObject).Length; } +#if FEATURE_UTF8STRING + else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject.GetType() == typeof(Utf8String)) + { + refToReturn = ref Unsafe.As<byte, T>(ref Unsafe.As<Utf8String>(tmpObject).DangerousGetMutableReference()); + lengthOfUnderlyingSpan = Unsafe.As<Utf8String>(tmpObject).Length; + } +#endif // FEATURE_UTF8STRING else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject)) { // We know the object is not null, it's not a string, and it is variable-length. The only @@ -342,6 +364,14 @@ namespace System ref char stringData = ref Unsafe.Add(ref s.GetRawStringData(), _index); return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle); } +#if FEATURE_UTF8STRING + else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject is Utf8String utf8String) + { + GCHandle handle = GCHandle.Alloc(tmpObject, GCHandleType.Pinned); + ref byte stringData = ref utf8String.DangerousGetMutableReference(_index); + return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle); + } +#endif // FEATURE_UTF8STRING else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject)) { // 'tmpObject is T[]' below also handles things like int[] <-> uint[] being convertible diff --git a/src/System.Private.CoreLib/shared/System/ReadOnlySpan.Fast.cs b/src/System.Private.CoreLib/shared/System/ReadOnlySpan.Fast.cs index eb3fd1464d..00337a5fd7 100644 --- a/src/System.Private.CoreLib/shared/System/ReadOnlySpan.Fast.cs +++ b/src/System.Private.CoreLib/shared/System/ReadOnlySpan.Fast.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.Versioning; +using System.Text; using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute; using EditorBrowsableState = System.ComponentModel.EditorBrowsableState; using Internal.Runtime.CompilerServices; @@ -240,12 +241,15 @@ namespace System { if (typeof(T) == typeof(char)) { - unsafe - { - fixed (char* src = &Unsafe.As<T, char>(ref _pointer.Value)) - return new string(src, 0, _length); - } + return new string(new ReadOnlySpan<char>(ref Unsafe.As<T, char>(ref _pointer.Value), _length)); } +#if FEATURE_UTF8STRING + else if (typeof(T) == typeof(Char8)) + { + // TODO_UTF8STRING: Call into optimized transcoding routine when it's available. + return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref _pointer.Value), _length)); + } +#endif // FEATURE_UTF8STRING return string.Format("System.ReadOnlySpan<{0}>[{1}]", typeof(T).Name, _length); } diff --git a/src/System.Private.CoreLib/shared/System/Runtime/InteropServices/MemoryMarshal.cs b/src/System.Private.CoreLib/shared/System/Runtime/InteropServices/MemoryMarshal.cs index b1f5507122..225f434382 100644 --- a/src/System.Private.CoreLib/shared/System/Runtime/InteropServices/MemoryMarshal.cs +++ b/src/System.Private.CoreLib/shared/System/Runtime/InteropServices/MemoryMarshal.cs @@ -28,7 +28,12 @@ namespace System.Runtime.InteropServices // As an optimization, we skip the "is string?" check below if typeof(T) is not char, // as Memory<T> / ROM<T> can't possibly contain a string instance in this case. - if (obj != null && (typeof(T) != typeof(char) || obj.GetType() != typeof(string))) + if (obj != null && !( + (typeof(T) == typeof(char) && obj.GetType() == typeof(string)) +#if FEATURE_UTF8STRING + || ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj.GetType() == typeof(Utf8String)) +#endif // FEATURE_UTF8STRING + )) { if (RuntimeHelpers.ObjectHasComponentSize(obj)) { diff --git a/src/System.Private.CoreLib/shared/System/Span.Fast.cs b/src/System.Private.CoreLib/shared/System/Span.Fast.cs index 66de4fe3d3..adc1f3903d 100644 --- a/src/System.Private.CoreLib/shared/System/Span.Fast.cs +++ b/src/System.Private.CoreLib/shared/System/Span.Fast.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.Versioning; +using System.Text; using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute; using EditorBrowsableState = System.ComponentModel.EditorBrowsableState; using Internal.Runtime.CompilerServices; @@ -319,12 +320,15 @@ namespace System { if (typeof(T) == typeof(char)) { - unsafe - { - fixed (char* src = &Unsafe.As<T, char>(ref _pointer.Value)) - return new string(src, 0, _length); - } + return new string(new ReadOnlySpan<char>(ref Unsafe.As<T, char>(ref _pointer.Value), _length)); + } +#if FEATURE_UTF8STRING + else if (typeof(T) == typeof(Char8)) + { + // TODO_UTF8STRING: Call into optimized transcoding routine when it's available. + return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref _pointer.Value), _length)); } +#endif // FEATURE_UTF8STRING return string.Format("System.Span<{0}>[{1}]", typeof(T).Name, _length); } diff --git a/src/System.Private.CoreLib/shared/System/String.cs b/src/System.Private.CoreLib/shared/System/String.cs index 49afbc8c8c..10f75225c0 100644 --- a/src/System.Private.CoreLib/shared/System/String.cs +++ b/src/System.Private.CoreLib/shared/System/String.cs @@ -24,9 +24,13 @@ namespace System [System.Runtime.CompilerServices.TypeForwardedFrom("mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089")] public sealed partial class String : IComparable, IEnumerable, IConvertible, IEnumerable<char>, IComparable<string>, IEquatable<string>, ICloneable { - // String constructors - // These are special. The implementation methods for these have a different signature from the - // declared constructors. + /* + * CONSTRUCTORS + * + * Defining a new constructor for string-like types (like String) requires changes both + * to the managed code below and to the native VM code. See the comment at the top of + * src/vm/ecall.cpp for instructions on how to add new overloads. + */ [MethodImplAttribute(MethodImplOptions.InternalCall)] public extern String(char[] value); @@ -335,8 +339,7 @@ namespace System return Empty; string result = FastAllocateString(value.Length); - fixed (char* dest = &result._firstChar, src = &MemoryMarshal.GetReference(value)) - wstrcpy(dest, src, value.Length); + Buffer.Memmove(ref result._firstChar, ref MemoryMarshal.GetReference(value), (uint)value.Length); return result; } diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs new file mode 100644 index 0000000000..6ee9ca05a6 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.cs @@ -0,0 +1,106 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Diagnostics; +using System.IO; +using System.Runtime.CompilerServices; + +namespace System.Text.Unicode +{ + internal static class Utf8Utility + { + /// <summary> + /// The maximum number of bytes that can result from UTF-8 transcoding + /// any Unicode scalar value. + /// </summary> + internal const int MaxBytesPerScalar = 4; + + /// <summary> + /// The UTF-8 representation of <see cref="UnicodeUtility.ReplacementChar"/>. + /// </summary> + private static ReadOnlySpan<byte> ReplacementCharSequence => new byte[] { 0xEF, 0xBF, 0xBD }; + + /// <summary> + /// Returns the byte index in <paramref name="utf8Data"/> where the first invalid UTF-8 sequence begins, + /// or -1 if the buffer contains no invalid sequences. Also outs the <paramref name="isAscii"/> parameter + /// stating whether all data observed (up to the first invalid sequence or the end of the buffer, whichever + /// comes first) is ASCII. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii) + { + // TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948). + + bool tempIsAscii = true; + int originalDataLength = utf8Data.Length; + + while (!utf8Data.IsEmpty) + { + if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done) + { + break; + } + + tempIsAscii &= result.IsAscii; + utf8Data = utf8Data.Slice(bytesConsumed); + } + + isAscii = tempIsAscii; + return (utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length); + } + +#if FEATURE_UTF8STRING + /// <summary> + /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data; + /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as + /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced + /// with U+FFD. + /// </summary> + public static Utf8String ValidateAndFixupUtf8String(Utf8String value) + { + if (Utf8String.IsNullOrEmpty(value)) + { + return value; + } + + ReadOnlySpan<byte> valueAsBytes = value.AsBytes(); + + int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _); + if (idxOfFirstInvalidData < 0) + { + return value; + } + + // TODO_UTF8STRING: Replace this with the faster implementation once it's available. + // (The faster implementation is in the dev/utf8string_bak branch currently.) + + MemoryStream memStream = new MemoryStream(); + memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData)); + + valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData); + do + { + if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done) + { + // Valid scalar value - copy data as-is to MemoryStream + memStream.Write(valueAsBytes.Slice(0, bytesConsumed)); + } + else + { + // Invalid scalar value - copy U+FFFD to MemoryStream + memStream.Write(ReplacementCharSequence); + } + + valueAsBytes = valueAsBytes.Slice(bytesConsumed); + } while (!valueAsBytes.IsEmpty); + + bool success = memStream.TryGetBuffer(out ArraySegment<byte> memStreamBuffer); + Debug.Assert(success, "Couldn't get underlying MemoryStream buffer."); + + return Utf8String.DangerousCreateWithoutValidation(memStreamBuffer, assumeWellFormed: true); + } +#endif // FEATURE_UTF8STRING + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs index 3aad29679d..065c938d81 100644 --- a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs @@ -11,7 +11,7 @@ namespace System.Text /// <summary> /// The Unicode replacement character U+FFFD. /// </summary> - public const uint ReplacementChar = 0xFFFDU; + public const uint ReplacementChar = 0xFFFD; /// <summary> /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point. diff --git a/src/System.Private.CoreLib/src/System/Char8.cs b/src/System.Private.CoreLib/src/System/Char8.cs new file mode 100644 index 0000000000..7a71e2faa0 --- /dev/null +++ b/src/System.Private.CoreLib/src/System/Char8.cs @@ -0,0 +1,69 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System +{ + /// <summary> + /// Represents a UTF-8 code unit, the elemental type of <see cref="Utf8String"/>. + /// </summary> + public readonly struct Char8 : IComparable<Char8>, IEquatable<Char8> + { + private readonly byte _value; + + private Char8(byte value) + { + _value = value; + } + + public static bool operator ==(Char8 left, Char8 right) => left._value == right._value; + public static bool operator !=(Char8 left, Char8 right) => left._value != right._value; + public static bool operator <(Char8 left, Char8 right) => left._value < right._value; + public static bool operator <=(Char8 left, Char8 right) => left._value <= right._value; + public static bool operator >(Char8 left, Char8 right) => left._value > right._value; + public static bool operator >=(Char8 left, Char8 right) => left._value >= right._value; + + // Operators from Utf8Char to <other primitives> + // TODO: Once C# gets support for checked operators, we should add those here. + + public static implicit operator byte(Char8 value) => value._value; + [CLSCompliant(false)] + public static explicit operator sbyte(Char8 value) => (sbyte)value._value; // explicit because can integer overflow + public static explicit operator char(Char8 value) => (char)value._value; // explicit because don't want to encourage char conversion + public static implicit operator short(Char8 value) => value._value; + [CLSCompliant(false)] + public static implicit operator ushort(Char8 value) => value._value; + public static implicit operator int(Char8 value) => value._value; + [CLSCompliant(false)] + public static implicit operator uint(Char8 value) => value._value; + public static implicit operator long(Char8 value) => value._value; + [CLSCompliant(false)] + public static implicit operator ulong(Char8 value) => value._value; + + // Operators from <other primitives> to Char8; most are explicit because narrowing conversions could be lossy + // TODO: Once C# gets support for checked operators, we should add those here. + + public static implicit operator Char8(byte value) => new Char8(value); + [CLSCompliant(false)] + public static explicit operator Char8(sbyte value) => new Char8((byte)value); + public static explicit operator Char8(char value) => new Char8((byte)value); + public static explicit operator Char8(short value) => new Char8((byte)value); + [CLSCompliant(false)] + public static explicit operator Char8(ushort value) => new Char8((byte)value); + public static explicit operator Char8(int value) => new Char8((byte)value); + [CLSCompliant(false)] + public static explicit operator Char8(uint value) => new Char8((byte)value); + public static explicit operator Char8(long value) => new Char8((byte)value); + [CLSCompliant(false)] + public static explicit operator Char8(ulong value) => new Char8((byte)value); + + public int CompareTo(Char8 other) => this._value.CompareTo(other._value); + + public override bool Equals(object obj) => (obj is Char8 other) && (this == other); + public bool Equals(Char8 other) => this == other; + + public override int GetHashCode() => _value; + + public override string ToString() => _value.ToString("X2"); + } +} diff --git a/src/System.Private.CoreLib/src/System/Utf8Extensions.cs b/src/System.Private.CoreLib/src/System/Utf8Extensions.cs new file mode 100644 index 0000000000..9fa2a54f16 --- /dev/null +++ b/src/System.Private.CoreLib/src/System/Utf8Extensions.cs @@ -0,0 +1,367 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Internal.Runtime.CompilerServices; + +namespace System +{ + public static class Utf8Extensions + { + /// <summary> + /// Projects <paramref name="text"/> as a <see cref="ReadOnlySpan{Byte}"/>. + /// </summary> + public static ReadOnlySpan<byte> AsBytes(this ReadOnlySpan<Char8> text) + { + return MemoryMarshal.Cast<Char8, byte>(text); + } + + /// <summary> + /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>. + /// </summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan<byte> AsBytes(this Utf8String text) + { + if (text == null) + return default; + + return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(), text.Length); + } + + /// <summary> + /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>. + /// </summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <exception cref="System.ArgumentNullException">Thrown when <paramref name="text"/> is null.</exception> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index is not in range (<0 or >text.Length). + /// </exception> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan<byte> AsBytes(this Utf8String text, int start) + { + if (text == null) + { + if (start != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + + if ((uint)start > (uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + + return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(start), text.Length - start); + } + + /// <summary> + /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>. + /// </summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <param name="length">The desired length for the slice (exclusive).</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range. + /// </exception> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan<byte> AsBytes(this Utf8String text, int start, int length) + { + if (text == null) + { + if (start != 0 || length != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + +#if BIT64 + // See comment in Span<T>.Slice for how this works. + if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#else + if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start)) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#endif + + return new ReadOnlySpan<byte>(ref text.DangerousGetMutableReference(start), length); + } + + /// <summary> + /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>. + /// </summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan<Char8> AsSpan(this Utf8String text) + { + if (text == null) + return default; + + return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference()), text.Length); + } + + /// <summary> + /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>. + /// </summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <exception cref="System.ArgumentNullException">Thrown when <paramref name="text"/> is null.</exception> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index is not in range (<0 or >text.Length). + /// </exception> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan<Char8> AsSpan(this Utf8String text, int start) + { + if (text == null) + { + if (start != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + + if ((uint)start > (uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + + return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference(start)), text.Length - start); + } + + /// <summary> + /// Creates a new readonly span over the portion of the target <see cref="Utf8String"/>. + /// </summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <param name="length">The desired length for the slice (exclusive).</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range. + /// </exception> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan<Char8> AsSpan(this Utf8String text, int start, int length) + { + if (text == null) + { + if (start != 0 || length != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + +#if BIT64 + // See comment in Span<T>.Slice for how this works. + if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#else + if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start)) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#endif + + return new ReadOnlySpan<Char8>(ref Unsafe.As<byte, Char8>(ref text.DangerousGetMutableReference(start)), length); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text) + { + if (text == null) + return default; + + return new ReadOnlyMemory<Char8>(text, 0, text.Length); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index is not in range (<0 or >text.Length). + /// </exception> + public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, int start) + { + if (text == null) + { + if (start != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + + if ((uint)start > (uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + + return new ReadOnlyMemory<Char8>(text, start, text.Length - start); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="startIndex">The index at which to begin this slice.</param> + public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, Index startIndex) + { + if (text == null) + { + if (!startIndex.Equals(Index.Start)) + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text); + + return default; + } + + int actualIndex = startIndex.GetOffset(text.Length); + if ((uint)actualIndex > (uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(); + + return new ReadOnlyMemory<Char8>(text, actualIndex, text.Length - actualIndex); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <param name="length">The desired length for the slice (exclusive).</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range. + /// </exception> + public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, int start, int length) + { + if (text == null) + { + if (start != 0 || length != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + +#if BIT64 + // See comment in Span<T>.Slice for how this works. + if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#else + if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start)) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#endif + + return new ReadOnlyMemory<Char8>(text, start, length); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="range">The range used to indicate the start and length of the sliced string.</param> + public static ReadOnlyMemory<Char8> AsMemory(this Utf8String text, Range range) + { + if (text == null) + { + Index startIndex = range.Start; + Index endIndex = range.End; + + if (!startIndex.Equals(Index.Start) || !endIndex.Equals(Index.Start)) + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text); + + return default; + } + + (int start, int length) = range.GetOffsetAndLength(text.Length); + return new ReadOnlyMemory<Char8>(text, start, length); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text) + { + if (text == null) + return default; + + return new ReadOnlyMemory<byte>(text, 0, text.Length); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index is not in range (<0 or >text.Length). + /// </exception> + public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, int start) + { + if (text == null) + { + if (start != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + + if ((uint)start > (uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + + return new ReadOnlyMemory<byte>(text, start, text.Length - start); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="startIndex">The index at which to begin this slice.</param> + public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, Index startIndex) + { + if (text == null) + { + if (!startIndex.Equals(Index.Start)) + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text); + + return default; + } + + int actualIndex = startIndex.GetOffset(text.Length); + if ((uint)actualIndex > (uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(); + + return new ReadOnlyMemory<byte>(text, actualIndex, text.Length - actualIndex); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="start">The index at which to begin this slice.</param> + /// <param name="length">The desired length for the slice (exclusive).</param> + /// <remarks>Returns default when <paramref name="text"/> is null.</remarks> + /// <exception cref="System.ArgumentOutOfRangeException"> + /// Thrown when the specified <paramref name="start"/> index or <paramref name="length"/> is not in range. + /// </exception> + public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, int start, int length) + { + if (text == null) + { + if (start != 0 || length != 0) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); + return default; + } + +#if BIT64 + // See comment in Span<T>.Slice for how this works. + if ((ulong)(uint)start + (ulong)(uint)length > (ulong)(uint)text.Length) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#else + if ((uint)start > (uint)text.Length || (uint)length > (uint)(text.Length - start)) + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.start); +#endif + + return new ReadOnlyMemory<byte>(text, start, length); + } + + /// <summary>Creates a new <see cref="ReadOnlyMemory{T}"/> over the portion of the target <see cref="Utf8String"/>.</summary> + /// <param name="text">The target <see cref="Utf8String"/>.</param> + /// <param name="range">The range used to indicate the start and length of the sliced string.</param> + public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, Range range) + { + if (text == null) + { + Index startIndex = range.Start; + Index endIndex = range.End; + + if (!startIndex.Equals(Index.Start) || !endIndex.Equals(Index.Start)) + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.text); + + return default; + } + + (int start, int length) = range.GetOffsetAndLength(text.Length); + return new ReadOnlyMemory<byte>(text, start, length); + } + } +} diff --git a/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs b/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs new file mode 100644 index 0000000000..9ecd44f3ae --- /dev/null +++ b/src/System.Private.CoreLib/src/System/Utf8String.Construction.cs @@ -0,0 +1,223 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Text.Unicode; + +namespace System +{ + public sealed partial class Utf8String + { + /* + * CONSTRUCTORS + * + * Defining a new constructor for string-like types (like Utf8String) requires changes both + * to the managed code below and to the native VM code. See the comment at the top of + * src/vm/ecall.cpp for instructions on how to add new overloads. + * + * The default behavior of each ctor is to validate the input, replacing invalid sequences with the + * Unicode replacement character U+FFFD. The resulting Utf8String instance will be well-formed but + * might not have full fidelity with the input data. This behavior can be controlled by calling + * any of the Create instances and specifying a different action. + */ + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing UTF-8 data. + /// </summary> + /// <remarks> + /// The UTF-8 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + public extern Utf8String(ReadOnlySpan<byte> value); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private Utf8String Ctor(ReadOnlySpan<byte> value) + { + if (value.IsEmpty) + { + return Empty; + } + + Utf8String newString = FastAllocate(value.Length); + Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref MemoryMarshal.GetReference(value), (uint)value.Length); + return Utf8Utility.ValidateAndFixupUtf8String(newString); + } + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing UTF-8 data. + /// </summary> + /// <remarks> + /// The UTF-8 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + public extern Utf8String(byte[] value, int startIndex, int length); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private Utf8String Ctor(byte[] value, int startIndex, int length) => Ctor(new ReadOnlySpan<byte>(value, startIndex, length)); + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing null-terminated UTF-8 data. + /// </summary> + /// <remarks> + /// The UTF-8 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + [CLSCompliant(false)] + public unsafe extern Utf8String(byte* value); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private unsafe Utf8String Ctor(byte* value) + { + if (value == null) + { + return Empty; + } + + return Ctor(new ReadOnlySpan<byte>(value, string.strlen(value))); + } + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing UTF-16 data. + /// </summary> + /// <remarks> + /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + public extern Utf8String(ReadOnlySpan<char> value); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private Utf8String Ctor(ReadOnlySpan<char> value) + { + if (value.IsEmpty) + { + return Empty; + } + + // TODO_UTF8STRING: Call into optimized transcoding routine when it's available. + + Utf8String newString = FastAllocate(Encoding.UTF8.GetByteCount(value)); + Encoding.UTF8.GetBytes(value, new Span<byte>(ref newString.DangerousGetMutableReference(), newString.Length)); + return newString; + } + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing UTF-16 data. + /// </summary> + /// <remarks> + /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + public extern Utf8String(char[] value, int startIndex, int length); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private Utf8String Ctor(char[] value, int startIndex, int length) => Ctor(new ReadOnlySpan<char>(value, startIndex, length)); + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing null-terminated UTF-16 data. + /// </summary> + /// <remarks> + /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + [CLSCompliant(false)] + public unsafe extern Utf8String(char* value); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private unsafe Utf8String Ctor(char* value) + { + if (value == null) + { + return Empty; + } + + return Ctor(new ReadOnlySpan<char>(value, string.wcslen(value))); + } + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing UTF-16 data. + /// </summary> + /// <remarks> + /// The UTF-16 data in <paramref name="value"/> is validated for well-formedness upon construction. + /// Invalid code unit sequences are replaced with U+FFFD in the resulting <see cref="Utf8String"/>. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + public extern Utf8String(string value); + +#if PROJECTN + [DependencyReductionRoot] +#endif +#if !CORECLR + static +#endif + private Utf8String Ctor(string value) => Ctor(value.AsSpan()); + + /* + * HELPER METHODS + */ + + /// <summary> + /// Creates a <see cref="Utf8String"/> instance from existing data, bypassing validation. + /// Also allows the caller to set flags dictating various attributes of the data. + /// </summary> + internal static Utf8String DangerousCreateWithoutValidation(ReadOnlySpan<byte> utf8Data, bool assumeWellFormed = false, bool assumeAscii = false) + { + if (utf8Data.IsEmpty) + { + return Empty; + } + + Utf8String newString = FastAllocate(utf8Data.Length); + utf8Data.CopyTo(new Span<byte>(ref newString.DangerousGetMutableReference(), newString.Length)); + return newString; + } + + /// <summary> + /// Creates a new zero-initialized instance of the specified length. Actual storage allocated is "length + 1" bytes + /// because instances are null-terminated. + /// </summary> + /// <remarks> + /// The implementation of this method checks its input argument for overflow. + /// </remarks> + [MethodImpl(MethodImplOptions.InternalCall)] + private static extern Utf8String FastAllocate(int length); + } +} diff --git a/src/System.Private.CoreLib/src/System/Utf8String.Manipulation.cs b/src/System.Private.CoreLib/src/System/Utf8String.Manipulation.cs new file mode 100644 index 0000000000..6e5209962f --- /dev/null +++ b/src/System.Private.CoreLib/src/System/Utf8String.Manipulation.cs @@ -0,0 +1,109 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; + +namespace System +{ + public sealed partial class Utf8String + { + /// <summary> + /// Substrings this <see cref="Utf8String"/> without bounds checking. + /// </summary> + private Utf8String InternalSubstring(int startIndex, int length) + { + Debug.Assert(startIndex >= 0, "StartIndex cannot be negative."); + Debug.Assert(startIndex <= this.Length, "StartIndex cannot point beyond the end of the string (except to the null terminator)."); + Debug.Assert(length >= 0, "Length cannot be negative."); + Debug.Assert(startIndex + length <= this.Length, "StartIndex and Length cannot point beyond the end of the string."); + + Debug.Assert(startIndex != 0 && startIndex != this.Length, "Caller should handle StartIndex boundary conditions."); + Debug.Assert(length != 0 && length != this.Length, "Caller should handle Length boundary conditions."); + + Utf8String newString = FastAllocate(length); + Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref this.DangerousGetMutableReference(startIndex), (uint)length); + return newString; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Utf8String Substring(Index startIndex) + { + int actualIndex = startIndex.GetOffset(Length); + return Substring(actualIndex); + } + + public Utf8String Substring(int startIndex) + { + if ((uint)startIndex > (uint)this.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startIndex); + } + + // Optimizations: since instances are immutable, we can return 'this' or the known + // Empty instance if the caller passed us a startIndex at the string boundary. + + if (startIndex == 0) + { + return this; + } + + if (startIndex == Length) + { + return Empty; + } + + return InternalSubstring(startIndex, Length - startIndex); + } + + public Utf8String Substring(int startIndex, int length) + { + ValidateStartIndexAndLength(startIndex, length); + + // Optimizations: since instances are immutable, we can return 'this' or the known + // Empty instance if the caller passed us a startIndex at the string boundary. + + if (length == 0) + { + return Empty; + } + + if (length == this.Length) + { + return this; + } + + return InternalSubstring(startIndex, length); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Utf8String Substring(Range range) + { + (int start, int length) = range.GetOffsetAndLength(Length); + return Substring(start, length); + } + + [StackTraceHidden] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ValidateStartIndexAndLength(int startIndex, int length) + { +#if BIT64 + // See comment in Span<T>.Slice for how this works. + if ((ulong)(uint)startIndex + (ulong)(uint)length > (ulong)(uint)this.Length) + ValidateStartIndexAndLength_Throw(startIndex, length); +#else + if ((uint)startIndex > (uint)this.Length || (uint)length > (uint)(this.Length - startIndex)) + ValidateStartIndexAndLength_Throw(startIndex, length); +#endif + } + + [StackTraceHidden] + private void ValidateStartIndexAndLength_Throw(int startIndex, int length) + { + throw new ArgumentOutOfRangeException(paramName: ((uint)startIndex > (uint)this.Length) ? nameof(startIndex) : nameof(length)); + } + } +} diff --git a/src/System.Private.CoreLib/src/System/Utf8String.Searching.cs b/src/System.Private.CoreLib/src/System/Utf8String.Searching.cs new file mode 100644 index 0000000000..0373cdd4fd --- /dev/null +++ b/src/System.Private.CoreLib/src/System/Utf8String.Searching.cs @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.InteropServices; +using System.Text; +using System.Text.Unicode; + +namespace System +{ + public sealed partial class Utf8String + { + // Ordinal search + public bool Contains(char value) + { + return Rune.TryCreate(value, out Rune result) && Contains(result); + } + + // Ordinal search + public bool Contains(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return SpanHelpers.IndexOf( + ref DangerousGetMutableReference(), Length, + ref MemoryMarshal.GetReference(runeBytes), runeBytesWritten) >= 0; + } + + // Ordinal search + public bool EndsWith(char value) + { + return Rune.TryCreate(value, out Rune result) && EndsWith(result); + } + + // Ordinal search + public bool EndsWith(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return this.AsBytes().EndsWith(runeBytes.Slice(0, runeBytesWritten)); + } + + // Ordinal search + public int IndexOf(char value) + { + return Rune.TryCreate(value, out Rune result) ? IndexOf(result) : -1; + } + + // Ordinal search + public int IndexOf(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return SpanHelpers.IndexOf( + ref DangerousGetMutableReference(), Length, + ref MemoryMarshal.GetReference(runeBytes), runeBytesWritten); + } + + // Ordinal search + public bool StartsWith(char value) + { + return Rune.TryCreate(value, out Rune result) && StartsWith(result); + } + + // Ordinal search + public bool StartsWith(Rune value) + { + // TODO_UTF8STRING: This should be split into two methods: + // One which operates on a single-byte (ASCII) search value, + // the other which operates on a multi-byte (non-ASCII) search value. + + Span<byte> runeBytes = stackalloc byte[Utf8Utility.MaxBytesPerScalar]; + int runeBytesWritten = value.EncodeToUtf8(runeBytes); + + return this.AsBytes().StartsWith(runeBytes.Slice(0, runeBytesWritten)); + } + } +} diff --git a/src/System.Private.CoreLib/src/System/Utf8String.cs b/src/System.Private.CoreLib/src/System/Utf8String.cs new file mode 100644 index 0000000000..1a4357a06f --- /dev/null +++ b/src/System.Private.CoreLib/src/System/Utf8String.cs @@ -0,0 +1,252 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.ComponentModel; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Text; +using Internal.Runtime.CompilerServices; + +namespace System +{ + /// <summary> + /// Represents an immutable string of UTF-8 code units. + /// </summary> + public sealed partial class Utf8String : IEquatable<Utf8String> + { + /* + * STATIC FIELDS + */ + + public static readonly Utf8String Empty = FastAllocate(0); + + /* + * INSTANCE FIELDS + * Do not reorder these fields. They must match the layout of Utf8StringObject in object.h. + */ + + private readonly int _length; + private readonly byte _firstByte; + + /* + * OPERATORS + */ + + /// <summary> + /// Compares two <see cref="Utf8String"/> instances for equality using a <see cref="StringComparison.Ordinal"/> comparer. + /// </summary> + public static bool operator ==(Utf8String left, Utf8String right) => Equals(left, right); + + /// <summary> + /// Compares two <see cref="Utf8String"/> instances for inequality using a <see cref="StringComparison.Ordinal"/> comparer. + /// </summary> + public static bool operator !=(Utf8String left, Utf8String right) => !Equals(left, right); + + /// <summary> + /// Projects a <see cref="Utf8String"/> instance as a <see cref="ReadOnlySpan{Byte}"/>. + /// </summary> + public static explicit operator ReadOnlySpan<byte>(Utf8String value) => value.AsBytes(); + + /// <summary> + /// Projects a <see cref="Utf8String"/> instance as a <see cref="ReadOnlySpan{Char8}"/>. + /// </summary> + public static implicit operator ReadOnlySpan<Char8>(Utf8String value) => value.AsSpan(); + + /* + * INSTANCE PROPERTIES + */ + + /// <summary> + /// Returns the length (in UTF-8 code units) of this instance. + /// </summary> + public int Length => _length; + + /* + * INSTANCE INDEXERS + */ + + /// <summary> + /// Gets the <see cref="Char8"/> at the specified position. + /// </summary> + public Char8 this[int index] + { + get + { + // Just like String, we don't allow indexing into the null terminator itself. + + if ((uint)index >= (uint)Length) + { + ThrowHelper.ThrowArgumentOutOfRange_IndexException(); + } + + return Unsafe.Add(ref DangerousGetMutableReference(), index); + } + } + + /// <summary> + /// Gets the <see cref="Char8"/> at the specified position. + /// </summary> + public Char8 this[Index index] + { + get + { + // Just like String, we don't allow indexing into the null terminator itself. + + int actualIndex = index.GetOffset(Length); + return this[actualIndex]; + } + } + + /// <summary> + /// Gets a substring of this <see cref="Utf8String"/> based on the provided <paramref name="range"/>. + /// </summary> + public Utf8String this[Range range] => Substring(range); + + /* + * METHODS + */ + + /// <summary> + /// Returns a <em>mutable</em> reference to the first byte of this <see cref="Utf8String"/> + /// (or the null terminator if the string is empty). + /// </summary> + /// <returns></returns> + internal ref byte DangerousGetMutableReference() => ref Unsafe.AsRef(in _firstByte); + + /// <summary> + /// Returns a <em>mutable</em> reference to the element at index <paramref name="index"/> + /// of this <see cref="Utf8String"/> instance. The index is not bounds-checked. + /// </summary> + internal ref byte DangerousGetMutableReference(int index) + { + // Allow retrieving references to the null terminator. + Debug.Assert((uint)index <= (uint)Length, "Caller should've performed bounds checking."); + + return ref Unsafe.Add(ref DangerousGetMutableReference(), index); + } + + /// <summary> + /// Performs an equality comparison using a <see cref="StringComparison.Ordinal"/> comparer. + /// </summary> + public override bool Equals(object obj) + { + return obj is Utf8String other && this.Equals(other); + } + + /// <summary> + /// Performs an equality comparison using a <see cref="StringComparison.Ordinal"/> comparer. + /// </summary> + public bool Equals(Utf8String value) + { + // First, a very quick check for referential equality. + + if (ReferenceEquals(this, value)) + { + return true; + } + + // Otherwise, perform a simple bitwise equality check. + + return !(value is null) + && this.Length == value.Length + && SpanHelpers.SequenceEqual(ref this.DangerousGetMutableReference(), ref value.DangerousGetMutableReference(), (uint)Length); + } + + /// <summary> + /// Compares two <see cref="Utf8String"/> instances using a <see cref="StringComparison.Ordinal"/> comparer. + /// </summary> + public static bool Equals(Utf8String left, Utf8String right) + { + // First, a very quick check for referential equality. + + if (ReferenceEquals(left, right)) + { + return true; + } + + // Otherwise, perform a simple bitwise equality check. + + return !(left is null) + && !(right is null) + && left.Length == right.Length + && SpanHelpers.SequenceEqual(ref left.DangerousGetMutableReference(), ref right.DangerousGetMutableReference(), (uint)left.Length); + } + + /// <summary> + /// Returns a hash code using a <see cref="StringComparison.Ordinal"/> comparison. + /// </summary> + public override int GetHashCode() + { + // TODO_UTF8STRING: Consider whether this should use a different seed than String.GetHashCode. + + ulong seed = Marvin.DefaultSeed; + return Marvin.ComputeHash32(ref DangerousGetMutableReference(), _length /* in bytes */, (uint)seed, (uint)(seed >> 32)); + } + + /// <summary> + /// Gets an immutable reference that can be used in a <see langword="fixed"/> statement. The resulting + /// reference can be pinned and used as a null-terminated <em>LPCUTF8STR</em>. + /// </summary> + /// <remarks> + /// If this <see cref="Utf8String"/> instance is empty, returns a reference to the null terminator. + /// </remarks> + [EditorBrowsable(EditorBrowsableState.Never)] // for compiler use only + public ref readonly byte GetPinnableReference() => ref _firstByte; + + /// <summary> + /// Returns <see langword="true"/> if <paramref name="value"/> is <see langword="null"/> or zero length; + /// <see langword="false"/> otherwise. + /// </summary> + public static bool IsNullOrEmpty(Utf8String value) + { + // Copied from String.IsNullOrEmpty. See that method for detailed comments on why this pattern is used. + return (value is null || 0u >= (uint)value.Length) ? true : false; + } + + /// <summary> + /// Returns the entire <see cref="Utf8String"/> as an array of bytes. + /// </summary> + public byte[] ToByteArray() + { + if (Length == 0) + { + return Array.Empty<byte>(); + } + + byte[] bytes = new byte[Length]; + Buffer.Memmove(ref bytes.GetRawSzArrayData(), ref DangerousGetMutableReference(), (uint)Length); + return bytes; + } + + /// <summary> + /// Returns a substring of this <see cref="Utf8String"/> as an array of bytes. + /// </summary> + public byte[] ToByteArray(int startIndex, int length) + { + ValidateStartIndexAndLength(startIndex, length); + + if (length == 0) + { + return Array.Empty<byte>(); + } + + byte[] bytes = new byte[length]; + Buffer.Memmove(ref bytes.GetRawSzArrayData(), ref DangerousGetMutableReference(startIndex), (uint)length); + return bytes; + } + + /// <summary> + /// Converts this <see cref="Utf8String"/> instance to a <see cref="string"/>. + /// </summary> + /// <remarks> + /// Invalid subsequences are replaced with U+FFFD during conversion. + /// </remarks> + public override string ToString() + { + // TODO_UTF8STRING: Call into optimized transcoding routine when it's available. + + return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref DangerousGetMutableReference(), Length)); + } + } +} diff --git a/src/classlibnative/bcltype/objectnative.cpp b/src/classlibnative/bcltype/objectnative.cpp index a90a37a692..64914d8807 100644 --- a/src/classlibnative/bcltype/objectnative.cpp +++ b/src/classlibnative/bcltype/objectnative.cpp @@ -253,6 +253,9 @@ FCIMPL1(Object*, ObjectNative::Clone, Object* pThisUNSAFE) // assert that String has overloaded the Clone() method _ASSERTE(pMT != g_pStringClass); +#ifdef FEATURE_UTF8STRING + _ASSERTE(pMT != g_pUtf8StringClass); +#endif // FEATURE_UTF8STRING if (pMT->IsArray()) { refClone = DupArrayForCloning((BASEARRAYREF)refThis); diff --git a/src/inc/dacvars.h b/src/inc/dacvars.h index fc5be15590..cec6d74dd7 100644 --- a/src/inc/dacvars.h +++ b/src/inc/dacvars.h @@ -168,6 +168,9 @@ DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pObjectClass, ::g_pObjectClass DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pRuntimeTypeClass, ::g_pRuntimeTypeClass) DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pCanonMethodTableClass, ::g_pCanonMethodTableClass) DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pStringClass, ::g_pStringClass) +#ifdef FEATURE_UTF8STRING +DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pUtf8StringClass, ::g_pUtf8StringClass) +#endif // FEATURE_UTF8STRING DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pArrayClass, ::g_pArrayClass) DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pSZArrayHelperClass, ::g_pSZArrayHelperClass) DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pNullableClass, ::g_pNullableClass) diff --git a/src/strongname/api/common.h b/src/strongname/api/common.h index 26c545cff9..626d9bb720 100644 --- a/src/strongname/api/common.h +++ b/src/strongname/api/common.h @@ -146,6 +146,9 @@ typedef DPTR(class ReJitManager) PTR_ReJitManager; typedef DPTR(struct ReJitInfo) PTR_ReJitInfo; typedef DPTR(struct SharedReJitInfo) PTR_SharedReJitInfo; typedef DPTR(class StringObject) PTR_StringObject; +#ifdef FEATURE_UTF8STRING +typedef DPTR(class Utf8StringObject) PTR_Utf8StringObject; +#endif // FEATURE_UTF8STRING typedef DPTR(class TypeHandle) PTR_TypeHandle; #ifdef STUB_DISPATCH typedef VPTR(class VirtualCallStubManager) PTR_VirtualCallStubManager; diff --git a/src/vm/appdomain.cpp b/src/vm/appdomain.cpp index 4eb716164c..9362dd9c7b 100644 --- a/src/vm/appdomain.cpp +++ b/src/vm/appdomain.cpp @@ -2485,6 +2485,11 @@ void SystemDomain::LoadBaseSystemClasses() // Load String g_pStringClass = MscorlibBinder::LoadPrimitiveType(ELEMENT_TYPE_STRING); +#ifdef FEATURE_UTF8STRING + // Load Utf8String + g_pUtf8StringClass = MscorlibBinder::GetClass(CLASS__UTF8_STRING); +#endif // FEATURE_UTF8STRING + // Used by Buffer::BlockCopy g_pByteArrayMT = ClassLoader::LoadArrayTypeThrowing( TypeHandle(MscorlibBinder::GetElementType(ELEMENT_TYPE_U1))).AsArray()->GetMethodTable(); diff --git a/src/vm/classnames.h b/src/vm/classnames.h index cb71df362a..f45311f0de 100644 --- a/src/vm/classnames.h +++ b/src/vm/classnames.h @@ -139,6 +139,10 @@ #define g_ThreadClassName "System.Threading.Thread" #define g_TypeClassName "System.Type" +#ifdef FEATURE_UTF8STRING +#define g_Utf8StringName "Utf8String" +#endif // FEATURE_UTF8STRING + #define g_VariantClassName "System.Variant" #define g_GuidClassName "System.Guid" diff --git a/src/vm/common.h b/src/vm/common.h index 2a91e77220..61ba2a7514 100644 --- a/src/vm/common.h +++ b/src/vm/common.h @@ -167,6 +167,9 @@ typedef DPTR(class ReJitManager) PTR_ReJitManager; typedef DPTR(struct ReJitInfo) PTR_ReJitInfo; typedef DPTR(struct SharedReJitInfo) PTR_SharedReJitInfo; typedef DPTR(class StringObject) PTR_StringObject; +#ifdef FEATURE_UTF8STRING +typedef DPTR(class Utf8StringObject) PTR_Utf8StringObject; +#endif // FEATURE_UTF8STRING typedef DPTR(class TypeHandle) PTR_TypeHandle; typedef VPTR(class VirtualCallStubManager) PTR_VirtualCallStubManager; typedef VPTR(class VirtualCallStubManagerManager) PTR_VirtualCallStubManagerManager; diff --git a/src/vm/ecall.cpp b/src/vm/ecall.cpp index b8e0d64e8f..dfeff95d6f 100644 --- a/src/vm/ecall.cpp +++ b/src/vm/ecall.cpp @@ -29,6 +29,36 @@ extern const int c_nECClasses; #endif // CROSSGEN_COMPILE +/********** + +The constructors of string-like types (String, Utf8String) are special since the JIT will +replace newobj instructions with calls to the corresponding 'Ctor' method. Depending on the +CLR in use, the ctor methods may be instance methods (with a null 'this' parameter) or +static methods. See the managed definitions of String.Ctor and Utf8String.Ctor for more +information. + +To add a new ctor overload, in addition to defining the constructor and Ctor methods on +the managed side, make changes to the following files. (These instructions are for +Utf8String, but String is similar.) + +- src/vm/ecall.cpp (this file), update the definition of "NumberOfUtf8StringConstructors" + and add the appropriate static asserts immediately above the definition. + +- src/vm/ecall.h, search for "Utf8StringCtor" and add the DYNAMICALLY_ASSIGNED_FCALL_IMPL + definitions corresponding to the new overloads. + +- src/vm/ecalllist.h, search for "FCFuncStart(gUtf8StringFuncs)" and add the overloads + within that block. + +- src/vm/metasig.h, add the new Utf8String-returning metasig declarations; and, if necessary, + add any void-returning metasig declarations if they haven't already been defined elsewhere. + search "String_RetUtf8Str" for an example of how to do this. + +- src/vm/mscorlib.h, search "DEFINE_CLASS(UTF8_STRING" and add the new DEFINE_METHOD + declarations for the Utf8String-returning Ctor methods, referencing the new metasig declarations. + +**********/ + // METHOD__STRING__CTORF_XXX has to be in same order as ECall::CtorCharXxx #define METHOD__STRING__CTORF_FIRST METHOD__STRING__CTORF_CHARARRAY static_assert_no_msg(METHOD__STRING__CTORF_FIRST + 0 == METHOD__STRING__CTORF_CHARARRAY); @@ -55,14 +85,38 @@ static_assert_no_msg(ECallCtor_First + 8 == ECall::CtorSBytePtrStartLengthEncodi #define NumberOfStringConstructors 9 +#ifdef FEATURE_UTF8STRING +// METHOD__UTF8STRING__CTORF_XXX has to be in same order as ECall::Utf8StringCtorCharXxx +#define METHOD__UTF8STRING__CTORF_FIRST METHOD__UTF8_STRING__CTORF_READONLYSPANOFBYTE +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 0 == METHOD__UTF8_STRING__CTORF_READONLYSPANOFBYTE); +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 1 == METHOD__UTF8_STRING__CTORF_READONLYSPANOFCHAR); +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 2 == METHOD__UTF8_STRING__CTORF_BYTEARRAY_START_LEN); +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 3 == METHOD__UTF8_STRING__CTORF_BYTEPTR); +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 4 == METHOD__UTF8_STRING__CTORF_CHARARRAY_START_LEN); +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 5 == METHOD__UTF8_STRING__CTORF_CHARPTR); +static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 6 == METHOD__UTF8_STRING__CTORF_STRING); + +// ECall::Utf8StringCtorCharXxx has to be in same order as METHOD__UTF8STRING__CTORF_XXX +#define ECallUtf8String_Ctor_First ECall::Utf8StringCtorReadOnlySpanOfByteManaged +static_assert_no_msg(ECallUtf8String_Ctor_First + 0 == ECall::Utf8StringCtorReadOnlySpanOfByteManaged); +static_assert_no_msg(ECallUtf8String_Ctor_First + 1 == ECall::Utf8StringCtorReadOnlySpanOfCharManaged); +static_assert_no_msg(ECallUtf8String_Ctor_First + 2 == ECall::Utf8StringCtorByteArrayStartLengthManaged); +static_assert_no_msg(ECallUtf8String_Ctor_First + 3 == ECall::Utf8StringCtorBytePtrManaged); +static_assert_no_msg(ECallUtf8String_Ctor_First + 4 == ECall::Utf8StringCtorCharArrayStartLengthManaged); +static_assert_no_msg(ECallUtf8String_Ctor_First + 5 == ECall::Utf8StringCtorCharPtrManaged); +static_assert_no_msg(ECallUtf8String_Ctor_First + 6 == ECall::Utf8StringCtorStringManaged); + +#define NumberOfUtf8StringConstructors 7 +#endif // FEATURE_UTF8STRING + void ECall::PopulateManagedStringConstructors() { STANDARD_VM_CONTRACT; INDEBUG(static bool fInitialized = false); _ASSERTE(!fInitialized); // assume this method is only called once - _ASSERTE(g_pStringClass != NULL); + _ASSERTE(g_pStringClass != NULL); for (int i = 0; i < NumberOfStringConstructors; i++) { MethodDesc* pMD = MscorlibBinder::GetMethod((BinderMethodID)(METHOD__STRING__CTORF_FIRST + i)); @@ -72,6 +126,20 @@ void ECall::PopulateManagedStringConstructors() ECall::DynamicallyAssignFCallImpl(pDest, ECallCtor_First + i); } + +#ifdef FEATURE_UTF8STRING + _ASSERTE(g_pUtf8StringClass != NULL); + for (int i = 0; i < NumberOfUtf8StringConstructors; i++) + { + MethodDesc* pMD = MscorlibBinder::GetMethod((BinderMethodID)(METHOD__UTF8STRING__CTORF_FIRST + i)); + _ASSERTE(pMD != NULL); + + PCODE pDest = pMD->GetMultiCallableAddrOfCode(); + + ECall::DynamicallyAssignFCallImpl(pDest, ECallUtf8String_Ctor_First + i); + } +#endif // FEATURE_UTF8STRING + INDEBUG(fInitialized = true); } diff --git a/src/vm/ecall.h b/src/vm/ecall.h index c809109c4c..58b4f0c34e 100644 --- a/src/vm/ecall.h +++ b/src/vm/ecall.h @@ -103,7 +103,7 @@ class ECall static void EnumFCallMethods(); #endif // DACCESS_COMPILE -#define DYNAMICALLY_ASSIGNED_FCALLS() \ +#define _DYNAMICALLY_ASSIGNED_FCALLS_BASE() \ DYNAMICALLY_ASSIGNED_FCALL_IMPL(FastAllocateString, FramedAllocateString) \ DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorCharArrayManaged, NULL) \ DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorCharArrayStartLengthManaged, NULL) \ @@ -116,6 +116,22 @@ class ECall DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorSBytePtrStartLengthEncodingManaged, NULL) \ DYNAMICALLY_ASSIGNED_FCALL_IMPL(InternalGetCurrentThread, NULL) \ +#define _DYNAMICALLY_ASSIGNED_FCALLS_UTF8STRING() \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(FastAllocateUtf8String, FramedAllocateUtf8String) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorReadOnlySpanOfByteManaged, NULL) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorReadOnlySpanOfCharManaged, NULL) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorByteArrayStartLengthManaged, NULL) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorBytePtrManaged, NULL) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorCharArrayStartLengthManaged, NULL) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorCharPtrManaged, NULL) \ + DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorStringManaged, NULL) \ + +#ifdef FEATURE_UTF8STRING +#define DYNAMICALLY_ASSIGNED_FCALLS() _DYNAMICALLY_ASSIGNED_FCALLS_BASE() _DYNAMICALLY_ASSIGNED_FCALLS_UTF8STRING() +#else +#define DYNAMICALLY_ASSIGNED_FCALLS() _DYNAMICALLY_ASSIGNED_FCALLS_BASE() +#endif // FEATURE_UTF8STRING + enum { #undef DYNAMICALLY_ASSIGNED_FCALL_IMPL diff --git a/src/vm/ecalllist.h b/src/vm/ecalllist.h index b44669ea75..7302bb4e4a 100644 --- a/src/vm/ecalllist.h +++ b/src/vm/ecalllist.h @@ -116,6 +116,19 @@ FCFuncStart(gStringFuncs) FCFuncElement("Intern", AppDomainNative::GetOrInternString) FCFuncEnd() +#ifdef FEATURE_UTF8STRING +FCFuncStart(gUtf8StringFuncs) + FCDynamic("FastAllocate", CORINFO_INTRINSIC_Illegal, ECall::FastAllocateUtf8String) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ReadOnlySpanOfByte_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorReadOnlySpanOfByteManaged) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ReadOnlySpanOfChar_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorReadOnlySpanOfCharManaged) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ArrByte_Int_Int_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorByteArrayStartLengthManaged) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_PtrByte_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorBytePtrManaged) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ArrChar_Int_Int_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorCharArrayStartLengthManaged) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_PtrChar_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorCharPtrManaged) + FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_Str_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorStringManaged) +FCFuncEnd() +#endif // FEATURE_UTF8STRING + FCFuncStart(gValueTypeFuncs) FCFuncElement("CanCompareBits", ValueTypeHelper::CanCompareBits) FCFuncElement("FastEqualsCheck", ValueTypeHelper::FastEqualsCheck) @@ -1270,6 +1283,9 @@ FCClassElement("TypedReference", "System", gTypedReferenceFuncs) #ifdef FEATURE_COMINTEROP FCClassElement("UriMarshaler", "System.StubHelpers", gUriMarshalerFuncs) #endif +#ifdef FEATURE_UTF8STRING +FCClassElement("Utf8String", "System", gUtf8StringFuncs) +#endif // FEATURE_UTF8STRING FCClassElement("ValueClassMarshaler", "System.StubHelpers", gValueClassMarshalerFuncs) FCClassElement("ValueType", "System", gValueTypeFuncs) #ifdef FEATURE_COMINTEROP diff --git a/src/vm/gchelpers.cpp b/src/vm/gchelpers.cpp index a52e10bb4f..af3a1602b3 100644 --- a/src/vm/gchelpers.cpp +++ b/src/vm/gchelpers.cpp @@ -981,6 +981,8 @@ STRINGREF SlowAllocateString( DWORD cchStringLength ) // Limit the maximum string size to <2GB to mitigate risk of security issues caused by 32-bit integer // overflows in buffer size calculations. + // + // If the value below is changed, also change SlowAllocateUtf8String. if (cchStringLength > 0x3FFFFFDF) ThrowOutOfMemory(); @@ -1028,6 +1030,81 @@ STRINGREF SlowAllocateString( DWORD cchStringLength ) return( ObjectToSTRINGREF(orObject) ); } +#ifdef FEATURE_UTF8STRING +UTF8STRINGREF SlowAllocateUtf8String(DWORD cchStringLength) +{ + CONTRACTL{ + THROWS; + GC_TRIGGERS; + MODE_COOPERATIVE; // returns an objref without pinning it => cooperative + } CONTRACTL_END; + + Utf8StringObject *orObject = NULL; + +#ifdef _DEBUG + if (g_pConfig->ShouldInjectFault(INJECTFAULT_GCHEAP)) + { + char *a = new char; + delete a; + } +#endif + + // Limit the maximum string size to <2GB to mitigate risk of security issues caused by 32-bit integer + // overflows in buffer size calculations. + // + // 0x7FFFFFBF is derived from the const 0x3FFFFFDF in SlowAllocateString. + // Adding +1 (for null terminator) and multiplying by sizeof(WCHAR) means that + // SlowAllocateString allows a maximum of 0x7FFFFFC0 bytes to be used for the + // string data itself, with some additional buffer for object headers and other + // data. Since we don't have the sizeof(WCHAR) multiplication here, we only need + // -1 to account for the null terminator, leading to a max size of 0x7FFFFFBF. + if (cchStringLength > 0x7FFFFFBF) + ThrowOutOfMemory(); + + SIZE_T ObjectSize = PtrAlign(Utf8StringObject::GetSize(cchStringLength)); + _ASSERTE(ObjectSize > cchStringLength); + + SetTypeHandleOnThreadForAlloc(TypeHandle(g_pUtf8StringClass)); + + orObject = (Utf8StringObject *)Alloc(ObjectSize, FALSE, FALSE); + + // Object is zero-init already + _ASSERTE(orObject->HasEmptySyncBlockInfo()); + + // Initialize Object + orObject->SetMethodTable(g_pUtf8StringClass); + orObject->SetLength(cchStringLength); + + if (ObjectSize >= LARGE_OBJECT_SIZE) + { + GCHeapUtilities::GetGCHeap()->PublishObject((BYTE*)orObject); + } + + // Notify the profiler of the allocation + if (TrackAllocations()) + { + OBJECTREF objref = ObjectToOBJECTREF((Object*)orObject); + GCPROTECT_BEGIN(objref); + ProfilerObjectAllocatedCallback(objref, (ClassID)orObject->GetTypeHandle().AsPtr()); + GCPROTECT_END(); + + orObject = (Utf8StringObject *)OBJECTREFToObject(objref); + } + +#ifdef FEATURE_EVENT_TRACE + // Send ETW event for allocation + if (ETW::TypeSystemLog::IsHeapAllocEventEnabled()) + { + ETW::TypeSystemLog::SendObjectAllocatedEvent(orObject); + } +#endif // FEATURE_EVENT_TRACE + + LogAlloc(ObjectSize, g_pUtf8StringClass, orObject); + + return( ObjectToUTF8STRINGREF(orObject) ); +} +#endif // FEATURE_UTF8STRING + #ifdef FEATURE_COMINTEROP_UNMANAGED_ACTIVATION // OBJECTREF AllocateComClassObject(ComClassFactory* pComClsFac) void AllocateComClassObject(ComClassFactory* pComClsFac, OBJECTREF* ppRefClass) diff --git a/src/vm/gchelpers.h b/src/vm/gchelpers.h index 0e407c6e61..8f6a16ade9 100644 --- a/src/vm/gchelpers.h +++ b/src/vm/gchelpers.h @@ -71,6 +71,10 @@ STRINGREF AllocateString( DWORD cchStringLength ); // The slow version, implemented in gcscan.cpp STRINGREF SlowAllocateString( DWORD cchStringLength ); +#ifdef FEATURE_UTF8STRING +UTF8STRINGREF SlowAllocateUtf8String( DWORD cchStringLength ); +#endif // FEATURE_UTF8STRING + #else // On other platforms, go to the (somewhat less efficient) implementations in gcscan.cpp @@ -83,6 +87,10 @@ OBJECTREF AllocateObjectArray(DWORD cElements, TypeHandle ElementType, BOOL bAll STRINGREF SlowAllocateString( DWORD cchStringLength ); +#ifdef FEATURE_UTF8STRING +UTF8STRINGREF SlowAllocateUtf8String( DWORD cchStringLength ); +#endif // FEATURE_UTF8STRING + inline STRINGREF AllocateString( DWORD cchStringLength ) { WRAPPER_NO_CONTRACT; @@ -92,6 +100,15 @@ inline STRINGREF AllocateString( DWORD cchStringLength ) #endif +#ifdef FEATURE_UTF8STRING +inline UTF8STRINGREF AllocateUtf8String(DWORD cchStringLength) +{ + WRAPPER_NO_CONTRACT; + + return SlowAllocateUtf8String(cchStringLength); +} +#endif // FEATURE_UTF8STRING + OBJECTREF DupArrayForCloning(BASEARRAYREF pRef, BOOL bAllocateInLargeHeap = FALSE); // The JIT requests the EE to specify an allocation helper to use at each new-site. diff --git a/src/vm/jithelpers.cpp b/src/vm/jithelpers.cpp index 303f06130f..0576ca7336 100644 --- a/src/vm/jithelpers.cpp +++ b/src/vm/jithelpers.cpp @@ -2895,6 +2895,61 @@ HCIMPL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength) } HCIMPLEND +#ifdef FEATURE_UTF8STRING +HCIMPL1(Utf8StringObject*, AllocateUtf8String_MP_FastPortable, DWORD stringLength) +{ + FCALL_CONTRACT; + + do + { + _ASSERTE(GCHeapUtilities::UseThreadAllocationContexts()); + + // Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow + // problems, as well as making sure big string objects are correctly allocated in the big object heap. + if (stringLength >= LARGE_OBJECT_SIZE - 256) + { + break; + } + + // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler + // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates + // some reshuffling of intermediate values into nonvolatile registers around the call. + Thread *thread = GetThread(); + + SIZE_T totalSize = Utf8StringObject::GetSize(stringLength); + + // The method table's base size includes space for a terminating null character + _ASSERTE(totalSize >= g_pUtf8StringClass->GetBaseSize()); + _ASSERTE(totalSize - g_pUtf8StringClass->GetBaseSize() == stringLength); + + SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT); + _ASSERTE(alignedTotalSize >= totalSize); + totalSize = alignedTotalSize; + + gc_alloc_context *allocContext = thread->GetAllocContext(); + BYTE *allocPtr = allocContext->alloc_ptr; + _ASSERTE(allocPtr <= allocContext->alloc_limit); + if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr)) + { + break; + } + allocContext->alloc_ptr = allocPtr + totalSize; + + _ASSERTE(allocPtr != nullptr); + Utf8StringObject *stringObject = reinterpret_cast<Utf8StringObject *>(allocPtr); + stringObject->SetMethodTable(g_pUtf8StringClass); + stringObject->SetLength(stringLength); + + return stringObject; + } while (false); + + // Tail call to the slow helper + ENDFORBIDGC(); + return HCCALL1(FramedAllocateUtf8String, stringLength); +} +HCIMPLEND +#endif // FEATURE_UTF8STRING + #include <optdefault.h> /*********************************************************************/ @@ -2933,6 +2988,22 @@ HCIMPL1(StringObject*, FramedAllocateString, DWORD stringLength) } HCIMPLEND +#ifdef FEATURE_UTF8STRING +HCIMPL1(Utf8StringObject*, FramedAllocateUtf8String, DWORD stringLength) +{ + FCALL_CONTRACT; + + UTF8STRINGREF result = NULL; + HELPER_METHOD_FRAME_BEGIN_RET_0(); // Set up a frame + + result = SlowAllocateUtf8String(stringLength); + + HELPER_METHOD_FRAME_END(); + return((Utf8StringObject*) OBJECTREFToObject(result)); +} +HCIMPLEND +#endif // FEATURE_UTF8STRING + /*********************************************************************/ OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok) { diff --git a/src/vm/jitinterface.cpp b/src/vm/jitinterface.cpp index af5fdbac33..b3ede3baa2 100644 --- a/src/vm/jitinterface.cpp +++ b/src/vm/jitinterface.cpp @@ -7514,6 +7514,9 @@ bool getILIntrinsicImplementationForRuntimeHelpers(MethodDesc * ftn, if (methodTable == MscorlibBinder::GetClass(CLASS__BOOLEAN) || methodTable == MscorlibBinder::GetClass(CLASS__BYTE) || methodTable == MscorlibBinder::GetClass(CLASS__SBYTE) +#ifdef FEATURE_UTF8STRING + || methodTable == MscorlibBinder::GetClass(CLASS__CHAR8) +#endif // FEATURE_UTF8STRING || methodTable == MscorlibBinder::GetClass(CLASS__CHAR) || methodTable == MscorlibBinder::GetClass(CLASS__INT16) || methodTable == MscorlibBinder::GetClass(CLASS__UINT16) diff --git a/src/vm/jitinterface.h b/src/vm/jitinterface.h index fe7dd4a922..af42bd29ab 100644 --- a/src/vm/jitinterface.h +++ b/src/vm/jitinterface.h @@ -231,6 +231,11 @@ extern FCDECL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength extern FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength); extern FCDECL1(StringObject*, FramedAllocateString, DWORD stringLength); +#ifdef FEATURE_UTF8STRING +extern FCDECL1(Utf8StringObject*, AllocateUtf8String_MP_FastPortable, DWORD stringLength); +extern FCDECL1(Utf8StringObject*, FramedAllocateUtf8String, DWORD stringLength); +#endif // FEATURE_UTF8STRING + extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); extern FCDECL2(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size); extern FCDECL2(Object*, JIT_NewArr1_R2R, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size); diff --git a/src/vm/jitinterfacegen.cpp b/src/vm/jitinterfacegen.cpp index f86011d3ef..3a5b618c26 100644 --- a/src/vm/jitinterfacegen.cpp +++ b/src/vm/jitinterfacegen.cpp @@ -80,6 +80,9 @@ void InitJITHelpers1() SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_FastPortable); ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateString_MP_FastPortable), ECall::FastAllocateString); +#ifdef FEATURE_UTF8STRING + ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateUtf8String_MP_FastPortable), ECall::FastAllocateUtf8String); +#endif // FEATURE_UTF8STRING #else // FEATURE_PAL // if (multi-proc || server GC) if (GCHeapUtilities::UseThreadAllocationContexts()) @@ -91,6 +94,9 @@ void InitJITHelpers1() SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_MP_InlineGetThread); ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP_InlineGetThread), ECall::FastAllocateString); +#ifdef FEATURE_UTF8STRING + ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateUtf8String_MP_FastPortable), ECall::FastAllocateUtf8String); +#endif // FEATURE_UTF8STRING } else { @@ -105,6 +111,9 @@ void InitJITHelpers1() SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_UP); ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastUP), ECall::FastAllocateString); +#ifdef FEATURE_UTF8STRING + ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateUtf8String_MP_FastPortable), ECall::FastAllocateUtf8String); +#endif // FEATURE_UTF8STRING } #endif // FEATURE_PAL } diff --git a/src/vm/marshalnative.cpp b/src/vm/marshalnative.cpp index 334a4a88e4..23df97dcb7 100644 --- a/src/vm/marshalnative.cpp +++ b/src/vm/marshalnative.cpp @@ -266,6 +266,11 @@ FCIMPL1(FC_BOOL_RET, MarshalNative::IsPinnable, Object* obj) if (obj->GetMethodTable() == g_pStringClass) FC_RETURN_BOOL(TRUE); +#ifdef FEATURE_UTF8STRING + if (obj->GetMethodTable() == g_pUtf8StringClass) + FC_RETURN_BOOL(TRUE); +#endif // FEATURE_UTF8STRING + if (obj->GetMethodTable()->IsArray()) { BASEARRAYREF asArray = (BASEARRAYREF)ObjectToOBJECTREF(obj); @@ -527,6 +532,11 @@ void ValidatePinnedObject(OBJECTREF obj) if (obj->GetMethodTable() == g_pStringClass) return; +#ifdef FEATURE_UTF8STRING + if (obj->GetMethodTable() == g_pUtf8StringClass) + return; +#endif // FEATURE_UTF8STRING + if (obj->GetMethodTable()->IsArray()) { BASEARRAYREF asArray = (BASEARRAYREF) obj; diff --git a/src/vm/metasig.h b/src/vm/metasig.h index 5321fd3ee3..5e0a821e44 100644 --- a/src/vm/metasig.h +++ b/src/vm/metasig.h @@ -402,6 +402,7 @@ DEFINE_METASIG(IM(Bool_Bool_RetStr, F F, s)) DEFINE_METASIG(IM(PtrChar_RetVoid, P(u), v)) DEFINE_METASIG(IM(PtrChar_Int_Int_RetVoid, P(u) i i, v)) +DEFINE_METASIG_T(IM(ReadOnlySpanOfByte_RetVoid, GI(g(READONLY_SPAN), 1, b), v)) DEFINE_METASIG_T(IM(ReadOnlySpanOfChar_RetVoid, GI(g(READONLY_SPAN), 1, u), v)) DEFINE_METASIG(IM(PtrSByt_RetVoid, P(B), v)) DEFINE_METASIG(IM(PtrSByt_Int_Int_RetVoid, P(B) i i, v)) @@ -420,6 +421,19 @@ DEFINE_METASIG(IM(PtrSByt_Int_Int_RetStr, P(B) i i, s)) DEFINE_METASIG_T(IM(PtrSByt_Int_Int_Encoding_RetStr, P(B) i i C(ENCODING), s)) DEFINE_METASIG(IM(Obj_Int_RetIntPtr, j i, I)) +DEFINE_METASIG(IM(ArrByte_Int_Int_RetVoid, a(b) i i, v)) +DEFINE_METASIG(IM(PtrByte_RetVoid, P(b), v)) + +#ifdef FEATURE_UTF8STRING +DEFINE_METASIG_T(IM(ReadOnlySpanOfByte_RetUtf8Str, GI(g(READONLY_SPAN), 1, b), C(UTF8_STRING))) +DEFINE_METASIG_T(IM(ReadOnlySpanOfChar_RetUtf8Str, GI(g(READONLY_SPAN), 1, u), C(UTF8_STRING))) +DEFINE_METASIG_T(IM(ArrByte_Int_Int_RetUtf8Str, a(b) i i, C(UTF8_STRING))) +DEFINE_METASIG_T(IM(PtrByte_RetUtf8Str, P(b), C(UTF8_STRING))) +DEFINE_METASIG_T(IM(ArrChar_Int_Int_RetUtf8Str, a(u) i i, C(UTF8_STRING))) +DEFINE_METASIG_T(IM(PtrChar_RetUtf8Str, P(u), C(UTF8_STRING))) +DEFINE_METASIG_T(IM(String_RetUtf8Str, s, C(UTF8_STRING))) +#endif // FEATURE_UTF8STRING + DEFINE_METASIG(IM(Char_Char_RetStr, u u, s)) DEFINE_METASIG(IM(Char_Int_RetVoid, u i, v)) DEFINE_METASIG_T(SM(RetCultureInfo, _, C(CULTURE_INFO))) diff --git a/src/vm/methodtable.h b/src/vm/methodtable.h index 9f9b25e37b..84f8399dc2 100644 --- a/src/vm/methodtable.h +++ b/src/vm/methodtable.h @@ -1743,7 +1743,7 @@ public: BOOL IsString() { LIMITED_METHOD_DAC_CONTRACT; - return HasComponentSize() && !IsArray(); + return HasComponentSize() && !IsArray() && RawGetComponentSize() == 2; } BOOL HasComponentSize() const diff --git a/src/vm/methodtablebuilder.cpp b/src/vm/methodtablebuilder.cpp index 568a23136e..d4ce5b0df0 100644 --- a/src/vm/methodtablebuilder.cpp +++ b/src/vm/methodtablebuilder.cpp @@ -9711,6 +9711,19 @@ void MethodTableBuilder::CheckForSystemTypes() pMT->SetComponentSize(2); } +#ifdef FEATURE_UTF8STRING + else if (strcmp(name, g_Utf8StringName) == 0 && strcmp(nameSpace, g_SystemNS) == 0) + { + // Utf8Strings are not "normal" objects, so we need to mess with their method table a bit + // so that the GC can figure out how big each string is... + DWORD baseSize = Utf8StringObject::GetBaseSize(); + pMT->SetBaseSize(baseSize); // NULL character included + + GetHalfBakedClass()->SetBaseSizePadding(baseSize - bmtFP->NumInstanceFieldBytes); + + pMT->SetComponentSize(1); + } +#endif // FEATURE_UTF8STRING else if (strcmp(name, g_CriticalFinalizerObjectName) == 0 && strcmp(nameSpace, g_ConstrainedExecutionNS) == 0) { // To introduce a class with a critical finalizer, diff --git a/src/vm/mscorlib.h b/src/vm/mscorlib.h index 264408f26a..c54a635abe 100644 --- a/src/vm/mscorlib.h +++ b/src/vm/mscorlib.h @@ -322,6 +322,10 @@ DEFINE_CLASS(ENCODING, Text, Encoding) DEFINE_CLASS(RUNE, Text, Rune) +#ifdef FEATURE_UTF8STRING +DEFINE_CLASS(CHAR8, System, Char8) +#endif // FEATURE_UTF8STRING + DEFINE_CLASS(ENUM, System, Enum) DEFINE_CLASS(ENVIRONMENT, System, Environment) @@ -818,6 +822,17 @@ DEFINE_METHOD(STRING, WCSLEN, wcslen, DEFINE_METHOD(STRING, STRLEN, strlen, SM_PtrByte_RetInt) DEFINE_PROPERTY(STRING, LENGTH, Length, Int) +#ifdef FEATURE_UTF8STRING +DEFINE_CLASS(UTF8_STRING, System, Utf8String) +DEFINE_METHOD(UTF8_STRING, CTORF_READONLYSPANOFBYTE,Ctor, IM_ReadOnlySpanOfByte_RetUtf8Str) +DEFINE_METHOD(UTF8_STRING, CTORF_READONLYSPANOFCHAR,Ctor, IM_ReadOnlySpanOfChar_RetUtf8Str) +DEFINE_METHOD(UTF8_STRING, CTORF_BYTEARRAY_START_LEN,Ctor, IM_ArrByte_Int_Int_RetUtf8Str) +DEFINE_METHOD(UTF8_STRING, CTORF_BYTEPTR, Ctor, IM_PtrByte_RetUtf8Str) +DEFINE_METHOD(UTF8_STRING, CTORF_CHARARRAY_START_LEN,Ctor, IM_ArrChar_Int_Int_RetUtf8Str) +DEFINE_METHOD(UTF8_STRING, CTORF_CHARPTR, Ctor, IM_PtrChar_RetUtf8Str) +DEFINE_METHOD(UTF8_STRING, CTORF_STRING, Ctor, IM_String_RetUtf8Str) +#endif // FEATURE_UTF8STRING + DEFINE_CLASS(STRING_BUILDER, Text, StringBuilder) DEFINE_PROPERTY(STRING_BUILDER, LENGTH, Length, Int) DEFINE_PROPERTY(STRING_BUILDER, CAPACITY, Capacity, Int) diff --git a/src/vm/object.h b/src/vm/object.h index 6bc3a74471..9087afa4a5 100644 --- a/src/vm/object.h +++ b/src/vm/object.h @@ -35,7 +35,10 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref); * | sync block index, which is at a negative offset * | * +-- code:StringObject - String objects are specialized objects for string - * | storage/retrieval for higher performance + * | storage/retrieval for higher performance (UCS-2 / UTF-16 data) + * | + * +-- code:Utf8StringObject - String objects are specialized objects for string + * | storage/retrieval for higher performance (UTF-8 data) * | * +-- BaseObjectWithCachedData - Object Plus one object field for caching. * | | @@ -870,6 +873,9 @@ typedef DPTR(UPTRArray) PTR_UPTRArray; typedef DPTR(PTRArray) PTR_PTRArray; class StringObject; +#ifdef FEATURE_UTF8STRING +class Utf8StringObject; +#endif // FEATURE_UTF8STRING #ifdef USE_CHECKED_OBJECTREFS typedef REF<ArrayBase> BASEARRAYREF; @@ -888,6 +894,9 @@ typedef REF<UPTRArray> UPTRARRAYREF; typedef REF<CHARArray> CHARARRAYREF; typedef REF<PTRArray> PTRARRAYREF; // Warning: Use PtrArray only for single dimensional arrays, not multidim arrays. typedef REF<StringObject> STRINGREF; +#ifdef FEATURE_UTF8STRING +typedef REF<Utf8StringObject> UTF8STRINGREF; +#endif // FEATURE_UTF8STRING #else // USE_CHECKED_OBJECTREFS @@ -907,6 +916,9 @@ typedef PTR_UPTRArray UPTRARRAYREF; typedef PTR_CHARArray CHARARRAYREF; typedef PTR_PTRArray PTRARRAYREF; // Warning: Use PtrArray only for single dimensional arrays, not multidim arrays. typedef PTR_StringObject STRINGREF; +#ifdef FEATURE_UTF8STRING +typedef PTR_Utf8StringObject UTF8STRINGREF; +#endif // FEATURE_UTF8STRING #endif // USE_CHECKED_OBJECTREFS @@ -1199,6 +1211,56 @@ public: }; +#ifdef FEATURE_UTF8STRING +class Utf8StringObject : public Object +{ +#ifdef DACCESS_COMPILE + friend class ClrDataAccess; +#endif + +private: + DWORD m_StringLength; + BYTE m_FirstChar; + +public: + VOID SetLength(DWORD len) { LIMITED_METHOD_CONTRACT; _ASSERTE(len >= 0); m_StringLength = len; } + +protected: + Utf8StringObject() { LIMITED_METHOD_CONTRACT; } + ~Utf8StringObject() { LIMITED_METHOD_CONTRACT; } + +public: + + /*=================RefInterpretGetStringValuesDangerousForGC====================== + **N.B.: This perfoms no range checking and relies on the caller to have done this. + **Args: (IN)ref -- the Utf8String to be interpretted. + ** (OUT)chars -- a pointer to the characters in the buffer. + ** (OUT)length -- a pointer to the length of the buffer. + **Returns: void. + **Exceptions: None. + ==============================================================================*/ + // !!!! If you use this function, you have to be careful because chars is a pointer + // !!!! to the data buffer of ref. If GC happens after this call, you need to make + // !!!! sure that you have a pin handle on ref, or use GCPROTECT_BEGINPINNING on ref. + void RefInterpretGetStringValuesDangerousForGC(__deref_out_ecount(*length + 1) CHAR **chars, int *length) { + WRAPPER_NO_CONTRACT; + + _ASSERTE(GetGCSafeMethodTable() == g_pUtf8StringClass); + *length = GetStringLength(); + *chars = GetBuffer(); +#ifdef _DEBUG + EnableStressHeapHelper(); +#endif + } + + DWORD GetStringLength() { LIMITED_METHOD_DAC_CONTRACT; return( m_StringLength );} + CHAR* GetBuffer() { LIMITED_METHOD_CONTRACT; _ASSERTE(this != nullptr); return (CHAR*)( dac_cast<TADDR>(this) + offsetof(Utf8StringObject, m_FirstChar) ); } + + static DWORD GetBaseSize(); + static SIZE_T GetSize(DWORD stringLength); +}; +#endif // FEATURE_UTF8STRING + // This is the Method version of the Reflection object. // A Method has adddition information. // m_pMD - A pointer to the actual MethodDesc of the method. diff --git a/src/vm/object.inl b/src/vm/object.inl index 9652909250..ebf9d364c9 100644 --- a/src/vm/object.inl +++ b/src/vm/object.inl @@ -71,6 +71,22 @@ __forceinline /*static*/ SIZE_T StringObject::GetSize(DWORD strLen) return GetBaseSize() + strLen * sizeof(WCHAR); } +#ifdef FEATURE_UTF8STRING +__forceinline /*static*/ DWORD Utf8StringObject::GetBaseSize() +{ + LIMITED_METHOD_DAC_CONTRACT; + + return OBJECT_BASESIZE + sizeof(DWORD) /* length */ + sizeof(BYTE) /* null terminator */; +} + +__forceinline /*static*/ SIZE_T Utf8StringObject::GetSize(DWORD strLen) +{ + LIMITED_METHOD_DAC_CONTRACT; + + return GetBaseSize() + strLen; +} +#endif // FEATURE_UTF8STRING + #ifdef DACCESS_COMPILE inline void Object::EnumMemoryRegions(void) diff --git a/src/vm/reflectioninvocation.cpp b/src/vm/reflectioninvocation.cpp index 1f8aa04593..954d6ae267 100644 --- a/src/vm/reflectioninvocation.cpp +++ b/src/vm/reflectioninvocation.cpp @@ -1001,6 +1001,7 @@ FCIMPL5(Object*, RuntimeMethodHandle::InvokeMethod, // Skip the activation optimization for remoting because of remoting proxy is not always activated. // It would be nice to clean this up and get remoting to always activate methodtable behind the proxy. BOOL fForceActivationForRemoting = FALSE; + BOOL fCtorOfVariableSizedObject = FALSE; if (fConstructor) { @@ -1018,7 +1019,8 @@ FCIMPL5(Object*, RuntimeMethodHandle::InvokeMethod, MethodTable * pMT = ownerType.AsMethodTable(); { - if (pMT != g_pStringClass) + fCtorOfVariableSizedObject = pMT->HasComponentSize(); + if (!fCtorOfVariableSizedObject) gc.retVal = pMT->Allocate(); } } @@ -1324,7 +1326,11 @@ FCIMPL5(Object*, RuntimeMethodHandle::InvokeMethod, if (fConstructor) { // We have a special case for Strings...The object is returned... - if (ownerType == TypeHandle(g_pStringClass)) { + if (ownerType == TypeHandle(g_pStringClass) +#ifdef FEATURE_UTF8STRING + || ownerType == TypeHandle(g_pUtf8StringClass) +#endif // FEATURE_UTF8STRING + ) { PVOID pReturnValue = &callDescrData.returnValue; gc.retVal = *(OBJECTREF *)pReturnValue; } @@ -2590,8 +2596,12 @@ FCIMPL1(Object*, ReflectionSerialization::GetUninitializedObject, ReflectClassBa MethodTable *pMT = type.GetMethodTable(); PREFIX_ASSUME(pMT != NULL); - //We don't allow unitialized strings. - if (pMT == g_pStringClass) { + //We don't allow unitialized Strings or Utf8Strings. + if (pMT == g_pStringClass +#ifdef FEATURE_UTF8STRING + || pMT == g_pUtf8StringClass +#endif // FEATURE_UTF8STRING + ) { COMPlusThrow(kArgumentException, W("Argument_NoUninitializedStrings")); } diff --git a/src/vm/vars.cpp b/src/vm/vars.cpp index 179acda8af..8b329d4c2e 100644 --- a/src/vm/vars.cpp +++ b/src/vm/vars.cpp @@ -61,6 +61,9 @@ GPTR_IMPL(MethodTable, g_pObjectClass); GPTR_IMPL(MethodTable, g_pRuntimeTypeClass); GPTR_IMPL(MethodTable, g_pCanonMethodTableClass); // System.__Canon GPTR_IMPL(MethodTable, g_pStringClass); +#ifdef FEATURE_UTF8STRING +GPTR_IMPL(MethodTable, g_pUtf8StringClass); +#endif // FEATURE_UTF8STRING GPTR_IMPL(MethodTable, g_pArrayClass); GPTR_IMPL(MethodTable, g_pSZArrayHelperClass); GPTR_IMPL(MethodTable, g_pNullableClass); diff --git a/src/vm/vars.hpp b/src/vm/vars.hpp index 91ad42a91c..d8ffc60e25 100644 --- a/src/vm/vars.hpp +++ b/src/vm/vars.hpp @@ -79,6 +79,9 @@ class LoaderHeap; class IGCHeap; class Object; class StringObject; +#ifdef FEATURE_UTF8STRING +class Utf8StringObject; +#endif // FEATURE_UTF8STRING class ArrayClass; class MethodTable; class MethodDesc; @@ -313,6 +316,10 @@ class REF : public OBJECTREF #define OBJECTREFToObject(objref) ((objref).operator-> ()) #define ObjectToSTRINGREF(obj) (STRINGREF(obj)) #define STRINGREFToObject(objref) (*( (StringObject**) &(objref) )) +#ifdef FEATURE_UTF8STRING +#define ObjectToUTF8STRINGREF(obj) (UTF8STRINGREF(obj)) +#define UTF8STRINGREFToObject(objref) (*( (Utf8StringObject**) &(objref) )) +#endif // FEATURE_UTF8STRING #else // _DEBUG_IMPL @@ -323,6 +330,10 @@ class REF : public OBJECTREF #define OBJECTREFToObject(objref) ((PTR_Object) (objref)) #define ObjectToSTRINGREF(obj) ((PTR_StringObject) (obj)) #define STRINGREFToObject(objref) ((PTR_StringObject) (objref)) +#ifdef FEATURE_UTF8STRING +#define ObjectToUTF8STRINGREF(obj) ((PTR_Utf8StringObject) (obj)) +#define UTF8STRINGREFToObject(objref) ((PTR_Utf8StringObject) (objref)) +#endif // FEATURE_UTF8STRING #endif // _DEBUG_IMPL @@ -363,6 +374,9 @@ GPTR_DECL(MethodTable, g_pObjectClass); GPTR_DECL(MethodTable, g_pRuntimeTypeClass); GPTR_DECL(MethodTable, g_pCanonMethodTableClass); // System.__Canon GPTR_DECL(MethodTable, g_pStringClass); +#ifdef FEATURE_UTF8STRING +GPTR_DECL(MethodTable, g_pUtf8StringClass); +#endif // FEATURE_UTF8STRING GPTR_DECL(MethodTable, g_pArrayClass); GPTR_DECL(MethodTable, g_pSZArrayHelperClass); GPTR_DECL(MethodTable, g_pNullableClass); |