diff options
Diffstat (limited to 'src/mscorlib/shared/System/Text')
-rw-r--r-- | src/mscorlib/shared/System/Text/ASCIIEncoding.cs | 973 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/Decoder.cs | 339 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/Encoder.cs | 333 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/EncodingInfo.cs | 72 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/EncodingNLS.cs | 322 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/EncodingProvider.cs | 136 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/Normalization.cs | 29 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/StringBuilder.cs | 2409 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/UTF32Encoding.cs | 1234 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/UTF8Encoding.cs | 2668 | ||||
-rw-r--r-- | src/mscorlib/shared/System/Text/UnicodeEncoding.cs | 2058 |
11 files changed, 10573 insertions, 0 deletions
diff --git a/src/mscorlib/shared/System/Text/ASCIIEncoding.cs b/src/mscorlib/shared/System/Text/ASCIIEncoding.cs new file mode 100644 index 0000000000..e5c1194849 --- /dev/null +++ b/src/mscorlib/shared/System/Text/ASCIIEncoding.cs @@ -0,0 +1,973 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Diagnostics; +using System.Diagnostics.Contracts; +using System.Runtime.Serialization; + +namespace System.Text +{ + // ASCIIEncoding + // + // Note that ASCIIEncoding is optimized with no best fit and ? for fallback. + // It doesn't come in other flavors. + // + // Note: ASCIIEncoding is the only encoding that doesn't do best fit (windows has best fit). + // + // Note: IsAlwaysNormalized remains false because 1/2 the code points are unassigned, so they'd + // use fallbacks, and we cannot guarantee that fallbacks are normalized. + + [Serializable] + public class ASCIIEncoding : Encoding + { + // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230) + [Serializable] + internal sealed class ASCIIEncodingSealed : ASCIIEncoding { } + + // Used by Encoding.ASCII for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly ASCIIEncodingSealed s_default = new ASCIIEncodingSealed(); + + public ASCIIEncoding() : base(Encoding.CodePageASCII) + { + } + + internal override void SetDefaultFallbacks() + { + // For ASCIIEncoding we just use default replacement fallback + this.encoderFallback = EncoderFallback.ReplacementFallback; + this.decoderFallback = DecoderFallback.ReplacementFallback; + } + + // WARNING: GetByteCount(string chars), GetBytes(string chars,...), and GetString(byte[] byteIndex...) + // WARNING: have different variable names than EncodingNLS.cs, so this can't just be cut & pasted, + // WARNING: or it'll break VB's way of calling these. + // + // The following methods are copied from EncodingNLS.cs. + // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here. + // These should be kept in sync for the following classes: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(String chars) + { + // Validate input + if (chars==null) + throw new ArgumentNullException("chars"); + Contract.EndContractBlock(); + + fixed (char* pChars = chars) + return GetByteCount(pChars, chars.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + public override unsafe int GetBytes(String chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like empty byte arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like empty byte arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fixed doesn't like empty char arrays + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe String GetString(byte[] bytes, int byteIndex, int byteCount) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + + if (bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (byteCount == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + byteIndex, byteCount, this); + } + + // + // End of standard methods copied from EncodingNLS.cs + // + + // GetByteCount + // Note: We start by assuming that the output will be the same as count. Having + // an encoder or fallback may change that assumption + internal override unsafe int GetByteCount(char* chars, int charCount, EncoderNLS encoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Debug.Assert(charCount >= 0, "[ASCIIEncoding.GetByteCount]count is negative"); + Debug.Assert(chars != null, "[ASCIIEncoding.GetByteCount]chars is null"); + + // Assert because we shouldn't be able to have a null encoder. + Debug.Assert(encoderFallback != null, "[ASCIIEncoding.GetByteCount]Attempting to use null fallback encoder"); + + char charLeftOver = (char)0; + EncoderReplacementFallback fallback = null; + + // Start by assuming default count, then +/- for fallback characters + char* charEnd = chars + charCount; + + // For fallback we may need a fallback buffer, we know we aren't default fallback. + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), + "[ASCIIEncoding.GetByteCount]leftover character should be high surrogate"); + + fallback = encoder.Fallback as EncoderReplacementFallback; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); + } + + // Verify that we have no fallbackbuffer, for ASCII its always empty, so just assert + Debug.Assert(!encoder.m_throwOnOverflow || !encoder.InternalHasFallbackBuffer || + encoder.FallbackBuffer.Remaining == 0, + "[ASCIICodePageEncoding.GetByteCount]Expected empty fallback buffer"); + } + else + { + fallback = this.EncoderFallback as EncoderReplacementFallback; + } + + // If we have an encoder AND we aren't using default fallback, + // then we may have a complicated count. + if (fallback != null && fallback.MaxCharCount == 1) + { + // Replacement fallback encodes surrogate pairs as two ?? (or two whatever), so return size is always + // same as input size. + // Note that no existing SBCS code pages map code points to supplimentary characters, so this is easy. + + // We could however have 1 extra byte if the last call had an encoder and a funky fallback and + // if we don't use the funky fallback this time. + + // Do we have an extra char left over from last time? + if (charLeftOver > 0) + charCount++; + + return (charCount); + } + + // Count is more complicated if you have a funky fallback + // For fallback we may need a fallback buffer, we know we're not default fallback + int byteCount = 0; + + // We may have a left over character from last time, try and process it. + if (charLeftOver > 0) + { + Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ASCIIEncoding.GetByteCount]leftover character should be high surrogate"); + Debug.Assert(encoder != null, "[ASCIIEncoding.GetByteCount]Expected encoder"); + + // Since left over char was a surrogate, it'll have to be fallen back. + // Get Fallback + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); + + // This will fallback a pair if *chars is a low surrogate + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + } + + // Now we may have fallback char[] already from the encoder + + // Go ahead and do it, including the fallback. + char ch; + while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || + chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Check for fallback, this'll catch surrogate pairs too. + // no chars >= 0x80 are allowed. + if (ch > 0x7f) + { + if (fallbackBuffer == null) + { + // Initialize the buffer + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, false); + } + + // Get Fallback + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + continue; + } + + // We'll use this one + byteCount++; + } + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[ASCIIEncoding.GetByteCount]Expected Empty fallback buffer"); + + return byteCount; + } + + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS encoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Debug.Assert(bytes != null, "[ASCIIEncoding.GetBytes]bytes is null"); + Debug.Assert(byteCount >= 0, "[ASCIIEncoding.GetBytes]byteCount is negative"); + Debug.Assert(chars != null, "[ASCIIEncoding.GetBytes]chars is null"); + Debug.Assert(charCount >= 0, "[ASCIIEncoding.GetBytes]charCount is negative"); + + // Assert because we shouldn't be able to have a null encoder. + Debug.Assert(encoderFallback != null, "[ASCIIEncoding.GetBytes]Attempting to use null encoder fallback"); + + // Get any left over characters + char charLeftOver = (char)0; + EncoderReplacementFallback fallback = null; + + // For fallback we may need a fallback buffer, we know we aren't default fallback. + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + // prepare our end + char* charEnd = chars + charCount; + byte* byteStart = bytes; + char* charStart = chars; + + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + fallback = encoder.Fallback as EncoderReplacementFallback; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), + "[ASCIIEncoding.GetBytes]leftover character should be high surrogate"); + + // Verify that we have no fallbackbuffer, for ASCII its always empty, so just assert + Debug.Assert(!encoder.m_throwOnOverflow || !encoder.InternalHasFallbackBuffer || + encoder.FallbackBuffer.Remaining == 0, + "[ASCIICodePageEncoding.GetBytes]Expected empty fallback buffer"); + } + else + { + fallback = this.EncoderFallback as EncoderReplacementFallback; + } + + + // See if we do the fast default or slightly slower fallback + if (fallback != null && fallback.MaxCharCount == 1) + { + // Fast version + char cReplacement = fallback.DefaultString[0]; + + // Check for replacements in range, otherwise fall back to slow version. + if (cReplacement <= (char)0x7f) + { + // We should have exactly as many output bytes as input bytes, unless there's a left + // over character, in which case we may need one more. + // If we had a left over character will have to add a ? (This happens if they had a funky + // fallback last time, but not this time.) (We can't spit any out though + // because with fallback encoder each surrogate is treated as a seperate code point) + if (charLeftOver > 0) + { + // Have to have room + // Throw even if doing no throw version because this is just 1 char, + // so buffer will never be big enough + if (byteCount == 0) + ThrowBytesOverflow(encoder, true); + + // This'll make sure we still have more room and also make sure our return value is correct. + *(bytes++) = (byte)cReplacement; + byteCount--; // We used one of the ones we were counting. + } + + // This keeps us from overrunning our output buffer + if (byteCount < charCount) + { + // Throw or make buffer smaller? + ThrowBytesOverflow(encoder, byteCount < 1); + + // Just use what we can + charEnd = chars + byteCount; + } + + // We just do a quick copy + while (chars < charEnd) + { + char ch2 = *(chars++); + if (ch2 >= 0x0080) *(bytes++) = (byte)cReplacement; + else *(bytes++) = unchecked((byte)(ch2)); + } + + // Clear encoder + if (encoder != null) + { + encoder.charLeftOver = (char)0; + encoder.m_charsUsed = (int)(chars - charStart); + } + + return (int)(bytes - byteStart); + } + } + + // Slower version, have to do real fallback. + + // prepare our end + byte* byteEnd = bytes + byteCount; + + // We may have a left over character from last time, try and process it. + if (charLeftOver > 0) + { + // Initialize the buffer + Debug.Assert(encoder != null, + "[ASCIIEncoding.GetBytes]Expected non null encoder if we have surrogate left over"); + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(chars, charEnd, encoder, true); + + // Since left over char was a surrogate, it'll have to be fallen back. + // Get Fallback + // This will fallback a pair if *chars is a low surrogate + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + } + + // Now we may have fallback char[] already from the encoder + + // Go ahead and do it, including the fallback. + char ch; + while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || + chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Check for fallback, this'll catch surrogate pairs too. + // All characters >= 0x80 must fall back. + if (ch > 0x7f) + { + // Initialize the buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, true); + } + + // Get Fallback + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + + // Go ahead & continue (& do the fallback) + continue; + } + + // We'll use this one + // Bounds check + if (bytes >= byteEnd) + { + // didn't use this char, we'll throw or use buffer + if (fallbackBuffer == null || fallbackBuffer.bFallingBack == false) + { + Debug.Assert(chars > charStart || bytes == byteStart, + "[ASCIIEncoding.GetBytes]Expected chars to have advanced already."); + chars--; // don't use last char + } + else + fallbackBuffer.MovePrevious(); + + // Are we throwing or using buffer? + ThrowBytesOverflow(encoder, bytes == byteStart); // throw? + break; // don't throw, stop + } + + // Go ahead and add it + *bytes = unchecked((byte)ch); + bytes++; + } + + // Need to do encoder stuff + if (encoder != null) + { + // Fallback stuck it in encoder if necessary, but we have to clear MustFlush cases + if (fallbackBuffer != null && !fallbackBuffer.bUsedEncoder) + // Clear it in case of MustFlush + encoder.charLeftOver = (char)0; + + // Set our chars used count + encoder.m_charsUsed = (int)(chars - charStart); + } + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || + (encoder != null && !encoder.m_throwOnOverflow), + "[ASCIIEncoding.GetBytes]Expected Empty fallback buffer at end"); + + return (int)(bytes - byteStart); + } + + // This is internal and called by something else, + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS decoder) + { + // Just assert, we're called internally so these should be safe, checked already + Debug.Assert(bytes != null, "[ASCIIEncoding.GetCharCount]bytes is null"); + Debug.Assert(count >= 0, "[ASCIIEncoding.GetCharCount]byteCount is negative"); + + // ASCII doesn't do best fit, so don't have to check for it, find out which decoder fallback we're using + DecoderReplacementFallback fallback = null; + + if (decoder == null) + fallback = this.DecoderFallback as DecoderReplacementFallback; + else + { + fallback = decoder.Fallback as DecoderReplacementFallback; + Debug.Assert(!decoder.m_throwOnOverflow || !decoder.InternalHasFallbackBuffer || + decoder.FallbackBuffer.Remaining == 0, + "[ASCIICodePageEncoding.GetCharCount]Expected empty fallback buffer"); + } + + if (fallback != null && fallback.MaxCharCount == 1) + { + // Just return length, SBCS stay the same length because they don't map to surrogate + // pairs and we don't have a decoder fallback. + + return count; + } + + // Only need decoder fallback buffer if not using default replacement fallback, no best fit for ASCII + DecoderFallbackBuffer fallbackBuffer = null; + + // Have to do it the hard way. + // Assume charCount will be == count + int charCount = count; + byte[] byteBuffer = new byte[1]; + + // Do it our fast way + byte* byteEnd = bytes + count; + + // Quick loop + while (bytes < byteEnd) + { + // Faster if don't use *bytes++; + byte b = *bytes; + bytes++; + + // If unknown we have to do fallback count + if (b >= 0x80) + { + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(byteEnd - count, null); + } + + // Use fallback buffer + byteBuffer[0] = b; + charCount--; // Have to unreserve the one we already allocated for b + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + } + } + + // Fallback buffer must be empty + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[ASCIIEncoding.GetCharCount]Expected Empty fallback buffer"); + + // Converted sequence is same length as input + return charCount; + } + + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS decoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Debug.Assert(bytes != null, "[ASCIIEncoding.GetChars]bytes is null"); + Debug.Assert(byteCount >= 0, "[ASCIIEncoding.GetChars]byteCount is negative"); + Debug.Assert(chars != null, "[ASCIIEncoding.GetChars]chars is null"); + Debug.Assert(charCount >= 0, "[ASCIIEncoding.GetChars]charCount is negative"); + + // Do it fast way if using ? replacement fallback + byte* byteEnd = bytes + byteCount; + byte* byteStart = bytes; + char* charStart = chars; + + // Note: ASCII doesn't do best fit, but we have to fallback if they use something > 0x7f + // Only need decoder fallback buffer if not using ? fallback. + // ASCII doesn't do best fit, so don't have to check for it, find out which decoder fallback we're using + DecoderReplacementFallback fallback = null; + char* charsForFallback; + + if (decoder == null) + fallback = this.DecoderFallback as DecoderReplacementFallback; + else + { + fallback = decoder.Fallback as DecoderReplacementFallback; + Debug.Assert(!decoder.m_throwOnOverflow || !decoder.InternalHasFallbackBuffer || + decoder.FallbackBuffer.Remaining == 0, + "[ASCIICodePageEncoding.GetChars]Expected empty fallback buffer"); + } + + if (fallback != null && fallback.MaxCharCount == 1) + { + // Try it the fast way + char replacementChar = fallback.DefaultString[0]; + + // Need byteCount chars, otherwise too small buffer + if (charCount < byteCount) + { + // Need at least 1 output byte, throw if must throw + ThrowCharsOverflow(decoder, charCount < 1); + + // Not throwing, use what we can + byteEnd = bytes + charCount; + } + + // Quick loop, just do '?' replacement because we don't have fallbacks for decodings. + while (bytes < byteEnd) + { + byte b = *(bytes++); + if (b >= 0x80) + // This is an invalid byte in the ASCII encoding. + *(chars++) = replacementChar; + else + *(chars++) = unchecked((char)b); + } + + // bytes & chars used are the same + if (decoder != null) + decoder.m_bytesUsed = (int)(bytes - byteStart); + return (int)(chars - charStart); + } + + // Slower way's going to need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + byte[] byteBuffer = new byte[1]; + char* charEnd = chars + charCount; + + // Not quite so fast loop + while (bytes < byteEnd) + { + // Faster if don't use *bytes++; + byte b = *(bytes); + bytes++; + + if (b >= 0x80) + { + // This is an invalid byte in the ASCII encoding. + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + fallbackBuffer.InternalInitialize(byteEnd - byteCount, charEnd); + } + + // Use fallback buffer + byteBuffer[0] = b; + + // Note that chars won't get updated unless this succeeds + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // May or may not throw, but we didn't get this byte + Debug.Assert(bytes > byteStart || chars == charStart, + "[ASCIIEncoding.GetChars]Expected bytes to have advanced already (fallback case)"); + bytes--; // unused byte + fallbackBuffer.InternalReset(); // Didn't fall this back + ThrowCharsOverflow(decoder, chars == charStart); // throw? + break; // don't throw, but stop loop + } + } + else + { + // Make sure we have buffer space + if (chars >= charEnd) + { + Debug.Assert(bytes > byteStart || chars == charStart, + "[ASCIIEncoding.GetChars]Expected bytes to have advanced already (normal case)"); + bytes--; // unused byte + ThrowCharsOverflow(decoder, chars == charStart); // throw? + break; // don't throw, but stop loop + } + + *(chars) = unchecked((char)b); + chars++; + } + } + + // Might have had decoder fallback stuff. + if (decoder != null) + decoder.m_bytesUsed = (int)(bytes - byteStart); + + // Expect Empty fallback buffer for GetChars + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[ASCIIEncoding.GetChars]Expected Empty fallback buffer"); + + return (int)(chars - charStart); + } + + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // 1 to 1 for most characters. Only surrogates with fallbacks have less. + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + return (int)byteCount; + } + + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Just return length, SBCS stay the same length because they don't map to surrogate + long charCount = (long)byteCount; + + // 1 to 1 for most characters. Only surrogates with fallbacks have less, unknown fallbacks could be longer. + if (DecoderFallback.MaxCharCount > 1) + charCount *= DecoderFallback.MaxCharCount; + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); + + return (int)charCount; + } + + // True if and only if the encoding only uses single byte code points. (Ie, ASCII, 1252, etc) + + public override bool IsSingleByte + { + get + { + return true; + } + } + + public override Decoder GetDecoder() + { + return new DecoderNLS(this); + } + + + public override Encoder GetEncoder() + { + return new EncoderNLS(this); + } + } +} diff --git a/src/mscorlib/shared/System/Text/Decoder.cs b/src/mscorlib/shared/System/Text/Decoder.cs new file mode 100644 index 0000000000..b2a003037b --- /dev/null +++ b/src/mscorlib/shared/System/Text/Decoder.cs @@ -0,0 +1,339 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.Serialization; +using System.Text; +using System; +using System.Diagnostics; +using System.Diagnostics.Contracts; + +namespace System.Text +{ + // A Decoder is used to decode a sequence of blocks of bytes into a + // sequence of blocks of characters. Following instantiation of a decoder, + // sequential blocks of bytes are converted into blocks of characters through + // calls to the GetChars method. The decoder maintains state between the + // conversions, allowing it to correctly decode byte sequences that span + // adjacent blocks. + // + // Instances of specific implementations of the Decoder abstract base + // class are typically obtained through calls to the GetDecoder method + // of Encoding objects. + // + [Serializable] + public abstract class Decoder + { + internal DecoderFallback m_fallback = null; + + [NonSerialized] + internal DecoderFallbackBuffer m_fallbackBuffer = null; + + internal void SerializeDecoder(SerializationInfo info) + { + info.AddValue("m_fallback", this.m_fallback); + } + + protected Decoder() + { + // We don't call default reset because default reset probably isn't good if we aren't initialized. + } + + public DecoderFallback Fallback + { + get + { + return m_fallback; + } + + set + { + if (value == null) + throw new ArgumentNullException(nameof(value)); + Contract.EndContractBlock(); + + // Can't change fallback if buffer is wrong + if (m_fallbackBuffer != null && m_fallbackBuffer.Remaining > 0) + throw new ArgumentException( + SR.Argument_FallbackBufferNotEmpty, nameof(value)); + + m_fallback = value; + m_fallbackBuffer = null; + } + } + + // Note: we don't test for threading here because async access to Encoders and Decoders + // doesn't work anyway. + public DecoderFallbackBuffer FallbackBuffer + { + get + { + if (m_fallbackBuffer == null) + { + if (m_fallback != null) + m_fallbackBuffer = m_fallback.CreateFallbackBuffer(); + else + m_fallbackBuffer = DecoderFallback.ReplacementFallback.CreateFallbackBuffer(); + } + + return m_fallbackBuffer; + } + } + + internal bool InternalHasFallbackBuffer + { + get + { + return m_fallbackBuffer != null; + } + } + + // Reset the Decoder + // + // Normally if we call GetChars() and an error is thrown we don't change the state of the Decoder. This + // would allow the caller to correct the error condition and try again (such as if they need a bigger buffer.) + // + // If the caller doesn't want to try again after GetChars() throws an error, then they need to call Reset(). + // + // Virtual implementation has to call GetChars with flush and a big enough buffer to clear a 0 byte string + // We avoid GetMaxCharCount() because a) we can't call the base encoder and b) it might be really big. + public virtual void Reset() + { + byte[] byteTemp = Array.Empty<byte>(); + char[] charTemp = new char[GetCharCount(byteTemp, 0, 0, true)]; + GetChars(byteTemp, 0, 0, charTemp, 0, true); + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Returns the number of characters the next call to GetChars will + // produce if presented with the given range of bytes. The returned value + // takes into account the state in which the decoder was left following the + // last call to GetChars. The state of the decoder is not affected + // by a call to this method. + // + public abstract int GetCharCount(byte[] bytes, int index, int count); + + public virtual int GetCharCount(byte[] bytes, int index, int count, bool flush) + { + return GetCharCount(bytes, index, count); + } + + // We expect this to be the workhorse for NLS Encodings, but for existing + // ones we need a working (if slow) default implementation) + [CLSCompliant(false)] + public virtual unsafe int GetCharCount(byte* bytes, int count, bool flush) + { + // Validate input parameters + if (bytes == null) + throw new ArgumentNullException(nameof(bytes), + SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException(nameof(count), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + byte[] arrbyte = new byte[count]; + int index; + + for (index = 0; index < count; index++) + arrbyte[index] = bytes[index]; + + return GetCharCount(arrbyte, 0, count); + } + + // Decodes a range of bytes in a byte array into a range of characters + // in a character array. The method decodes byteCount bytes from + // bytes starting at index byteIndex, storing the resulting + // characters in chars starting at index charIndex. The + // decoding takes into account the state in which the decoder was left + // following the last call to this method. + // + // An exception occurs if the character array is not large enough to + // hold the complete decoding of the bytes. The GetCharCount method + // can be used to determine the exact number of characters that will be + // produced for a given range of bytes. Alternatively, the + // GetMaxCharCount method of the Encoding that produced this + // decoder can be used to determine the maximum number of characters that + // will be produced for a given number of bytes, regardless of the actual + // byte values. + // + public abstract int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex); + + public virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex, bool flush) + { + return GetChars(bytes, byteIndex, byteCount, chars, charIndex); + } + + // We expect this to be the workhorse for NLS Encodings, but for existing + // ones we need a working (if slow) default implementation) + // + // WARNING WARNING WARNING + // + // WARNING: If this breaks it could be a security threat. Obviously we + // call this internally, so you need to make sure that your pointers, counts + // and indexes are correct when you call this method. + // + // In addition, we have internal code, which will be marked as "safe" calling + // this code. However this code is dependent upon the implementation of an + // external GetChars() method, which could be overridden by a third party and + // the results of which cannot be guaranteed. We use that result to copy + // the char[] to our char* output buffer. If the result count was wrong, we + // could easily overflow our output buffer. Therefore we do an extra test + // when we copy the buffer so that we don't overflow charCount either. + [CLSCompliant(false)] + public virtual unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, bool flush) + { + // Validate input parameters + if (chars == null || bytes == null) + throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes), + SR.ArgumentNull_Array); + + if (byteCount < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Get the byte array to convert + byte[] arrByte = new byte[byteCount]; + + int index; + for (index = 0; index < byteCount; index++) + arrByte[index] = bytes[index]; + + // Get the char array to fill + char[] arrChar = new char[charCount]; + + // Do the work + int result = GetChars(arrByte, 0, byteCount, arrChar, 0, flush); + + Debug.Assert(result <= charCount, "Returned more chars than we have space for"); + + // Copy the char array + // WARNING: We MUST make sure that we don't copy too many chars. We can't + // rely on result because it could be a 3rd party implementation. We need + // to make sure we never copy more than charCount chars no matter the value + // of result + if (result < charCount) + charCount = result; + + // We check both result and charCount so that we don't accidentally overrun + // our pointer buffer just because of an issue in GetChars + for (index = 0; index < charCount; index++) + chars[index] = arrChar[index]; + + return charCount; + } + + // This method is used when the output buffer might not be large enough. + // It will decode until it runs out of bytes, and then it will return + // true if it the entire input was converted. In either case it + // will also return the number of converted bytes and output characters used. + // It will only throw a buffer overflow exception if the entire lenght of chars[] is + // too small to store the next char. (like 0 or maybe 1 or 4 for some encodings) + // We're done processing this buffer only if completed returns true. + // + // Might consider checking Max...Count to avoid the extra counting step. + // + // Note that if all of the input bytes are not consumed, then we'll do a /2, which means + // that its likely that we didn't consume as many bytes as we could have. For some + // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) + public virtual void Convert(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex, int charCount, bool flush, + out int bytesUsed, out int charsUsed, out bool completed) + { + // Validate parameters + if (bytes == null || chars == null) + throw new ArgumentNullException((bytes == null ? nameof(bytes) : nameof(chars)), + SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException(nameof(bytes), + SR.ArgumentOutOfRange_IndexCountBuffer); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException(nameof(chars), + SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + bytesUsed = byteCount; + + // Its easy to do if it won't overrun our buffer. + while (bytesUsed > 0) + { + if (GetCharCount(bytes, byteIndex, bytesUsed, flush) <= charCount) + { + charsUsed = GetChars(bytes, byteIndex, bytesUsed, chars, charIndex, flush); + completed = (bytesUsed == byteCount && + (m_fallbackBuffer == null || m_fallbackBuffer.Remaining == 0)); + return; + } + + // Try again with 1/2 the count, won't flush then 'cause won't read it all + flush = false; + bytesUsed /= 2; + } + + // Oops, we didn't have anything, we'll have to throw an overflow + throw new ArgumentException(SR.Argument_ConversionOverflow); + } + + // This is the version that uses *. + // We're done processing this buffer only if completed returns true. + // + // Might consider checking Max...Count to avoid the extra counting step. + // + // Note that if all of the input bytes are not consumed, then we'll do a /2, which means + // that its likely that we didn't consume as many bytes as we could have. For some + // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) + [CLSCompliant(false)] + public virtual unsafe void Convert(byte* bytes, int byteCount, + char* chars, int charCount, bool flush, + out int bytesUsed, out int charsUsed, out bool completed) + { + // Validate input parameters + if (chars == null || bytes == null) + throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes), + SR.ArgumentNull_Array); + + if (byteCount < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Get ready to do it + bytesUsed = byteCount; + + // Its easy to do if it won't overrun our buffer. + while (bytesUsed > 0) + { + if (GetCharCount(bytes, bytesUsed, flush) <= charCount) + { + charsUsed = GetChars(bytes, bytesUsed, chars, charCount, flush); + completed = (bytesUsed == byteCount && + (m_fallbackBuffer == null || m_fallbackBuffer.Remaining == 0)); + return; + } + + // Try again with 1/2 the count, won't flush then 'cause won't read it all + flush = false; + bytesUsed /= 2; + } + + // Oops, we didn't have anything, we'll have to throw an overflow + throw new ArgumentException(SR.Argument_ConversionOverflow); + } + } +} diff --git a/src/mscorlib/shared/System/Text/Encoder.cs b/src/mscorlib/shared/System/Text/Encoder.cs new file mode 100644 index 0000000000..e4e91765e1 --- /dev/null +++ b/src/mscorlib/shared/System/Text/Encoder.cs @@ -0,0 +1,333 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.Serialization; +using System.Text; +using System; +using System.Diagnostics; +using System.Diagnostics.Contracts; + +namespace System.Text +{ + // An Encoder is used to encode a sequence of blocks of characters into + // a sequence of blocks of bytes. Following instantiation of an encoder, + // sequential blocks of characters are converted into blocks of bytes through + // calls to the GetBytes method. The encoder maintains state between the + // conversions, allowing it to correctly encode character sequences that span + // adjacent blocks. + // + // Instances of specific implementations of the Encoder abstract base + // class are typically obtained through calls to the GetEncoder method + // of Encoding objects. + // + [Serializable] + public abstract class Encoder + { + internal EncoderFallback m_fallback = null; + + [NonSerialized] + internal EncoderFallbackBuffer m_fallbackBuffer = null; + + internal void SerializeEncoder(SerializationInfo info) + { + info.AddValue("m_fallback", this.m_fallback); + } + + protected Encoder() + { + // We don't call default reset because default reset probably isn't good if we aren't initialized. + } + + public EncoderFallback Fallback + { + get + { + return m_fallback; + } + + set + { + if (value == null) + throw new ArgumentNullException(nameof(value)); + Contract.EndContractBlock(); + + // Can't change fallback if buffer is wrong + if (m_fallbackBuffer != null && m_fallbackBuffer.Remaining > 0) + throw new ArgumentException( + SR.Argument_FallbackBufferNotEmpty, nameof(value)); + + m_fallback = value; + m_fallbackBuffer = null; + } + } + + // Note: we don't test for threading here because async access to Encoders and Decoders + // doesn't work anyway. + public EncoderFallbackBuffer FallbackBuffer + { + get + { + if (m_fallbackBuffer == null) + { + if (m_fallback != null) + m_fallbackBuffer = m_fallback.CreateFallbackBuffer(); + else + m_fallbackBuffer = EncoderFallback.ReplacementFallback.CreateFallbackBuffer(); + } + + return m_fallbackBuffer; + } + } + + internal bool InternalHasFallbackBuffer + { + get + { + return m_fallbackBuffer != null; + } + } + + // Reset the Encoder + // + // Normally if we call GetBytes() and an error is thrown we don't change the state of the encoder. This + // would allow the caller to correct the error condition and try again (such as if they need a bigger buffer.) + // + // If the caller doesn't want to try again after GetBytes() throws an error, then they need to call Reset(). + // + // Virtual implementation has to call GetBytes with flush and a big enough buffer to clear a 0 char string + // We avoid GetMaxByteCount() because a) we can't call the base encoder and b) it might be really big. + public virtual void Reset() + { + char[] charTemp = { }; + byte[] byteTemp = new byte[GetByteCount(charTemp, 0, 0, true)]; + GetBytes(charTemp, 0, 0, byteTemp, 0, true); + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Returns the number of bytes the next call to GetBytes will + // produce if presented with the given range of characters and the given + // value of the flush parameter. The returned value takes into + // account the state in which the encoder was left following the last call + // to GetBytes. The state of the encoder is not affected by a call + // to this method. + // + public abstract int GetByteCount(char[] chars, int index, int count, bool flush); + + // We expect this to be the workhorse for NLS encodings + // unfortunately for existing overrides, it has to call the [] version, + // which is really slow, so avoid this method if you might be calling external encodings. + [CLSCompliant(false)] + public virtual unsafe int GetByteCount(char* chars, int count, bool flush) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException(nameof(chars), + SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException(nameof(count), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + char[] arrChar = new char[count]; + int index; + + for (index = 0; index < count; index++) + arrChar[index] = chars[index]; + + return GetByteCount(arrChar, 0, count, flush); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. The method encodes charCount characters from + // chars starting at index charIndex, storing the resulting + // bytes in bytes starting at index byteIndex. The encoding + // takes into account the state in which the encoder was left following the + // last call to this method. The flush parameter indicates whether + // the encoder should flush any shift-states and partial characters at the + // end of the conversion. To ensure correct termination of a sequence of + // blocks of encoded bytes, the last call to GetBytes should specify + // a value of true for the flush parameter. + // + // An exception occurs if the byte array is not large enough to hold the + // complete encoding of the characters. The GetByteCount method can + // be used to determine the exact number of bytes that will be produced for + // a given range of characters. Alternatively, the GetMaxByteCount + // method of the Encoding that produced this encoder can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + public abstract int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex, bool flush); + + // We expect this to be the workhorse for NLS Encodings, but for existing + // ones we need a working (if slow) default implementation) + // + // WARNING WARNING WARNING + // + // WARNING: If this breaks it could be a security threat. Obviously we + // call this internally, so you need to make sure that your pointers, counts + // and indexes are correct when you call this method. + // + // In addition, we have internal code, which will be marked as "safe" calling + // this code. However this code is dependent upon the implementation of an + // external GetBytes() method, which could be overridden by a third party and + // the results of which cannot be guaranteed. We use that result to copy + // the byte[] to our byte* output buffer. If the result count was wrong, we + // could easily overflow our output buffer. Therefore we do an extra test + // when we copy the buffer so that we don't overflow byteCount either. + [CLSCompliant(false)] + public virtual unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, bool flush) + { + // Validate input parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), + SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Get the char array to convert + char[] arrChar = new char[charCount]; + + int index; + for (index = 0; index < charCount; index++) + arrChar[index] = chars[index]; + + // Get the byte array to fill + byte[] arrByte = new byte[byteCount]; + + // Do the work + int result = GetBytes(arrChar, 0, charCount, arrByte, 0, flush); + + Debug.Assert(result <= byteCount, "Returned more bytes than we have space for"); + + // Copy the byte array + // WARNING: We MUST make sure that we don't copy too many bytes. We can't + // rely on result because it could be a 3rd party implementation. We need + // to make sure we never copy more than byteCount bytes no matter the value + // of result + if (result < byteCount) + byteCount = result; + + // Don't copy too many bytes! + for (index = 0; index < byteCount; index++) + bytes[index] = arrByte[index]; + + return byteCount; + } + + // This method is used to avoid running out of output buffer space. + // It will encode until it runs out of chars, and then it will return + // true if it the entire input was converted. In either case it + // will also return the number of converted chars and output bytes used. + // It will only throw a buffer overflow exception if the entire lenght of bytes[] is + // too small to store the next byte. (like 0 or maybe 1 or 4 for some encodings) + // We're done processing this buffer only if completed returns true. + // + // Might consider checking Max...Count to avoid the extra counting step. + // + // Note that if all of the input chars are not consumed, then we'll do a /2, which means + // that its likely that we didn't consume as many chars as we could have. For some + // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) + public virtual void Convert(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex, int byteCount, bool flush, + out int charsUsed, out int bytesUsed, out bool completed) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), + SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException(nameof(chars), + SR.ArgumentOutOfRange_IndexCountBuffer); + + if (bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException(nameof(bytes), + SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + charsUsed = charCount; + + // Its easy to do if it won't overrun our buffer. + // Note: We don't want to call unsafe version because that might be an untrusted version + // which could be really unsafe and we don't want to mix it up. + while (charsUsed > 0) + { + if (GetByteCount(chars, charIndex, charsUsed, flush) <= byteCount) + { + bytesUsed = GetBytes(chars, charIndex, charsUsed, bytes, byteIndex, flush); + completed = (charsUsed == charCount && + (m_fallbackBuffer == null || m_fallbackBuffer.Remaining == 0)); + return; + } + + // Try again with 1/2 the count, won't flush then 'cause won't read it all + flush = false; + charsUsed /= 2; + } + + // Oops, we didn't have anything, we'll have to throw an overflow + throw new ArgumentException(SR.Argument_ConversionOverflow); + } + + // Same thing, but using pointers + // + // Might consider checking Max...Count to avoid the extra counting step. + // + // Note that if all of the input chars are not consumed, then we'll do a /2, which means + // that its likely that we didn't consume as many chars as we could have. For some + // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) + [CLSCompliant(false)] + public virtual unsafe void Convert(char* chars, int charCount, + byte* bytes, int byteCount, bool flush, + out int charsUsed, out int bytesUsed, out bool completed) + { + // Validate input parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), + SR.ArgumentNull_Array); + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Get ready to do it + charsUsed = charCount; + + // Its easy to do if it won't overrun our buffer. + while (charsUsed > 0) + { + if (GetByteCount(chars, charsUsed, flush) <= byteCount) + { + bytesUsed = GetBytes(chars, charsUsed, bytes, byteCount, flush); + completed = (charsUsed == charCount && + (m_fallbackBuffer == null || m_fallbackBuffer.Remaining == 0)); + return; + } + + // Try again with 1/2 the count, won't flush then 'cause won't read it all + flush = false; + charsUsed /= 2; + } + + // Oops, we didn't have anything, we'll have to throw an overflow + throw new ArgumentException(SR.Argument_ConversionOverflow); + } + } +} + diff --git a/src/mscorlib/shared/System/Text/EncodingInfo.cs b/src/mscorlib/shared/System/Text/EncodingInfo.cs new file mode 100644 index 0000000000..360dd7f638 --- /dev/null +++ b/src/mscorlib/shared/System/Text/EncodingInfo.cs @@ -0,0 +1,72 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Text; + +namespace System.Text +{ + [Serializable] + public sealed class EncodingInfo + { + private int iCodePage; // Code Page # + private string strEncodingName; // Short name (web name) + private string strDisplayName; // Full localized name + + internal EncodingInfo(int codePage, string name, string displayName) + { + iCodePage = codePage; + strEncodingName = name; + strDisplayName = displayName; + } + + + public int CodePage + { + get + { + return iCodePage; + } + } + + + public string Name + { + get + { + return strEncodingName; + } + } + + + public string DisplayName + { + get + { + return strDisplayName; + } + } + + + public Encoding GetEncoding() + { + return Encoding.GetEncoding(iCodePage); + } + + public override bool Equals(Object value) + { + EncodingInfo that = value as EncodingInfo; + if (that != null) + { + return (this.CodePage == that.CodePage); + } + return (false); + } + + public override int GetHashCode() + { + return this.CodePage; + } + } +} diff --git a/src/mscorlib/shared/System/Text/EncodingNLS.cs b/src/mscorlib/shared/System/Text/EncodingNLS.cs new file mode 100644 index 0000000000..205ae26902 --- /dev/null +++ b/src/mscorlib/shared/System/Text/EncodingNLS.cs @@ -0,0 +1,322 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Diagnostics.Contracts; +using System.Collections; +using System.Globalization; +using System.Threading; + +namespace System.Text +{ + // This class overrides Encoding with the things we need for our NLS Encodings + // + // All of the GetBytes/Chars GetByte/CharCount methods are just wrappers for the pointer + // plus decoder/encoder method that is our real workhorse. Note that this is an internal + // class, so our public classes cannot derive from this class. Because of this, all of the + // GetBytes/Chars GetByte/CharCount wrapper methods are duplicated in all of our public + // encodings, which currently include: + // + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, & UnicodeEncoding + // + // So if you change the wrappers in this class, you must change the wrappers in the other classes + // as well because they should have the same behavior. + + [Serializable] + internal abstract class EncodingNLS : Encoding + { + protected EncodingNLS(int codePage) : base(codePage) + { + } + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + public override unsafe int GetByteCount(String s) + { + // Validate input + if (s==null) + throw new ArgumentNullException("s"); + Contract.EndContractBlock(); + + fixed (char* pChars = s) + return GetByteCount(pChars, s.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + public override unsafe int GetBytes(String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null || bytes == null) + throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (s.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like empty arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like empty arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fixed doesn't like empty arrays + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + public override unsafe String GetString(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (count == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + index, count, this); + } + + public override Decoder GetDecoder() + { + return new DecoderNLS(this); + } + + public override Encoder GetEncoder() + { + return new EncoderNLS(this); + } + } +} diff --git a/src/mscorlib/shared/System/Text/EncodingProvider.cs b/src/mscorlib/shared/System/Text/EncodingProvider.cs new file mode 100644 index 0000000000..ce8c3e0208 --- /dev/null +++ b/src/mscorlib/shared/System/Text/EncodingProvider.cs @@ -0,0 +1,136 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections; +using System.Collections.Generic; + +namespace System.Text +{ + public abstract class EncodingProvider + { + public EncodingProvider() { } + public abstract Encoding GetEncoding(string name); + public abstract Encoding GetEncoding(int codepage); + + // GetEncoding should return either valid encoding or null. shouldn't throw any exception except on null name + public virtual Encoding GetEncoding(string name, EncoderFallback encoderFallback, DecoderFallback decoderFallback) + { + Encoding enc = GetEncoding(name); + if (enc != null) + { + enc = (Encoding)GetEncoding(name).Clone(); + enc.EncoderFallback = encoderFallback; + enc.DecoderFallback = decoderFallback; + } + + return enc; + } + + public virtual Encoding GetEncoding(int codepage, EncoderFallback encoderFallback, DecoderFallback decoderFallback) + { + Encoding enc = GetEncoding(codepage); + if (enc != null) + { + enc = (Encoding)GetEncoding(codepage).Clone(); + enc.EncoderFallback = encoderFallback; + enc.DecoderFallback = decoderFallback; + } + + return enc; + } + + internal static void AddProvider(EncodingProvider provider) + { + if (provider == null) + throw new ArgumentNullException(nameof(provider)); + + lock (s_InternalSyncObject) + { + if (s_providers == null) + { + s_providers = new EncodingProvider[1] { provider }; + return; + } + + if (Array.IndexOf(s_providers, provider) >= 0) + { + return; + } + + EncodingProvider[] providers = new EncodingProvider[s_providers.Length + 1]; + Array.Copy(s_providers, providers, s_providers.Length); + providers[providers.Length - 1] = provider; + s_providers = providers; + } + } + + internal static Encoding GetEncodingFromProvider(int codepage) + { + if (s_providers == null) + return null; + + EncodingProvider[] providers = s_providers; + foreach (EncodingProvider provider in providers) + { + Encoding enc = provider.GetEncoding(codepage); + if (enc != null) + return enc; + } + + return null; + } + + internal static Encoding GetEncodingFromProvider(string encodingName) + { + if (s_providers == null) + return null; + + EncodingProvider[] providers = s_providers; + foreach (EncodingProvider provider in providers) + { + Encoding enc = provider.GetEncoding(encodingName); + if (enc != null) + return enc; + } + + return null; + } + + internal static Encoding GetEncodingFromProvider(int codepage, EncoderFallback enc, DecoderFallback dec) + { + if (s_providers == null) + return null; + + EncodingProvider[] providers = s_providers; + foreach (EncodingProvider provider in providers) + { + Encoding encing = provider.GetEncoding(codepage, enc, dec); + if (encing != null) + return encing; + } + + return null; + } + + internal static Encoding GetEncodingFromProvider(string encodingName, EncoderFallback enc, DecoderFallback dec) + { + if (s_providers == null) + return null; + + EncodingProvider[] providers = s_providers; + foreach (EncodingProvider provider in providers) + { + Encoding encoding = provider.GetEncoding(encodingName, enc, dec); + if (encoding != null) + return encoding; + } + + return null; + } + + private static Object s_InternalSyncObject = new Object(); + private static volatile EncodingProvider[] s_providers; + } +} diff --git a/src/mscorlib/shared/System/Text/Normalization.cs b/src/mscorlib/shared/System/Text/Normalization.cs new file mode 100644 index 0000000000..dc8bc2af71 --- /dev/null +++ b/src/mscorlib/shared/System/Text/Normalization.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System.Text +{ + // This is the enumeration for Normalization Forms + public enum NormalizationForm + { + FormC = 1, + FormD = 2, + FormKC = 5, + FormKD = 6 + } + + internal enum ExtendedNormalizationForms + { + FormC = 1, + FormD = 2, + FormKC = 5, + FormKD = 6, + FormIdna = 0xd, + FormCDisallowUnassigned = 0x101, + FormDDisallowUnassigned = 0x102, + FormKCDisallowUnassigned = 0x105, + FormKDDisallowUnassigned = 0x106, + FormIdnaDisallowUnassigned = 0x10d + } +} diff --git a/src/mscorlib/shared/System/Text/StringBuilder.cs b/src/mscorlib/shared/System/Text/StringBuilder.cs new file mode 100644 index 0000000000..df1a889823 --- /dev/null +++ b/src/mscorlib/shared/System/Text/StringBuilder.cs @@ -0,0 +1,2409 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Text; +using System.Runtime; +using System.Runtime.Serialization; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Versioning; +using System.Security; +using System.Threading; +using System.Globalization; +using System.Diagnostics; +using System.Diagnostics.Contracts; +using System.Collections.Generic; + +namespace System.Text +{ + // This class represents a mutable string. It is convenient for situations in + // which it is desirable to modify a string, perhaps by removing, replacing, or + // inserting characters, without creating a new String subsequent to + // each modification. + // + // The methods contained within this class do not return a new StringBuilder + // object unless specified otherwise. This class may be used in conjunction with the String + // class to carry out modifications upon strings. + // + // When passing null into a constructor in VJ and VC, the null + // should be explicitly type cast. + // For Example: + // StringBuilder sb1 = new StringBuilder((StringBuilder)null); + // StringBuilder sb2 = new StringBuilder((String)null); + // Console.WriteLine(sb1); + // Console.WriteLine(sb2); + // + [Serializable] + public sealed partial class StringBuilder : ISerializable + { + // A StringBuilder is internally represented as a linked list of blocks each of which holds + // a chunk of the string. It turns out string as a whole can also be represented as just a chunk, + // so that is what we do. + + // + // + // CLASS VARIABLES + // + // + internal char[] m_ChunkChars; // The characters in this block + internal StringBuilder m_ChunkPrevious; // Link to the block logically before this block + internal int m_ChunkLength; // The index in m_ChunkChars that represent the end of the block + internal int m_ChunkOffset; // The logical offset (sum of all characters in previous blocks) + internal int m_MaxCapacity = 0; + + // + // + // STATIC CONSTANTS + // + // + internal const int DefaultCapacity = 16; + private const String CapacityField = "Capacity"; + private const String MaxCapacityField = "m_MaxCapacity"; + private const String StringValueField = "m_StringValue"; + private const String ThreadIDField = "m_currentThread"; + // We want to keep chunk arrays out of large object heap (< 85K bytes ~ 40K chars) to be sure. + // Making the maximum chunk size big means less allocation code called, but also more waste + // in unused characters and slower inserts / replaces (since you do need to slide characters over + // within a buffer). + internal const int MaxChunkSize = 8000; + + // + // + //CONSTRUCTORS + // + // + + // Creates a new empty string builder (i.e., it represents String.Empty) + // with the default capacity (16 characters). + public StringBuilder() + { + m_MaxCapacity = int.MaxValue; + m_ChunkChars = new char[DefaultCapacity]; + } + + // Create a new empty string builder (i.e., it represents String.Empty) + // with the specified capacity. + public StringBuilder(int capacity) + : this(capacity, int.MaxValue) + { + } + + // Creates a new string builder from the specified string. If value + // is a null String (i.e., if it represents String.NullString) + // then the new string builder will also be null (i.e., it will also represent + // String.NullString). + // + public StringBuilder(String value) + : this(value, DefaultCapacity) + { + } + + // Creates a new string builder from the specified string with the specified + // capacity. If value is a null String (i.e., if it represents + // String.NullString) then the new string builder will also be null + // (i.e., it will also represent String.NullString). + // The maximum number of characters this string may contain is set by capacity. + // + public StringBuilder(String value, int capacity) + : this(value, 0, ((value != null) ? value.Length : 0), capacity) + { + } + + // Creates a new string builder from the specifed substring with the specified + // capacity. The maximum number of characters is set by capacity. + // + public StringBuilder(String value, int startIndex, int length, int capacity) + { + if (capacity < 0) + { + throw new ArgumentOutOfRangeException(nameof(capacity), + SR.Format(SR.ArgumentOutOfRange_MustBePositive, nameof(capacity))); + } + if (length < 0) + { + throw new ArgumentOutOfRangeException(nameof(length), + SR.Format(SR.ArgumentOutOfRange_MustBeNonNegNum, nameof(length))); + } + if (startIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_StartIndex); + } + Contract.EndContractBlock(); + + if (value == null) + { + value = String.Empty; + } + if (startIndex > value.Length - length) + { + throw new ArgumentOutOfRangeException(nameof(length), SR.ArgumentOutOfRange_IndexLength); + } + m_MaxCapacity = Int32.MaxValue; + if (capacity == 0) + { + capacity = DefaultCapacity; + } + if (capacity < length) + capacity = length; + + m_ChunkChars = new char[capacity]; + m_ChunkLength = length; + + unsafe + { + fixed (char* sourcePtr = value) + ThreadSafeCopy(sourcePtr + startIndex, m_ChunkChars, 0, length); + } + } + + // Creates an empty StringBuilder with a minimum capacity of capacity + // and a maximum capacity of maxCapacity. + public StringBuilder(int capacity, int maxCapacity) + { + if (capacity > maxCapacity) + { + throw new ArgumentOutOfRangeException(nameof(capacity), SR.ArgumentOutOfRange_Capacity); + } + if (maxCapacity < 1) + { + throw new ArgumentOutOfRangeException(nameof(maxCapacity), SR.ArgumentOutOfRange_SmallMaxCapacity); + } + if (capacity < 0) + { + throw new ArgumentOutOfRangeException(nameof(capacity), + SR.Format(SR.ArgumentOutOfRange_MustBePositive, nameof(capacity))); + } + Contract.EndContractBlock(); + + if (capacity == 0) + { + capacity = Math.Min(DefaultCapacity, maxCapacity); + } + + m_MaxCapacity = maxCapacity; + m_ChunkChars = new char[capacity]; + } + + private StringBuilder(SerializationInfo info, StreamingContext context) + { + if (info == null) + throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + int persistedCapacity = 0; + string persistedString = null; + int persistedMaxCapacity = Int32.MaxValue; + bool capacityPresent = false; + + // Get the data + SerializationInfoEnumerator enumerator = info.GetEnumerator(); + while (enumerator.MoveNext()) + { + switch (enumerator.Name) + { + case MaxCapacityField: + persistedMaxCapacity = info.GetInt32(MaxCapacityField); + break; + case StringValueField: + persistedString = info.GetString(StringValueField); + break; + case CapacityField: + persistedCapacity = info.GetInt32(CapacityField); + capacityPresent = true; + break; + default: + // Ignore other fields for forward compatibility. + break; + } + } + + // Check values and set defaults + if (persistedString == null) + { + persistedString = String.Empty; + } + if (persistedMaxCapacity < 1 || persistedString.Length > persistedMaxCapacity) + { + throw new SerializationException(SR.Serialization_StringBuilderMaxCapacity); + } + + if (!capacityPresent) + { + // StringBuilder in V1.X did not persist the Capacity, so this is a valid legacy code path. + persistedCapacity = DefaultCapacity; + if (persistedCapacity < persistedString.Length) + { + persistedCapacity = persistedString.Length; + } + if (persistedCapacity > persistedMaxCapacity) + { + persistedCapacity = persistedMaxCapacity; + } + } + if (persistedCapacity < 0 || persistedCapacity < persistedString.Length || persistedCapacity > persistedMaxCapacity) + { + throw new SerializationException(SR.Serialization_StringBuilderCapacity); + } + + // Assign + m_MaxCapacity = persistedMaxCapacity; + m_ChunkChars = new char[persistedCapacity]; + persistedString.CopyTo(0, m_ChunkChars, 0, persistedString.Length); + m_ChunkLength = persistedString.Length; + m_ChunkPrevious = null; + VerifyClassInvariant(); + } + + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + if (info == null) + { + throw new ArgumentNullException(nameof(info)); + } + Contract.EndContractBlock(); + + VerifyClassInvariant(); + info.AddValue(MaxCapacityField, m_MaxCapacity); + info.AddValue(CapacityField, Capacity); + info.AddValue(StringValueField, ToString()); + // Note: persist "m_currentThread" to be compatible with old versions + info.AddValue(ThreadIDField, 0); + } + + [System.Diagnostics.Conditional("_DEBUG")] + private void VerifyClassInvariant() + { + Debug.Assert((uint)(m_ChunkOffset + m_ChunkChars.Length) >= m_ChunkOffset, "Integer Overflow"); + StringBuilder currentBlock = this; + int maxCapacity = this.m_MaxCapacity; + for (;;) + { + // All blocks have copy of the maxCapacity. + Debug.Assert(currentBlock.m_MaxCapacity == maxCapacity, "Bad maxCapacity"); + Debug.Assert(currentBlock.m_ChunkChars != null, "Empty Buffer"); + + Debug.Assert(currentBlock.m_ChunkLength <= currentBlock.m_ChunkChars.Length, "Out of range length"); + Debug.Assert(currentBlock.m_ChunkLength >= 0, "Negative length"); + Debug.Assert(currentBlock.m_ChunkOffset >= 0, "Negative offset"); + + StringBuilder prevBlock = currentBlock.m_ChunkPrevious; + if (prevBlock == null) + { + Debug.Assert(currentBlock.m_ChunkOffset == 0, "First chunk's offset is not 0"); + break; + } + // There are no gaps in the blocks. + Debug.Assert(currentBlock.m_ChunkOffset == prevBlock.m_ChunkOffset + prevBlock.m_ChunkLength, "There is a gap between chunks!"); + currentBlock = prevBlock; + } + } + + public int Capacity + { + get { return m_ChunkChars.Length + m_ChunkOffset; } + set + { + if (value < 0) + { + throw new ArgumentOutOfRangeException(nameof(value), SR.ArgumentOutOfRange_NegativeCapacity); + } + if (value > MaxCapacity) + { + throw new ArgumentOutOfRangeException(nameof(value), SR.ArgumentOutOfRange_Capacity); + } + if (value < Length) + { + throw new ArgumentOutOfRangeException(nameof(value), SR.ArgumentOutOfRange_SmallCapacity); + } + Contract.EndContractBlock(); + + if (Capacity != value) + { + int newLen = value - m_ChunkOffset; + char[] newArray = new char[newLen]; + Array.Copy(m_ChunkChars, 0, newArray, 0, m_ChunkLength); + m_ChunkChars = newArray; + } + } + } + + public int MaxCapacity + { + get { return m_MaxCapacity; } + } + + // Ensures that the capacity of this string builder is at least the specified value. + // If capacity is greater than the capacity of this string builder, then the capacity + // is set to capacity; otherwise the capacity is unchanged. + // + public int EnsureCapacity(int capacity) + { + if (capacity < 0) + { + throw new ArgumentOutOfRangeException(nameof(capacity), SR.ArgumentOutOfRange_NegativeCapacity); + } + Contract.EndContractBlock(); + + if (Capacity < capacity) + Capacity = capacity; + return Capacity; + } + + public override String ToString() + { + Contract.Ensures(Contract.Result<String>() != null); + + VerifyClassInvariant(); + + if (Length == 0) + return String.Empty; + + string ret = string.FastAllocateString(Length); + StringBuilder chunk = this; + unsafe + { + fixed (char* destinationPtr = ret) + { + do + { + if (chunk.m_ChunkLength > 0) + { + // Copy these into local variables so that they are stable even in the presence of race conditions + char[] sourceArray = chunk.m_ChunkChars; + int chunkOffset = chunk.m_ChunkOffset; + int chunkLength = chunk.m_ChunkLength; + + // Check that we will not overrun our boundaries. + if ((uint)(chunkLength + chunkOffset) <= (uint)ret.Length && (uint)chunkLength <= (uint)sourceArray.Length) + { + fixed (char* sourcePtr = &sourceArray[0]) + string.wstrcpy(destinationPtr + chunkOffset, sourcePtr, chunkLength); + } + else + { + throw new ArgumentOutOfRangeException(nameof(chunkLength), SR.ArgumentOutOfRange_Index); + } + } + chunk = chunk.m_ChunkPrevious; + } while (chunk != null); + + return ret; + } + } + } + + + // Converts a substring of this string builder to a String. + public String ToString(int startIndex, int length) + { + Contract.Ensures(Contract.Result<String>() != null); + + int currentLength = this.Length; + if (startIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_StartIndex); + } + if (startIndex > currentLength) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_StartIndexLargerThanLength); + } + if (length < 0) + { + throw new ArgumentOutOfRangeException(nameof(length), SR.ArgumentOutOfRange_NegativeLength); + } + if (startIndex > (currentLength - length)) + { + throw new ArgumentOutOfRangeException(nameof(length), SR.ArgumentOutOfRange_IndexLength); + } + + VerifyClassInvariant(); + + StringBuilder chunk = this; + int sourceEndIndex = startIndex + length; + + string ret = string.FastAllocateString(length); + int curDestIndex = length; + unsafe + { + fixed (char* destinationPtr = ret) + { + while (curDestIndex > 0) + { + int chunkEndIndex = sourceEndIndex - chunk.m_ChunkOffset; + if (chunkEndIndex >= 0) + { + if (chunkEndIndex > chunk.m_ChunkLength) + chunkEndIndex = chunk.m_ChunkLength; + + int countLeft = curDestIndex; + int chunkCount = countLeft; + int chunkStartIndex = chunkEndIndex - countLeft; + if (chunkStartIndex < 0) + { + chunkCount += chunkStartIndex; + chunkStartIndex = 0; + } + curDestIndex -= chunkCount; + + if (chunkCount > 0) + { + // work off of local variables so that they are stable even in the presence of race conditions + char[] sourceArray = chunk.m_ChunkChars; + + // Check that we will not overrun our boundaries. + if ((uint)(chunkCount + curDestIndex) <= (uint)length && (uint)(chunkCount + chunkStartIndex) <= (uint)sourceArray.Length) + { + fixed (char* sourcePtr = &sourceArray[chunkStartIndex]) + string.wstrcpy(destinationPtr + curDestIndex, sourcePtr, chunkCount); + } + else + { + throw new ArgumentOutOfRangeException(nameof(chunkCount), SR.ArgumentOutOfRange_Index); + } + } + } + chunk = chunk.m_ChunkPrevious; + } + + return ret; + } + } + } + + // Convenience method for sb.Length=0; + public StringBuilder Clear() + { + this.Length = 0; + return this; + } + + // Sets the length of the String in this buffer. If length is less than the current + // instance, the StringBuilder is truncated. If length is greater than the current + // instance, nulls are appended. The capacity is adjusted to be the same as the length. + + public int Length + { + get + { + Contract.Ensures(Contract.Result<int>() >= 0); + return m_ChunkOffset + m_ChunkLength; + } + set + { + //If the new length is less than 0 or greater than our Maximum capacity, bail. + if (value < 0) + { + throw new ArgumentOutOfRangeException(nameof(value), SR.ArgumentOutOfRange_NegativeLength); + } + + if (value > MaxCapacity) + { + throw new ArgumentOutOfRangeException(nameof(value), SR.ArgumentOutOfRange_SmallCapacity); + } + Contract.EndContractBlock(); + + int originalCapacity = Capacity; + + if (value == 0 && m_ChunkPrevious == null) + { + m_ChunkLength = 0; + m_ChunkOffset = 0; + Debug.Assert(Capacity >= originalCapacity, "setting the Length should never decrease the Capacity"); + return; + } + + int delta = value - Length; + // if the specified length is greater than the current length + if (delta > 0) + { + // the end of the string value of the current StringBuilder object is padded with the Unicode NULL character + Append('\0', delta); // We could improve on this, but who does this anyway? + } + // if the specified length is less than or equal to the current length + else + { + StringBuilder chunk = FindChunkForIndex(value); + if (chunk != this) + { + // we crossed a chunk boundary when reducing the Length, we must replace this middle-chunk with a new + // larger chunk to ensure the original capacity is preserved + int newLen = originalCapacity - chunk.m_ChunkOffset; + char[] newArray = new char[newLen]; + + Debug.Assert(newLen > chunk.m_ChunkChars.Length, "the new chunk should be larger than the one it is replacing"); + Array.Copy(chunk.m_ChunkChars, 0, newArray, 0, chunk.m_ChunkLength); + + m_ChunkChars = newArray; + m_ChunkPrevious = chunk.m_ChunkPrevious; + m_ChunkOffset = chunk.m_ChunkOffset; + } + m_ChunkLength = value - chunk.m_ChunkOffset; + VerifyClassInvariant(); + } + Debug.Assert(Capacity >= originalCapacity, "setting the Length should never decrease the Capacity"); + } + } + + [System.Runtime.CompilerServices.IndexerName("Chars")] + public char this[int index] + { + get + { + StringBuilder chunk = this; + for (;;) + { + int indexInBlock = index - chunk.m_ChunkOffset; + if (indexInBlock >= 0) + { + if (indexInBlock >= chunk.m_ChunkLength) + throw new IndexOutOfRangeException(); + return chunk.m_ChunkChars[indexInBlock]; + } + chunk = chunk.m_ChunkPrevious; + if (chunk == null) + throw new IndexOutOfRangeException(); + } + } + set + { + StringBuilder chunk = this; + for (;;) + { + int indexInBlock = index - chunk.m_ChunkOffset; + if (indexInBlock >= 0) + { + if (indexInBlock >= chunk.m_ChunkLength) + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + chunk.m_ChunkChars[indexInBlock] = value; + return; + } + chunk = chunk.m_ChunkPrevious; + if (chunk == null) + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + } + } + } + + // Appends a character at the end of this string builder. The capacity is adjusted as needed. + public StringBuilder Append(char value, int repeatCount) + { + if (repeatCount < 0) + { + throw new ArgumentOutOfRangeException(nameof(repeatCount), SR.ArgumentOutOfRange_NegativeCount); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + if (repeatCount == 0) + { + return this; + } + + // this is where we can check if the repeatCount will put us over m_MaxCapacity + // We are doing the check here to prevent the corruption of the StringBuilder. + int newLength = Length + repeatCount; + if (newLength > m_MaxCapacity || newLength < repeatCount) + { + throw new ArgumentOutOfRangeException(nameof(repeatCount), SR.ArgumentOutOfRange_LengthGreaterThanCapacity); + } + + int idx = m_ChunkLength; + while (repeatCount > 0) + { + if (idx < m_ChunkChars.Length) + { + m_ChunkChars[idx++] = value; + --repeatCount; + } + else + { + m_ChunkLength = idx; + ExpandByABlock(repeatCount); + Debug.Assert(m_ChunkLength == 0, "Expand should create a new block"); + idx = 0; + } + } + m_ChunkLength = idx; + VerifyClassInvariant(); + return this; + } + + // Appends an array of characters at the end of this string builder. The capacity is adjusted as needed. + public StringBuilder Append(char[] value, int startIndex, int charCount) + { + if (startIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_GenericPositive); + } + if (charCount < 0) + { + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GenericPositive); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + if (value == null) + { + if (startIndex == 0 && charCount == 0) + { + return this; + } + throw new ArgumentNullException(nameof(value)); + } + if (charCount > value.Length - startIndex) + { + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_Index); + } + + if (charCount == 0) + { + return this; + } + unsafe + { + fixed (char* valueChars = &value[startIndex]) + { + Append(valueChars, charCount); + + return this; + } + } + } + + + // Appends a copy of this string at the end of this string builder. + public StringBuilder Append(String value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + if (value != null) + { + // This is a hand specialization of the 'AppendHelper' code below. + // We could have just called AppendHelper. + char[] chunkChars = m_ChunkChars; + int chunkLength = m_ChunkLength; + int valueLen = value.Length; + int newCurrentIndex = chunkLength + valueLen; + if (newCurrentIndex < chunkChars.Length) // Use strictly < to avoid issue if count == 0, newIndex == length + { + if (valueLen <= 2) + { + if (valueLen > 0) + chunkChars[chunkLength] = value[0]; + if (valueLen > 1) + chunkChars[chunkLength + 1] = value[1]; + } + else + { + unsafe + { + fixed (char* valuePtr = value) + fixed (char* destPtr = &chunkChars[chunkLength]) + string.wstrcpy(destPtr, valuePtr, valueLen); + } + } + m_ChunkLength = newCurrentIndex; + } + else + AppendHelper(value); + } + return this; + } + + + // We put this fixed in its own helper to avoid the cost zero initing valueChars in the + // case we don't actually use it. + private void AppendHelper(string value) + { + unsafe + { + fixed (char* valueChars = value) + Append(valueChars, value.Length); + } + } + + // Appends a copy of the characters in value from startIndex to startIndex + + // count at the end of this string builder. + public StringBuilder Append(String value, int startIndex, int count) + { + if (startIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_Index); + } + + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GenericPositive); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + //If the value being added is null, eat the null + //and return. + if (value == null) + { + if (startIndex == 0 && count == 0) + { + return this; + } + throw new ArgumentNullException(nameof(value)); + } + + if (count == 0) + { + return this; + } + + if (startIndex > value.Length - count) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_Index); + } + + unsafe + { + fixed (char* valueChars = value) + { + Append(valueChars + startIndex, count); + + return this; + } + } + } + + public StringBuilder AppendLine() + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(Environment.NewLine); + } + + public StringBuilder AppendLine(string value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Append(value); + return Append(Environment.NewLine); + } + + public void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count) + { + if (destination == null) + { + throw new ArgumentNullException(nameof(destination)); + } + + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count), SR.Arg_NegativeArgCount); + } + + if (destinationIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(destinationIndex), + SR.Format(SR.ArgumentOutOfRange_MustBeNonNegNum, nameof(destinationIndex))); + } + + if (destinationIndex > destination.Length - count) + { + throw new ArgumentException(SR.ArgumentOutOfRange_OffsetOut); + } + + if ((uint)sourceIndex > (uint)Length) + { + throw new ArgumentOutOfRangeException(nameof(sourceIndex), SR.ArgumentOutOfRange_Index); + } + + if (sourceIndex > Length - count) + { + throw new ArgumentException(SR.Arg_LongerThanSrcString); + } + Contract.EndContractBlock(); + + VerifyClassInvariant(); + + StringBuilder chunk = this; + int sourceEndIndex = sourceIndex + count; + int curDestIndex = destinationIndex + count; + while (count > 0) + { + int chunkEndIndex = sourceEndIndex - chunk.m_ChunkOffset; + if (chunkEndIndex >= 0) + { + if (chunkEndIndex > chunk.m_ChunkLength) + chunkEndIndex = chunk.m_ChunkLength; + + int chunkCount = count; + int chunkStartIndex = chunkEndIndex - count; + if (chunkStartIndex < 0) + { + chunkCount += chunkStartIndex; + chunkStartIndex = 0; + } + curDestIndex -= chunkCount; + count -= chunkCount; + + // SafeCritical: we ensure that chunkStartIndex + chunkCount are within range of m_chunkChars + // as well as ensuring that curDestIndex + chunkCount are within range of destination + ThreadSafeCopy(chunk.m_ChunkChars, chunkStartIndex, destination, curDestIndex, chunkCount); + } + chunk = chunk.m_ChunkPrevious; + } + } + + // Inserts multiple copies of a string into this string builder at the specified position. + // Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, this + // string builder is not changed. + // + public StringBuilder Insert(int index, String value, int count) + { + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + //Range check the index. + int currentLength = Length; + if ((uint)index > (uint)currentLength) + { + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + } + + //If value is null, empty or count is 0, do nothing. This is ECMA standard. + if (value == null || value.Length == 0 || count == 0) + { + return this; + } + + //Ensure we don't insert more chars than we can hold, and we don't + //have any integer overflow in our inserted characters. + long insertingChars = (long)value.Length * count; + if (insertingChars > MaxCapacity - this.Length) + { + throw new OutOfMemoryException(); + } + Debug.Assert(insertingChars + this.Length < Int32.MaxValue); + + StringBuilder chunk; + int indexInChunk; + MakeRoom(index, (int)insertingChars, out chunk, out indexInChunk, false); + unsafe + { + fixed (char* valuePtr = value) + { + while (count > 0) + { + ReplaceInPlaceAtChunk(ref chunk, ref indexInChunk, valuePtr, value.Length); + --count; + } + + return this; + } + } + } + + // Removes the specified characters from this string builder. + // The length of this string builder is reduced by + // length, but the capacity is unaffected. + // + public StringBuilder Remove(int startIndex, int length) + { + if (length < 0) + { + throw new ArgumentOutOfRangeException(nameof(length), SR.ArgumentOutOfRange_NegativeLength); + } + + if (startIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_StartIndex); + } + + if (length > Length - startIndex) + { + throw new ArgumentOutOfRangeException(nameof(length), SR.ArgumentOutOfRange_Index); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + if (Length == length && startIndex == 0) + { + // Optimization. If we are deleting everything + Length = 0; + return this; + } + + if (length > 0) + { + StringBuilder chunk; + int indexInChunk; + Remove(startIndex, length, out chunk, out indexInChunk); + } + return this; + } + + // Appends a boolean to the end of this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(bool value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends an sbyte to this string builder. + // The capacity is adjusted as needed. + [CLSCompliant(false)] + public StringBuilder Append(sbyte value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends a ubyte to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(byte value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends a character at the end of this string builder. The capacity is adjusted as needed. + public StringBuilder Append(char value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + if (m_ChunkLength < m_ChunkChars.Length) + m_ChunkChars[m_ChunkLength++] = value; + else + Append(value, 1); + return this; + } + + // Appends a short to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(short value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends an int to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(int value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends a long to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(long value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends a float to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(float value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends a double to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(double value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + public StringBuilder Append(decimal value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends an ushort to this string builder. + // The capacity is adjusted as needed. + [CLSCompliant(false)] + public StringBuilder Append(ushort value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends an uint to this string builder. + // The capacity is adjusted as needed. + [CLSCompliant(false)] + public StringBuilder Append(uint value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends an unsigned long to this string builder. + // The capacity is adjusted as needed. + [CLSCompliant(false)] + public StringBuilder Append(ulong value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Append(value.ToString()); + } + + // Appends an Object to this string builder. + // The capacity is adjusted as needed. + public StringBuilder Append(Object value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + if (null == value) + { + //Appending null is now a no-op. + return this; + } + return Append(value.ToString()); + } + + // Appends all of the characters in value to the current instance. + public StringBuilder Append(char[] value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + if (null != value && value.Length > 0) + { + unsafe + { + fixed (char* valueChars = &value[0]) + Append(valueChars, value.Length); + } + } + return this; + } + + // Append joined values with a separator between each value. + public unsafe StringBuilder AppendJoin<T>(char separator, params T[] values) + { + // Defer argument validation to the internal function + return AppendJoinCore(&separator, 1, values); + } + + public unsafe StringBuilder AppendJoin<T>(string separator, params T[] values) + { + separator = separator ?? string.Empty; + fixed (char* pSeparator = separator) + { + // Defer argument validation to the internal function + return AppendJoinCore(pSeparator, separator.Length, values); + } + } + + public unsafe StringBuilder AppendJoin<T>(char separator, IEnumerable<T> values) + { + // Defer argument validation to the internal function + return AppendJoinCore(&separator, 1, values); + } + + public unsafe StringBuilder AppendJoin<T>(string separator, IEnumerable<T> values) + { + separator = separator ?? string.Empty; + fixed (char* pSeparator = separator) + { + // Defer argument validation to the internal function + return AppendJoinCore(pSeparator, separator.Length, values); + } + } + + private unsafe StringBuilder AppendJoinCore<T>(char* separator, int separatorLength, params T[] values) + { + if (values == null) + throw new ArgumentNullException(nameof(values)); + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + if (values.Length == 0) + return this; + + var value = values[0]; + if (value != null) + Append(value.ToString()); + + for (var i = 1; i < values.Length; i++) + { + Append(separator, separatorLength); + value = values[i]; + if (value != null) + Append(value.ToString()); + } + return this; + } + + private unsafe StringBuilder AppendJoinCore<T>(char* separator, int separatorLength, IEnumerable<T> values) + { + if (values == null) + throw new ArgumentNullException(nameof(values)); + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + using (var en = values.GetEnumerator()) + { + if (!en.MoveNext()) + return this; + + var value = en.Current; + if (value != null) + Append(value.ToString()); + + while (en.MoveNext()) + { + Append(separator, separatorLength); + value = en.Current; + if (value != null) + Append(value.ToString()); + } + } + return this; + } + + /*====================================Insert==================================== + ** + ==============================================================================*/ + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, String value) + { + if ((uint)index > (uint)Length) + { + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + if (value != null) + { + unsafe + { + fixed (char* sourcePtr = value) + Insert(index, sourcePtr, value.Length); + } + } + return this; + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, bool value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + [CLSCompliant(false)] + public StringBuilder Insert(int index, sbyte value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, byte value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, short value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + public StringBuilder Insert(int index, char value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + unsafe + { + Insert(index, &value, 1); + } + return this; + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, char[] value) + { + if ((uint)index > (uint)Length) + { + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + if (value != null) + Insert(index, value, 0, value.Length); + return this; + } + + // Returns a reference to the StringBuilder with charCount characters from + // value inserted into the buffer at index. Existing characters are shifted + // to make room for the new text and capacity is adjusted as required. If value is null, the StringBuilder + // is unchanged. Characters are taken from value starting at position startIndex. + public StringBuilder Insert(int index, char[] value, int startIndex, int charCount) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + int currentLength = Length; + if ((uint)index > (uint)currentLength) + { + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + } + + //If they passed in a null char array, just jump out quickly. + if (value == null) + { + if (startIndex == 0 && charCount == 0) + { + return this; + } + throw new ArgumentNullException(nameof(value), SR.ArgumentNull_String); + } + + //Range check the array. + if (startIndex < 0) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_StartIndex); + } + + if (charCount < 0) + { + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GenericPositive); + } + + if (startIndex > value.Length - charCount) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_Index); + } + + if (charCount > 0) + { + unsafe + { + fixed (char* sourcePtr = &value[startIndex]) + Insert(index, sourcePtr, charCount); + } + } + return this; + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, int value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, long value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, float value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with ; value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. + // + public StringBuilder Insert(int index, double value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + public StringBuilder Insert(int index, decimal value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. + // + [CLSCompliant(false)] + public StringBuilder Insert(int index, ushort value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. + // + [CLSCompliant(false)] + public StringBuilder Insert(int index, uint value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to the StringBuilder with value inserted into + // the buffer at index. Existing characters are shifted to make room for the new text. + // The capacity is adjusted as needed. + // + [CLSCompliant(false)] + public StringBuilder Insert(int index, ulong value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Insert(index, value.ToString(), 1); + } + + // Returns a reference to this string builder with value inserted into + // the buffer at index. Existing characters are shifted to make room for the + // new text. The capacity is adjusted as needed. If value equals String.Empty, the + // StringBuilder is not changed. No changes are made if value is null. + // + public StringBuilder Insert(int index, Object value) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + if (null == value) + { + return this; + } + return Insert(index, value.ToString(), 1); + } + + public StringBuilder AppendFormat(String format, Object arg0) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return AppendFormatHelper(null, format, new ParamsArray(arg0)); + } + + public StringBuilder AppendFormat(String format, Object arg0, Object arg1) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return AppendFormatHelper(null, format, new ParamsArray(arg0, arg1)); + } + + public StringBuilder AppendFormat(String format, Object arg0, Object arg1, Object arg2) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return AppendFormatHelper(null, format, new ParamsArray(arg0, arg1, arg2)); + } + + public StringBuilder AppendFormat(String format, params Object[] args) + { + if (args == null) + { + // To preserve the original exception behavior, throw an exception about format if both + // args and format are null. The actual null check for format is in AppendFormatHelper. + throw new ArgumentNullException((format == null) ? nameof(format) : nameof(args)); + } + Contract.Ensures(Contract.Result<String>() != null); + Contract.EndContractBlock(); + + return AppendFormatHelper(null, format, new ParamsArray(args)); + } + + public StringBuilder AppendFormat(IFormatProvider provider, String format, Object arg0) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return AppendFormatHelper(provider, format, new ParamsArray(arg0)); + } + + public StringBuilder AppendFormat(IFormatProvider provider, String format, Object arg0, Object arg1) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return AppendFormatHelper(provider, format, new ParamsArray(arg0, arg1)); + } + + public StringBuilder AppendFormat(IFormatProvider provider, String format, Object arg0, Object arg1, Object arg2) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return AppendFormatHelper(provider, format, new ParamsArray(arg0, arg1, arg2)); + } + + public StringBuilder AppendFormat(IFormatProvider provider, String format, params Object[] args) + { + if (args == null) + { + // To preserve the original exception behavior, throw an exception about format if both + // args and format are null. The actual null check for format is in AppendFormatHelper. + throw new ArgumentNullException((format == null) ? nameof(format) : nameof(args)); + } + Contract.Ensures(Contract.Result<String>() != null); + Contract.EndContractBlock(); + + return AppendFormatHelper(provider, format, new ParamsArray(args)); + } + + private static void FormatError() + { + throw new FormatException(SR.Format_InvalidString); + } + + // undocumented exclusive limits on the range for Argument Hole Index and Argument Hole Alignment. + private const int Index_Limit = 1000000; // Note: 0 <= ArgIndex < Index_Limit + private const int Width_Limit = 1000000; // Note: -Width_Limit < ArgAlign < Width_Limit + + internal StringBuilder AppendFormatHelper(IFormatProvider provider, String format, ParamsArray args) + { + if (format == null) + { + throw new ArgumentNullException(nameof(format)); + } + Contract.Ensures(Contract.Result<StringBuilder>() != null); + Contract.EndContractBlock(); + + int pos = 0; + int len = format.Length; + char ch = '\x0'; + StringBuilder unescapedItemFormat = null; + + ICustomFormatter cf = null; + if (provider != null) + { + cf = (ICustomFormatter)provider.GetFormat(typeof(ICustomFormatter)); + } + + while (true) + { + while (pos < len) + { + ch = format[pos]; + + pos++; + // Is it a closing brace? + if (ch == '}') + { + // Check next character (if there is one) to see if it is escaped. eg }} + if (pos < len && format[pos] == '}') + pos++; + else + // Otherwise treat it as an error (Mismatched closing brace) + FormatError(); + } + // Is it a opening brace? + if (ch == '{') + { + // Check next character (if there is one) to see if it is escaped. eg {{ + if (pos < len && format[pos] == '{') + pos++; + else + { + // Otherwise treat it as the opening brace of an Argument Hole. + pos--; + break; + } + } + // If it neither then treat the character as just text. + Append(ch); + } + + // + // Start of parsing of Argument Hole. + // Argument Hole ::= { Index (, WS* Alignment WS*)? (: Formatting)? } + // + if (pos == len) break; + + // + // Start of parsing required Index parameter. + // Index ::= ('0'-'9')+ WS* + // + pos++; + // If reached end of text then error (Unexpected end of text) + // or character is not a digit then error (Unexpected Character) + if (pos == len || (ch = format[pos]) < '0' || ch > '9') FormatError(); + int index = 0; + do + { + index = index * 10 + ch - '0'; + pos++; + // If reached end of text then error (Unexpected end of text) + if (pos == len) FormatError(); + ch = format[pos]; + // so long as character is digit and value of the index is less than 1000000 ( index limit ) + } while (ch >= '0' && ch <= '9' && index < Index_Limit); + + // If value of index is not within the range of the arguments passed in then error (Index out of range) + if (index >= args.Length) throw new FormatException(SR.Format_IndexOutOfRange); + + // Consume optional whitespace. + while (pos < len && (ch = format[pos]) == ' ') pos++; + // End of parsing index parameter. + + // + // Start of parsing of optional Alignment + // Alignment ::= comma WS* minus? ('0'-'9')+ WS* + // + bool leftJustify = false; + int width = 0; + // Is the character a comma, which indicates the start of alignment parameter. + if (ch == ',') + { + pos++; + + // Consume Optional whitespace + while (pos < len && format[pos] == ' ') pos++; + + // If reached the end of the text then error (Unexpected end of text) + if (pos == len) FormatError(); + + // Is there a minus sign? + ch = format[pos]; + if (ch == '-') + { + // Yes, then alignment is left justified. + leftJustify = true; + pos++; + // If reached end of text then error (Unexpected end of text) + if (pos == len) FormatError(); + ch = format[pos]; + } + + // If current character is not a digit then error (Unexpected character) + if (ch < '0' || ch > '9') FormatError(); + // Parse alignment digits. + do + { + width = width * 10 + ch - '0'; + pos++; + // If reached end of text then error. (Unexpected end of text) + if (pos == len) FormatError(); + ch = format[pos]; + // So long a current character is a digit and the value of width is less than 100000 ( width limit ) + } while (ch >= '0' && ch <= '9' && width < Width_Limit); + // end of parsing Argument Alignment + } + + // Consume optional whitespace + while (pos < len && (ch = format[pos]) == ' ') pos++; + + // + // Start of parsing of optional formatting parameter. + // + Object arg = args[index]; + String itemFormat = null; + // Is current character a colon? which indicates start of formatting parameter. + if (ch == ':') + { + pos++; + int startPos = pos; + + while (true) + { + // If reached end of text then error. (Unexpected end of text) + if (pos == len) FormatError(); + ch = format[pos]; + pos++; + + // Is character a opening or closing brace? + if (ch == '}' || ch == '{') + { + if (ch == '{') + { + // Yes, is next character also a opening brace, then treat as escaped. eg {{ + if (pos < len && format[pos] == '{') + pos++; + else + // Error Argument Holes can not be nested. + FormatError(); + } + else + { + // Yes, is next character also a closing brace, then treat as escaped. eg }} + if (pos < len && format[pos] == '}') + pos++; + else + { + // No, then treat it as the closing brace of an Arg Hole. + pos--; + break; + } + } + + // Reaching here means the brace has been escaped + // so we need to build up the format string in segments + if (unescapedItemFormat == null) + { + unescapedItemFormat = new StringBuilder(); + } + unescapedItemFormat.Append(format, startPos, pos - startPos - 1); + startPos = pos; + } + } + + if (unescapedItemFormat == null || unescapedItemFormat.Length == 0) + { + if (startPos != pos) + { + // There was no brace escaping, extract the item format as a single string + itemFormat = format.Substring(startPos, pos - startPos); + } + } + else + { + unescapedItemFormat.Append(format, startPos, pos - startPos); + itemFormat = unescapedItemFormat.ToString(); + unescapedItemFormat.Clear(); + } + } + // If current character is not a closing brace then error. (Unexpected Character) + if (ch != '}') FormatError(); + // Construct the output for this arg hole. + pos++; + String s = null; + if (cf != null) + { + s = cf.Format(itemFormat, arg, provider); + } + + if (s == null) + { + IFormattable formattableArg = arg as IFormattable; + + if (formattableArg != null) + { + s = formattableArg.ToString(itemFormat, provider); + } + else if (arg != null) + { + s = arg.ToString(); + } + } + // Append it to the final output of the Format String. + if (s == null) s = String.Empty; + int pad = width - s.Length; + if (!leftJustify && pad > 0) Append(' ', pad); + Append(s); + if (leftJustify && pad > 0) Append(' ', pad); + // Continue to parse other characters. + } + return this; + } + + // Returns a reference to the current StringBuilder with all instances of oldString + // replaced with newString. If startIndex and count are specified, + // we only replace strings completely contained in the range of startIndex to startIndex + + // count. The strings to be replaced are checked on an ordinal basis (e.g. not culture aware). If + // newValue is null, instances of oldValue are removed (e.g. replaced with nothing.). + // + public StringBuilder Replace(String oldValue, String newValue) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + return Replace(oldValue, newValue, 0, Length); + } + + public bool Equals(StringBuilder sb) + { + if (sb == null) + return false; + if (Capacity != sb.Capacity || MaxCapacity != sb.MaxCapacity || Length != sb.Length) + return false; + if (sb == this) + return true; + + StringBuilder thisChunk = this; + int thisChunkIndex = thisChunk.m_ChunkLength; + StringBuilder sbChunk = sb; + int sbChunkIndex = sbChunk.m_ChunkLength; + for (;;) + { + // Decrement the pointer to the 'this' StringBuilder + --thisChunkIndex; + --sbChunkIndex; + + while (thisChunkIndex < 0) + { + thisChunk = thisChunk.m_ChunkPrevious; + if (thisChunk == null) + break; + thisChunkIndex = thisChunk.m_ChunkLength + thisChunkIndex; + } + + // Decrement the pointer to the 'this' StringBuilder + while (sbChunkIndex < 0) + { + sbChunk = sbChunk.m_ChunkPrevious; + if (sbChunk == null) + break; + sbChunkIndex = sbChunk.m_ChunkLength + sbChunkIndex; + } + + if (thisChunkIndex < 0) + return sbChunkIndex < 0; + if (sbChunkIndex < 0) + return false; + if (thisChunk.m_ChunkChars[thisChunkIndex] != sbChunk.m_ChunkChars[sbChunkIndex]) + return false; + } + } + + public StringBuilder Replace(String oldValue, String newValue, int startIndex, int count) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + int currentLength = Length; + if ((uint)startIndex > (uint)currentLength) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_Index); + } + if (count < 0 || startIndex > currentLength - count) + { + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_Index); + } + if (oldValue == null) + { + throw new ArgumentNullException(nameof(oldValue)); + } + if (oldValue.Length == 0) + { + throw new ArgumentException(SR.Argument_EmptyName, nameof(oldValue)); + } + + if (newValue == null) + newValue = ""; + + int deltaLength = newValue.Length - oldValue.Length; + + int[] replacements = null; // A list of replacement positions in a chunk to apply + int replacementsCount = 0; + + // Find the chunk, indexInChunk for the starting point + StringBuilder chunk = FindChunkForIndex(startIndex); + int indexInChunk = startIndex - chunk.m_ChunkOffset; + while (count > 0) + { + // Look for a match in the chunk,indexInChunk pointer + if (StartsWith(chunk, indexInChunk, count, oldValue)) + { + // Push it on my replacements array (with growth), we will do all replacements in a + // given chunk in one operation below (see ReplaceAllInChunk) so we don't have to slide + // many times. + if (replacements == null) + replacements = new int[5]; + else if (replacementsCount >= replacements.Length) + { + Array.Resize(ref replacements, replacements.Length * 3 / 2 + 4); // grow by 1.5X but more in the beginning + } + replacements[replacementsCount++] = indexInChunk; + indexInChunk += oldValue.Length; + count -= oldValue.Length; + } + else + { + indexInChunk++; + --count; + } + + if (indexInChunk >= chunk.m_ChunkLength || count == 0) // Have we moved out of the current chunk + { + // Replacing mutates the blocks, so we need to convert to logical index and back afterward. + int index = indexInChunk + chunk.m_ChunkOffset; + int indexBeforeAdjustment = index; + + // See if we accumulated any replacements, if so apply them + ReplaceAllInChunk(replacements, replacementsCount, chunk, oldValue.Length, newValue); + // The replacement has affected the logical index. Adjust it. + index += ((newValue.Length - oldValue.Length) * replacementsCount); + replacementsCount = 0; + + chunk = FindChunkForIndex(index); + indexInChunk = index - chunk.m_ChunkOffset; + Debug.Assert(chunk != null || count == 0, "Chunks ended prematurely"); + } + } + VerifyClassInvariant(); + return this; + } + + // Returns a StringBuilder with all instances of oldChar replaced with + // newChar. The size of the StringBuilder is unchanged because we're only + // replacing characters. If startIndex and count are specified, we + // only replace characters in the range from startIndex to startIndex+count + // + public StringBuilder Replace(char oldChar, char newChar) + { + return Replace(oldChar, newChar, 0, Length); + } + public StringBuilder Replace(char oldChar, char newChar, int startIndex, int count) + { + Contract.Ensures(Contract.Result<StringBuilder>() != null); + + int currentLength = Length; + if ((uint)startIndex > (uint)currentLength) + { + throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_Index); + } + + if (count < 0 || startIndex > currentLength - count) + { + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_Index); + } + + int endIndex = startIndex + count; + StringBuilder chunk = this; + for (;;) + { + int endIndexInChunk = endIndex - chunk.m_ChunkOffset; + int startIndexInChunk = startIndex - chunk.m_ChunkOffset; + if (endIndexInChunk >= 0) + { + int curInChunk = Math.Max(startIndexInChunk, 0); + int endInChunk = Math.Min(chunk.m_ChunkLength, endIndexInChunk); + while (curInChunk < endInChunk) + { + if (chunk.m_ChunkChars[curInChunk] == oldChar) + chunk.m_ChunkChars[curInChunk] = newChar; + curInChunk++; + } + } + if (startIndexInChunk >= 0) + break; + chunk = chunk.m_ChunkPrevious; + } + return this; + } + + /// <summary> + /// Appends 'value' of length 'count' to the stringBuilder. + /// </summary> + [CLSCompliant(false)] + public unsafe StringBuilder Append(char* value, int valueCount) + { + // We don't check null value as this case will throw null reference exception anyway + if (valueCount < 0) + { + throw new ArgumentOutOfRangeException(nameof(valueCount), SR.ArgumentOutOfRange_NegativeCount); + } + + // this is where we can check if the valueCount will put us over m_MaxCapacity + // We are doing the check here to prevent the corruption of the StringBuilder. + int newLength = Length + valueCount; + if (newLength > m_MaxCapacity || newLength < valueCount) + { + throw new ArgumentOutOfRangeException(nameof(valueCount), SR.ArgumentOutOfRange_LengthGreaterThanCapacity); + } + + // This case is so common we want to optimize for it heavily. + int newIndex = valueCount + m_ChunkLength; + if (newIndex <= m_ChunkChars.Length) + { + ThreadSafeCopy(value, m_ChunkChars, m_ChunkLength, valueCount); + m_ChunkLength = newIndex; + } + else + { + // Copy the first chunk + int firstLength = m_ChunkChars.Length - m_ChunkLength; + if (firstLength > 0) + { + ThreadSafeCopy(value, m_ChunkChars, m_ChunkLength, firstLength); + m_ChunkLength = m_ChunkChars.Length; + } + + // Expand the builder to add another chunk. + int restLength = valueCount - firstLength; + ExpandByABlock(restLength); + Debug.Assert(m_ChunkLength == 0, "Expand did not make a new block"); + + // Copy the second chunk + ThreadSafeCopy(value + firstLength, m_ChunkChars, 0, restLength); + m_ChunkLength = restLength; + } + VerifyClassInvariant(); + return this; + } + + /// <summary> + /// Inserts 'value' of length 'cou + /// </summary> + unsafe private void Insert(int index, char* value, int valueCount) + { + if ((uint)index > (uint)Length) + { + throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); + } + + if (valueCount > 0) + { + StringBuilder chunk; + int indexInChunk; + MakeRoom(index, valueCount, out chunk, out indexInChunk, false); + ReplaceInPlaceAtChunk(ref chunk, ref indexInChunk, value, valueCount); + } + } + + /// <summary> + /// 'replacements' is a list of index (relative to the begining of the 'chunk' to remove + /// 'removeCount' characters and replace them with 'value'. This routine does all those + /// replacements in bulk (and therefore very efficiently. + /// with the string 'value'. + /// </summary> + private void ReplaceAllInChunk(int[] replacements, int replacementsCount, StringBuilder sourceChunk, int removeCount, string value) + { + if (replacementsCount <= 0) + return; + + unsafe + { + fixed (char* valuePtr = value) + { + // calculate the total amount of extra space or space needed for all the replacements. + int delta = (value.Length - removeCount) * replacementsCount; + + StringBuilder targetChunk = sourceChunk; // the target as we copy chars down + int targetIndexInChunk = replacements[0]; + + // Make the room needed for all the new characters if needed. + if (delta > 0) + MakeRoom(targetChunk.m_ChunkOffset + targetIndexInChunk, delta, out targetChunk, out targetIndexInChunk, true); + // We made certain that characters after the insertion point are not moved, + int i = 0; + for (;;) + { + // Copy in the new string for the ith replacement + ReplaceInPlaceAtChunk(ref targetChunk, ref targetIndexInChunk, valuePtr, value.Length); + int gapStart = replacements[i] + removeCount; + i++; + if (i >= replacementsCount) + break; + + int gapEnd = replacements[i]; + Debug.Assert(gapStart < sourceChunk.m_ChunkChars.Length, "gap starts at end of buffer. Should not happen"); + Debug.Assert(gapStart <= gapEnd, "negative gap size"); + Debug.Assert(gapEnd <= sourceChunk.m_ChunkLength, "gap too big"); + if (delta != 0) // can skip the sliding of gaps if source an target string are the same size. + { + // Copy the gap data between the current replacement and the the next replacement + fixed (char* sourcePtr = &sourceChunk.m_ChunkChars[gapStart]) + ReplaceInPlaceAtChunk(ref targetChunk, ref targetIndexInChunk, sourcePtr, gapEnd - gapStart); + } + else + { + targetIndexInChunk += gapEnd - gapStart; + Debug.Assert(targetIndexInChunk <= targetChunk.m_ChunkLength, "gap not in chunk"); + } + } + + // Remove extra space if necessary. + if (delta < 0) + Remove(targetChunk.m_ChunkOffset + targetIndexInChunk, -delta, out targetChunk, out targetIndexInChunk); + } + } + } + + /// <summary> + /// Returns true if the string that is starts at 'chunk' and 'indexInChunk, and has a logical + /// length of 'count' starts with the string 'value'. + /// </summary> + private bool StartsWith(StringBuilder chunk, int indexInChunk, int count, string value) + { + for (int i = 0; i < value.Length; i++) + { + if (count == 0) + return false; + if (indexInChunk >= chunk.m_ChunkLength) + { + chunk = Next(chunk); + if (chunk == null) + return false; + indexInChunk = 0; + } + + // See if there no match, break out of the inner for loop + if (value[i] != chunk.m_ChunkChars[indexInChunk]) + return false; + + indexInChunk++; + --count; + } + return true; + } + + /// <summary> + /// ReplaceInPlaceAtChunk is the logical equivalent of 'memcpy'. Given a chunk and ann index in + /// that chunk, it copies in 'count' characters from 'value' and updates 'chunk, and indexInChunk to + /// point at the end of the characters just copyied (thus you can splice in strings from multiple + /// places by calling this mulitple times. + /// </summary> + unsafe private void ReplaceInPlaceAtChunk(ref StringBuilder chunk, ref int indexInChunk, char* value, int count) + { + if (count != 0) + { + for (;;) + { + int lengthInChunk = chunk.m_ChunkLength - indexInChunk; + Debug.Assert(lengthInChunk >= 0, "index not in chunk"); + + int lengthToCopy = Math.Min(lengthInChunk, count); + ThreadSafeCopy(value, chunk.m_ChunkChars, indexInChunk, lengthToCopy); + + // Advance the index. + indexInChunk += lengthToCopy; + if (indexInChunk >= chunk.m_ChunkLength) + { + chunk = Next(chunk); + indexInChunk = 0; + } + count -= lengthToCopy; + if (count == 0) + break; + value += lengthToCopy; + } + } + } + + /// <summary> + /// We have to prevent modification off the end of an array. + /// The only way to do this is to copy all interesting variables out of the heap and then do the + /// bounds check. This is what we do here. + /// </summary> + private static unsafe void ThreadSafeCopy(char* sourcePtr, char[] destination, int destinationIndex, int count) + { + if (count > 0) + { + if ((uint)destinationIndex <= (uint)destination.Length && (destinationIndex + count) <= destination.Length) + { + fixed (char* destinationPtr = &destination[destinationIndex]) + string.wstrcpy(destinationPtr, sourcePtr, count); + } + else + { + throw new ArgumentOutOfRangeException(nameof(destinationIndex), SR.ArgumentOutOfRange_Index); + } + } + } + + private static void ThreadSafeCopy(char[] source, int sourceIndex, char[] destination, int destinationIndex, int count) + { + if (count > 0) + { + if ((uint)sourceIndex <= (uint)source.Length && (sourceIndex + count) <= source.Length) + { + unsafe + { + fixed (char* sourcePtr = &source[sourceIndex]) + ThreadSafeCopy(sourcePtr, destination, destinationIndex, count); + } + } + else + { + throw new ArgumentOutOfRangeException(nameof(sourceIndex), SR.ArgumentOutOfRange_Index); + } + } + } + + /// <summary> + /// Finds the chunk for the logical index (number of characters in the whole stringbuilder) 'index' + /// YOu can then get the offset in this chunk by subtracting the m_BlockOffset field from 'index' + /// </summary> + /// <param name="index"></param> + /// <returns></returns> + private StringBuilder FindChunkForIndex(int index) + { + Debug.Assert(0 <= index && index <= Length, "index not in string"); + + StringBuilder ret = this; + while (ret.m_ChunkOffset > index) + ret = ret.m_ChunkPrevious; + + Debug.Assert(ret != null, "index not in string"); + return ret; + } + + /// <summary> + /// Finds the chunk for the logical byte index 'byteIndex' + /// </summary> + /// <param name="index"></param> + /// <returns></returns> + private StringBuilder FindChunkForByte(int byteIndex) + { + Debug.Assert(0 <= byteIndex && byteIndex <= Length * sizeof(char), "Byte Index not in string"); + + StringBuilder ret = this; + while (ret.m_ChunkOffset * sizeof(char) > byteIndex) + ret = ret.m_ChunkPrevious; + + Debug.Assert(ret != null, "Byte Index not in string"); + return ret; + } + + /// <summary> + /// Finds the chunk that logically follows the 'chunk' chunk. Chunks only persist the pointer to + /// the chunk that is logically before it, so this routine has to start at the this pointer (which + /// is a assumed to point at the chunk representing the whole stringbuilder) and search + /// until it finds the current chunk (thus is O(n)). So it is more expensive than a field fetch! + /// </summary> + private StringBuilder Next(StringBuilder chunk) + { + if (chunk == this) + return null; + return FindChunkForIndex(chunk.m_ChunkOffset + chunk.m_ChunkLength); + } + + /// <summary> + /// Assumes that 'this' is the last chunk in the list and that it is full. Upon return the 'this' + /// block is updated so that it is a new block that has at least 'minBlockCharCount' characters. + /// that can be used to copy characters into it. + /// </summary> + private void ExpandByABlock(int minBlockCharCount) + { + Contract.Requires(Capacity == Length, "Expand expect to be called only when there is no space left"); // We are currently full + Contract.Requires(minBlockCharCount > 0, "Expansion request must be positive"); + + VerifyClassInvariant(); + + if ((minBlockCharCount + Length) > m_MaxCapacity || minBlockCharCount + Length < minBlockCharCount) + throw new ArgumentOutOfRangeException("requiredLength", SR.ArgumentOutOfRange_SmallCapacity); + + // Compute the length of the new block we need + // We make the new chunk at least big enough for the current need (minBlockCharCount) + // But also as big as the current length (thus doubling capacity), up to a maximum + // (so we stay in the small object heap, and never allocate really big chunks even if + // the string gets really big. + int newBlockLength = Math.Max(minBlockCharCount, Math.Min(Length, MaxChunkSize)); + + // Copy the current block to the new block, and initialize this to point at the new buffer. + m_ChunkPrevious = new StringBuilder(this); + m_ChunkOffset += m_ChunkLength; + m_ChunkLength = 0; + + // Check for integer overflow (logical buffer size > int.MaxInt) + if (m_ChunkOffset + newBlockLength < newBlockLength) + { + m_ChunkChars = null; + throw new OutOfMemoryException(); + } + m_ChunkChars = new char[newBlockLength]; + + VerifyClassInvariant(); + } + + /// <summary> + /// Used by ExpandByABlock to create a new chunk. The new chunk is a copied from 'from' + /// In particular the buffer is shared. It is expected that 'from' chunk (which represents + /// the whole list, is then updated to point to point to this new chunk. + /// </summary> + private StringBuilder(StringBuilder from) + { + m_ChunkLength = from.m_ChunkLength; + m_ChunkOffset = from.m_ChunkOffset; + m_ChunkChars = from.m_ChunkChars; + m_ChunkPrevious = from.m_ChunkPrevious; + m_MaxCapacity = from.m_MaxCapacity; + VerifyClassInvariant(); + } + + /// <summary> + /// Creates a gap of size 'count' at the logical offset (count of characters in the whole string + /// builder) 'index'. It returns the 'chunk' and 'indexInChunk' which represents a pointer to + /// this gap that was just created. You can then use 'ReplaceInPlaceAtChunk' to fill in the + /// chunk + /// + /// ReplaceAllChunks relies on the fact that indexes above 'index' are NOT moved outside 'chunk' + /// by this process (because we make the space by creating the cap BEFORE the chunk). If we + /// change this ReplaceAllChunks needs to be updated. + /// + /// If dontMoveFollowingChars is true, then the room must be made by inserting a chunk BEFORE the + /// current chunk (this is what it does most of the time anyway) + /// </summary> + private void MakeRoom(int index, int count, out StringBuilder chunk, out int indexInChunk, bool doneMoveFollowingChars) + { + VerifyClassInvariant(); + Debug.Assert(count > 0, "Count must be strictly positive"); + Debug.Assert(index >= 0, "Index can't be negative"); + if (count + Length > m_MaxCapacity || count + Length < count) + throw new ArgumentOutOfRangeException("requiredLength", SR.ArgumentOutOfRange_SmallCapacity); + + chunk = this; + while (chunk.m_ChunkOffset > index) + { + chunk.m_ChunkOffset += count; + chunk = chunk.m_ChunkPrevious; + } + indexInChunk = index - chunk.m_ChunkOffset; + + // Cool, we have some space in this block, and you don't have to copy much to get it, go ahead + // and use it. This happens typically when you repeatedly insert small strings at a spot + // (typically the absolute front) of the buffer. + if (!doneMoveFollowingChars && chunk.m_ChunkLength <= DefaultCapacity * 2 && chunk.m_ChunkChars.Length - chunk.m_ChunkLength >= count) + { + for (int i = chunk.m_ChunkLength; i > indexInChunk;) + { + --i; + chunk.m_ChunkChars[i + count] = chunk.m_ChunkChars[i]; + } + chunk.m_ChunkLength += count; + return; + } + + // Allocate space for the new chunk (will go before this one) + StringBuilder newChunk = new StringBuilder(Math.Max(count, DefaultCapacity), chunk.m_MaxCapacity, chunk.m_ChunkPrevious); + newChunk.m_ChunkLength = count; + + // Copy the head of the buffer to the new buffer. + int copyCount1 = Math.Min(count, indexInChunk); + if (copyCount1 > 0) + { + unsafe + { + fixed (char* chunkCharsPtr = &chunk.m_ChunkChars[0]) + { + ThreadSafeCopy(chunkCharsPtr, newChunk.m_ChunkChars, 0, copyCount1); + + // Slide characters in the current buffer over to make room. + int copyCount2 = indexInChunk - copyCount1; + if (copyCount2 >= 0) + { + ThreadSafeCopy(chunkCharsPtr + copyCount1, chunk.m_ChunkChars, 0, copyCount2); + indexInChunk = copyCount2; + } + } + } + } + + chunk.m_ChunkPrevious = newChunk; // Wire in the new chunk + chunk.m_ChunkOffset += count; + if (copyCount1 < count) + { + chunk = newChunk; + indexInChunk = copyCount1; + } + + VerifyClassInvariant(); + } + + /// <summary> + /// Used by MakeRoom to allocate another chunk. + /// </summary> + private StringBuilder(int size, int maxCapacity, StringBuilder previousBlock) + { + Debug.Assert(size > 0, "size not positive"); + Debug.Assert(maxCapacity > 0, "maxCapacity not positive"); + m_ChunkChars = new char[size]; + m_MaxCapacity = maxCapacity; + m_ChunkPrevious = previousBlock; + if (previousBlock != null) + m_ChunkOffset = previousBlock.m_ChunkOffset + previousBlock.m_ChunkLength; + VerifyClassInvariant(); + } + + /// <summary> + /// Removes 'count' characters from the logical index 'startIndex' and returns the chunk and + /// index in the chunk of that logical index in the out parameters. + /// </summary> + private void Remove(int startIndex, int count, out StringBuilder chunk, out int indexInChunk) + { + VerifyClassInvariant(); + Debug.Assert(startIndex >= 0 && startIndex < Length, "startIndex not in string"); + + int endIndex = startIndex + count; + + // Find the chunks for the start and end of the block to delete. + chunk = this; + StringBuilder endChunk = null; + int endIndexInChunk = 0; + for (;;) + { + if (endIndex - chunk.m_ChunkOffset >= 0) + { + if (endChunk == null) + { + endChunk = chunk; + endIndexInChunk = endIndex - endChunk.m_ChunkOffset; + } + if (startIndex - chunk.m_ChunkOffset >= 0) + { + indexInChunk = startIndex - chunk.m_ChunkOffset; + break; + } + } + else + { + chunk.m_ChunkOffset -= count; + } + chunk = chunk.m_ChunkPrevious; + } + Debug.Assert(chunk != null, "fell off beginning of string!"); + + int copyTargetIndexInChunk = indexInChunk; + int copyCount = endChunk.m_ChunkLength - endIndexInChunk; + if (endChunk != chunk) + { + copyTargetIndexInChunk = 0; + // Remove the characters after startIndex to end of the chunk + chunk.m_ChunkLength = indexInChunk; + + // Remove the characters in chunks between start and end chunk + endChunk.m_ChunkPrevious = chunk; + endChunk.m_ChunkOffset = chunk.m_ChunkOffset + chunk.m_ChunkLength; + + // If the start is 0 then we can throw away the whole start chunk + if (indexInChunk == 0) + { + endChunk.m_ChunkPrevious = chunk.m_ChunkPrevious; + chunk = endChunk; + } + } + endChunk.m_ChunkLength -= (endIndexInChunk - copyTargetIndexInChunk); + + // SafeCritical: We ensure that endIndexInChunk + copyCount is within range of m_ChunkChars and + // also ensure that copyTargetIndexInChunk + copyCount is within the chunk + // + // Remove any characters in the end chunk, by sliding the characters down. + if (copyTargetIndexInChunk != endIndexInChunk) // Sometimes no move is necessary + ThreadSafeCopy(endChunk.m_ChunkChars, endIndexInChunk, endChunk.m_ChunkChars, copyTargetIndexInChunk, copyCount); + + Debug.Assert(chunk != null, "fell off beginning of string!"); + VerifyClassInvariant(); + } + } +} diff --git a/src/mscorlib/shared/System/Text/UTF32Encoding.cs b/src/mscorlib/shared/System/Text/UTF32Encoding.cs new file mode 100644 index 0000000000..e4cd6c960e --- /dev/null +++ b/src/mscorlib/shared/System/Text/UTF32Encoding.cs @@ -0,0 +1,1234 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// +// Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused. +// + +using System; +using System.Diagnostics; +using System.Diagnostics.Contracts; +using System.Globalization; + +namespace System.Text +{ + // Encodes text into and out of UTF-32. UTF-32 is a way of writing + // Unicode characters with a single storage unit (32 bits) per character, + // + // The UTF-32 byte order mark is simply the Unicode byte order mark + // (0x00FEFF) written in UTF-32 (0x0000FEFF or 0xFFFE0000). The byte order + // mark is used mostly to distinguish UTF-32 text from other encodings, and doesn't + // switch the byte orderings. + + [Serializable] + public sealed class UTF32Encoding : Encoding + { + /* + words bits UTF-32 representation + ----- ---- ----------------------------------- + 1 16 00000000 00000000 xxxxxxxx xxxxxxxx + 2 21 00000000 000xxxxx hhhhhhll llllllll + ----- ---- ----------------------------------- + + Surrogate: + Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000 + */ + + // Used by Encoding.UTF32/BigEndianUTF32 for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly UTF32Encoding s_default = new UTF32Encoding(bigEndian: false, byteOrderMark: true); + internal static readonly UTF32Encoding s_bigEndianDefault = new UTF32Encoding(bigEndian: true, byteOrderMark: true); + + private bool _emitUTF32ByteOrderMark = false; + private bool _isThrowException = false; + private bool _bigEndian = false; + + + public UTF32Encoding() : this(false, true, false) + { + } + + + public UTF32Encoding(bool bigEndian, bool byteOrderMark) : + this(bigEndian, byteOrderMark, false) + { + } + + + public UTF32Encoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidCharacters) : + base(bigEndian ? 12001 : 12000) + { + _bigEndian = bigEndian; + _emitUTF32ByteOrderMark = byteOrderMark; + _isThrowException = throwOnInvalidCharacters; + + // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions + if (_isThrowException) + SetDefaultFallbacks(); + } + + internal override void SetDefaultFallbacks() + { + // For UTF-X encodings, we use a replacement fallback with an empty string + if (_isThrowException) + { + this.encoderFallback = EncoderFallback.ExceptionFallback; + this.decoderFallback = DecoderFallback.ExceptionFallback; + } + else + { + this.encoderFallback = new EncoderReplacementFallback("\xFFFD"); + this.decoderFallback = new DecoderReplacementFallback("\xFFFD"); + } + } + + + // The following methods are copied from EncodingNLS.cs. + // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here. + // These should be kept in sync for the following classes: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(String s) + { + // Validate input + if (s==null) + throw new ArgumentNullException("s"); + Contract.EndContractBlock(); + + fixed (char* pChars = s) + return GetByteCount(pChars, s.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + public override unsafe int GetBytes(String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null || bytes == null) + throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (s.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fix our input array if 0 length because fixed doesn't like 0 length arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fix our input array if 0 length because fixed doesn't like 0 length arrays + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays. + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fix our input array if 0 length because fixed doesn't like 0 length arrays + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe String GetString(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (count == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + index, count, this); + } + + // + // End of standard methods copied from EncodingNLS.cs + // + + internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) + { + Debug.Assert(chars != null, "[UTF32Encoding.GetByteCount]chars!=null"); + Debug.Assert(count >= 0, "[UTF32Encoding.GetByteCount]count >=0"); + + char* end = chars + count; + char* charStart = chars; + int byteCount = 0; + + char highSurrogate = '\0'; + + // For fallback we may need a fallback buffer + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + if (encoder != null) + { + highSurrogate = encoder.charLeftOver; + fallbackBuffer = encoder.FallbackBuffer; + + // We mustn't have left over fallback data when counting + if (fallbackBuffer.Remaining > 0) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + } + else + { + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + } + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, end, encoder, false); + + char ch; + TryAgain: + + while (((ch = fallbackBuffer.InternalGetNextChar()) != 0) || chars < end) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Do we need a low surrogate? + if (highSurrogate != '\0') + { + // + // In previous char, we encounter a high surrogate, so we are expecting a low surrogate here. + // + if (Char.IsLowSurrogate(ch)) + { + // They're all legal + highSurrogate = '\0'; + + // + // One surrogate pair will be translated into 4 bytes UTF32. + // + + byteCount += 4; + continue; + } + + // We are missing our low surrogate, decrement chars and fallback the high surrogate + // The high surrogate may have come from the encoder, but nothing else did. + Debug.Assert(chars > charStart, + "[UTF32Encoding.GetByteCount]Expected chars to have advanced if no low surrogate"); + chars--; + + // Do the fallback + charsForFallback = chars; + fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback); + chars = charsForFallback; + + // We're going to fallback the old high surrogate. + highSurrogate = '\0'; + continue; + } + + // Do we have another high surrogate? + if (Char.IsHighSurrogate(ch)) + { + // + // We'll have a high surrogate to check next time. + // + highSurrogate = ch; + continue; + } + + // Check for illegal characters + if (Char.IsLowSurrogate(ch)) + { + // We have a leading low surrogate, do the fallback + charsForFallback = chars; + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + + // Try again with fallback buffer + continue; + } + + // We get to add the character (4 bytes UTF32) + byteCount += 4; + } + + // May have to do our last surrogate + if ((encoder == null || encoder.MustFlush) && highSurrogate > 0) + { + // We have to do the fallback for the lonely high surrogate + charsForFallback = chars; + fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback); + chars = charsForFallback; + + highSurrogate = (char)0; + goto TryAgain; + } + + // Check for overflows. + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow); + + // Shouldn't have anything in fallback buffer for GetByteCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(fallbackBuffer.Remaining == 0, + "[UTF32Encoding.GetByteCount]Expected empty fallback buffer at end"); + + // Return our count + return byteCount; + } + + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS encoder) + { + Debug.Assert(chars != null, "[UTF32Encoding.GetBytes]chars!=null"); + Debug.Assert(bytes != null, "[UTF32Encoding.GetBytes]bytes!=null"); + Debug.Assert(byteCount >= 0, "[UTF32Encoding.GetBytes]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UTF32Encoding.GetBytes]charCount >=0"); + + char* charStart = chars; + char* charEnd = chars + charCount; + byte* byteStart = bytes; + byte* byteEnd = bytes + byteCount; + + char highSurrogate = '\0'; + + // For fallback we may need a fallback buffer + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + if (encoder != null) + { + highSurrogate = encoder.charLeftOver; + fallbackBuffer = encoder.FallbackBuffer; + + // We mustn't have left over fallback data when not converting + if (encoder.m_throwOnOverflow && fallbackBuffer.Remaining > 0) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + } + else + { + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + } + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + + char ch; + TryAgain: + + while (((ch = fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Do we need a low surrogate? + if (highSurrogate != '\0') + { + // + // In previous char, we encountered a high surrogate, so we are expecting a low surrogate here. + // + if (Char.IsLowSurrogate(ch)) + { + // Is it a legal one? + uint iTemp = GetSurrogate(highSurrogate, ch); + highSurrogate = '\0'; + + // + // One surrogate pair will be translated into 4 bytes UTF32. + // + if (bytes + 3 >= byteEnd) + { + // Don't have 4 bytes + if (fallbackBuffer.bFallingBack) + { + fallbackBuffer.MovePrevious(); // Aren't using these 2 fallback chars + fallbackBuffer.MovePrevious(); + } + else + { + // If we don't have enough room, then either we should've advanced a while + // or we should have bytes==byteStart and throw below + Debug.Assert(chars > charStart + 1 || bytes == byteStart, + "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair"); + chars -= 2; // Aren't using those 2 chars + } + ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) + highSurrogate = (char)0; // Nothing left over (we backed up to start of pair if supplimentary) + break; + } + + if (_bigEndian) + { + *(bytes++) = (byte)(0x00); + *(bytes++) = (byte)(iTemp >> 16); // Implies & 0xFF, which isn't needed cause high are all 0 + *(bytes++) = (byte)(iTemp >> 8); // Implies & 0xFF + *(bytes++) = (byte)(iTemp); // Implies & 0xFF + } + else + { + *(bytes++) = (byte)(iTemp); // Implies & 0xFF + *(bytes++) = (byte)(iTemp >> 8); // Implies & 0xFF + *(bytes++) = (byte)(iTemp >> 16); // Implies & 0xFF, which isn't needed cause high are all 0 + *(bytes++) = (byte)(0x00); + } + continue; + } + + // We are missing our low surrogate, decrement chars and fallback the high surrogate + // The high surrogate may have come from the encoder, but nothing else did. + Debug.Assert(chars > charStart, + "[UTF32Encoding.GetBytes]Expected chars to have advanced if no low surrogate"); + chars--; + + // Do the fallback + charsForFallback = chars; + fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback); + chars = charsForFallback; + + // We're going to fallback the old high surrogate. + highSurrogate = '\0'; + continue; + } + + // Do we have another high surrogate?, if so remember it + if (Char.IsHighSurrogate(ch)) + { + // + // We'll have a high surrogate to check next time. + // + highSurrogate = ch; + continue; + } + + // Check for illegal characters (low surrogate) + if (Char.IsLowSurrogate(ch)) + { + // We have a leading low surrogate, do the fallback + charsForFallback = chars; + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + + // Try again with fallback buffer + continue; + } + + // We get to add the character, yippee. + if (bytes + 3 >= byteEnd) + { + // Don't have 4 bytes + if (fallbackBuffer.bFallingBack) + fallbackBuffer.MovePrevious(); // Aren't using this fallback char + else + { + // Must've advanced already + Debug.Assert(chars > charStart, + "[UTF32Encoding.GetBytes]Expected chars to have advanced if normal character"); + chars--; // Aren't using this char + } + ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) + break; // Didn't throw, stop + } + + if (_bigEndian) + { + *(bytes++) = (byte)(0x00); + *(bytes++) = (byte)(0x00); + *(bytes++) = (byte)((uint)ch >> 8); // Implies & 0xFF + *(bytes++) = (byte)(ch); // Implies & 0xFF + } + else + { + *(bytes++) = (byte)(ch); // Implies & 0xFF + *(bytes++) = (byte)((uint)ch >> 8); // Implies & 0xFF + *(bytes++) = (byte)(0x00); + *(bytes++) = (byte)(0x00); + } + } + + // May have to do our last surrogate + if ((encoder == null || encoder.MustFlush) && highSurrogate > 0) + { + // We have to do the fallback for the lonely high surrogate + charsForFallback = chars; + fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback); + chars = charsForFallback; + + highSurrogate = (char)0; + goto TryAgain; + } + + // Fix our encoder if we have one + Debug.Assert(highSurrogate == 0 || (encoder != null && !encoder.MustFlush), + "[UTF32Encoding.GetBytes]Expected encoder to be flushed."); + + if (encoder != null) + { + // Remember our left over surrogate (or 0 if flushing) + encoder.charLeftOver = highSurrogate; + + // Need # chars used + encoder.m_charsUsed = (int)(chars - charStart); + } + + // return the new length + return (int)(bytes - byteStart); + } + + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) + { + Debug.Assert(bytes != null, "[UTF32Encoding.GetCharCount]bytes!=null"); + Debug.Assert(count >= 0, "[UTF32Encoding.GetCharCount]count >=0"); + + UTF32Decoder decoder = (UTF32Decoder)baseDecoder; + + // None so far! + int charCount = 0; + byte* end = bytes + count; + byte* byteStart = bytes; + + // Set up decoder + int readCount = 0; + uint iChar = 0; + + // For fallback we may need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + + // See if there's anything in our decoder + if (decoder != null) + { + readCount = decoder.readByteCount; + iChar = (uint)decoder.iChar; + fallbackBuffer = decoder.FallbackBuffer; + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for chars or count) + Debug.Assert(fallbackBuffer.Remaining == 0, + "[UTF32Encoding.GetCharCount]Expected empty fallback buffer at start"); + } + else + { + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + } + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + + // Loop through our input, 4 characters at a time! + while (bytes < end && charCount >= 0) + { + // Get our next character + if (_bigEndian) + { + // Scoot left and add it to the bottom + iChar <<= 8; + iChar += *(bytes++); + } + else + { + // Scoot right and add it to the top + iChar >>= 8; + iChar += (uint)(*(bytes++)) << 24; + } + + readCount++; + + // See if we have all the bytes yet + if (readCount < 4) + continue; + + // Have the bytes + readCount = 0; + + // See if its valid to encode + if (iChar > 0x10FFFF || (iChar >= 0xD800 && iChar <= 0xDFFF)) + { + // Need to fall back these 4 bytes + byte[] fallbackBytes; + if (_bigEndian) + { + fallbackBytes = new byte[] { + unchecked((byte)(iChar>>24)), unchecked((byte)(iChar>>16)), + unchecked((byte)(iChar>>8)), unchecked((byte)(iChar)) }; + } + else + { + fallbackBytes = new byte[] { + unchecked((byte)(iChar)), unchecked((byte)(iChar>>8)), + unchecked((byte)(iChar>>16)), unchecked((byte)(iChar>>24)) }; + } + + charCount += fallbackBuffer.InternalFallback(fallbackBytes, bytes); + + // Ignore the illegal character + iChar = 0; + continue; + } + + // Ok, we have something we can add to our output + if (iChar >= 0x10000) + { + // Surrogates take 2 + charCount++; + } + + // Add the rest of the surrogate or our normal character + charCount++; + + // iChar is back to 0 + iChar = 0; + } + + // See if we have something left over that has to be decoded + if (readCount > 0 && (decoder == null || decoder.MustFlush)) + { + // Oops, there's something left over with no place to go. + byte[] fallbackBytes = new byte[readCount]; + if (_bigEndian) + { + while (readCount > 0) + { + fallbackBytes[--readCount] = unchecked((byte)iChar); + iChar >>= 8; + } + } + else + { + while (readCount > 0) + { + fallbackBytes[--readCount] = unchecked((byte)(iChar >> 24)); + iChar <<= 8; + } + } + + charCount += fallbackBuffer.InternalFallback(fallbackBytes, bytes); + } + + // Check for overflows. + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow); + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for chars or count) + Debug.Assert(fallbackBuffer.Remaining == 0, + "[UTF32Encoding.GetCharCount]Expected empty fallback buffer at end"); + + // Return our count + return charCount; + } + + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS baseDecoder) + { + Debug.Assert(chars != null, "[UTF32Encoding.GetChars]chars!=null"); + Debug.Assert(bytes != null, "[UTF32Encoding.GetChars]bytes!=null"); + Debug.Assert(byteCount >= 0, "[UTF32Encoding.GetChars]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UTF32Encoding.GetChars]charCount >=0"); + + UTF32Decoder decoder = (UTF32Decoder)baseDecoder; + + // None so far! + char* charStart = chars; + char* charEnd = chars + charCount; + + byte* byteStart = bytes; + byte* byteEnd = bytes + byteCount; + + // See if there's anything in our decoder (but don't clear it yet) + int readCount = 0; + uint iChar = 0; + + // For fallback we may need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + // See if there's anything in our decoder + if (decoder != null) + { + readCount = decoder.readByteCount; + iChar = (uint)decoder.iChar; + fallbackBuffer = baseDecoder.FallbackBuffer; + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + Debug.Assert(fallbackBuffer.Remaining == 0, + "[UTF32Encoding.GetChars]Expected empty fallback buffer at start"); + } + else + { + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + } + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(bytes, chars + charCount); + + // Loop through our input, 4 characters at a time! + while (bytes < byteEnd) + { + // Get our next character + if (_bigEndian) + { + // Scoot left and add it to the bottom + iChar <<= 8; + iChar += *(bytes++); + } + else + { + // Scoot right and add it to the top + iChar >>= 8; + iChar += (uint)(*(bytes++)) << 24; + } + + readCount++; + + // See if we have all the bytes yet + if (readCount < 4) + continue; + + // Have the bytes + readCount = 0; + + // See if its valid to encode + if (iChar > 0x10FFFF || (iChar >= 0xD800 && iChar <= 0xDFFF)) + { + // Need to fall back these 4 bytes + byte[] fallbackBytes; + if (_bigEndian) + { + fallbackBytes = new byte[] { + unchecked((byte)(iChar>>24)), unchecked((byte)(iChar>>16)), + unchecked((byte)(iChar>>8)), unchecked((byte)(iChar)) }; + } + else + { + fallbackBytes = new byte[] { + unchecked((byte)(iChar)), unchecked((byte)(iChar>>8)), + unchecked((byte)(iChar>>16)), unchecked((byte)(iChar>>24)) }; + } + + // Chars won't be updated unless this works. + charsForFallback = chars; + bool fallbackResult = fallbackBuffer.InternalFallback(fallbackBytes, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + + // Couldn't fallback, throw or wait til next time + // We either read enough bytes for bytes-=4 to work, or we're + // going to throw in ThrowCharsOverflow because chars == charStart + Debug.Assert(bytes >= byteStart + 4 || chars == charStart, + "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (bad surrogate)"); + bytes -= 4; // get back to where we were + iChar = 0; // Remembering nothing + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // Stop here, didn't throw + } + + // Ignore the illegal character + iChar = 0; + continue; + } + + + // Ok, we have something we can add to our output + if (iChar >= 0x10000) + { + // Surrogates take 2 + if (chars >= charEnd - 1) + { + // Throwing or stopping + // We either read enough bytes for bytes-=4 to work, or we're + // going to throw in ThrowCharsOverflow because chars == charStart + Debug.Assert(bytes >= byteStart + 4 || chars == charStart, + "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (surrogate)"); + bytes -= 4; // get back to where we were + iChar = 0; // Remembering nothing + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // Stop here, didn't throw + } + + *(chars++) = GetHighSurrogate(iChar); + iChar = GetLowSurrogate(iChar); + } + // Bounds check for normal character + else if (chars >= charEnd) + { + // Throwing or stopping + // We either read enough bytes for bytes-=4 to work, or we're + // going to throw in ThrowCharsOverflow because chars == charStart + Debug.Assert(bytes >= byteStart + 4 || chars == charStart, + "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (normal char)"); + bytes -= 4; // get back to where we were + iChar = 0; // Remembering nothing + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // Stop here, didn't throw + } + + // Add the rest of the surrogate or our normal character + *(chars++) = (char)iChar; + + // iChar is back to 0 + iChar = 0; + } + + // See if we have something left over that has to be decoded + if (readCount > 0 && (decoder == null || decoder.MustFlush)) + { + // Oops, there's something left over with no place to go. + byte[] fallbackBytes = new byte[readCount]; + int tempCount = readCount; + if (_bigEndian) + { + while (tempCount > 0) + { + fallbackBytes[--tempCount] = unchecked((byte)iChar); + iChar >>= 8; + } + } + else + { + while (tempCount > 0) + { + fallbackBytes[--tempCount] = unchecked((byte)(iChar >> 24)); + iChar <<= 8; + } + } + + charsForFallback = chars; + bool fallbackResult = fallbackBuffer.InternalFallback(fallbackBytes, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // Couldn't fallback. + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // Stop here, didn't throw, backed up, so still nothing in buffer + } + else + { + // Don't clear our decoder unless we could fall it back. + // If we caught the if above, then we're a convert() and will catch this next time. + readCount = 0; + iChar = 0; + } + } + + // Remember any left over stuff, clearing buffer as well for MustFlush + if (decoder != null) + { + decoder.iChar = (int)iChar; + decoder.readByteCount = readCount; + decoder.m_bytesUsed = (int)(bytes - byteStart); + } + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + Debug.Assert(fallbackBuffer.Remaining == 0, + "[UTF32Encoding.GetChars]Expected empty fallback buffer at end"); + + // Return our count + return (int)(chars - charStart); + } + + + private uint GetSurrogate(char cHigh, char cLow) + { + return (((uint)cHigh - 0xD800) * 0x400) + ((uint)cLow - 0xDC00) + 0x10000; + } + + private char GetHighSurrogate(uint iChar) + { + return (char)((iChar - 0x10000) / 0x400 + 0xD800); + } + + private char GetLowSurrogate(uint iChar) + { + return (char)((iChar - 0x10000) % 0x400 + 0xDC00); + } + + + public override Decoder GetDecoder() + { + return new UTF32Decoder(this); + } + + + public override Encoder GetEncoder() + { + return new EncoderNLS(this); + } + + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // 4 bytes per char + byteCount *= 4; + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + + return (int)byteCount; + } + + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // A supplementary character becomes 2 surrogate characters, so 4 input bytes becomes 2 chars, + // plus we may have 1 surrogate char left over if the decoder has 3 bytes in it already for a non-bmp char. + // Have to add another one because 1/2 == 0, but 3 bytes left over could be 2 char surrogate pair + int charCount = (byteCount / 2) + 2; + + // Also consider fallback because our input bytes could be out of range of unicode. + // Since fallback would fallback 4 bytes at a time, we'll only fall back 1/2 of MaxCharCount. + if (DecoderFallback.MaxCharCount > 2) + { + // Multiply time fallback size + charCount *= DecoderFallback.MaxCharCount; + + // We were already figuring 2 chars per 4 bytes, but fallback will be different # + charCount /= 2; + } + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); + + return (int)charCount; + } + + + public override byte[] GetPreamble() + { + if (_emitUTF32ByteOrderMark) + { + // Allocate new array to prevent users from modifying it. + if (_bigEndian) + { + return new byte[4] { 0x00, 0x00, 0xFE, 0xFF }; + } + else + { + return new byte[4] { 0xFF, 0xFE, 0x00, 0x00 }; // 00 00 FE FF + } + } + else + return Array.Empty<byte>(); + } + + + public override bool Equals(Object value) + { + UTF32Encoding that = value as UTF32Encoding; + if (that != null) + { + return (_emitUTF32ByteOrderMark == that._emitUTF32ByteOrderMark) && + (_bigEndian == that._bigEndian) && + (EncoderFallback.Equals(that.EncoderFallback)) && + (DecoderFallback.Equals(that.DecoderFallback)); + } + return (false); + } + + + public override int GetHashCode() + { + //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable. + return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + + CodePage + (_emitUTF32ByteOrderMark ? 4 : 0) + (_bigEndian ? 8 : 0); + } + + [Serializable] + private sealed class UTF32Decoder : DecoderNLS + { + // Need a place to store any extra bytes we may have picked up + internal int iChar = 0; + internal int readByteCount = 0; + + public UTF32Decoder(UTF32Encoding encoding) : base(encoding) + { + // base calls reset + } + + public override void Reset() + { + this.iChar = 0; + this.readByteCount = 0; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our decoder? + internal override bool HasState + { + get + { + // ReadByteCount is our flag. (iChar==0 doesn't mean much). + return (this.readByteCount != 0); + } + } + } + } +} diff --git a/src/mscorlib/shared/System/Text/UTF8Encoding.cs b/src/mscorlib/shared/System/Text/UTF8Encoding.cs new file mode 100644 index 0000000000..5cfa89018a --- /dev/null +++ b/src/mscorlib/shared/System/Text/UTF8Encoding.cs @@ -0,0 +1,2668 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// The worker functions in this file was optimized for performance. If you make changes +// you should use care to consider all of the interesting cases. + +// The code of all worker functions in this file is written twice: Once as as a slow loop, and the +// second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc. +// The fast loops attempts to blaze through as fast as possible with optimistic range checks, +// processing multiple characters at a time, and falling back to the slow loop for all special cases. + +// This define can be used to turn off the fast loops. Useful for finding whether +// the problem is fastloop-specific. +#define FASTLOOP + +using System; +using System.Runtime.Serialization; +using System.Diagnostics; +using System.Diagnostics.Contracts; +using System.Globalization; + +namespace System.Text +{ + // Encodes text into and out of UTF-8. UTF-8 is a way of writing + // Unicode characters with variable numbers of bytes per character, + // optimized for the lower 127 ASCII characters. It's an efficient way + // of encoding US English in an internationalizable way. + // + // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused. + // + // The UTF-8 byte order mark is simply the Unicode byte order mark + // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is + // used mostly to distinguish UTF-8 text from other encodings, and doesn't + // switch the byte orderings. + + [Serializable] + public class UTF8Encoding : Encoding + { + /* + bytes bits UTF-8 representation + ----- ---- ----------------------------------- + 1 7 0vvvvvvv + 2 11 110vvvvv 10vvvvvv + 3 16 1110vvvv 10vvvvvv 10vvvvvv + 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv + ----- ---- ----------------------------------- + + Surrogate: + Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000 + */ + + private const int UTF8_CODEPAGE = 65001; + + // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230) + [Serializable] + internal sealed class UTF8EncodingSealed : UTF8Encoding + { + public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { } + } + + // Used by Encoding.UTF8 for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true); + + // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into + // the standard. + private bool _emitUTF8Identifier = false; + + private bool _isThrowException = false; + + + public UTF8Encoding() : this(false) + { + } + + + public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) : + this(encoderShouldEmitUTF8Identifier, false) + { + } + + + public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) : + base(UTF8_CODEPAGE) + { + _emitUTF8Identifier = encoderShouldEmitUTF8Identifier; + _isThrowException = throwOnInvalidBytes; + + // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions + if (_isThrowException) + SetDefaultFallbacks(); + } + + internal override void SetDefaultFallbacks() + { + // For UTF-X encodings, we use a replacement fallback with an empty string + if (_isThrowException) + { + this.encoderFallback = EncoderFallback.ExceptionFallback; + this.decoderFallback = DecoderFallback.ExceptionFallback; + } + else + { + this.encoderFallback = new EncoderReplacementFallback("\xFFFD"); + this.decoderFallback = new DecoderReplacementFallback("\xFFFD"); + } + } + + + // WARNING: GetByteCount(string chars) + // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted, + // WARNING: otherwise it'll break VB's way of declaring these. + // + // The following methods are copied from EncodingNLS.cs. + // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here. + // These should be kept in sync for the following classes: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(String chars) + { + // Validate input + if (chars==null) + throw new ArgumentNullException("s"); + Contract.EndContractBlock(); + + fixed (char* pChars = chars) + return GetByteCount(pChars, chars.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + public override unsafe int GetBytes(String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null || bytes == null) + throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (s.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like 0 length arrays. + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like 0 length arrays. + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays. + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fixed doesn't like 0 length arrays. + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe String GetString(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (count == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + index, count, this); + } + + // + // End of standard methods copied from EncodingNLS.cs + // + + // To simplify maintenance, the structure of GetByteCount and GetBytes should be + // kept the same as much as possible + internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) + { + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + EncoderFallbackBuffer fallbackBuffer = null; + char* pSrcForFallback; + + char* pSrc = chars; + char* pEnd = pSrc + count; + + // Start by assuming we have as many as count + int byteCount = count; + + int ch = 0; + + if (baseEncoder != null) + { + UTF8Encoder encoder = (UTF8Encoder)baseEncoder; + ch = encoder.surrogateChar; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false); + } + } + + for (;;) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) + { + if (ch == 0) + { + // Unroll any fallback that happens at the end + ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0; + if (ch > 0) + { + byteCount++; + goto ProcessChar; + } + } + else + { + // Case of surrogates in the fallback. + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + { + Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + ch = fallbackBuffer.InternalGetNextChar(); + byteCount++; + + if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + ch = 0xfffd; + byteCount++; + goto EncodeChar; + } + else if (ch > 0) + { + goto ProcessChar; + } + else + { + byteCount--; // ignore last one. + break; + } + } + } + + if (ch <= 0) + { + break; + } + if (baseEncoder != null && !baseEncoder.MustFlush) + { + break; + } + + // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. + byteCount++; + goto EncodeChar; + } + + if (ch > 0) + { + Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // count the pending surrogate + byteCount++; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + // if (IsLowSurrogate(cha)) { + if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. + ch = 0xfffd; + // ch = cha + (ch << 10) + + // (0x10000 + // - CharUnicodeInfo.LOW_SURROGATE_START + // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) ); + + // Use this next char + pSrc++; + } + // else ch is still high surrogate and encoding will fail (so don't add count) + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackBuffer != null) + { + ch = fallbackBuffer.InternalGetNextChar(); + if (ch > 0) + { + // We have an extra byte we weren't expecting. + byteCount++; + goto ProcessChar; + } + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + // if (IsHighSurrogate(ch)) { + if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) + { + // we will count this surrogate next time around + byteCount--; + continue; + } + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) + if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed + // Have to make a fallback buffer if we don't have one + if (fallbackBuffer == null) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + if (baseEncoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = baseEncoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback); + pSrc = pSrcForFallback; + + // Ignore it if we don't throw (we had preallocated this ch) + byteCount--; + ch = 0; + continue; + } + + // Count them + if (ch > 0x7F) + { + if (ch > 0x7FF) + { + // the extra surrogate byte was compensated by the second surrogate character + // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) + byteCount++; + } + byteCount++; + } + +#if BIT64 + // check for overflow + if (byteCount < 0) + { + break; + } +#endif + +#if FASTLOOP + // If still have fallback don't do fast loop + if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0) + { + // We're reserving 1 byte for each char by default + byteCount++; + goto ProcessChar; + } + + int availableChars = PtrDiff(pEnd, pSrc); + + // don't fall into the fast decoding loop if we don't have enough characters + if (availableChars <= 13) + { + // try to get over the remainder of the ascii characters fast though + char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + goto ProcessChar; + } + + // we are done + break; + } + +#if BIT64 + // make sure that we won't get a silent overflow inside the fast loop + // (Fall out to slow loop if we have this many characters) + availableChars &= 0x0FFFFFFF; +#endif + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + char* pStop = pSrc + availableChars - (3 + 4); + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + + // get pSrc aligned + if ((unchecked((int)pSrc) & 0x2) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + } + + // Run 2 * 4 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII + { + if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + + if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits) + byteCount++; + if ((ch & unchecked((int)0xFF80)) != 0) + byteCount++; + if ((chc & unchecked((int)0xFF800000)) != 0) + byteCount++; + if ((chc & unchecked((int)0xFF80)) != 0) + byteCount++; + } + pSrc += 4; + + ch = *(int*)pSrc; + chc = *(int*)(pSrc + 2); + if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII + { + if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + if ((ch & unchecked((int)0xFF800000)) != 0) + byteCount++; + if ((ch & unchecked((int)0xFF80)) != 0) + byteCount++; + if ((chc & unchecked((int)0xFF800000)) != 0) + byteCount++; + if ((chc & unchecked((int)0xFF80)) != 0) + byteCount++; + } + pSrc += 4; + } + break; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); +#else // BIGENDIAN + ch = (char)ch; +#endif // BIGENDIAN + pSrc++; + + if (ch <= 0x7F) + { + continue; + } + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + if (ch > 0x7FF) + { + // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) + if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // 4 byte encoding - high surrogate + low surrogate + + int chd = *pSrc; + if ( + // !IsHighSurrogate(ch) // low without high -> bad + ch > CharUnicodeInfo.HIGH_SURROGATE_END || + // !IsLowSurrogate(chd) // high not followed by low -> bad + !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // Back up and drop out to slow loop to figure out error + pSrc--; + break; + } + pSrc++; + + // byteCount - this byte is compensated by the second surrogate character + } + byteCount++; + } + byteCount++; + + // byteCount - the last byte is already included + } +#endif // FASTLOOP + + // no pending char at this point + ch = 0; + } + +#if BIT64 + // check for overflow + if (byteCount < 0) + { + throw new ArgumentException( + SR.Argument_ConversionOverflow); + } +#endif + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); + + return byteCount; + } + + // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic + // is good enough for us, and it tends to generate better code than the signed + // arithmetic generated by default + unsafe private static int PtrDiff(char* a, char* b) + { + return (int)(((uint)((byte*)a - (byte*)b)) >> 1); + } + + // byte* flavor just for parity + unsafe private static int PtrDiff(byte* a, byte* b) + { + return (int)(a - b); + } + + private static bool InRange(int ch, int start, int end) + { + return (uint)(ch - start) <= (uint)(end - start); + } + + // Our workhorse + // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS baseEncoder) + { + Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null"); + Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); + Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null"); + + UTF8Encoder encoder = null; + + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + EncoderFallbackBuffer fallbackBuffer = null; + char* pSrcForFallback; + + char* pSrc = chars; + byte* pTarget = bytes; + + char* pEnd = pSrc + charCount; + byte* pAllocatedBufferEnd = pTarget + byteCount; + + int ch = 0; + + // assume that JIT will enregister pSrc, pTarget and ch + + if (baseEncoder != null) + { + encoder = (UTF8Encoder)baseEncoder; + ch = encoder.surrogateChar; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true); + } + } + + for (;;) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) + { + if (ch == 0) + { + // Check if there's anthing left to get out of the fallback buffer + ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0; + if (ch > 0) + { + goto ProcessChar; + } + } + else + { + // Case of leftover surrogates in the fallback buffer + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + { + Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + int cha = ch; + + ch = fallbackBuffer.InternalGetNextChar(); + + if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); + goto EncodeChar; + } + else if (ch > 0) + { + goto ProcessChar; + } + else + { + break; + } + } + } + + // attempt to encode the partial surrogate (will fail or ignore) + if (ch > 0 && (encoder == null || encoder.MustFlush)) + goto EncodeChar; + + // We're done + break; + } + + if (ch > 0) + { + // We have a high surrogate left over from a previous loop. + Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + // if (IsLowSurrogate(cha)) { + if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + ch = cha + (ch << 10) + + (0x10000 + - CharUnicodeInfo.LOW_SURROGATE_START + - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); + + pSrc++; + } + // else ch is still high surrogate and encoding will fail + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackBuffer != null) + { + ch = fallbackBuffer.InternalGetNextChar(); + if (ch > 0) goto ProcessChar; + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + // if (IsHighSurrogate(ch)) { + if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) + { + continue; + } + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + // if (IsLowSurrogate(ch) || IsHighSurrogate(ch)) + if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed, we have to do fallback for them + // Have to make a fallback buffer if we don't have one + if (fallbackBuffer == null) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + if (baseEncoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = baseEncoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback); + pSrc = pSrcForFallback; + + // Ignore it if we don't throw + ch = 0; + continue; + } + + // Count bytes needed + int bytesNeeded = 1; + if (ch > 0x7F) + { + if (ch > 0x7FF) + { + if (ch > 0xFFFF) + { + bytesNeeded++; // 4 bytes (surrogate pair) + } + bytesNeeded++; // 3 bytes (800-FFFF) + } + bytesNeeded++; // 2 bytes (80-7FF) + } + + if (pTarget > pAllocatedBufferEnd - bytesNeeded) + { + // Left over surrogate from last time will cause pSrc == chars, so we'll throw + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + { + fallbackBuffer.MovePrevious(); // Didn't use this fallback char + if (ch > 0xFFFF) + fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either + } + else + { + pSrc--; // Didn't use this char + if (ch > 0xFFFF) + pSrc--; // Was surrogate, didn't use 2nd part either + } + Debug.Assert(pSrc >= chars || pTarget == bytes, + "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); + ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must + ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) + break; + } + + if (ch <= 0x7F) + { + *pTarget = (byte)ch; + } + else + { + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int chb; + if (ch <= 0x7FF) + { + // 2 byte encoding + chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6)); + } + else + { + if (ch <= 0xFFFF) + { + chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12)); + } + else + { + *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); + pTarget++; + + chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; + } + *pTarget = (byte)chb; + pTarget++; + + chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; + } + *pTarget = (byte)chb; + pTarget++; + + *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); + } + pTarget++; + + +#if FASTLOOP + // If still have fallback don't do fast loop + if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0) + goto ProcessChar; + + int availableChars = PtrDiff(pEnd, pSrc); + int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); + + // don't fall into the fast decoding loop if we don't have enough characters + // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. + if (availableChars <= 13) + { + // we are hoping for 1 byte per char + if (availableBytes < availableChars) + { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + // Not ASCII, need more than 1 byte per char + if (ch > 0x7F) + goto ProcessChar; + + *pTarget = (byte)ch; + pTarget++; + } + // we are done, let ch be 0 to clear encoder + ch = 0; + break; + } + + // we need at least 1 byte per character, but Convert might allow us to convert + // only part of the input, so try as much as we can. Reduce charCount if necessary + if (availableBytes < availableChars) + { + availableChars = availableBytes; + } + + // FASTLOOP: + // - optimistic range checks + // - fallbacks to the slow loop for all special cases, exception throwing, etc. + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. + char* pStop = pSrc + availableChars - 5; + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + { + goto LongCode; + } + *pTarget = (byte)ch; + pTarget++; + + // get pSrc aligned + if ((unchecked((int)pSrc) & 0x2) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + { + goto LongCode; + } + *pTarget = (byte)ch; + pTarget++; + } + + // Run 4 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) + { + goto LongCodeWithMask; + } + + // Unfortunately, this is endianess sensitive +#if BIGENDIAN + *pTarget = (byte)(ch>>16); + *(pTarget+1) = (byte)ch; + pSrc += 4; + *(pTarget+2) = (byte)(chc>>16); + *(pTarget+3) = (byte)chc; + pTarget += 4; +#else // BIGENDIAN + *pTarget = (byte)ch; + *(pTarget + 1) = (byte)(ch >> 16); + pSrc += 4; + *(pTarget + 2) = (byte)chc; + *(pTarget + 3) = (byte)(chc >> 16); + pTarget += 4; +#endif // BIGENDIAN + } + continue; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); +#else // BIGENDIAN + ch = (char)ch; +#endif // BIGENDIAN + pSrc++; + + if (ch > 0x7F) + { + goto LongCode; + } + *pTarget = (byte)ch; + pTarget++; + continue; + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + int chd; + if (ch <= 0x7FF) + { + // 2 byte encoding + chd = unchecked((sbyte)0xC0) | (ch >> 6); + } + else + { + // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch)) + if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // 3 byte encoding + chd = unchecked((sbyte)0xE0) | (ch >> 12); + } + else + { + // 4 byte encoding - high surrogate + low surrogate + // if (!IsHighSurrogate(ch)) + if (ch > CharUnicodeInfo.HIGH_SURROGATE_END) + { + // low without high -> bad, try again in slow loop + pSrc -= 1; + break; + } + + chd = *pSrc; + pSrc++; + + // if (!IsLowSurrogate(chd)) { + if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) + { + // high not followed by low -> bad, try again in slow loop + pSrc -= 2; + break; + } + + ch = chd + (ch << 10) + + (0x10000 + - CharUnicodeInfo.LOW_SURROGATE_START + - (CharUnicodeInfo.HIGH_SURROGATE_START << 10)); + + *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); + // pStop - this byte is compensated by the second surrogate character + // 2 input chars require 4 output bytes. 2 have been anticipated already + // and 2 more will be accounted for by the 2 pStop-- calls below. + pTarget++; + + chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; + } + *pTarget = (byte)chd; + pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too. + pTarget++; + + chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; + } + *pTarget = (byte)chd; + pStop--; // 2 byte sequence for 1 char so need pStop--. + pTarget++; + + *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); + // pStop - this byte is already included + pTarget++; + } + + Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); + +#endif // FASTLOOP + + // no pending char at this point + ch = 0; + } + + // Do we have to set the encoder bytes? + if (encoder != null) + { + Debug.Assert(!encoder.MustFlush || ch == 0, + "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture)); + + encoder.surrogateChar = ch; + encoder.m_charsUsed = (int)(pSrc - chars); + } + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || + baseEncoder == null || !baseEncoder.m_throwOnOverflow, + "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting"); + + return (int)(pTarget - bytes); + } + + + // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits + // while the actual character is being built in the lower bits. They are shifted together + // with the actual bits of the character. + + // bits 30 & 31 are used for pending bits fixup + private const int FinalByte = 1 << 29; + private const int SupplimentarySeq = 1 << 28; + private const int ThreeByteSeq = 1 << 27; + + // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. + // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + // + // To simplify maintenance, the structure of GetCharCount and GetChars should be + // kept the same as much as possible + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) + { + Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); + Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null"); + + // Initialize stuff + byte* pSrc = bytes; + byte* pEnd = pSrc + count; + + // Start by assuming we have as many as count, charCount always includes the adjustment + // for the character being decoded + int charCount = count; + int ch = 0; + DecoderFallbackBuffer fallback = null; + + if (baseDecoder != null) + { + UTF8Decoder decoder = (UTF8Decoder)baseDecoder; + ch = decoder.bits; + charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars. + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, + "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start"); + } + + for (;;) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) + { + break; + } + + if (ch == 0) + { + // no pending bits + goto ReadChar; + } + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & unchecked((sbyte)0xC0)) != 0x80) + { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + charCount += (ch >> 30); + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) + { + Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, + "[UTF8Encoding.GetChars]Invariant volation"); + + if ((ch & SupplimentarySeq) != 0) + { + if ((ch & (FinalByte >> 6)) != 0) + { + // this is 3rd byte (of 4 byte supplimentary) - nothing to do + continue; + } + + // 2nd byte, check for non-shortest form of supplimentary char and the valid + // supplimentary characters in range 0x010000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) + { + goto InvalidByteSequence; + } + } + else + { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // adjust for surrogates in non-shortest form + if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) + { + charCount--; + } + goto EncodeChar; + + InvalidByteSequence: + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (fallback == null) + { + if (baseDecoder == null) + fallback = this.decoderFallback.CreateFallbackBuffer(); + else + fallback = baseDecoder.FallbackBuffer; + fallback.InternalInitialize(bytes, null); + } + charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); + + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) + { + // If its > 0x7F, its start of a new multi-byte sequence + + // Long sequence, so unreserve our char. + charCount--; + + // bit 6 has to be non-zero for start of multibyte chars. + if ((ch & 0x40) == 0) + { + // Unexpected trail byte + goto InvalidByteSequence; + } + + // start a new long code + if ((ch & 0x20) != 0) + { + if ((ch & 0x10) != 0) + { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) + { + ch |= 0xf0; + goto InvalidByteSequence; + } + + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. + ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now + (1 << 30) | // If it dies on next byte we'll need an extra char + (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + + // Our character count will be 2 characters for these 4 bytes, so subtract another char + charCount--; + } + else + { + // 3 byte encoding + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + + // We'll expect 1 character for these 3 bytes, so subtract another char. + charCount--; + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + ch |= 0xc0; + goto InvalidByteSequence; + } + + // Add bit flags so we'll be flagged correctly + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + +#if FASTLOOP + int availableBytes = PtrDiff(pEnd, pSrc); + + // don't fall into the fast decoding loop if we don't have enough bytes + if (availableBytes <= 13) + { + // try to get over the remainder of the ascii characters fast though + byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + goto ProcessChar; + } + // we are done + ch = 0; + break; + } + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + byte* pStop = pSrc + availableBytes - 7; + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + { + goto LongCode; + } + + // get pSrc 2-byte aligned + if ((unchecked((int)pSrc) & 0x1) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + { + goto LongCode; + } + } + + // get pSrc 4-byte aligned + if ((unchecked((int)pSrc) & 0x2) != 0) + { + ch = *(ushort*)pSrc; + if ((ch & 0x8080) != 0) + { + goto LongCodeWithMask16; + } + pSrc += 2; + } + + // Run 8 + 8 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & unchecked((int)0x80808080)) != 0) + { + goto LongCodeWithMask32; + } + pSrc += 8; + + // This is a really small loop - unroll it + if (pSrc >= pStop) + break; + + ch = *(int*)pSrc; + chb = *(int*)(pSrc + 4); + if (((ch | chb) & unchecked((int)0x80808080)) != 0) + { + goto LongCodeWithMask32; + } + pSrc += 8; + } + break; + +#if BIGENDIAN + LongCodeWithMask32: + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + LongCodeWithMask16: + ch = (int)(((uint)ch) >> 8); +#else // BIGENDIAN + LongCodeWithMask32: + LongCodeWithMask16: + ch &= 0xFF; +#endif // BIGENDIAN + pSrc++; + if (ch <= 0x7F) + { + continue; + } + + LongCode: + int chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) + { + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) + { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + pSrc += 2; + + // extra byte + charCount--; + } + else + { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + // extra byte + charCount--; + } + } + else + { + // 2 byte encoding + + // check for non-shortest form + if ((ch & 0x1E) == 0) + { + goto BadLongCode; + } + } + + // extra byte + charCount--; + } +#endif // FASTLOOP + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + // May have a problem if we have to flush + if (ch != 0) + { + // We were already adjusting for these, so need to unadjust + charCount += (ch >> 30); + if (baseDecoder == null || baseDecoder.MustFlush) + { + // Have to do fallback for invalid bytes + if (fallback == null) + { + if (baseDecoder == null) + fallback = this.decoderFallback.CreateFallbackBuffer(); + else + fallback = baseDecoder.FallbackBuffer; + fallback.InternalInitialize(bytes, null); + } + charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); + } + } + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(fallback == null || fallback.Remaining == 0, + "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); + + return charCount; + } + + // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method. + // So if we're really broken, then that could also throw an error... recursively. + // So try to make sure GetChars can at least process all uses by + // System.Resources.ResourceReader! + // + // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms. + // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates. + // + // To simplify maintenance, the structure of GetCharCount and GetChars should be + // kept the same as much as possible + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS baseDecoder) + { + Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null"); + Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0"); + Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); + Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null"); + + byte* pSrc = bytes; + char* pTarget = chars; + + byte* pEnd = pSrc + byteCount; + char* pAllocatedBufferEnd = pTarget + charCount; + + int ch = 0; + + DecoderFallbackBuffer fallback = null; + byte* pSrcForFallback; + char* pTargetForFallback; + if (baseDecoder != null) + { + UTF8Decoder decoder = (UTF8Decoder)baseDecoder; + ch = decoder.bits; + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty) + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, + "[UTF8Encoding.GetChars]Expected empty fallback buffer at start"); + } + + for (;;) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) + { + break; + } + + if (ch == 0) + { + // no pending bits + goto ReadChar; + } + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & unchecked((sbyte)0xC0)) != 0x80) + { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) + { + // Not at last byte yet + Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, + "[UTF8Encoding.GetChars]Invariant volation"); + + if ((ch & SupplimentarySeq) != 0) + { + // Its a 4-byte supplimentary sequence + if ((ch & (FinalByte >> 6)) != 0) + { + // this is 3rd byte of 4 byte sequence - nothing to do + continue; + } + + // 2nd byte of 4 bytes + // check for non-shortest form of surrogate and the valid surrogate + // range 0x000000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) + { + goto InvalidByteSequence; + } + } + else + { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // surrogate in shortest form? + // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? + if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) + { + // let the range check for the second char throw the exception + if (pTarget < pAllocatedBufferEnd) + { + *pTarget = (char)(((ch >> 10) & 0x7FF) + + unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))))); + pTarget++; + + ch = (ch & 0x3FF) + + unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START)); + } + } + + goto EncodeChar; + + InvalidByteSequence: + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (fallback == null) + { + if (baseDecoder == null) + fallback = this.decoderFallback.CreateFallbackBuffer(); + else + fallback = baseDecoder.FallbackBuffer; + fallback.InternalInitialize(bytes, pAllocatedBufferEnd); + } + // This'll back us up the appropriate # of bytes if we didn't get anywhere + pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered + pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered + bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback); + pSrc = pSrcForFallback; + pTarget = pTargetForFallback; + + if (!fallbackResult) + { + // Ran out of buffer space + // Need to throw an exception? + Debug.Assert(pSrc >= bytes || pTarget == chars, + "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); + fallback.InternalReset(); + ThrowCharsOverflow(baseDecoder, pTarget == chars); + ch = 0; + break; + } + Debug.Assert(pSrc >= bytes, + "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) + { + // If its > 0x7F, its start of a new multi-byte sequence + + // bit 6 has to be non-zero + if ((ch & 0x40) == 0) + { + goto InvalidByteSequence; + } + + // start a new long code + if ((ch & 0x20) != 0) + { + if ((ch & 0x10) != 0) + { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) + { + ch |= 0xf0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + } + else + { + // 3 byte encoding + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + ch |= 0xc0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + // write the pending character + if (pTarget >= pAllocatedBufferEnd) + { + // Fix chars so we make sure to throw if we didn't output anything + ch &= 0x1fffff; + if (ch > 0x7f) + { + if (ch > 0x7ff) + { + if (ch >= CharUnicodeInfo.LOW_SURROGATE_START && + ch <= CharUnicodeInfo.LOW_SURROGATE_END) + { + pSrc--; // It was 4 bytes + pTarget--; // 1 was stored already, but we can't remember 1/2, so back up + } + else if (ch > 0xffff) + { + pSrc--; // It was 4 bytes, nothing was stored + } + pSrc--; // It was at least 3 bytes + } + pSrc--; // It was at least 2 bytes + } + pSrc--; + + // Throw that we don't have enough room (pSrc could be < chars if we had started to process + // a 4 byte sequence alredy) + Debug.Assert(pSrc >= bytes || pTarget == chars, + "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); + ThrowCharsOverflow(baseDecoder, pTarget == chars); + + // Don't store ch in decoder, we already backed up to its start + ch = 0; + + // Didn't throw, just use this buffer size. + break; + } + *pTarget = (char)ch; + pTarget++; + +#if FASTLOOP + int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); + int availableBytes = PtrDiff(pEnd, pSrc); + + // don't fall into the fast decoding loop if we don't have enough bytes + // Test for availableChars is done because pStop would be <= pTarget. + if (availableBytes <= 13) + { + // we may need as many as 1 character per byte + if (availableChars < availableBytes) + { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + goto ProcessChar; + + *pTarget = (char)ch; + pTarget++; + } + // we are done + ch = 0; + break; + } + + // we may need as many as 1 character per byte, so reduce the byte count if necessary. + // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. + if (availableChars < availableBytes) + { + availableBytes = availableChars; + } + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + char* pStop = pTarget + availableBytes - 7; + + while (pTarget < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + { + goto LongCode; + } + *pTarget = (char)ch; + pTarget++; + + // get pSrc to be 2-byte aligned + if ((unchecked((int)pSrc) & 0x1) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + { + goto LongCode; + } + *pTarget = (char)ch; + pTarget++; + } + + // get pSrc to be 4-byte aligned + if ((unchecked((int)pSrc) & 0x2) != 0) + { + ch = *(ushort*)pSrc; + if ((ch & 0x8080) != 0) + { + goto LongCodeWithMask16; + } + + // Unfortunately, this is endianess sensitive +#if BIGENDIAN + *pTarget = (char)((ch >> 8) & 0x7F); + pSrc += 2; + *(pTarget+1) = (char)(ch & 0x7F); + pTarget += 2; +#else // BIGENDIAN + *pTarget = (char)(ch & 0x7F); + pSrc += 2; + *(pTarget + 1) = (char)((ch >> 8) & 0x7F); + pTarget += 2; +#endif // BIGENDIAN + } + + // Run 8 characters at a time! + while (pTarget < pStop) + { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & unchecked((int)0x80808080)) != 0) + { + goto LongCodeWithMask32; + } + + // Unfortunately, this is endianess sensitive +#if BIGENDIAN + *pTarget = (char)((ch >> 24) & 0x7F); + *(pTarget+1) = (char)((ch >> 16) & 0x7F); + *(pTarget+2) = (char)((ch >> 8) & 0x7F); + *(pTarget+3) = (char)(ch & 0x7F); + pSrc += 8; + *(pTarget+4) = (char)((chb >> 24) & 0x7F); + *(pTarget+5) = (char)((chb >> 16) & 0x7F); + *(pTarget+6) = (char)((chb >> 8) & 0x7F); + *(pTarget+7) = (char)(chb & 0x7F); + pTarget += 8; +#else // BIGENDIAN + *pTarget = (char)(ch & 0x7F); + *(pTarget + 1) = (char)((ch >> 8) & 0x7F); + *(pTarget + 2) = (char)((ch >> 16) & 0x7F); + *(pTarget + 3) = (char)((ch >> 24) & 0x7F); + pSrc += 8; + *(pTarget + 4) = (char)(chb & 0x7F); + *(pTarget + 5) = (char)((chb >> 8) & 0x7F); + *(pTarget + 6) = (char)((chb >> 16) & 0x7F); + *(pTarget + 7) = (char)((chb >> 24) & 0x7F); + pTarget += 8; +#endif // BIGENDIAN + } + break; + +#if BIGENDIAN + LongCodeWithMask32: + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + LongCodeWithMask16: + ch = (int)(((uint)ch) >> 8); +#else // BIGENDIAN + LongCodeWithMask32: + LongCodeWithMask16: + ch &= 0xFF; +#endif // BIGENDIAN + pSrc++; + if (ch <= 0x7F) + { + *pTarget = (char)ch; + pTarget++; + continue; + } + + LongCode: + int chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) + { + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) + { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + pSrc += 2; + + ch = (chc << 6) | (ch & 0x3F); + + *pTarget = (char)(((ch >> 10) & 0x7FF) + + unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))); + pTarget++; + + ch = (ch & 0x3FF) + + unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START)); + + // extra byte, we're already planning 2 chars for 2 of these bytes, + // but the big loop is testing the target against pStop, so we need + // to subtract 2 more or we risk overrunning the input. Subtract + // one here and one below. + pStop--; + } + else + { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & unchecked((sbyte)0xC0)) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + ch = (chc << 6) | (ch & 0x3F); + + // extra byte, we're only expecting 1 char for each of these 3 bytes, + // but the loop is testing the target (not source) against pStop, so + // we need to subtract 2 more or we risk overrunning the input. + // Subtract 1 here and one more below + pStop--; + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + goto BadLongCode; + } + ch = (ch << 6) | chc; + } + + *pTarget = (char)ch; + pTarget++; + + // extra byte, we're only expecting 1 char for each of these 2 bytes, + // but the loop is testing the target (not source) against pStop. + // subtract an extra count from pStop so that we don't overrun the input. + pStop--; + } +#endif // FASTLOOP + + Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush)) + { + // Have to do fallback for invalid bytes + if (fallback == null) + { + if (baseDecoder == null) + fallback = this.decoderFallback.CreateFallbackBuffer(); + else + fallback = baseDecoder.FallbackBuffer; + fallback.InternalInitialize(bytes, pAllocatedBufferEnd); + } + + // This'll back us up the appropriate # of bytes if we didn't get anywhere + pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered + pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered + bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback); + pSrc = pSrcForFallback; + pTarget = pTargetForFallback; + + if (!fallbackResult) + { + Debug.Assert(pSrc >= bytes || pTarget == chars, + "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); + + // Ran out of buffer space + // Need to throw an exception? + fallback.InternalReset(); + ThrowCharsOverflow(baseDecoder, pTarget == chars); + } + Debug.Assert(pSrc >= bytes, + "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); + ch = 0; + } + + if (baseDecoder != null) + { + UTF8Decoder decoder = (UTF8Decoder)baseDecoder; + + // If we're storing flush data we expect all bits to be used or else + // we're stuck in the middle of a conversion + Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow, + "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow."); + + // Remember our leftover bits. + decoder.bits = ch; + + baseDecoder.m_bytesUsed = (int)(pSrc - bytes); + } + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + Debug.Assert(fallback == null || fallback.Remaining == 0, + "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); + + return PtrDiff(pTarget, chars); + } + + // During GetChars we had an invalid byte sequence + // pSrc is backed up to the start of the bad sequence if we didn't have room to + // fall it back. Otherwise pSrc remains wher it is. + private unsafe bool FallbackInvalidByteSequence( + ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget) + { + // Get our byte[] + byte* pStart = pSrc; + byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch); + + // Do the actual fallback + if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget)) + { + // Oops, it failed, back up to pStart + pSrc = pStart; + return false; + } + + // It worked + return true; + } + + // During GetCharCount we had an invalid byte sequence + // pSrc is used to find the index that points to the invalid bytes, + // however the byte[] contains the fallback bytes (in case the index is -1) + private unsafe int FallbackInvalidByteSequence( + byte* pSrc, int ch, DecoderFallbackBuffer fallback) + { + // Get our byte[] + byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch); + + // Do the actual fallback + int count = fallback.InternalFallback(bytesUnknown, pSrc); + + // # of fallback chars expected. + // Note that we only get here for "long" sequences, and have already unreserved + // the count that we prereserved for the input bytes + return count; + } + + // Note that some of these bytes may have come from a previous fallback, so we cannot + // just decrement the pointer and use the values we read. In those cases we have + // to regenerate the original values. + private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch) + { + // Get our byte[] + byte[] bytesUnknown = null; + + // See if it was a plain char + // (have to check >= 0 because we have all sorts of wierd bit flags) + if (ch < 0x100 && ch >= 0) + { + pSrc--; + bytesUnknown = new byte[] { unchecked((byte)ch) }; + } + // See if its an unfinished 2 byte sequence + else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) + { + pSrc--; + bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) }; + } + // So now we're either 2nd byte of 3 or 4 byte sequence or + // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence + // 1st check if its a 4 byte sequence + else if ((ch & SupplimentarySeq) != 0) + { + // 3rd byte of 4 byte sequence? + if ((ch & (FinalByte >> 6)) != 0) + { + // 3rd byte of 4 byte sequence + pSrc -= 3; + bytesUnknown = new byte[] { + unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)), + unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)), + unchecked((byte)(((ch) & 0x3F) | 0x80)) }; + } + else if ((ch & (FinalByte >> 12)) != 0) + { + // 2nd byte of a 4 byte sequence + pSrc -= 2; + bytesUnknown = new byte[] { + unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)), + unchecked((byte)(((ch) & 0x3F) | 0x80)) }; + } + else + { + // 4th byte of a 4 byte sequence + pSrc--; + bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) }; + } + } + else + { + // 2nd byte of 3 byte sequence? + if ((ch & (FinalByte >> 6)) != 0) + { + // So its 2nd byte of a 3 byte sequence + pSrc -= 2; + bytesUnknown = new byte[] { + unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) }; + } + else + { + // 1st byte of a 3 byte sequence + pSrc--; + bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) }; + } + } + + return bytesUnknown; + } + + + public override Decoder GetDecoder() + { + return new UTF8Decoder(this); + } + + + public override Encoder GetEncoder() + { + return new UTF8Encoder(this); + } + + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // Max 3 bytes per char. (4 bytes per 2 chars for surrogates) + byteCount *= 3; + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + + return (int)byteCount; + } + + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair + long charCount = ((long)byteCount + 1); + + // Non-shortest form would fall back, so get max count from fallback. + // So would 11... followed by 11..., so you could fall back every byte + if (DecoderFallback.MaxCharCount > 1) + { + charCount *= DecoderFallback.MaxCharCount; + } + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); + + return (int)charCount; + } + + + public override byte[] GetPreamble() + { + if (_emitUTF8Identifier) + { + // Allocate new array to prevent users from modifying it. + return new byte[3] { 0xEF, 0xBB, 0xBF }; + } + else + return Array.Empty<byte>(); + } + + + public override bool Equals(Object value) + { + UTF8Encoding that = value as UTF8Encoding; + if (that != null) + { + return (_emitUTF8Identifier == that._emitUTF8Identifier) && + (EncoderFallback.Equals(that.EncoderFallback)) && + (DecoderFallback.Equals(that.DecoderFallback)); + } + return (false); + } + + + public override int GetHashCode() + { + //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable. + return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + + UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0); + } + + [Serializable] + private sealed class UTF8Encoder : EncoderNLS, ISerializable + { + // We must save a high surrogate value until the next call, looking + // for a low surrogate value. + internal int surrogateChar; + + public UTF8Encoder(UTF8Encoding encoding) : base(encoding) + { + // base calls reset + } + + // Constructor called by serialization, have to handle deserializing from Everett + internal UTF8Encoder(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Get common info + this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding)); + + // SurrogateChar happens to mean the same thing + this.surrogateChar = (int)info.GetValue("surrogateChar", typeof(int)); + + try + { + this.m_fallback = (EncoderFallback)info.GetValue("m_fallback", typeof(EncoderFallback)); + } + catch (SerializationException) + { + this.m_fallback = null; + } + } + + // ISerializable implementation, get data for this object + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Save Whidbey data + // Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding) + info.AddValue("encoding", this.m_encoding); + info.AddValue("surrogateChar", this.surrogateChar); + + info.AddValue("m_fallback", this.m_fallback); + + // Extra stuff for Everett that Whidbey doesn't use + info.AddValue("storedSurrogate", this.surrogateChar > 0 ? true : false); + info.AddValue("mustFlush", false); // Everett doesn't actually use this either, but it accidently serialized it! + } + + public override void Reset() + + { + this.surrogateChar = 0; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our encoder? + internal override bool HasState + { + get + { + return (this.surrogateChar != 0); + } + } + } + + [Serializable] + private sealed class UTF8Decoder : DecoderNLS, ISerializable + { + // We'll need to remember the previous information. See the comments around definition + // of FinalByte for details. + internal int bits; + + public UTF8Decoder(UTF8Encoding encoding) : base(encoding) + { + // base calls reset + } + + // Constructor called by serialization, have to handle deserializing from Everett + internal UTF8Decoder(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Get common info + this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding)); + + try + { + // Get whidbey version of bits + this.bits = (int)info.GetValue("wbits", typeof(int)); + this.m_fallback = (DecoderFallback)info.GetValue("m_fallback", typeof(DecoderFallback)); + } + catch (SerializationException) + { + // Everett calls bits bits instead of wbits, so this is Everett + this.bits = 0; + this.m_fallback = null; + } + } + + // ISerializable implementation, get data for this object + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Save new Whidbey data + info.AddValue("encoding", this.m_encoding); + info.AddValue("wbits", this.bits); // Special whidbey bits name + info.AddValue("m_fallback", this.m_fallback); + + // Everett has extra stuff, we set it all to 0 in case this deserializes in Everett + info.AddValue("bits", (int)0); + info.AddValue("trailCount", (int)0); + info.AddValue("isSurrogate", false); + info.AddValue("byteSequence", (int)0); + } + + public override void Reset() + { + this.bits = 0; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our decoder? + internal override bool HasState + { + get + { + return (this.bits != 0); + } + } + } + } +} diff --git a/src/mscorlib/shared/System/Text/UnicodeEncoding.cs b/src/mscorlib/shared/System/Text/UnicodeEncoding.cs new file mode 100644 index 0000000000..0e4db9aaad --- /dev/null +++ b/src/mscorlib/shared/System/Text/UnicodeEncoding.cs @@ -0,0 +1,2058 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// +// Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused. +// + +using System; +using System.Globalization; +using System.Runtime.Serialization; +using System.Diagnostics; +using System.Diagnostics.Contracts; + +namespace System.Text +{ + [Serializable] + public class UnicodeEncoding : Encoding + { + // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true); + internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true); + + [OptionalField(VersionAdded = 2)] + internal bool isThrowException = false; + + internal bool bigEndian = false; + internal bool byteOrderMark = true; + + // Unicode version 2.0 character size in bytes + public const int CharSize = 2; + + + public UnicodeEncoding() + : this(false, true) + { + } + + + public UnicodeEncoding(bool bigEndian, bool byteOrderMark) + : this(bigEndian, byteOrderMark, false) + { + } + + + public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes) + : base(bigEndian ? 1201 : 1200) //Set the data item. + { + this.isThrowException = throwOnInvalidBytes; + this.bigEndian = bigEndian; + this.byteOrderMark = byteOrderMark; + + // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions + if (this.isThrowException) + SetDefaultFallbacks(); + } + + #region Serialization + [OnDeserializing] + private void OnDeserializing(StreamingContext ctx) + { + // In Everett it is false. Whidbey will overwrite this value. + isThrowException = false; + } + #endregion Serialization + + internal override void SetDefaultFallbacks() + { + // For UTF-X encodings, we use a replacement fallback with an empty string + if (this.isThrowException) + { + this.encoderFallback = EncoderFallback.ExceptionFallback; + this.decoderFallback = DecoderFallback.ExceptionFallback; + } + else + { + this.encoderFallback = new EncoderReplacementFallback("\xFFFD"); + this.decoderFallback = new DecoderReplacementFallback("\xFFFD"); + } + } + + // The following methods are copied from EncodingNLS.cs. + // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here. + // These should be kept in sync for the following classes: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(String s) + { + // Validate input + if (s==null) + throw new ArgumentNullException("s"); + Contract.EndContractBlock(); + + fixed (char* pChars = s) + return GetByteCount(pChars, s.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + public override unsafe int GetBytes(String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null || bytes == null) + throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (s.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like 0 length arrays. + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like 0 length arrays. + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fixed doesn't like 0 length arrays. + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe string GetString(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (count == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + index, count, this); + } + + // + // End of standard methods copied from EncodingNLS.cs + // + + internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) + { + Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null"); + Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0"); + + // Start by assuming each char gets 2 bytes + int byteCount = count << 1; + + // Check for overflow in byteCount + // (If they were all invalid chars, this would actually be wrong, + // but that's a ridiculously large # so we're not concerned about that case) + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow); + + char* charStart = chars; + char* charEnd = chars + count; + char charLeftOver = (char)0; + + bool wasHereBefore = false; + + // Need -1 to check 2 at a time. If we have an even #, longChars will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + ulong* longEnd = (ulong*)(charEnd - 3); + + // For fallback we may need a fallback buffer + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + + // Assume extra bytes to encode charLeftOver if it existed + if (charLeftOver > 0) + byteCount += 2; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + } + + char ch; + TryAgain: + + while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, maybe we can do it fast +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. + if ( bigEndian && +#else + if (!bigEndian && +#endif // BIGENDIAN + +#if BIT64 // 64 bit CPU needs to be long aligned for this to work. + charLeftOver == 0 && (unchecked((long)chars) & 7) == 0) +#else + charLeftOver == 0 && (unchecked((int)chars) & 3) == 0) +#endif + { + // Need new char* so we can check 4 at a time + ulong* longChars = (ulong*)chars; + + while (longChars < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longChars) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We already counted these four chars, go to next long. + longChars++; + } + + chars = (char*)longChars; + + if (chars >= charEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // No fallback, just get next char + ch = *chars; + chars++; + } + else + { + // We weren't preallocating fallback space. + byteCount += 2; + } + + // Check for high or low surrogates + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, if we already had a high surrogate do its fallback + if (charLeftOver > 0) + { + // Unwind the current character, this should be safe because we + // don't have leftover data in the fallback, so chars must have + // advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate"); + chars--; + + // If previous high surrogate deallocate 2 bytes + byteCount -= 2; + + // Fallback the previous surrogate + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + // Now no high surrogate left over + charLeftOver = (char)0; + continue; + } + + // Remember this high surrogate + charLeftOver = ch; + continue; + } + + + // Its a low surrogate + if (charLeftOver == 0) + { + // Expected a previous high surrogate. + // Don't count this one (we'll count its fallback if necessary) + byteCount -= 2; + + // fallback this one + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + continue; + } + + // Valid surrogate pair, add our charLeftOver + charLeftOver = (char)0; + continue; + } + else if (charLeftOver > 0) + { + // Expected a low surrogate, but this char is normal + + // Rewind the current character, fallback previous character. + // this should be safe because we don't have leftover data in the + // fallback, so chars must have advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate"); + chars--; + + // fallback previous chars + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + // Ignore charLeftOver or throw + byteCount -= 2; + charLeftOver = (char)0; + + continue; + } + + // Ok we had something to add (already counted) + } + + // Don't allocate space for left over char + if (charLeftOver > 0) + { + byteCount -= 2; + + // If we have to flush, stick it in fallback and try again + if (encoder == null || encoder.MustFlush) + { + if (wasHereBefore) + { + // Throw it, using our complete character + throw new ArgumentException( + SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars)); + } + else + { + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + charLeftOver = (char)0; + wasHereBefore = true; + goto TryAgain; + } + } + } + + // Shouldn't have anything in fallback buffer for GetByteCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end"); + + // Don't remember fallbackBuffer.encoder for counting + return byteCount; + } + + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS encoder) + { + Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null"); + Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0"); + Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null"); + + char charLeftOver = (char)0; + char ch; + bool wasHereBefore = false; + + + byte* byteEnd = bytes + byteCount; + char* charEnd = chars + charCount; + byte* byteStart = bytes; + char* charStart = chars; + + // For fallback we may need a fallback buffer + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + // Get our encoder, but don't clear it yet. + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + } + + TryAgain: + while (((ch = (fallbackBuffer == null) ? + (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || + chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, maybe we can do it fast +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. + if ( bigEndian && +#else + if (!bigEndian && +#endif // BIGENDIAN +#if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned + (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && +#else + (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 && +#endif // BIT64 + charLeftOver == 0) + { + // Need -1 to check 2 at a time. If we have an even #, longChars will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + // We can only go iCount units (limited by shorter of char or byte buffers. + ulong* longEnd = (ulong*)(chars - 3 + + (((byteEnd - bytes) >> 1 < charEnd - chars) ? + (byteEnd - bytes) >> 1 : charEnd - chars)); + + // Need new char* so we can check 4 at a time + ulong* longChars = (ulong*)chars; + ulong* longBytes = (ulong*)bytes; + + while (longChars < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longChars) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We can use these 4 chars. + *longBytes = *longChars; + longChars++; + longBytes++; + } + + chars = (char*)longChars; + bytes = (byte*)longBytes; + + if (chars >= charEnd) + break; + } + // Not aligned, but maybe we can still be somewhat faster + // Also somehow this optimizes the above loop? It seems to cause something above + // to get enregistered, but I haven't figured out how to make that happen without this loop. + else if ((charLeftOver == 0) && +#if BIGENDIAN + bigEndian && +#else + !bigEndian && +#endif // BIGENDIAN + +#if BIT64 + (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop'll be faster next time +#else + (unchecked((int)chars) & 3) != (unchecked((int)bytes) & 3) && // Only do this if chars & bytes are out of line, otherwise faster loop'll be faster next time +#endif // BIT64 + (unchecked((int)(bytes)) & 1) == 0) + { + // # to use + long iCount = ((byteEnd - bytes) >> 1 < charEnd - chars) ? + (byteEnd - bytes) >> 1 : charEnd - chars; + + // Need new char* + char* charOut = ((char*)bytes); // a char* for our output + char* tempEnd = chars + iCount - 1; // Our end pointer + + while (chars < tempEnd) + { + if (*chars >= (char)0xd800 && *chars <= (char)0xdfff) + { + // break for fallback for low surrogate + if (*chars >= 0xdc00) + break; + + // break if next one's not a low surrogate (will do fallback) + if (*(chars + 1) < 0xdc00 || *(chars + 1) > 0xdfff) + break; + + // They both exist, use them + } + // If 2nd char is surrogate & this one isn't then only add one + else if (*(chars + 1) >= (char)0xd800 && *(chars + 1) <= 0xdfff) + { + *charOut = *chars; + charOut++; + chars++; + continue; + } + + *charOut = *chars; + *(charOut + 1) = *(chars + 1); + charOut += 2; + chars += 2; + } + + bytes = (byte*)charOut; + + if (chars >= charEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Check for high or low surrogates + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, see if we already had a high surrogate + if (charLeftOver > 0) + { + // Unwind the current character, this should be safe because we + // don't have leftover data in the fallback, so chars must have + // advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate"); + chars--; + + // Fallback the previous surrogate + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + charLeftOver = (char)0; + continue; + } + + // Remember this high surrogate + charLeftOver = ch; + continue; + } + + // Its a low surrogate + if (charLeftOver == 0) + { + // We'll fall back this one + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + continue; + } + + // Valid surrogate pair, add our charLeftOver + if (bytes + 3 >= byteEnd) + { + // Not enough room to add this surrogate pair + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + { + // These must have both been from the fallbacks. + // Both of these MUST have been from a fallback because if the 1st wasn't + // from a fallback, then a high surrogate followed by an illegal char + // would've caused the high surrogate to fall back. If a high surrogate + // fell back, then it was consumed and both chars came from the fallback. + fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate + fallbackBuffer.MovePrevious(); + } + else + { + // If we don't have enough room, then either we should've advanced a while + // or we should have bytes==byteStart and throw below + Debug.Assert(chars > charStart + 1 || bytes == byteStart, + "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair"); + chars -= 2; // Didn't use either surrogate + } + ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) + charLeftOver = (char)0; // we'll retry it later + break; // Didn't throw, but stop 'til next time. + } + + if (bigEndian) + { + *(bytes++) = (byte)(charLeftOver >> 8); + *(bytes++) = (byte)charLeftOver; + } + else + { + *(bytes++) = (byte)charLeftOver; + *(bytes++) = (byte)(charLeftOver >> 8); + } + + charLeftOver = (char)0; + } + else if (charLeftOver > 0) + { + // Expected a low surrogate, but this char is normal + + // Rewind the current character, fallback previous character. + // this should be safe because we don't have leftover data in the + // fallback, so chars must have advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate"); + chars--; + + // fallback previous chars + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + // Ignore charLeftOver or throw + charLeftOver = (char)0; + continue; + } + + // Ok, we have a char to add + if (bytes + 1 >= byteEnd) + { + // Couldn't add this char + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + fallbackBuffer.MovePrevious(); // Not using this fallback char + else + { + // Lonely charLeftOver (from previous call) would've been caught up above, + // so this must be a case where we've already read an input char. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback"); + chars--; // Not using this char + } + ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) + break; // didn't throw, just stop + } + + if (bigEndian) + { + *(bytes++) = (byte)(ch >> 8); + *(bytes++) = (byte)ch; + } + else + { + *(bytes++) = (byte)ch; + *(bytes++) = (byte)(ch >> 8); + } + } + + // Don't allocate space for left over char + if (charLeftOver > 0) + { + // If we aren't flushing we need to fall this back + if (encoder == null || encoder.MustFlush) + { + if (wasHereBefore) + { + // Throw it, using our complete character + throw new ArgumentException( + SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars)); + } + else + { + // If we have to flush, stick it in fallback and try again + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + // If we're not flushing, this'll remember the left over character. + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + charLeftOver = (char)0; + wasHereBefore = true; + goto TryAgain; + } + } + } + + // Not flushing, remember it in the encoder + if (encoder != null) + { + encoder.charLeftOver = charLeftOver; + encoder.m_charsUsed = (int)(chars - charStart); + } + + // Remember charLeftOver if we must, or clear it if we're flushing + // (charLeftOver should be 0 if we're flushing) + Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0, + "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing"); + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || + encoder == null || !encoder.m_throwOnOverflow, + "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting"); + + // We used to copy it fast, but this doesn't check for surrogates + // System.IO.__UnmanagedMemoryStream.memcpyimpl(bytes, (byte*)chars, usedByteCount); + + return (int)(bytes - byteStart); + } + + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) + { + Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null"); + Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0"); + + UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder; + + byte* byteEnd = bytes + count; + byte* byteStart = bytes; + + // Need last vars + int lastByte = -1; + char lastChar = (char)0; + + // Start by assuming same # of chars as bytes + int charCount = count >> 1; + + // Need -1 to check 2 at a time. If we have an even #, longBytes will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + ulong* longEnd = (ulong*)(byteEnd - 7); + + // For fallback we may need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + + if (decoder != null) + { + lastByte = decoder.lastByte; + lastChar = decoder.lastChar; + + // Assume extra char if last char was around + if (lastChar > 0) + charCount++; + + // Assume extra char if extra last byte makes up odd # of input bytes + if (lastByte >= 0 && (count & 1) == 1) + { + charCount++; + } + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start"); + } + + while (bytes < byteEnd) + { + // If we're aligned then maybe we can do it fast + // This'll hurt if we're unaligned because we'll always test but never be aligned +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN + if (bigEndian && +#else // BIGENDIAN + if (!bigEndian && +#endif // BIGENDIAN +#if BIT64 // win64 has to be long aligned + (unchecked((long)bytes) & 7) == 0 && +#else + (unchecked((int)bytes) & 3) == 0 && +#endif // BIT64 + lastByte == -1 && lastChar == 0) + { + // Need new char* so we can check 4 at a time + ulong* longBytes = (ulong*)bytes; + + while (longBytes < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longBytes) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We can use these 4 chars. + longBytes++; + } + + bytes = (byte*)longBytes; + + if (bytes >= byteEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // Get 1st byte + if (lastByte < 0) + { + lastByte = *bytes++; + if (bytes >= byteEnd) break; + } + + // Get full char + char ch; + if (bigEndian) + { + ch = (char)(lastByte << 8 | *(bytes++)); + } + else + { + ch = (char)(*(bytes++) << 8 | lastByte); + } + lastByte = -1; + + // See if the char's valid + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, if we had one then do fallback for previous one + if (lastChar > 0) + { + // Ignore previous bad high surrogate + charCount--; + + // Get fallback for previous high surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + // Get fallback. + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + } + + // Ignore the last one which fell back already, + // and remember the new high surrogate + lastChar = ch; + continue; + } + + // Its a low surrogate + if (lastChar == 0) + { + // Expected a previous high surrogate + charCount--; + + // Get fallback for this low surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + + // Ignore this one (we already did its fallback) + continue; + } + + // Valid surrogate pair, already counted. + lastChar = (char)0; + } + else if (lastChar > 0) + { + // Had a high surrogate, expected a low surrogate + // Uncount the last high surrogate + charCount--; + + // fall back the high surrogate. + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + // Already subtracted high surrogate + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + + // Not left over now, clear previous high surrogate and continue to add current char + lastChar = (char)0; + } + + // Valid char, already counted + } + + // Extra space if we can't use decoder + if (decoder == null || decoder.MustFlush) + { + if (lastChar > 0) + { + // No hanging high surrogates allowed, do fallback and remove count for it + charCount--; + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + + lastChar = (char)0; + } + + if (lastByte >= 0) + { + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + // No hanging odd bytes allowed if must flush + charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes); + lastByte = -1; + } + } + + // If we had a high surrogate left over, we can't count it + if (lastChar > 0) + charCount--; + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end"); + + return charCount; + } + + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS baseDecoder) + { + Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null"); + Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0"); + Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null"); + + UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder; + + // Need last vars + int lastByte = -1; + char lastChar = (char)0; + + // Get our decoder (but don't clear it yet) + if (decoder != null) + { + lastByte = decoder.lastByte; + lastChar = decoder.lastChar; + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start"); + } + + // For fallback we may need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + byte* byteEnd = bytes + byteCount; + char* charEnd = chars + charCount; + byte* byteStart = bytes; + char* charStart = chars; + + while (bytes < byteEnd) + { + // If we're aligned then maybe we can do it fast + // This'll hurt if we're unaligned because we'll always test but never be aligned +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN + if (bigEndian && +#else // BIGENDIAN + if (!bigEndian && +#endif // BIGENDIAN +#if BIT64 // win64 has to be long aligned + (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && +#else + (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 && +#endif // BIT64 + lastByte == -1 && lastChar == 0) + { + // Need -1 to check 2 at a time. If we have an even #, longChars will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + // We can only go iCount units (limited by shorter of char or byte buffers. + ulong* longEnd = (ulong*)(bytes - 7 + + (((byteEnd - bytes) >> 1 < charEnd - chars) ? + (byteEnd - bytes) : (charEnd - chars) << 1)); + + // Need new char* so we can check 4 at a time + ulong* longBytes = (ulong*)bytes; + ulong* longChars = (ulong*)chars; + + while (longBytes < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longBytes) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We can use these 4 chars. + *longChars = *longBytes; + longBytes++; + longChars++; + } + + chars = (char*)longChars; + bytes = (byte*)longBytes; + + if (bytes >= byteEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // Get 1st byte + if (lastByte < 0) + { + lastByte = *bytes++; + continue; + } + + // Get full char + char ch; + if (bigEndian) + { + ch = (char)(lastByte << 8 | *(bytes++)); + } + else + { + ch = (char)(*(bytes++) << 8 | lastByte); + } + lastByte = -1; + + // See if the char's valid + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, if we had one then do fallback for previous one + if (lastChar > 0) + { + // Get fallback for previous high surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // couldn't fall back lonely surrogate + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)"); + bytes -= 2; // didn't use these 2 bytes + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + } + + // Ignore the previous high surrogate which fell back already, + // yet remember the current high surrogate for next time. + lastChar = ch; + continue; + } + + // Its a low surrogate + if (lastChar == 0) + { + // Expected a previous high surrogate + // Get fallback for this low surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // couldn't fall back lonely surrogate + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)"); + bytes -= 2; // didn't use these 2 bytes + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + + // Didn't throw, ignore this one (we already did its fallback) + continue; + } + + // Valid surrogate pair, add our lastChar (will need 2 chars) + if (chars >= charEnd - 1) + { + // couldn't find room for this surrogate pair + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)"); + bytes -= 2; // didn't use these 2 bytes + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // Leave lastChar for next call to Convert() + break; // couldn't fallback but didn't throw + } + + *chars++ = lastChar; + lastChar = (char)0; + } + else if (lastChar > 0) + { + // Had a high surrogate, expected a low surrogate, fall back the high surrogate. + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // couldn't fall back high surrogate, or char that would be next + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)"); + bytes -= 2; // didn't use these 2 bytes + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + + // Not left over now, clear previous high surrogate and continue to add current char + lastChar = (char)0; + } + + // Valid char, room for it? + if (chars >= charEnd) + { + // 2 bytes couldn't fall back + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)"); + bytes -= 2; // didn't use these bytes + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + + // add it + *chars++ = ch; + } + + // Remember our decoder if we must + if (decoder == null || decoder.MustFlush) + { + if (lastChar > 0) + { + // No hanging high surrogates allowed, do fallback and remove count for it + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // 2 bytes couldn't fall back + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)"); + bytes -= 2; // didn't use these bytes + if (lastByte >= 0) + bytes--; // had an extra last byte hanging around + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // We'll remember these in our decoder though + bytes += 2; + if (lastByte >= 0) + bytes++; + goto End; + } + + // done with this one + lastChar = (char)0; + } + + if (lastByte >= 0) + { + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + // No hanging odd bytes allowed if must flush + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // odd byte couldn't fall back + bytes--; // didn't use this byte + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // didn't throw, but we'll remember it in the decoder + bytes++; + goto End; + } + + // Didn't fail, clear buffer + lastByte = -1; + } + } + + End: + + // Remember our decoder if we must + if (decoder != null) + { + Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)), + "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing" + // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2") + ); + + decoder.m_bytesUsed = (int)(bytes - byteStart); + decoder.lastChar = lastChar; + decoder.lastByte = lastByte; + } + + // Used to do this the old way + // System.IO.__UnmanagedMemoryStream.memcpyimpl((byte*)chars, bytes, byteCount); + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for count or chars) + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end"); + + return (int)(chars - charStart); + } + + + public override System.Text.Encoder GetEncoder() + { + return new EncoderNLS(this); + } + + + public override System.Text.Decoder GetDecoder() + { + return new UnicodeEncoding.Decoder(this); + } + + + public override byte[] GetPreamble() + { + if (byteOrderMark) + { + // Note - we must allocate new byte[]'s here to prevent someone + // from modifying a cached byte[]. + if (bigEndian) + return new byte[2] { 0xfe, 0xff }; + else + return new byte[2] { 0xff, 0xfe }; + } + return Array.Empty<Byte>(); + } + + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // 2 bytes per char + byteCount <<= 1; + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + + return (int)byteCount; + } + + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // long because byteCount could be biggest int. + // 1 char per 2 bytes. Round up in case 1 left over in decoder. + // Round up using &1 in case byteCount is max size + // Might also need an extra 1 if there's a left over high surrogate in the decoder. + long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1; + + // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizzare like that) + if (DecoderFallback.MaxCharCount > 1) + charCount *= DecoderFallback.MaxCharCount; + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); + + return (int)charCount; + } + + + public override bool Equals(Object value) + { + UnicodeEncoding that = value as UnicodeEncoding; + if (that != null) + { + // + // Big Endian Unicode has different code page (1201) than small Endian one (1200), + // so we still have to check m_codePage here. + // + return (CodePage == that.CodePage) && + byteOrderMark == that.byteOrderMark && + // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks + bigEndian == that.bigEndian && + (EncoderFallback.Equals(that.EncoderFallback)) && + (DecoderFallback.Equals(that.DecoderFallback)); + } + return (false); + } + + public override int GetHashCode() + { + return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + + (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0); + } + + [Serializable] + private sealed class Decoder : System.Text.DecoderNLS, ISerializable + { + internal int lastByte = -1; + internal char lastChar = '\0'; + + public Decoder(UnicodeEncoding encoding) : base(encoding) + { + // base calls reset + } + + // Constructor called by serialization, have to handle deserializing from Everett + internal Decoder(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Get Common Info + this.lastByte = (int)info.GetValue("lastByte", typeof(int)); + + try + { + // Try the encoding, which is only serialized in Whidbey + this.m_encoding = (Encoding)info.GetValue("m_encoding", typeof(Encoding)); + this.lastChar = (char)info.GetValue("lastChar", typeof(char)); + this.m_fallback = (DecoderFallback)info.GetValue("m_fallback", typeof(DecoderFallback)); + } + catch (SerializationException) + { + // Everett didn't serialize the UnicodeEncoding, get the default one + bool bigEndian = (bool)info.GetValue("bigEndian", typeof(bool)); + this.m_encoding = new UnicodeEncoding(bigEndian, false); + } + } + + // ISerializable implementation, get data for this object + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Save Whidbey data + info.AddValue("m_encoding", this.m_encoding); + info.AddValue("m_fallback", this.m_fallback); + info.AddValue("lastChar", this.lastChar); // Unused by everett so it'll probably get lost + info.AddValue("lastByte", this.lastByte); + + // Everett Only + info.AddValue("bigEndian", ((UnicodeEncoding)(this.m_encoding)).bigEndian); + } + + public override void Reset() + { + lastByte = -1; + lastChar = '\0'; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our decoder? + internal override bool HasState + { + get + { + return (this.lastByte != -1 || this.lastChar != '\0'); + } + } + } + } +} + |