diff options
Diffstat (limited to 'src/mscorlib/shared/System/Text/UnicodeEncoding.cs')
-rw-r--r-- | src/mscorlib/shared/System/Text/UnicodeEncoding.cs | 2058 |
1 files changed, 2058 insertions, 0 deletions
diff --git a/src/mscorlib/shared/System/Text/UnicodeEncoding.cs b/src/mscorlib/shared/System/Text/UnicodeEncoding.cs new file mode 100644 index 0000000000..0e4db9aaad --- /dev/null +++ b/src/mscorlib/shared/System/Text/UnicodeEncoding.cs @@ -0,0 +1,2058 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// +// Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused. +// + +using System; +using System.Globalization; +using System.Runtime.Serialization; +using System.Diagnostics; +using System.Diagnostics.Contracts; + +namespace System.Text +{ + [Serializable] + public class UnicodeEncoding : Encoding + { + // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization + // The initialization code will not be run until a static member of the class is referenced + internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true); + internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true); + + [OptionalField(VersionAdded = 2)] + internal bool isThrowException = false; + + internal bool bigEndian = false; + internal bool byteOrderMark = true; + + // Unicode version 2.0 character size in bytes + public const int CharSize = 2; + + + public UnicodeEncoding() + : this(false, true) + { + } + + + public UnicodeEncoding(bool bigEndian, bool byteOrderMark) + : this(bigEndian, byteOrderMark, false) + { + } + + + public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes) + : base(bigEndian ? 1201 : 1200) //Set the data item. + { + this.isThrowException = throwOnInvalidBytes; + this.bigEndian = bigEndian; + this.byteOrderMark = byteOrderMark; + + // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions + if (this.isThrowException) + SetDefaultFallbacks(); + } + + #region Serialization + [OnDeserializing] + private void OnDeserializing(StreamingContext ctx) + { + // In Everett it is false. Whidbey will overwrite this value. + isThrowException = false; + } + #endregion Serialization + + internal override void SetDefaultFallbacks() + { + // For UTF-X encodings, we use a replacement fallback with an empty string + if (this.isThrowException) + { + this.encoderFallback = EncoderFallback.ExceptionFallback; + this.decoderFallback = DecoderFallback.ExceptionFallback; + } + else + { + this.encoderFallback = new EncoderReplacementFallback("\xFFFD"); + this.decoderFallback = new DecoderReplacementFallback("\xFFFD"); + } + } + + // The following methods are copied from EncodingNLS.cs. + // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here. + // These should be kept in sync for the following classes: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // + + // Returns the number of bytes required to encode a range of characters in + // a character array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(char[] chars, int index, int count) + { + // Validate input parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - index < count) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input, return 0, avoid fixed empty array problem + if (count == 0) + return 0; + + // Just call the pointer version + fixed (char* pChars = chars) + return GetByteCount(pChars + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetByteCount(String s) + { + // Validate input + if (s==null) + throw new ArgumentNullException("s"); + Contract.EndContractBlock(); + + fixed (char* pChars = s) + return GetByteCount(pChars, s.Length, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetByteCount(char* chars, int count) + { + // Validate Parameters + if (chars == null) + throw new ArgumentNullException("chars", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Call it with empty encoder + return GetByteCount(chars, count, null); + } + + // Parent method is safe. + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + public override unsafe int GetBytes(String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + if (s == null || bytes == null) + throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (s.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like 0 length arrays. + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // Encodes a range of characters in a character array into a range of bytes + // in a byte array. An exception occurs if the byte array is not large + // enough to hold the complete encoding of the characters. The + // GetByteCount method can be used to determine the exact number of + // bytes that will be produced for a given range of characters. + // Alternatively, the GetMaxByteCount method can be used to + // determine the maximum number of bytes that will be produced for a given + // number of characters, regardless of the actual character values. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex) + { + // Validate parameters + if (chars == null || bytes == null) + throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); + + if (charIndex < 0 || charCount < 0) + throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (chars.Length - charIndex < charCount) + throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (byteIndex < 0 || byteIndex > bytes.Length) + throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If nothing to encode return 0, avoid fixed problem + if (charCount == 0) + return 0; + + // Just call pointer version + int byteCount = bytes.Length - byteIndex; + + // Fixed doesn't like 0 length arrays. + if (bytes.Length == 0) + bytes = new byte[1]; + + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) + // Remember that byteCount is # to decode, not size of array. + return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetBytes(chars, charCount, bytes, byteCount, null); + } + + // Returns the number of characters produced by decoding a range of bytes + // in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetCharCount(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // If no input just return 0, fixed doesn't like 0 length arrays + if (count == 0) + return 0; + + // Just call pointer version + fixed (byte* pBytes = bytes) + return GetCharCount(pBytes + index, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public override unsafe int GetCharCount(byte* bytes, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (count < 0) + throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetCharCount(bytes, count, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (byteIndex < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if ( bytes.Length - byteIndex < byteCount) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + + if (charIndex < 0 || charIndex > chars.Length) + throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); + Contract.EndContractBlock(); + + // If no input, return 0 & avoid fixed problem + if (byteCount == 0) + return 0; + + // Just call pointer version + int charCount = chars.Length - charIndex; + + // Fixed doesn't like 0 length arrays. + if (chars.Length == 0) + chars = new char[1]; + + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) + // Remember that charCount is # to decode, not size of array + return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); + } + + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + + [CLSCompliant(false)] + public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) + { + // Validate Parameters + if (bytes == null || chars == null) + throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); + + if (charCount < 0 || byteCount < 0) + throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + return GetChars(bytes, byteCount, chars, charCount, null); + } + + // Returns a string containing the decoded representation of a range of + // bytes in a byte array. + // + // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) + // So if you fix this, fix the others. Currently those include: + // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding + // parent method is safe + + public override unsafe string GetString(byte[] bytes, int index, int count) + { + // Validate Parameters + if (bytes == null) + throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); + + if (index < 0 || count < 0) + throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); + + if (bytes.Length - index < count) + throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); + Contract.EndContractBlock(); + + // Avoid problems with empty input buffer + if (count == 0) return String.Empty; + + fixed (byte* pBytes = bytes) + return String.CreateStringFromEncoding( + pBytes + index, count, this); + } + + // + // End of standard methods copied from EncodingNLS.cs + // + + internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) + { + Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null"); + Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0"); + + // Start by assuming each char gets 2 bytes + int byteCount = count << 1; + + // Check for overflow in byteCount + // (If they were all invalid chars, this would actually be wrong, + // but that's a ridiculously large # so we're not concerned about that case) + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow); + + char* charStart = chars; + char* charEnd = chars + count; + char charLeftOver = (char)0; + + bool wasHereBefore = false; + + // Need -1 to check 2 at a time. If we have an even #, longChars will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + ulong* longEnd = (ulong*)(charEnd - 3); + + // For fallback we may need a fallback buffer + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + + // Assume extra bytes to encode charLeftOver if it existed + if (charLeftOver > 0) + byteCount += 2; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + } + + char ch; + TryAgain: + + while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, maybe we can do it fast +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. + if ( bigEndian && +#else + if (!bigEndian && +#endif // BIGENDIAN + +#if BIT64 // 64 bit CPU needs to be long aligned for this to work. + charLeftOver == 0 && (unchecked((long)chars) & 7) == 0) +#else + charLeftOver == 0 && (unchecked((int)chars) & 3) == 0) +#endif + { + // Need new char* so we can check 4 at a time + ulong* longChars = (ulong*)chars; + + while (longChars < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longChars) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We already counted these four chars, go to next long. + longChars++; + } + + chars = (char*)longChars; + + if (chars >= charEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // No fallback, just get next char + ch = *chars; + chars++; + } + else + { + // We weren't preallocating fallback space. + byteCount += 2; + } + + // Check for high or low surrogates + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, if we already had a high surrogate do its fallback + if (charLeftOver > 0) + { + // Unwind the current character, this should be safe because we + // don't have leftover data in the fallback, so chars must have + // advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate"); + chars--; + + // If previous high surrogate deallocate 2 bytes + byteCount -= 2; + + // Fallback the previous surrogate + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + // Now no high surrogate left over + charLeftOver = (char)0; + continue; + } + + // Remember this high surrogate + charLeftOver = ch; + continue; + } + + + // Its a low surrogate + if (charLeftOver == 0) + { + // Expected a previous high surrogate. + // Don't count this one (we'll count its fallback if necessary) + byteCount -= 2; + + // fallback this one + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + continue; + } + + // Valid surrogate pair, add our charLeftOver + charLeftOver = (char)0; + continue; + } + else if (charLeftOver > 0) + { + // Expected a low surrogate, but this char is normal + + // Rewind the current character, fallback previous character. + // this should be safe because we don't have leftover data in the + // fallback, so chars must have advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate"); + chars--; + + // fallback previous chars + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + // Ignore charLeftOver or throw + byteCount -= 2; + charLeftOver = (char)0; + + continue; + } + + // Ok we had something to add (already counted) + } + + // Don't allocate space for left over char + if (charLeftOver > 0) + { + byteCount -= 2; + + // If we have to flush, stick it in fallback and try again + if (encoder == null || encoder.MustFlush) + { + if (wasHereBefore) + { + // Throw it, using our complete character + throw new ArgumentException( + SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars)); + } + else + { + // Need to initialize fallback buffer? + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + charLeftOver = (char)0; + wasHereBefore = true; + goto TryAgain; + } + } + } + + // Shouldn't have anything in fallback buffer for GetByteCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end"); + + // Don't remember fallbackBuffer.encoder for counting + return byteCount; + } + + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS encoder) + { + Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null"); + Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0"); + Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null"); + + char charLeftOver = (char)0; + char ch; + bool wasHereBefore = false; + + + byte* byteEnd = bytes + byteCount; + char* charEnd = chars + charCount; + byte* byteStart = bytes; + char* charStart = chars; + + // For fallback we may need a fallback buffer + EncoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + // Get our encoder, but don't clear it yet. + if (encoder != null) + { + charLeftOver = encoder.charLeftOver; + + // We mustn't have left over fallback data when counting + if (encoder.InternalHasFallbackBuffer) + { + // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary + fallbackBuffer = encoder.FallbackBuffer; + if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow) + throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); + } + } + + TryAgain: + while (((ch = (fallbackBuffer == null) ? + (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || + chars < charEnd) + { + // First unwind any fallback + if (ch == 0) + { + // No fallback, maybe we can do it fast +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. + if ( bigEndian && +#else + if (!bigEndian && +#endif // BIGENDIAN +#if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned + (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && +#else + (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 && +#endif // BIT64 + charLeftOver == 0) + { + // Need -1 to check 2 at a time. If we have an even #, longChars will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + // We can only go iCount units (limited by shorter of char or byte buffers. + ulong* longEnd = (ulong*)(chars - 3 + + (((byteEnd - bytes) >> 1 < charEnd - chars) ? + (byteEnd - bytes) >> 1 : charEnd - chars)); + + // Need new char* so we can check 4 at a time + ulong* longChars = (ulong*)chars; + ulong* longBytes = (ulong*)bytes; + + while (longChars < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longChars) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We can use these 4 chars. + *longBytes = *longChars; + longChars++; + longBytes++; + } + + chars = (char*)longChars; + bytes = (byte*)longBytes; + + if (chars >= charEnd) + break; + } + // Not aligned, but maybe we can still be somewhat faster + // Also somehow this optimizes the above loop? It seems to cause something above + // to get enregistered, but I haven't figured out how to make that happen without this loop. + else if ((charLeftOver == 0) && +#if BIGENDIAN + bigEndian && +#else + !bigEndian && +#endif // BIGENDIAN + +#if BIT64 + (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop'll be faster next time +#else + (unchecked((int)chars) & 3) != (unchecked((int)bytes) & 3) && // Only do this if chars & bytes are out of line, otherwise faster loop'll be faster next time +#endif // BIT64 + (unchecked((int)(bytes)) & 1) == 0) + { + // # to use + long iCount = ((byteEnd - bytes) >> 1 < charEnd - chars) ? + (byteEnd - bytes) >> 1 : charEnd - chars; + + // Need new char* + char* charOut = ((char*)bytes); // a char* for our output + char* tempEnd = chars + iCount - 1; // Our end pointer + + while (chars < tempEnd) + { + if (*chars >= (char)0xd800 && *chars <= (char)0xdfff) + { + // break for fallback for low surrogate + if (*chars >= 0xdc00) + break; + + // break if next one's not a low surrogate (will do fallback) + if (*(chars + 1) < 0xdc00 || *(chars + 1) > 0xdfff) + break; + + // They both exist, use them + } + // If 2nd char is surrogate & this one isn't then only add one + else if (*(chars + 1) >= (char)0xd800 && *(chars + 1) <= 0xdfff) + { + *charOut = *chars; + charOut++; + chars++; + continue; + } + + *charOut = *chars; + *(charOut + 1) = *(chars + 1); + charOut += 2; + chars += 2; + } + + bytes = (byte*)charOut; + + if (chars >= charEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // No fallback, just get next char + ch = *chars; + chars++; + } + + // Check for high or low surrogates + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, see if we already had a high surrogate + if (charLeftOver > 0) + { + // Unwind the current character, this should be safe because we + // don't have leftover data in the fallback, so chars must have + // advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate"); + chars--; + + // Fallback the previous surrogate + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + charLeftOver = (char)0; + continue; + } + + // Remember this high surrogate + charLeftOver = ch; + continue; + } + + // Its a low surrogate + if (charLeftOver == 0) + { + // We'll fall back this one + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(ch, ref charsForFallback); + chars = charsForFallback; + continue; + } + + // Valid surrogate pair, add our charLeftOver + if (bytes + 3 >= byteEnd) + { + // Not enough room to add this surrogate pair + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + { + // These must have both been from the fallbacks. + // Both of these MUST have been from a fallback because if the 1st wasn't + // from a fallback, then a high surrogate followed by an illegal char + // would've caused the high surrogate to fall back. If a high surrogate + // fell back, then it was consumed and both chars came from the fallback. + fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate + fallbackBuffer.MovePrevious(); + } + else + { + // If we don't have enough room, then either we should've advanced a while + // or we should have bytes==byteStart and throw below + Debug.Assert(chars > charStart + 1 || bytes == byteStart, + "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair"); + chars -= 2; // Didn't use either surrogate + } + ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) + charLeftOver = (char)0; // we'll retry it later + break; // Didn't throw, but stop 'til next time. + } + + if (bigEndian) + { + *(bytes++) = (byte)(charLeftOver >> 8); + *(bytes++) = (byte)charLeftOver; + } + else + { + *(bytes++) = (byte)charLeftOver; + *(bytes++) = (byte)(charLeftOver >> 8); + } + + charLeftOver = (char)0; + } + else if (charLeftOver > 0) + { + // Expected a low surrogate, but this char is normal + + // Rewind the current character, fallback previous character. + // this should be safe because we don't have leftover data in the + // fallback, so chars must have advanced already. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate"); + chars--; + + // fallback previous chars + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + // Ignore charLeftOver or throw + charLeftOver = (char)0; + continue; + } + + // Ok, we have a char to add + if (bytes + 1 >= byteEnd) + { + // Couldn't add this char + if (fallbackBuffer != null && fallbackBuffer.bFallingBack) + fallbackBuffer.MovePrevious(); // Not using this fallback char + else + { + // Lonely charLeftOver (from previous call) would've been caught up above, + // so this must be a case where we've already read an input char. + Debug.Assert(chars > charStart, + "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback"); + chars--; // Not using this char + } + ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) + break; // didn't throw, just stop + } + + if (bigEndian) + { + *(bytes++) = (byte)(ch >> 8); + *(bytes++) = (byte)ch; + } + else + { + *(bytes++) = (byte)ch; + *(bytes++) = (byte)(ch >> 8); + } + } + + // Don't allocate space for left over char + if (charLeftOver > 0) + { + // If we aren't flushing we need to fall this back + if (encoder == null || encoder.MustFlush) + { + if (wasHereBefore) + { + // Throw it, using our complete character + throw new ArgumentException( + SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars)); + } + else + { + // If we have to flush, stick it in fallback and try again + // Might need to create our fallback buffer + if (fallbackBuffer == null) + { + if (encoder == null) + fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = encoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); + } + + // If we're not flushing, this'll remember the left over character. + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); + chars = charsForFallback; + + charLeftOver = (char)0; + wasHereBefore = true; + goto TryAgain; + } + } + } + + // Not flushing, remember it in the encoder + if (encoder != null) + { + encoder.charLeftOver = charLeftOver; + encoder.m_charsUsed = (int)(chars - charStart); + } + + // Remember charLeftOver if we must, or clear it if we're flushing + // (charLeftOver should be 0 if we're flushing) + Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0, + "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing"); + + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || + encoder == null || !encoder.m_throwOnOverflow, + "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting"); + + // We used to copy it fast, but this doesn't check for surrogates + // System.IO.__UnmanagedMemoryStream.memcpyimpl(bytes, (byte*)chars, usedByteCount); + + return (int)(bytes - byteStart); + } + + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) + { + Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null"); + Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0"); + + UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder; + + byte* byteEnd = bytes + count; + byte* byteStart = bytes; + + // Need last vars + int lastByte = -1; + char lastChar = (char)0; + + // Start by assuming same # of chars as bytes + int charCount = count >> 1; + + // Need -1 to check 2 at a time. If we have an even #, longBytes will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + ulong* longEnd = (ulong*)(byteEnd - 7); + + // For fallback we may need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + + if (decoder != null) + { + lastByte = decoder.lastByte; + lastChar = decoder.lastChar; + + // Assume extra char if last char was around + if (lastChar > 0) + charCount++; + + // Assume extra char if extra last byte makes up odd # of input bytes + if (lastByte >= 0 && (count & 1) == 1) + { + charCount++; + } + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start"); + } + + while (bytes < byteEnd) + { + // If we're aligned then maybe we can do it fast + // This'll hurt if we're unaligned because we'll always test but never be aligned +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN + if (bigEndian && +#else // BIGENDIAN + if (!bigEndian && +#endif // BIGENDIAN +#if BIT64 // win64 has to be long aligned + (unchecked((long)bytes) & 7) == 0 && +#else + (unchecked((int)bytes) & 3) == 0 && +#endif // BIT64 + lastByte == -1 && lastChar == 0) + { + // Need new char* so we can check 4 at a time + ulong* longBytes = (ulong*)bytes; + + while (longBytes < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longBytes) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We can use these 4 chars. + longBytes++; + } + + bytes = (byte*)longBytes; + + if (bytes >= byteEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // Get 1st byte + if (lastByte < 0) + { + lastByte = *bytes++; + if (bytes >= byteEnd) break; + } + + // Get full char + char ch; + if (bigEndian) + { + ch = (char)(lastByte << 8 | *(bytes++)); + } + else + { + ch = (char)(*(bytes++) << 8 | lastByte); + } + lastByte = -1; + + // See if the char's valid + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, if we had one then do fallback for previous one + if (lastChar > 0) + { + // Ignore previous bad high surrogate + charCount--; + + // Get fallback for previous high surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + // Get fallback. + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + } + + // Ignore the last one which fell back already, + // and remember the new high surrogate + lastChar = ch; + continue; + } + + // Its a low surrogate + if (lastChar == 0) + { + // Expected a previous high surrogate + charCount--; + + // Get fallback for this low surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + + // Ignore this one (we already did its fallback) + continue; + } + + // Valid surrogate pair, already counted. + lastChar = (char)0; + } + else if (lastChar > 0) + { + // Had a high surrogate, expected a low surrogate + // Uncount the last high surrogate + charCount--; + + // fall back the high surrogate. + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + // Already subtracted high surrogate + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + + // Not left over now, clear previous high surrogate and continue to add current char + lastChar = (char)0; + } + + // Valid char, already counted + } + + // Extra space if we can't use decoder + if (decoder == null || decoder.MustFlush) + { + if (lastChar > 0) + { + // No hanging high surrogates allowed, do fallback and remove count for it + charCount--; + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); + + lastChar = (char)0; + } + + if (lastByte >= 0) + { + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, null); + } + + // No hanging odd bytes allowed if must flush + charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes); + lastByte = -1; + } + } + + // If we had a high surrogate left over, we can't count it + if (lastChar > 0) + charCount--; + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end"); + + return charCount; + } + + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS baseDecoder) + { + Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null"); + Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0"); + Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0"); + Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null"); + + UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder; + + // Need last vars + int lastByte = -1; + char lastChar = (char)0; + + // Get our decoder (but don't clear it yet) + if (decoder != null) + { + lastByte = decoder.lastByte; + lastChar = decoder.lastChar; + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start"); + } + + // For fallback we may need a fallback buffer + DecoderFallbackBuffer fallbackBuffer = null; + char* charsForFallback; + + byte* byteEnd = bytes + byteCount; + char* charEnd = chars + charCount; + byte* byteStart = bytes; + char* charStart = chars; + + while (bytes < byteEnd) + { + // If we're aligned then maybe we can do it fast + // This'll hurt if we're unaligned because we'll always test but never be aligned +#if !NO_FAST_UNICODE_LOOP +#if BIGENDIAN + if (bigEndian && +#else // BIGENDIAN + if (!bigEndian && +#endif // BIGENDIAN +#if BIT64 // win64 has to be long aligned + (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && +#else + (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 && +#endif // BIT64 + lastByte == -1 && lastChar == 0) + { + // Need -1 to check 2 at a time. If we have an even #, longChars will go + // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars + // will go from longEnd - 1 long to longEnd. (Might not get to use this) + // We can only go iCount units (limited by shorter of char or byte buffers. + ulong* longEnd = (ulong*)(bytes - 7 + + (((byteEnd - bytes) >> 1 < charEnd - chars) ? + (byteEnd - bytes) : (charEnd - chars) << 1)); + + // Need new char* so we can check 4 at a time + ulong* longBytes = (ulong*)bytes; + ulong* longChars = (ulong*)chars; + + while (longBytes < longEnd) + { + // See if we potentially have surrogates (0x8000 bit set) + // (We're either big endian on a big endian machine or little endian on + // a little endian machine so this'll work) + if ((0x8000800080008000 & *longBytes) != 0) + { + // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high + // 5 bits looks like 11011, then its a high or low surrogate. + // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. + // Note that we expect BMP characters to be more common than surrogates + // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates + ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800; + + // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate + // but no clue if they're high or low. + // If each of the 4 characters are non-zero, then none are surrogates. + if ((uTemp & 0xFFFF000000000000) == 0 || + (uTemp & 0x0000FFFF00000000) == 0 || + (uTemp & 0x00000000FFFF0000) == 0 || + (uTemp & 0x000000000000FFFF) == 0) + { + // It has at least 1 surrogate, but we don't know if they're high or low surrogates, + // or if there's 1 or 4 surrogates + + // If they happen to be high/low/high/low, we may as well continue. Check the next + // bit to see if its set (low) or not (high) in the right pattern +#if BIGENDIAN + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) +#else + if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) +#endif + { + // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high + // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. + + // Drop out to the slow loop to resolve the surrogates + break; + } + // else they are all surrogates in High/Low/High/Low order, so we can use them. + } + // else none are surrogates, so we can use them. + } + // else all < 0x8000 so we can use them + + // We can use these 4 chars. + *longChars = *longBytes; + longBytes++; + longChars++; + } + + chars = (char*)longChars; + bytes = (byte*)longBytes; + + if (bytes >= byteEnd) + break; + } +#endif // !NO_FAST_UNICODE_LOOP + + // Get 1st byte + if (lastByte < 0) + { + lastByte = *bytes++; + continue; + } + + // Get full char + char ch; + if (bigEndian) + { + ch = (char)(lastByte << 8 | *(bytes++)); + } + else + { + ch = (char)(*(bytes++) << 8 | lastByte); + } + lastByte = -1; + + // See if the char's valid + if (ch >= 0xd800 && ch <= 0xdfff) + { + // Was it a high surrogate? + if (ch <= 0xdbff) + { + // Its a high surrogate, if we had one then do fallback for previous one + if (lastChar > 0) + { + // Get fallback for previous high surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // couldn't fall back lonely surrogate + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)"); + bytes -= 2; // didn't use these 2 bytes + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + } + + // Ignore the previous high surrogate which fell back already, + // yet remember the current high surrogate for next time. + lastChar = ch; + continue; + } + + // Its a low surrogate + if (lastChar == 0) + { + // Expected a previous high surrogate + // Get fallback for this low surrogate + // Note we have to reconstruct bytes because some may have been in decoder + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // couldn't fall back lonely surrogate + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)"); + bytes -= 2; // didn't use these 2 bytes + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + + // Didn't throw, ignore this one (we already did its fallback) + continue; + } + + // Valid surrogate pair, add our lastChar (will need 2 chars) + if (chars >= charEnd - 1) + { + // couldn't find room for this surrogate pair + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)"); + bytes -= 2; // didn't use these 2 bytes + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // Leave lastChar for next call to Convert() + break; // couldn't fallback but didn't throw + } + + *chars++ = lastChar; + lastChar = (char)0; + } + else if (lastChar > 0) + { + // Had a high surrogate, expected a low surrogate, fall back the high surrogate. + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // couldn't fall back high surrogate, or char that would be next + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)"); + bytes -= 2; // didn't use these 2 bytes + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + + // Not left over now, clear previous high surrogate and continue to add current char + lastChar = (char)0; + } + + // Valid char, room for it? + if (chars >= charEnd) + { + // 2 bytes couldn't fall back + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)"); + bytes -= 2; // didn't use these bytes + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + break; // couldn't fallback but didn't throw + } + + // add it + *chars++ = ch; + } + + // Remember our decoder if we must + if (decoder == null || decoder.MustFlush) + { + if (lastChar > 0) + { + // No hanging high surrogates allowed, do fallback and remove count for it + byte[] byteBuffer = null; + if (bigEndian) + { + byteBuffer = new byte[] + { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; + } + else + { + byteBuffer = new byte[] + { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; + } + + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // 2 bytes couldn't fall back + // We either advanced bytes or chars should == charStart and throw below + Debug.Assert(bytes >= byteStart + 2 || chars == charStart, + "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)"); + bytes -= 2; // didn't use these bytes + if (lastByte >= 0) + bytes--; // had an extra last byte hanging around + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // We'll remember these in our decoder though + bytes += 2; + if (lastByte >= 0) + bytes++; + goto End; + } + + // done with this one + lastChar = (char)0; + } + + if (lastByte >= 0) + { + if (fallbackBuffer == null) + { + if (decoder == null) + fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); + else + fallbackBuffer = decoder.FallbackBuffer; + + // Set our internal fallback interesting things. + fallbackBuffer.InternalInitialize(byteStart, charEnd); + } + + // No hanging odd bytes allowed if must flush + charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered + bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback); + chars = charsForFallback; + + if (!fallbackResult) + { + // odd byte couldn't fall back + bytes--; // didn't use this byte + fallbackBuffer.InternalReset(); + ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output + // didn't throw, but we'll remember it in the decoder + bytes++; + goto End; + } + + // Didn't fail, clear buffer + lastByte = -1; + } + } + + End: + + // Remember our decoder if we must + if (decoder != null) + { + Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)), + "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing" + // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2") + ); + + decoder.m_bytesUsed = (int)(bytes - byteStart); + decoder.lastChar = lastChar; + decoder.lastByte = lastByte; + } + + // Used to do this the old way + // System.IO.__UnmanagedMemoryStream.memcpyimpl((byte*)chars, bytes, byteCount); + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for count or chars) + Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, + "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end"); + + return (int)(chars - charStart); + } + + + public override System.Text.Encoder GetEncoder() + { + return new EncoderNLS(this); + } + + + public override System.Text.Decoder GetDecoder() + { + return new UnicodeEncoding.Decoder(this); + } + + + public override byte[] GetPreamble() + { + if (byteOrderMark) + { + // Note - we must allocate new byte[]'s here to prevent someone + // from modifying a cached byte[]. + if (bigEndian) + return new byte[2] { 0xfe, 0xff }; + else + return new byte[2] { 0xff, 0xfe }; + } + return Array.Empty<Byte>(); + } + + + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException(nameof(charCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // 2 bytes per char + byteCount <<= 1; + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); + + return (int)byteCount; + } + + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException(nameof(byteCount), + SR.ArgumentOutOfRange_NeedNonNegNum); + Contract.EndContractBlock(); + + // long because byteCount could be biggest int. + // 1 char per 2 bytes. Round up in case 1 left over in decoder. + // Round up using &1 in case byteCount is max size + // Might also need an extra 1 if there's a left over high surrogate in the decoder. + long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1; + + // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizzare like that) + if (DecoderFallback.MaxCharCount > 1) + charCount *= DecoderFallback.MaxCharCount; + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); + + return (int)charCount; + } + + + public override bool Equals(Object value) + { + UnicodeEncoding that = value as UnicodeEncoding; + if (that != null) + { + // + // Big Endian Unicode has different code page (1201) than small Endian one (1200), + // so we still have to check m_codePage here. + // + return (CodePage == that.CodePage) && + byteOrderMark == that.byteOrderMark && + // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks + bigEndian == that.bigEndian && + (EncoderFallback.Equals(that.EncoderFallback)) && + (DecoderFallback.Equals(that.DecoderFallback)); + } + return (false); + } + + public override int GetHashCode() + { + return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + + (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0); + } + + [Serializable] + private sealed class Decoder : System.Text.DecoderNLS, ISerializable + { + internal int lastByte = -1; + internal char lastChar = '\0'; + + public Decoder(UnicodeEncoding encoding) : base(encoding) + { + // base calls reset + } + + // Constructor called by serialization, have to handle deserializing from Everett + internal Decoder(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Get Common Info + this.lastByte = (int)info.GetValue("lastByte", typeof(int)); + + try + { + // Try the encoding, which is only serialized in Whidbey + this.m_encoding = (Encoding)info.GetValue("m_encoding", typeof(Encoding)); + this.lastChar = (char)info.GetValue("lastChar", typeof(char)); + this.m_fallback = (DecoderFallback)info.GetValue("m_fallback", typeof(DecoderFallback)); + } + catch (SerializationException) + { + // Everett didn't serialize the UnicodeEncoding, get the default one + bool bigEndian = (bool)info.GetValue("bigEndian", typeof(bool)); + this.m_encoding = new UnicodeEncoding(bigEndian, false); + } + } + + // ISerializable implementation, get data for this object + void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) + { + // Any info? + if (info == null) throw new ArgumentNullException(nameof(info)); + Contract.EndContractBlock(); + + // Save Whidbey data + info.AddValue("m_encoding", this.m_encoding); + info.AddValue("m_fallback", this.m_fallback); + info.AddValue("lastChar", this.lastChar); // Unused by everett so it'll probably get lost + info.AddValue("lastByte", this.lastByte); + + // Everett Only + info.AddValue("bigEndian", ((UnicodeEncoding)(this.m_encoding)).bigEndian); + } + + public override void Reset() + { + lastByte = -1; + lastChar = '\0'; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our decoder? + internal override bool HasState + { + get + { + return (this.lastByte != -1 || this.lastChar != '\0'); + } + } + } + } +} + |