summaryrefslogtreecommitdiff
path: root/src/mscorlib/shared/System/Text/UTF8Encoding.cs
diff options
context:
space:
mode:
Diffstat (limited to 'src/mscorlib/shared/System/Text/UTF8Encoding.cs')
-rw-r--r--src/mscorlib/shared/System/Text/UTF8Encoding.cs2668
1 files changed, 2668 insertions, 0 deletions
diff --git a/src/mscorlib/shared/System/Text/UTF8Encoding.cs b/src/mscorlib/shared/System/Text/UTF8Encoding.cs
new file mode 100644
index 0000000000..5cfa89018a
--- /dev/null
+++ b/src/mscorlib/shared/System/Text/UTF8Encoding.cs
@@ -0,0 +1,2668 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+// The worker functions in this file was optimized for performance. If you make changes
+// you should use care to consider all of the interesting cases.
+
+// The code of all worker functions in this file is written twice: Once as as a slow loop, and the
+// second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
+// The fast loops attempts to blaze through as fast as possible with optimistic range checks,
+// processing multiple characters at a time, and falling back to the slow loop for all special cases.
+
+// This define can be used to turn off the fast loops. Useful for finding whether
+// the problem is fastloop-specific.
+#define FASTLOOP
+
+using System;
+using System.Runtime.Serialization;
+using System.Diagnostics;
+using System.Diagnostics.Contracts;
+using System.Globalization;
+
+namespace System.Text
+{
+ // Encodes text into and out of UTF-8. UTF-8 is a way of writing
+ // Unicode characters with variable numbers of bytes per character,
+ // optimized for the lower 127 ASCII characters. It's an efficient way
+ // of encoding US English in an internationalizable way.
+ //
+ // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
+ //
+ // The UTF-8 byte order mark is simply the Unicode byte order mark
+ // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
+ // used mostly to distinguish UTF-8 text from other encodings, and doesn't
+ // switch the byte orderings.
+
+ [Serializable]
+ public class UTF8Encoding : Encoding
+ {
+ /*
+ bytes bits UTF-8 representation
+ ----- ---- -----------------------------------
+ 1 7 0vvvvvvv
+ 2 11 110vvvvv 10vvvvvv
+ 3 16 1110vvvv 10vvvvvv 10vvvvvv
+ 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
+ ----- ---- -----------------------------------
+
+ Surrogate:
+ Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
+ */
+
+ private const int UTF8_CODEPAGE = 65001;
+
+ // Allow for devirtualization (see https://github.com/dotnet/coreclr/pull/9230)
+ [Serializable]
+ internal sealed class UTF8EncodingSealed : UTF8Encoding
+ {
+ public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
+ }
+
+ // Used by Encoding.UTF8 for lazy initialization
+ // The initialization code will not be run until a static member of the class is referenced
+ internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
+
+ // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
+ // the standard.
+ private bool _emitUTF8Identifier = false;
+
+ private bool _isThrowException = false;
+
+
+ public UTF8Encoding() : this(false)
+ {
+ }
+
+
+ public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
+ this(encoderShouldEmitUTF8Identifier, false)
+ {
+ }
+
+
+ public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
+ base(UTF8_CODEPAGE)
+ {
+ _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
+ _isThrowException = throwOnInvalidBytes;
+
+ // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
+ if (_isThrowException)
+ SetDefaultFallbacks();
+ }
+
+ internal override void SetDefaultFallbacks()
+ {
+ // For UTF-X encodings, we use a replacement fallback with an empty string
+ if (_isThrowException)
+ {
+ this.encoderFallback = EncoderFallback.ExceptionFallback;
+ this.decoderFallback = DecoderFallback.ExceptionFallback;
+ }
+ else
+ {
+ this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
+ this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
+ }
+ }
+
+
+ // WARNING: GetByteCount(string chars)
+ // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
+ // WARNING: otherwise it'll break VB's way of declaring these.
+ //
+ // The following methods are copied from EncodingNLS.cs.
+ // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
+ // These should be kept in sync for the following classes:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+
+ // Returns the number of bytes required to encode a range of characters in
+ // a character array.
+ //
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
+
+ public override unsafe int GetByteCount(char[] chars, int index, int count)
+ {
+ // Validate input parameters
+ if (chars == null)
+ throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
+
+ if (index < 0 || count < 0)
+ throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
+
+ if (chars.Length - index < count)
+ throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
+ Contract.EndContractBlock();
+
+ // If no input, return 0, avoid fixed empty array problem
+ if (count == 0)
+ return 0;
+
+ // Just call the pointer version
+ fixed (char* pChars = chars)
+ return GetByteCount(pChars + index, count, null);
+ }
+
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
+
+ public override unsafe int GetByteCount(String chars)
+ {
+ // Validate input
+ if (chars==null)
+ throw new ArgumentNullException("s");
+ Contract.EndContractBlock();
+
+ fixed (char* pChars = chars)
+ return GetByteCount(pChars, chars.Length, null);
+ }
+
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+
+ [CLSCompliant(false)]
+ public override unsafe int GetByteCount(char* chars, int count)
+ {
+ // Validate Parameters
+ if (chars == null)
+ throw new ArgumentNullException("chars", SR.ArgumentNull_Array);
+
+ if (count < 0)
+ throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
+ Contract.EndContractBlock();
+
+ // Call it with empty encoder
+ return GetByteCount(chars, count, null);
+ }
+
+ // Parent method is safe.
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+
+ public override unsafe int GetBytes(String s, int charIndex, int charCount,
+ byte[] bytes, int byteIndex)
+ {
+ if (s == null || bytes == null)
+ throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array);
+
+ if (charIndex < 0 || charCount < 0)
+ throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
+
+ if (s.Length - charIndex < charCount)
+ throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount);
+
+ if (byteIndex < 0 || byteIndex > bytes.Length)
+ throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
+ Contract.EndContractBlock();
+
+ int byteCount = bytes.Length - byteIndex;
+
+ // Fixed doesn't like 0 length arrays.
+ if (bytes.Length == 0)
+ bytes = new byte[1];
+
+ fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0])
+ return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
+ }
+
+ // Encodes a range of characters in a character array into a range of bytes
+ // in a byte array. An exception occurs if the byte array is not large
+ // enough to hold the complete encoding of the characters. The
+ // GetByteCount method can be used to determine the exact number of
+ // bytes that will be produced for a given range of characters.
+ // Alternatively, the GetMaxByteCount method can be used to
+ // determine the maximum number of bytes that will be produced for a given
+ // number of characters, regardless of the actual character values.
+ //
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
+
+ public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
+ byte[] bytes, int byteIndex)
+ {
+ // Validate parameters
+ if (chars == null || bytes == null)
+ throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array);
+
+ if (charIndex < 0 || charCount < 0)
+ throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
+
+ if (chars.Length - charIndex < charCount)
+ throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer);
+
+ if (byteIndex < 0 || byteIndex > bytes.Length)
+ throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index);
+ Contract.EndContractBlock();
+
+ // If nothing to encode return 0, avoid fixed problem
+ if (charCount == 0)
+ return 0;
+
+ // Just call pointer version
+ int byteCount = bytes.Length - byteIndex;
+
+ // Fixed doesn't like 0 length arrays.
+ if (bytes.Length == 0)
+ bytes = new byte[1];
+
+ fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0])
+ // Remember that byteCount is # to decode, not size of array.
+ return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
+ }
+
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+
+ [CLSCompliant(false)]
+ public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
+ {
+ // Validate Parameters
+ if (bytes == null || chars == null)
+ throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
+
+ if (charCount < 0 || byteCount < 0)
+ throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
+ Contract.EndContractBlock();
+
+ return GetBytes(chars, charCount, bytes, byteCount, null);
+ }
+
+ // Returns the number of characters produced by decoding a range of bytes
+ // in a byte array.
+ //
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
+
+ public override unsafe int GetCharCount(byte[] bytes, int index, int count)
+ {
+ // Validate Parameters
+ if (bytes == null)
+ throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
+
+ if (index < 0 || count < 0)
+ throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
+
+ if (bytes.Length - index < count)
+ throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
+ Contract.EndContractBlock();
+
+ // If no input just return 0, fixed doesn't like 0 length arrays.
+ if (count == 0)
+ return 0;
+
+ // Just call pointer version
+ fixed (byte* pBytes = bytes)
+ return GetCharCount(pBytes + index, count, null);
+ }
+
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+
+ [CLSCompliant(false)]
+ public override unsafe int GetCharCount(byte* bytes, int count)
+ {
+ // Validate Parameters
+ if (bytes == null)
+ throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
+
+ if (count < 0)
+ throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum);
+ Contract.EndContractBlock();
+
+ return GetCharCount(bytes, count, null);
+ }
+
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
+
+ public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
+ char[] chars, int charIndex)
+ {
+ // Validate Parameters
+ if (bytes == null || chars == null)
+ throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
+
+ if (byteIndex < 0 || byteCount < 0)
+ throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
+
+ if ( bytes.Length - byteIndex < byteCount)
+ throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
+
+ if (charIndex < 0 || charIndex > chars.Length)
+ throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index);
+ Contract.EndContractBlock();
+
+ // If no input, return 0 & avoid fixed problem
+ if (byteCount == 0)
+ return 0;
+
+ // Just call pointer version
+ int charCount = chars.Length - charIndex;
+
+ // Fixed doesn't like 0 length arrays.
+ if (chars.Length == 0)
+ chars = new char[1];
+
+ fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0])
+ // Remember that charCount is # to decode, not size of array
+ return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
+ }
+
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+
+ [CLSCompliant(false)]
+ public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
+ {
+ // Validate Parameters
+ if (bytes == null || chars == null)
+ throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array);
+
+ if (charCount < 0 || byteCount < 0)
+ throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum);
+ Contract.EndContractBlock();
+
+ return GetChars(bytes, byteCount, chars, charCount, null);
+ }
+
+ // Returns a string containing the decoded representation of a range of
+ // bytes in a byte array.
+ //
+ // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
+ // So if you fix this, fix the others. Currently those include:
+ // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
+ // parent method is safe
+
+ public override unsafe String GetString(byte[] bytes, int index, int count)
+ {
+ // Validate Parameters
+ if (bytes == null)
+ throw new ArgumentNullException("bytes", SR.ArgumentNull_Array);
+
+ if (index < 0 || count < 0)
+ throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum);
+
+ if (bytes.Length - index < count)
+ throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer);
+ Contract.EndContractBlock();
+
+ // Avoid problems with empty input buffer
+ if (count == 0) return String.Empty;
+
+ fixed (byte* pBytes = bytes)
+ return String.CreateStringFromEncoding(
+ pBytes + index, count, this);
+ }
+
+ //
+ // End of standard methods copied from EncodingNLS.cs
+ //
+
+ // To simplify maintenance, the structure of GetByteCount and GetBytes should be
+ // kept the same as much as possible
+ internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
+ {
+ // For fallback we may need a fallback buffer.
+ // We wait to initialize it though in case we don't have any broken input unicode
+ EncoderFallbackBuffer fallbackBuffer = null;
+ char* pSrcForFallback;
+
+ char* pSrc = chars;
+ char* pEnd = pSrc + count;
+
+ // Start by assuming we have as many as count
+ int byteCount = count;
+
+ int ch = 0;
+
+ if (baseEncoder != null)
+ {
+ UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
+ ch = encoder.surrogateChar;
+
+ // We mustn't have left over fallback data when counting
+ if (encoder.InternalHasFallbackBuffer)
+ {
+ fallbackBuffer = encoder.FallbackBuffer;
+ if (fallbackBuffer.Remaining > 0)
+ throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
+
+ // Set our internal fallback interesting things.
+ fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
+ }
+ }
+
+ for (;;)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+ if (pSrc >= pEnd)
+ {
+ if (ch == 0)
+ {
+ // Unroll any fallback that happens at the end
+ ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
+ if (ch > 0)
+ {
+ byteCount++;
+ goto ProcessChar;
+ }
+ }
+ else
+ {
+ // Case of surrogates in the fallback.
+ if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
+ {
+ Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
+ "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
+
+ ch = fallbackBuffer.InternalGetNextChar();
+ byteCount++;
+
+ if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ ch = 0xfffd;
+ byteCount++;
+ goto EncodeChar;
+ }
+ else if (ch > 0)
+ {
+ goto ProcessChar;
+ }
+ else
+ {
+ byteCount--; // ignore last one.
+ break;
+ }
+ }
+ }
+
+ if (ch <= 0)
+ {
+ break;
+ }
+ if (baseEncoder != null && !baseEncoder.MustFlush)
+ {
+ break;
+ }
+
+ // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
+ byteCount++;
+ goto EncodeChar;
+ }
+
+ if (ch > 0)
+ {
+ Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
+ "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
+
+ // use separate helper variables for local contexts so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int cha = *pSrc;
+
+ // count the pending surrogate
+ byteCount++;
+
+ // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
+ // if (IsLowSurrogate(cha)) {
+ if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
+ ch = 0xfffd;
+ // ch = cha + (ch << 10) +
+ // (0x10000
+ // - CharUnicodeInfo.LOW_SURROGATE_START
+ // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
+
+ // Use this next char
+ pSrc++;
+ }
+ // else ch is still high surrogate and encoding will fail (so don't add count)
+
+ // attempt to encode the surrogate or partial surrogate
+ goto EncodeChar;
+ }
+
+ // If we've used a fallback, then we have to check for it
+ if (fallbackBuffer != null)
+ {
+ ch = fallbackBuffer.InternalGetNextChar();
+ if (ch > 0)
+ {
+ // We have an extra byte we weren't expecting.
+ byteCount++;
+ goto ProcessChar;
+ }
+ }
+
+ // read next char. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ // if (IsHighSurrogate(ch)) {
+ if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
+ {
+ // we will count this surrogate next time around
+ byteCount--;
+ continue;
+ }
+ // either good char or partial surrogate
+
+ EncodeChar:
+ // throw exception on partial surrogate if necessary
+ // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
+ if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // Lone surrogates aren't allowed
+ // Have to make a fallback buffer if we don't have one
+ if (fallbackBuffer == null)
+ {
+ // wait on fallbacks if we can
+ // For fallback we may need a fallback buffer
+ if (baseEncoder == null)
+ fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
+ else
+ fallbackBuffer = baseEncoder.FallbackBuffer;
+
+ // Set our internal fallback interesting things.
+ fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
+ }
+
+ // Do our fallback. Actually we already know its a mixed up surrogate,
+ // so the ref pSrc isn't gonna do anything.
+ pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
+ fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
+ pSrc = pSrcForFallback;
+
+ // Ignore it if we don't throw (we had preallocated this ch)
+ byteCount--;
+ ch = 0;
+ continue;
+ }
+
+ // Count them
+ if (ch > 0x7F)
+ {
+ if (ch > 0x7FF)
+ {
+ // the extra surrogate byte was compensated by the second surrogate character
+ // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
+ byteCount++;
+ }
+ byteCount++;
+ }
+
+#if BIT64
+ // check for overflow
+ if (byteCount < 0)
+ {
+ break;
+ }
+#endif
+
+#if FASTLOOP
+ // If still have fallback don't do fast loop
+ if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
+ {
+ // We're reserving 1 byte for each char by default
+ byteCount++;
+ goto ProcessChar;
+ }
+
+ int availableChars = PtrDiff(pEnd, pSrc);
+
+ // don't fall into the fast decoding loop if we don't have enough characters
+ if (availableChars <= 13)
+ {
+ // try to get over the remainder of the ascii characters fast though
+ char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F)
+ goto ProcessChar;
+ }
+
+ // we are done
+ break;
+ }
+
+#if BIT64
+ // make sure that we won't get a silent overflow inside the fast loop
+ // (Fall out to slow loop if we have this many characters)
+ availableChars &= 0x0FFFFFFF;
+#endif
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
+ char* pStop = pSrc + availableChars - (3 + 4);
+
+ while (pSrc < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F) // Not ASCII
+ {
+ if (ch > 0x7FF) // Not 2 Byte
+ {
+ if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
+ goto LongCode;
+ byteCount++;
+ }
+ byteCount++;
+ }
+
+ // get pSrc aligned
+ if ((unchecked((int)pSrc) & 0x2) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F) // Not ASCII
+ {
+ if (ch > 0x7FF) // Not 2 Byte
+ {
+ if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
+ goto LongCode;
+ byteCount++;
+ }
+ byteCount++;
+ }
+ }
+
+ // Run 2 * 4 characters at a time!
+ while (pSrc < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chc = *(int*)(pSrc + 2);
+ if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
+ {
+ if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
+ {
+ goto LongCodeWithMask;
+ }
+
+
+ if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits)
+ byteCount++;
+ if ((ch & unchecked((int)0xFF80)) != 0)
+ byteCount++;
+ if ((chc & unchecked((int)0xFF800000)) != 0)
+ byteCount++;
+ if ((chc & unchecked((int)0xFF80)) != 0)
+ byteCount++;
+ }
+ pSrc += 4;
+
+ ch = *(int*)pSrc;
+ chc = *(int*)(pSrc + 2);
+ if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
+ {
+ if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
+ {
+ goto LongCodeWithMask;
+ }
+
+ if ((ch & unchecked((int)0xFF800000)) != 0)
+ byteCount++;
+ if ((ch & unchecked((int)0xFF80)) != 0)
+ byteCount++;
+ if ((chc & unchecked((int)0xFF800000)) != 0)
+ byteCount++;
+ if ((chc & unchecked((int)0xFF80)) != 0)
+ byteCount++;
+ }
+ pSrc += 4;
+ }
+ break;
+
+ LongCodeWithMask:
+#if BIGENDIAN
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+#else // BIGENDIAN
+ ch = (char)ch;
+#endif // BIGENDIAN
+ pSrc++;
+
+ if (ch <= 0x7F)
+ {
+ continue;
+ }
+
+ LongCode:
+ // use separate helper variables for slow and fast loop so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ if (ch > 0x7FF)
+ {
+ // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
+ if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // 4 byte encoding - high surrogate + low surrogate
+
+ int chd = *pSrc;
+ if (
+ // !IsHighSurrogate(ch) // low without high -> bad
+ ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
+ // !IsLowSurrogate(chd) // high not followed by low -> bad
+ !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // Back up and drop out to slow loop to figure out error
+ pSrc--;
+ break;
+ }
+ pSrc++;
+
+ // byteCount - this byte is compensated by the second surrogate character
+ }
+ byteCount++;
+ }
+ byteCount++;
+
+ // byteCount - the last byte is already included
+ }
+#endif // FASTLOOP
+
+ // no pending char at this point
+ ch = 0;
+ }
+
+#if BIT64
+ // check for overflow
+ if (byteCount < 0)
+ {
+ throw new ArgumentException(
+ SR.Argument_ConversionOverflow);
+ }
+#endif
+
+ Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
+ "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
+
+ return byteCount;
+ }
+
+ // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
+ // is good enough for us, and it tends to generate better code than the signed
+ // arithmetic generated by default
+ unsafe private static int PtrDiff(char* a, char* b)
+ {
+ return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
+ }
+
+ // byte* flavor just for parity
+ unsafe private static int PtrDiff(byte* a, byte* b)
+ {
+ return (int)(a - b);
+ }
+
+ private static bool InRange(int ch, int start, int end)
+ {
+ return (uint)(ch - start) <= (uint)(end - start);
+ }
+
+ // Our workhorse
+ // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw
+ internal override unsafe int GetBytes(char* chars, int charCount,
+ byte* bytes, int byteCount, EncoderNLS baseEncoder)
+ {
+ Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
+ Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
+ Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
+ Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
+
+ UTF8Encoder encoder = null;
+
+ // For fallback we may need a fallback buffer.
+ // We wait to initialize it though in case we don't have any broken input unicode
+ EncoderFallbackBuffer fallbackBuffer = null;
+ char* pSrcForFallback;
+
+ char* pSrc = chars;
+ byte* pTarget = bytes;
+
+ char* pEnd = pSrc + charCount;
+ byte* pAllocatedBufferEnd = pTarget + byteCount;
+
+ int ch = 0;
+
+ // assume that JIT will enregister pSrc, pTarget and ch
+
+ if (baseEncoder != null)
+ {
+ encoder = (UTF8Encoder)baseEncoder;
+ ch = encoder.surrogateChar;
+
+ // We mustn't have left over fallback data when counting
+ if (encoder.InternalHasFallbackBuffer)
+ {
+ // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
+ fallbackBuffer = encoder.FallbackBuffer;
+ if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
+ throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
+
+ // Set our internal fallback interesting things.
+ fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
+ }
+ }
+
+ for (;;)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+ if (pSrc >= pEnd)
+ {
+ if (ch == 0)
+ {
+ // Check if there's anthing left to get out of the fallback buffer
+ ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
+ if (ch > 0)
+ {
+ goto ProcessChar;
+ }
+ }
+ else
+ {
+ // Case of leftover surrogates in the fallback buffer
+ if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
+ {
+ Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
+ "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
+
+ int cha = ch;
+
+ ch = fallbackBuffer.InternalGetNextChar();
+
+ if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
+ goto EncodeChar;
+ }
+ else if (ch > 0)
+ {
+ goto ProcessChar;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+
+ // attempt to encode the partial surrogate (will fail or ignore)
+ if (ch > 0 && (encoder == null || encoder.MustFlush))
+ goto EncodeChar;
+
+ // We're done
+ break;
+ }
+
+ if (ch > 0)
+ {
+ // We have a high surrogate left over from a previous loop.
+ Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
+ "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
+
+ // use separate helper variables for local contexts so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int cha = *pSrc;
+
+ // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
+ // if (IsLowSurrogate(cha)) {
+ if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ ch = cha + (ch << 10) +
+ (0x10000
+ - CharUnicodeInfo.LOW_SURROGATE_START
+ - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
+
+ pSrc++;
+ }
+ // else ch is still high surrogate and encoding will fail
+
+ // attempt to encode the surrogate or partial surrogate
+ goto EncodeChar;
+ }
+
+ // If we've used a fallback, then we have to check for it
+ if (fallbackBuffer != null)
+ {
+ ch = fallbackBuffer.InternalGetNextChar();
+ if (ch > 0) goto ProcessChar;
+ }
+
+ // read next char. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ // if (IsHighSurrogate(ch)) {
+ if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
+ {
+ continue;
+ }
+ // either good char or partial surrogate
+
+ EncodeChar:
+ // throw exception on partial surrogate if necessary
+ // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
+ if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // Lone surrogates aren't allowed, we have to do fallback for them
+ // Have to make a fallback buffer if we don't have one
+ if (fallbackBuffer == null)
+ {
+ // wait on fallbacks if we can
+ // For fallback we may need a fallback buffer
+ if (baseEncoder == null)
+ fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
+ else
+ fallbackBuffer = baseEncoder.FallbackBuffer;
+
+ // Set our internal fallback interesting things.
+ fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
+ }
+
+ // Do our fallback. Actually we already know its a mixed up surrogate,
+ // so the ref pSrc isn't gonna do anything.
+ pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
+ fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
+ pSrc = pSrcForFallback;
+
+ // Ignore it if we don't throw
+ ch = 0;
+ continue;
+ }
+
+ // Count bytes needed
+ int bytesNeeded = 1;
+ if (ch > 0x7F)
+ {
+ if (ch > 0x7FF)
+ {
+ if (ch > 0xFFFF)
+ {
+ bytesNeeded++; // 4 bytes (surrogate pair)
+ }
+ bytesNeeded++; // 3 bytes (800-FFFF)
+ }
+ bytesNeeded++; // 2 bytes (80-7FF)
+ }
+
+ if (pTarget > pAllocatedBufferEnd - bytesNeeded)
+ {
+ // Left over surrogate from last time will cause pSrc == chars, so we'll throw
+ if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
+ {
+ fallbackBuffer.MovePrevious(); // Didn't use this fallback char
+ if (ch > 0xFFFF)
+ fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either
+ }
+ else
+ {
+ pSrc--; // Didn't use this char
+ if (ch > 0xFFFF)
+ pSrc--; // Was surrogate, didn't use 2nd part either
+ }
+ Debug.Assert(pSrc >= chars || pTarget == bytes,
+ "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
+ ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must
+ ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
+ break;
+ }
+
+ if (ch <= 0x7F)
+ {
+ *pTarget = (byte)ch;
+ }
+ else
+ {
+ // use separate helper variables for local contexts so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int chb;
+ if (ch <= 0x7FF)
+ {
+ // 2 byte encoding
+ chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
+ }
+ else
+ {
+ if (ch <= 0xFFFF)
+ {
+ chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
+ }
+ else
+ {
+ *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
+ pTarget++;
+
+ chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
+ }
+ *pTarget = (byte)chb;
+ pTarget++;
+
+ chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
+ }
+ *pTarget = (byte)chb;
+ pTarget++;
+
+ *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
+ }
+ pTarget++;
+
+
+#if FASTLOOP
+ // If still have fallback don't do fast loop
+ if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
+ goto ProcessChar;
+
+ int availableChars = PtrDiff(pEnd, pSrc);
+ int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
+
+ // don't fall into the fast decoding loop if we don't have enough characters
+ // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
+ if (availableChars <= 13)
+ {
+ // we are hoping for 1 byte per char
+ if (availableBytes < availableChars)
+ {
+ // not enough output room. no pending bits at this point
+ ch = 0;
+ continue;
+ }
+
+ // try to get over the remainder of the ascii characters fast though
+ char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ // Not ASCII, need more than 1 byte per char
+ if (ch > 0x7F)
+ goto ProcessChar;
+
+ *pTarget = (byte)ch;
+ pTarget++;
+ }
+ // we are done, let ch be 0 to clear encoder
+ ch = 0;
+ break;
+ }
+
+ // we need at least 1 byte per character, but Convert might allow us to convert
+ // only part of the input, so try as much as we can. Reduce charCount if necessary
+ if (availableBytes < availableChars)
+ {
+ availableChars = availableBytes;
+ }
+
+ // FASTLOOP:
+ // - optimistic range checks
+ // - fallbacks to the slow loop for all special cases, exception throwing, etc.
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
+ // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
+ char* pStop = pSrc + availableChars - 5;
+
+ while (pSrc < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ *pTarget = (byte)ch;
+ pTarget++;
+
+ // get pSrc aligned
+ if ((unchecked((int)pSrc) & 0x2) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ *pTarget = (byte)ch;
+ pTarget++;
+ }
+
+ // Run 4 characters at a time!
+ while (pSrc < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chc = *(int*)(pSrc + 2);
+ if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
+ {
+ goto LongCodeWithMask;
+ }
+
+ // Unfortunately, this is endianess sensitive
+#if BIGENDIAN
+ *pTarget = (byte)(ch>>16);
+ *(pTarget+1) = (byte)ch;
+ pSrc += 4;
+ *(pTarget+2) = (byte)(chc>>16);
+ *(pTarget+3) = (byte)chc;
+ pTarget += 4;
+#else // BIGENDIAN
+ *pTarget = (byte)ch;
+ *(pTarget + 1) = (byte)(ch >> 16);
+ pSrc += 4;
+ *(pTarget + 2) = (byte)chc;
+ *(pTarget + 3) = (byte)(chc >> 16);
+ pTarget += 4;
+#endif // BIGENDIAN
+ }
+ continue;
+
+ LongCodeWithMask:
+#if BIGENDIAN
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+#else // BIGENDIAN
+ ch = (char)ch;
+#endif // BIGENDIAN
+ pSrc++;
+
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ *pTarget = (byte)ch;
+ pTarget++;
+ continue;
+
+ LongCode:
+ // use separate helper variables for slow and fast loop so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int chd;
+ if (ch <= 0x7FF)
+ {
+ // 2 byte encoding
+ chd = unchecked((sbyte)0xC0) | (ch >> 6);
+ }
+ else
+ {
+ // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
+ if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // 3 byte encoding
+ chd = unchecked((sbyte)0xE0) | (ch >> 12);
+ }
+ else
+ {
+ // 4 byte encoding - high surrogate + low surrogate
+ // if (!IsHighSurrogate(ch))
+ if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
+ {
+ // low without high -> bad, try again in slow loop
+ pSrc -= 1;
+ break;
+ }
+
+ chd = *pSrc;
+ pSrc++;
+
+ // if (!IsLowSurrogate(chd)) {
+ if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
+ {
+ // high not followed by low -> bad, try again in slow loop
+ pSrc -= 2;
+ break;
+ }
+
+ ch = chd + (ch << 10) +
+ (0x10000
+ - CharUnicodeInfo.LOW_SURROGATE_START
+ - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
+
+ *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
+ // pStop - this byte is compensated by the second surrogate character
+ // 2 input chars require 4 output bytes. 2 have been anticipated already
+ // and 2 more will be accounted for by the 2 pStop-- calls below.
+ pTarget++;
+
+ chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
+ }
+ *pTarget = (byte)chd;
+ pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too.
+ pTarget++;
+
+ chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
+ }
+ *pTarget = (byte)chd;
+ pStop--; // 2 byte sequence for 1 char so need pStop--.
+ pTarget++;
+
+ *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
+ // pStop - this byte is already included
+ pTarget++;
+ }
+
+ Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
+
+#endif // FASTLOOP
+
+ // no pending char at this point
+ ch = 0;
+ }
+
+ // Do we have to set the encoder bytes?
+ if (encoder != null)
+ {
+ Debug.Assert(!encoder.MustFlush || ch == 0,
+ "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
+
+ encoder.surrogateChar = ch;
+ encoder.m_charsUsed = (int)(pSrc - chars);
+ }
+
+ Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
+ baseEncoder == null || !baseEncoder.m_throwOnOverflow,
+ "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
+
+ return (int)(pTarget - bytes);
+ }
+
+
+ // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
+ // while the actual character is being built in the lower bits. They are shifted together
+ // with the actual bits of the character.
+
+ // bits 30 & 31 are used for pending bits fixup
+ private const int FinalByte = 1 << 29;
+ private const int SupplimentarySeq = 1 << 28;
+ private const int ThreeByteSeq = 1 << 27;
+
+ // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
+ // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+ //
+ // To simplify maintenance, the structure of GetCharCount and GetChars should be
+ // kept the same as much as possible
+ internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
+ {
+ Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
+ Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
+
+ // Initialize stuff
+ byte* pSrc = bytes;
+ byte* pEnd = pSrc + count;
+
+ // Start by assuming we have as many as count, charCount always includes the adjustment
+ // for the character being decoded
+ int charCount = count;
+ int ch = 0;
+ DecoderFallbackBuffer fallback = null;
+
+ if (baseDecoder != null)
+ {
+ UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
+ ch = decoder.bits;
+ charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars.
+
+ // Shouldn't have anything in fallback buffer for GetCharCount
+ // (don't have to check m_throwOnOverflow for count)
+ Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
+ "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
+ }
+
+ for (;;)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+ if (pSrc >= pEnd)
+ {
+ break;
+ }
+
+ if (ch == 0)
+ {
+ // no pending bits
+ goto ReadChar;
+ }
+
+ // read next byte. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ int cha = *pSrc;
+ pSrc++;
+
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((cha & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ // This can be a valid starting byte for another UTF8 byte sequence, so let's put
+ // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
+ pSrc--;
+ charCount += (ch >> 30);
+ goto InvalidByteSequence;
+ }
+
+ // fold in the new byte
+ ch = (ch << 6) | (cha & 0x3F);
+
+ if ((ch & FinalByte) == 0)
+ {
+ Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
+ "[UTF8Encoding.GetChars]Invariant volation");
+
+ if ((ch & SupplimentarySeq) != 0)
+ {
+ if ((ch & (FinalByte >> 6)) != 0)
+ {
+ // this is 3rd byte (of 4 byte supplimentary) - nothing to do
+ continue;
+ }
+
+ // 2nd byte, check for non-shortest form of supplimentary char and the valid
+ // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
+ if (!InRange(ch & 0x1F0, 0x10, 0x100))
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ else
+ {
+ // Must be 2nd byte of a 3-byte sequence
+ // check for non-shortest form of 3 byte seq
+ if ((ch & (0x1F << 5)) == 0 || // non-shortest form
+ (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ continue;
+ }
+
+ // ready to punch
+
+ // adjust for surrogates in non-shortest form
+ if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
+ {
+ charCount--;
+ }
+ goto EncodeChar;
+
+ InvalidByteSequence:
+ // this code fragment should be close to the gotos referencing it
+ // Have to do fallback for invalid bytes
+ if (fallback == null)
+ {
+ if (baseDecoder == null)
+ fallback = this.decoderFallback.CreateFallbackBuffer();
+ else
+ fallback = baseDecoder.FallbackBuffer;
+ fallback.InternalInitialize(bytes, null);
+ }
+ charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
+
+ ch = 0;
+ continue;
+
+ ReadChar:
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ if (ch > 0x7F)
+ {
+ // If its > 0x7F, its start of a new multi-byte sequence
+
+ // Long sequence, so unreserve our char.
+ charCount--;
+
+ // bit 6 has to be non-zero for start of multibyte chars.
+ if ((ch & 0x40) == 0)
+ {
+ // Unexpected trail byte
+ goto InvalidByteSequence;
+ }
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - supplimentary character (2 surrogates)
+
+ ch &= 0x0F;
+
+ // check that bit 4 is zero and the valid supplimentary character
+ // range 0x000000 - 0x10FFFF at the same time
+ if (ch > 0x04)
+ {
+ ch |= 0xf0;
+ goto InvalidByteSequence;
+ }
+
+ // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
+ // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
+ ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
+ (1 << 30) | // If it dies on next byte we'll need an extra char
+ (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
+ (SupplimentarySeq) | (SupplimentarySeq >> 6) |
+ (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
+
+ // Our character count will be 2 characters for these 4 bytes, so subtract another char
+ charCount--;
+ }
+ else
+ {
+ // 3 byte encoding
+ // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
+ ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
+ (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
+
+ // We'll expect 1 character for these 3 bytes, so subtract another char.
+ charCount--;
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ ch &= 0x1F;
+
+ // check for non-shortest form
+ if (ch <= 1)
+ {
+ ch |= 0xc0;
+ goto InvalidByteSequence;
+ }
+
+ // Add bit flags so we'll be flagged correctly
+ ch |= (FinalByte >> 6);
+ }
+ continue;
+ }
+
+ EncodeChar:
+
+#if FASTLOOP
+ int availableBytes = PtrDiff(pEnd, pSrc);
+
+ // don't fall into the fast decoding loop if we don't have enough bytes
+ if (availableBytes <= 13)
+ {
+ // try to get over the remainder of the ascii characters fast though
+ byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ goto ProcessChar;
+ }
+ // we are done
+ ch = 0;
+ break;
+ }
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
+ byte* pStop = pSrc + availableBytes - 7;
+
+ while (pSrc < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+
+ // get pSrc 2-byte aligned
+ if ((unchecked((int)pSrc) & 0x1) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ }
+
+ // get pSrc 4-byte aligned
+ if ((unchecked((int)pSrc) & 0x2) != 0)
+ {
+ ch = *(ushort*)pSrc;
+ if ((ch & 0x8080) != 0)
+ {
+ goto LongCodeWithMask16;
+ }
+ pSrc += 2;
+ }
+
+ // Run 8 + 8 characters at a time!
+ while (pSrc < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chb = *(int*)(pSrc + 4);
+ if (((ch | chb) & unchecked((int)0x80808080)) != 0)
+ {
+ goto LongCodeWithMask32;
+ }
+ pSrc += 8;
+
+ // This is a really small loop - unroll it
+ if (pSrc >= pStop)
+ break;
+
+ ch = *(int*)pSrc;
+ chb = *(int*)(pSrc + 4);
+ if (((ch | chb) & unchecked((int)0x80808080)) != 0)
+ {
+ goto LongCodeWithMask32;
+ }
+ pSrc += 8;
+ }
+ break;
+
+#if BIGENDIAN
+ LongCodeWithMask32:
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+ LongCodeWithMask16:
+ ch = (int)(((uint)ch) >> 8);
+#else // BIGENDIAN
+ LongCodeWithMask32:
+ LongCodeWithMask16:
+ ch &= 0xFF;
+#endif // BIGENDIAN
+ pSrc++;
+ if (ch <= 0x7F)
+ {
+ continue;
+ }
+
+ LongCode:
+ int chc = *pSrc;
+ pSrc++;
+
+ if (
+ // bit 6 has to be zero
+ (ch & 0x40) == 0 ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (chc & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc &= 0x3F;
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ // fold the first two bytes together
+ chc |= (ch & 0x0F) << 6;
+
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - surrogate
+ ch = *pSrc;
+ if (
+ // check that bit 4 is zero, the non-shortest form of surrogate
+ // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
+ !InRange(chc >> 4, 0x01, 0x10) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc = (chc << 6) | (ch & 0x3F);
+
+ ch = *(pSrc + 1);
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((ch & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc += 2;
+
+ // extra byte
+ charCount--;
+ }
+ else
+ {
+ // 3 byte encoding
+ ch = *pSrc;
+ if (
+ // check for non-shortest form of 3 byte seq
+ (chc & (0x1F << 5)) == 0 ||
+ // Can't have surrogates here.
+ (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc++;
+
+ // extra byte
+ charCount--;
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ // check for non-shortest form
+ if ((ch & 0x1E) == 0)
+ {
+ goto BadLongCode;
+ }
+ }
+
+ // extra byte
+ charCount--;
+ }
+#endif // FASTLOOP
+
+ // no pending bits at this point
+ ch = 0;
+ continue;
+
+ BadLongCode:
+ pSrc -= 2;
+ ch = 0;
+ continue;
+ }
+
+ // May have a problem if we have to flush
+ if (ch != 0)
+ {
+ // We were already adjusting for these, so need to unadjust
+ charCount += (ch >> 30);
+ if (baseDecoder == null || baseDecoder.MustFlush)
+ {
+ // Have to do fallback for invalid bytes
+ if (fallback == null)
+ {
+ if (baseDecoder == null)
+ fallback = this.decoderFallback.CreateFallbackBuffer();
+ else
+ fallback = baseDecoder.FallbackBuffer;
+ fallback.InternalInitialize(bytes, null);
+ }
+ charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
+ }
+ }
+
+ // Shouldn't have anything in fallback buffer for GetCharCount
+ // (don't have to check m_throwOnOverflow for count)
+ Debug.Assert(fallback == null || fallback.Remaining == 0,
+ "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
+
+ return charCount;
+ }
+
+ // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
+ // So if we're really broken, then that could also throw an error... recursively.
+ // So try to make sure GetChars can at least process all uses by
+ // System.Resources.ResourceReader!
+ //
+ // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
+ // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
+ //
+ // To simplify maintenance, the structure of GetCharCount and GetChars should be
+ // kept the same as much as possible
+ internal override unsafe int GetChars(byte* bytes, int byteCount,
+ char* chars, int charCount, DecoderNLS baseDecoder)
+ {
+ Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
+ Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
+ Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
+ Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
+
+ byte* pSrc = bytes;
+ char* pTarget = chars;
+
+ byte* pEnd = pSrc + byteCount;
+ char* pAllocatedBufferEnd = pTarget + charCount;
+
+ int ch = 0;
+
+ DecoderFallbackBuffer fallback = null;
+ byte* pSrcForFallback;
+ char* pTargetForFallback;
+ if (baseDecoder != null)
+ {
+ UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
+ ch = decoder.bits;
+
+ // Shouldn't have anything in fallback buffer for GetChars
+ // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
+ Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
+ "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
+ }
+
+ for (;;)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+ if (pSrc >= pEnd)
+ {
+ break;
+ }
+
+ if (ch == 0)
+ {
+ // no pending bits
+ goto ReadChar;
+ }
+
+ // read next byte. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ int cha = *pSrc;
+ pSrc++;
+
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((cha & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ // This can be a valid starting byte for another UTF8 byte sequence, so let's put
+ // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
+ pSrc--;
+ goto InvalidByteSequence;
+ }
+
+ // fold in the new byte
+ ch = (ch << 6) | (cha & 0x3F);
+
+ if ((ch & FinalByte) == 0)
+ {
+ // Not at last byte yet
+ Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
+ "[UTF8Encoding.GetChars]Invariant volation");
+
+ if ((ch & SupplimentarySeq) != 0)
+ {
+ // Its a 4-byte supplimentary sequence
+ if ((ch & (FinalByte >> 6)) != 0)
+ {
+ // this is 3rd byte of 4 byte sequence - nothing to do
+ continue;
+ }
+
+ // 2nd byte of 4 bytes
+ // check for non-shortest form of surrogate and the valid surrogate
+ // range 0x000000 - 0x10FFFF at the same time
+ if (!InRange(ch & 0x1F0, 0x10, 0x100))
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ else
+ {
+ // Must be 2nd byte of a 3-byte sequence
+ // check for non-shortest form of 3 byte seq
+ if ((ch & (0x1F << 5)) == 0 || // non-shortest form
+ (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ continue;
+ }
+
+ // ready to punch
+
+ // surrogate in shortest form?
+ // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
+ if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
+ {
+ // let the range check for the second char throw the exception
+ if (pTarget < pAllocatedBufferEnd)
+ {
+ *pTarget = (char)(((ch >> 10) & 0x7FF) +
+ unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
+ pTarget++;
+
+ ch = (ch & 0x3FF) +
+ unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
+ }
+ }
+
+ goto EncodeChar;
+
+ InvalidByteSequence:
+ // this code fragment should be close to the gotos referencing it
+ // Have to do fallback for invalid bytes
+ if (fallback == null)
+ {
+ if (baseDecoder == null)
+ fallback = this.decoderFallback.CreateFallbackBuffer();
+ else
+ fallback = baseDecoder.FallbackBuffer;
+ fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
+ }
+ // This'll back us up the appropriate # of bytes if we didn't get anywhere
+ pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
+ pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
+ bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
+ pSrc = pSrcForFallback;
+ pTarget = pTargetForFallback;
+
+ if (!fallbackResult)
+ {
+ // Ran out of buffer space
+ // Need to throw an exception?
+ Debug.Assert(pSrc >= bytes || pTarget == chars,
+ "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
+ fallback.InternalReset();
+ ThrowCharsOverflow(baseDecoder, pTarget == chars);
+ ch = 0;
+ break;
+ }
+ Debug.Assert(pSrc >= bytes,
+ "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
+ ch = 0;
+ continue;
+
+ ReadChar:
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ if (ch > 0x7F)
+ {
+ // If its > 0x7F, its start of a new multi-byte sequence
+
+ // bit 6 has to be non-zero
+ if ((ch & 0x40) == 0)
+ {
+ goto InvalidByteSequence;
+ }
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - supplimentary character (2 surrogates)
+
+ ch &= 0x0F;
+
+ // check that bit 4 is zero and the valid supplimentary character
+ // range 0x000000 - 0x10FFFF at the same time
+ if (ch > 0x04)
+ {
+ ch |= 0xf0;
+ goto InvalidByteSequence;
+ }
+
+ ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
+ (SupplimentarySeq) | (SupplimentarySeq >> 6) |
+ (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
+ }
+ else
+ {
+ // 3 byte encoding
+ ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
+ (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ ch &= 0x1F;
+
+ // check for non-shortest form
+ if (ch <= 1)
+ {
+ ch |= 0xc0;
+ goto InvalidByteSequence;
+ }
+
+ ch |= (FinalByte >> 6);
+ }
+ continue;
+ }
+
+ EncodeChar:
+ // write the pending character
+ if (pTarget >= pAllocatedBufferEnd)
+ {
+ // Fix chars so we make sure to throw if we didn't output anything
+ ch &= 0x1fffff;
+ if (ch > 0x7f)
+ {
+ if (ch > 0x7ff)
+ {
+ if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
+ ch <= CharUnicodeInfo.LOW_SURROGATE_END)
+ {
+ pSrc--; // It was 4 bytes
+ pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
+ }
+ else if (ch > 0xffff)
+ {
+ pSrc--; // It was 4 bytes, nothing was stored
+ }
+ pSrc--; // It was at least 3 bytes
+ }
+ pSrc--; // It was at least 2 bytes
+ }
+ pSrc--;
+
+ // Throw that we don't have enough room (pSrc could be < chars if we had started to process
+ // a 4 byte sequence alredy)
+ Debug.Assert(pSrc >= bytes || pTarget == chars,
+ "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
+ ThrowCharsOverflow(baseDecoder, pTarget == chars);
+
+ // Don't store ch in decoder, we already backed up to its start
+ ch = 0;
+
+ // Didn't throw, just use this buffer size.
+ break;
+ }
+ *pTarget = (char)ch;
+ pTarget++;
+
+#if FASTLOOP
+ int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
+ int availableBytes = PtrDiff(pEnd, pSrc);
+
+ // don't fall into the fast decoding loop if we don't have enough bytes
+ // Test for availableChars is done because pStop would be <= pTarget.
+ if (availableBytes <= 13)
+ {
+ // we may need as many as 1 character per byte
+ if (availableChars < availableBytes)
+ {
+ // not enough output room. no pending bits at this point
+ ch = 0;
+ continue;
+ }
+
+ // try to get over the remainder of the ascii characters fast though
+ byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ goto ProcessChar;
+
+ *pTarget = (char)ch;
+ pTarget++;
+ }
+ // we are done
+ ch = 0;
+ break;
+ }
+
+ // we may need as many as 1 character per byte, so reduce the byte count if necessary.
+ // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
+ if (availableChars < availableBytes)
+ {
+ availableBytes = availableChars;
+ }
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
+ char* pStop = pTarget + availableBytes - 7;
+
+ while (pTarget < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ *pTarget = (char)ch;
+ pTarget++;
+
+ // get pSrc to be 2-byte aligned
+ if ((unchecked((int)pSrc) & 0x1) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ *pTarget = (char)ch;
+ pTarget++;
+ }
+
+ // get pSrc to be 4-byte aligned
+ if ((unchecked((int)pSrc) & 0x2) != 0)
+ {
+ ch = *(ushort*)pSrc;
+ if ((ch & 0x8080) != 0)
+ {
+ goto LongCodeWithMask16;
+ }
+
+ // Unfortunately, this is endianess sensitive
+#if BIGENDIAN
+ *pTarget = (char)((ch >> 8) & 0x7F);
+ pSrc += 2;
+ *(pTarget+1) = (char)(ch & 0x7F);
+ pTarget += 2;
+#else // BIGENDIAN
+ *pTarget = (char)(ch & 0x7F);
+ pSrc += 2;
+ *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
+ pTarget += 2;
+#endif // BIGENDIAN
+ }
+
+ // Run 8 characters at a time!
+ while (pTarget < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chb = *(int*)(pSrc + 4);
+ if (((ch | chb) & unchecked((int)0x80808080)) != 0)
+ {
+ goto LongCodeWithMask32;
+ }
+
+ // Unfortunately, this is endianess sensitive
+#if BIGENDIAN
+ *pTarget = (char)((ch >> 24) & 0x7F);
+ *(pTarget+1) = (char)((ch >> 16) & 0x7F);
+ *(pTarget+2) = (char)((ch >> 8) & 0x7F);
+ *(pTarget+3) = (char)(ch & 0x7F);
+ pSrc += 8;
+ *(pTarget+4) = (char)((chb >> 24) & 0x7F);
+ *(pTarget+5) = (char)((chb >> 16) & 0x7F);
+ *(pTarget+6) = (char)((chb >> 8) & 0x7F);
+ *(pTarget+7) = (char)(chb & 0x7F);
+ pTarget += 8;
+#else // BIGENDIAN
+ *pTarget = (char)(ch & 0x7F);
+ *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
+ *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
+ *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
+ pSrc += 8;
+ *(pTarget + 4) = (char)(chb & 0x7F);
+ *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
+ *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
+ *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
+ pTarget += 8;
+#endif // BIGENDIAN
+ }
+ break;
+
+#if BIGENDIAN
+ LongCodeWithMask32:
+ // be careful about the sign extension
+ ch = (int)(((uint)ch) >> 16);
+ LongCodeWithMask16:
+ ch = (int)(((uint)ch) >> 8);
+#else // BIGENDIAN
+ LongCodeWithMask32:
+ LongCodeWithMask16:
+ ch &= 0xFF;
+#endif // BIGENDIAN
+ pSrc++;
+ if (ch <= 0x7F)
+ {
+ *pTarget = (char)ch;
+ pTarget++;
+ continue;
+ }
+
+ LongCode:
+ int chc = *pSrc;
+ pSrc++;
+
+ if (
+ // bit 6 has to be zero
+ (ch & 0x40) == 0 ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (chc & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc &= 0x3F;
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ // fold the first two bytes together
+ chc |= (ch & 0x0F) << 6;
+
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - surrogate
+ ch = *pSrc;
+ if (
+ // check that bit 4 is zero, the non-shortest form of surrogate
+ // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
+ !InRange(chc >> 4, 0x01, 0x10) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc = (chc << 6) | (ch & 0x3F);
+
+ ch = *(pSrc + 1);
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((ch & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc += 2;
+
+ ch = (chc << 6) | (ch & 0x3F);
+
+ *pTarget = (char)(((ch >> 10) & 0x7FF) +
+ unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
+ pTarget++;
+
+ ch = (ch & 0x3FF) +
+ unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
+
+ // extra byte, we're already planning 2 chars for 2 of these bytes,
+ // but the big loop is testing the target against pStop, so we need
+ // to subtract 2 more or we risk overrunning the input. Subtract
+ // one here and one below.
+ pStop--;
+ }
+ else
+ {
+ // 3 byte encoding
+ ch = *pSrc;
+ if (
+ // check for non-shortest form of 3 byte seq
+ (chc & (0x1F << 5)) == 0 ||
+ // Can't have surrogates here.
+ (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & unchecked((sbyte)0xC0)) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc++;
+
+ ch = (chc << 6) | (ch & 0x3F);
+
+ // extra byte, we're only expecting 1 char for each of these 3 bytes,
+ // but the loop is testing the target (not source) against pStop, so
+ // we need to subtract 2 more or we risk overrunning the input.
+ // Subtract 1 here and one more below
+ pStop--;
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ ch &= 0x1F;
+
+ // check for non-shortest form
+ if (ch <= 1)
+ {
+ goto BadLongCode;
+ }
+ ch = (ch << 6) | chc;
+ }
+
+ *pTarget = (char)ch;
+ pTarget++;
+
+ // extra byte, we're only expecting 1 char for each of these 2 bytes,
+ // but the loop is testing the target (not source) against pStop.
+ // subtract an extra count from pStop so that we don't overrun the input.
+ pStop--;
+ }
+#endif // FASTLOOP
+
+ Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
+
+ // no pending bits at this point
+ ch = 0;
+ continue;
+
+ BadLongCode:
+ pSrc -= 2;
+ ch = 0;
+ continue;
+ }
+
+ if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
+ {
+ // Have to do fallback for invalid bytes
+ if (fallback == null)
+ {
+ if (baseDecoder == null)
+ fallback = this.decoderFallback.CreateFallbackBuffer();
+ else
+ fallback = baseDecoder.FallbackBuffer;
+ fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
+ }
+
+ // This'll back us up the appropriate # of bytes if we didn't get anywhere
+ pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be enregistered
+ pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be enregistered
+ bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
+ pSrc = pSrcForFallback;
+ pTarget = pTargetForFallback;
+
+ if (!fallbackResult)
+ {
+ Debug.Assert(pSrc >= bytes || pTarget == chars,
+ "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
+
+ // Ran out of buffer space
+ // Need to throw an exception?
+ fallback.InternalReset();
+ ThrowCharsOverflow(baseDecoder, pTarget == chars);
+ }
+ Debug.Assert(pSrc >= bytes,
+ "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
+ ch = 0;
+ }
+
+ if (baseDecoder != null)
+ {
+ UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
+
+ // If we're storing flush data we expect all bits to be used or else
+ // we're stuck in the middle of a conversion
+ Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
+ "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
+
+ // Remember our leftover bits.
+ decoder.bits = ch;
+
+ baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
+ }
+
+ // Shouldn't have anything in fallback buffer for GetChars
+ // (don't have to check m_throwOnOverflow for chars)
+ Debug.Assert(fallback == null || fallback.Remaining == 0,
+ "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
+
+ return PtrDiff(pTarget, chars);
+ }
+
+ // During GetChars we had an invalid byte sequence
+ // pSrc is backed up to the start of the bad sequence if we didn't have room to
+ // fall it back. Otherwise pSrc remains wher it is.
+ private unsafe bool FallbackInvalidByteSequence(
+ ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
+ {
+ // Get our byte[]
+ byte* pStart = pSrc;
+ byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
+
+ // Do the actual fallback
+ if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
+ {
+ // Oops, it failed, back up to pStart
+ pSrc = pStart;
+ return false;
+ }
+
+ // It worked
+ return true;
+ }
+
+ // During GetCharCount we had an invalid byte sequence
+ // pSrc is used to find the index that points to the invalid bytes,
+ // however the byte[] contains the fallback bytes (in case the index is -1)
+ private unsafe int FallbackInvalidByteSequence(
+ byte* pSrc, int ch, DecoderFallbackBuffer fallback)
+ {
+ // Get our byte[]
+ byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
+
+ // Do the actual fallback
+ int count = fallback.InternalFallback(bytesUnknown, pSrc);
+
+ // # of fallback chars expected.
+ // Note that we only get here for "long" sequences, and have already unreserved
+ // the count that we prereserved for the input bytes
+ return count;
+ }
+
+ // Note that some of these bytes may have come from a previous fallback, so we cannot
+ // just decrement the pointer and use the values we read. In those cases we have
+ // to regenerate the original values.
+ private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
+ {
+ // Get our byte[]
+ byte[] bytesUnknown = null;
+
+ // See if it was a plain char
+ // (have to check >= 0 because we have all sorts of wierd bit flags)
+ if (ch < 0x100 && ch >= 0)
+ {
+ pSrc--;
+ bytesUnknown = new byte[] { unchecked((byte)ch) };
+ }
+ // See if its an unfinished 2 byte sequence
+ else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
+ {
+ pSrc--;
+ bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
+ }
+ // So now we're either 2nd byte of 3 or 4 byte sequence or
+ // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
+ // 1st check if its a 4 byte sequence
+ else if ((ch & SupplimentarySeq) != 0)
+ {
+ // 3rd byte of 4 byte sequence?
+ if ((ch & (FinalByte >> 6)) != 0)
+ {
+ // 3rd byte of 4 byte sequence
+ pSrc -= 3;
+ bytesUnknown = new byte[] {
+ unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
+ unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
+ unchecked((byte)(((ch) & 0x3F) | 0x80)) };
+ }
+ else if ((ch & (FinalByte >> 12)) != 0)
+ {
+ // 2nd byte of a 4 byte sequence
+ pSrc -= 2;
+ bytesUnknown = new byte[] {
+ unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
+ unchecked((byte)(((ch) & 0x3F) | 0x80)) };
+ }
+ else
+ {
+ // 4th byte of a 4 byte sequence
+ pSrc--;
+ bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
+ }
+ }
+ else
+ {
+ // 2nd byte of 3 byte sequence?
+ if ((ch & (FinalByte >> 6)) != 0)
+ {
+ // So its 2nd byte of a 3 byte sequence
+ pSrc -= 2;
+ bytesUnknown = new byte[] {
+ unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
+ }
+ else
+ {
+ // 1st byte of a 3 byte sequence
+ pSrc--;
+ bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
+ }
+ }
+
+ return bytesUnknown;
+ }
+
+
+ public override Decoder GetDecoder()
+ {
+ return new UTF8Decoder(this);
+ }
+
+
+ public override Encoder GetEncoder()
+ {
+ return new UTF8Encoder(this);
+ }
+
+
+ public override int GetMaxByteCount(int charCount)
+ {
+ if (charCount < 0)
+ throw new ArgumentOutOfRangeException(nameof(charCount),
+ SR.ArgumentOutOfRange_NeedNonNegNum);
+ Contract.EndContractBlock();
+
+ // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
+ long byteCount = (long)charCount + 1;
+
+ if (EncoderFallback.MaxCharCount > 1)
+ byteCount *= EncoderFallback.MaxCharCount;
+
+ // Max 3 bytes per char. (4 bytes per 2 chars for surrogates)
+ byteCount *= 3;
+
+ if (byteCount > 0x7fffffff)
+ throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
+
+ return (int)byteCount;
+ }
+
+
+ public override int GetMaxCharCount(int byteCount)
+ {
+ if (byteCount < 0)
+ throw new ArgumentOutOfRangeException(nameof(byteCount),
+ SR.ArgumentOutOfRange_NeedNonNegNum);
+ Contract.EndContractBlock();
+
+ // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
+ long charCount = ((long)byteCount + 1);
+
+ // Non-shortest form would fall back, so get max count from fallback.
+ // So would 11... followed by 11..., so you could fall back every byte
+ if (DecoderFallback.MaxCharCount > 1)
+ {
+ charCount *= DecoderFallback.MaxCharCount;
+ }
+
+ if (charCount > 0x7fffffff)
+ throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
+
+ return (int)charCount;
+ }
+
+
+ public override byte[] GetPreamble()
+ {
+ if (_emitUTF8Identifier)
+ {
+ // Allocate new array to prevent users from modifying it.
+ return new byte[3] { 0xEF, 0xBB, 0xBF };
+ }
+ else
+ return Array.Empty<byte>();
+ }
+
+
+ public override bool Equals(Object value)
+ {
+ UTF8Encoding that = value as UTF8Encoding;
+ if (that != null)
+ {
+ return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
+ (EncoderFallback.Equals(that.EncoderFallback)) &&
+ (DecoderFallback.Equals(that.DecoderFallback));
+ }
+ return (false);
+ }
+
+
+ public override int GetHashCode()
+ {
+ //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
+ return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
+ UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
+ }
+
+ [Serializable]
+ private sealed class UTF8Encoder : EncoderNLS, ISerializable
+ {
+ // We must save a high surrogate value until the next call, looking
+ // for a low surrogate value.
+ internal int surrogateChar;
+
+ public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
+ {
+ // base calls reset
+ }
+
+ // Constructor called by serialization, have to handle deserializing from Everett
+ internal UTF8Encoder(SerializationInfo info, StreamingContext context)
+ {
+ // Any info?
+ if (info == null) throw new ArgumentNullException(nameof(info));
+ Contract.EndContractBlock();
+
+ // Get common info
+ this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
+
+ // SurrogateChar happens to mean the same thing
+ this.surrogateChar = (int)info.GetValue("surrogateChar", typeof(int));
+
+ try
+ {
+ this.m_fallback = (EncoderFallback)info.GetValue("m_fallback", typeof(EncoderFallback));
+ }
+ catch (SerializationException)
+ {
+ this.m_fallback = null;
+ }
+ }
+
+ // ISerializable implementation, get data for this object
+ void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
+ {
+ // Any info?
+ if (info == null) throw new ArgumentNullException(nameof(info));
+ Contract.EndContractBlock();
+
+ // Save Whidbey data
+ // Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding)
+ info.AddValue("encoding", this.m_encoding);
+ info.AddValue("surrogateChar", this.surrogateChar);
+
+ info.AddValue("m_fallback", this.m_fallback);
+
+ // Extra stuff for Everett that Whidbey doesn't use
+ info.AddValue("storedSurrogate", this.surrogateChar > 0 ? true : false);
+ info.AddValue("mustFlush", false); // Everett doesn't actually use this either, but it accidently serialized it!
+ }
+
+ public override void Reset()
+
+ {
+ this.surrogateChar = 0;
+ if (m_fallbackBuffer != null)
+ m_fallbackBuffer.Reset();
+ }
+
+ // Anything left in our encoder?
+ internal override bool HasState
+ {
+ get
+ {
+ return (this.surrogateChar != 0);
+ }
+ }
+ }
+
+ [Serializable]
+ private sealed class UTF8Decoder : DecoderNLS, ISerializable
+ {
+ // We'll need to remember the previous information. See the comments around definition
+ // of FinalByte for details.
+ internal int bits;
+
+ public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
+ {
+ // base calls reset
+ }
+
+ // Constructor called by serialization, have to handle deserializing from Everett
+ internal UTF8Decoder(SerializationInfo info, StreamingContext context)
+ {
+ // Any info?
+ if (info == null) throw new ArgumentNullException(nameof(info));
+ Contract.EndContractBlock();
+
+ // Get common info
+ this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
+
+ try
+ {
+ // Get whidbey version of bits
+ this.bits = (int)info.GetValue("wbits", typeof(int));
+ this.m_fallback = (DecoderFallback)info.GetValue("m_fallback", typeof(DecoderFallback));
+ }
+ catch (SerializationException)
+ {
+ // Everett calls bits bits instead of wbits, so this is Everett
+ this.bits = 0;
+ this.m_fallback = null;
+ }
+ }
+
+ // ISerializable implementation, get data for this object
+ void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
+ {
+ // Any info?
+ if (info == null) throw new ArgumentNullException(nameof(info));
+ Contract.EndContractBlock();
+
+ // Save new Whidbey data
+ info.AddValue("encoding", this.m_encoding);
+ info.AddValue("wbits", this.bits); // Special whidbey bits name
+ info.AddValue("m_fallback", this.m_fallback);
+
+ // Everett has extra stuff, we set it all to 0 in case this deserializes in Everett
+ info.AddValue("bits", (int)0);
+ info.AddValue("trailCount", (int)0);
+ info.AddValue("isSurrogate", false);
+ info.AddValue("byteSequence", (int)0);
+ }
+
+ public override void Reset()
+ {
+ this.bits = 0;
+ if (m_fallbackBuffer != null)
+ m_fallbackBuffer.Reset();
+ }
+
+ // Anything left in our decoder?
+ internal override bool HasState
+ {
+ get
+ {
+ return (this.bits != 0);
+ }
+ }
+ }
+ }
+}