diff options
Diffstat (limited to 'src/mscorlib/src/System/Text/ISO2022Encoding.cs')
-rw-r--r-- | src/mscorlib/src/System/Text/ISO2022Encoding.cs | 1983 |
1 files changed, 0 insertions, 1983 deletions
diff --git a/src/mscorlib/src/System/Text/ISO2022Encoding.cs b/src/mscorlib/src/System/Text/ISO2022Encoding.cs deleted file mode 100644 index fca579fe56..0000000000 --- a/src/mscorlib/src/System/Text/ISO2022Encoding.cs +++ /dev/null @@ -1,1983 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - - -// -// -// Notes: -// -// IsAlwaysNormalized ??? -// Regarding Normalization for ISO-2022-JP (50220, 50221, 50222), its the same rules as EUCJP -// Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings -// Form D is precluded because of 0x00a8, which changes to space + dierises. -// -// Note: I think that IsAlwaysNormalized should probably return true for form C for Japanese 20932 based CPs. -// -// For ISO-2022-KR -// Never normalized, C & D (& therefore KC & KD) are precluded because of Hangul syllables and combined characters. -// -// IsAlwaysNormalized ??? -// Regarding Normalization for ISO-2022-CN (50227, 50229) & HZ-GB2312 (52936) I think is similar to the Japanese case. -// Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings -// Form D is precluded because of 0x00a8, which changes to space + dierises. -// -// Note: I think that IsAlwaysNormalized should probably return true for form C for Chinese 20936 based CPs. -// -#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding -namespace System.Text -{ - using System.Globalization; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Text; - using System.Runtime.InteropServices; - using System; - using System.Security; - using System.Runtime.CompilerServices; - using System.Runtime.Serialization; - - - /*=================================ISO2022Encoding============================ - ** - ** This is used to support ISO 2022 encodings that use shift/escape sequences. - ** - ==============================================================================*/ - - [Serializable] - internal class ISO2022Encoding : DBCSCodePageEncoding - { - const byte SHIFT_OUT = (byte)0x0E; - const byte SHIFT_IN = (byte)0x0F; - const byte ESCAPE = 0x1B; - const byte LEADBYTE_HALFWIDTH = 0x10; - - // We have to load the 936 code page tables, so impersonate 936 as our base - // This pretends to be other code pages as far as memory sections are concerned. - internal ISO2022Encoding(int codePage) : base(codePage, tableBaseCodePages[codePage % 10]) - { - this.m_bUseMlangTypeForSerialization = true; - } - - // Constructor called by serialization. - // Note: We use the base GetObjectData however - internal ISO2022Encoding(SerializationInfo info, StreamingContext context) : base(info, context) - { - // Actually this can't ever get called, CodePageEncoding is our proxy - Debug.Assert(false, "Didn't expect to make it to DBCSCodePageEncoding serialization constructor"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - - static int[] tableBaseCodePages = - { - 932, // 50220 ISO-2022-JP, No halfwidth Katakana, convert to full width - 932, // 50221 ISO-2022-JP, Use escape sequence for half width Katakana - 932, // 50222 ISO-2022-JP, Use shift-in/shift-out for half width Katakana - 0, - 0, - 949, // 50225 ISO-2022-KR, Korean - 936, // 52936 HZ-GB2312, 936 might be better source - 0, //20936, // 50227 ISO-2022-CN, Note: This is just the same as CP 936 in Everett. - 0, - // 50229 is currently unsupported, CP 20000 is currently not built in .nlp file - 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_1 - 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_2 - 0 // ModeASCII - }; - - internal enum ISO2022Modes - { - ModeHalfwidthKatakana = 0, - ModeJIS0208 = 1, - ModeKR = 5, - ModeHZ = 6, - ModeGB2312 = 7, - ModeCNS11643_1 = 9, - ModeCNS11643_2 = 10, - ModeASCII = 11, - - ModeIncompleteEscape = -1, - ModeInvalidEscape = -2, - ModeNOOP = -3 - } - - protected unsafe override String GetMemorySectionName() - { - int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; - - String strFormat; - - switch (this.CodePage) - { - case 50220: - case 50221: - case 50222: - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022JP"; - break; - case 50225: - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022KR"; - break; - case 52936: - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_HZ"; - break; - default: - Debug.Assert(false, "[ISO2022Encoding.GetMemorySectionName] Don't expect to get here for code page " + this.CodePage); - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}"; - break; - } - - String strName = String.Format(CultureInfo.InvariantCulture, strFormat, - iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, - this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); - - return strName; - } - - // Clean up characters for ISO2022 code pages, etc. - // ISO2022 (50220, 50221, 50222) - // GB-HZ (52936) - protected override bool CleanUpBytes(ref int bytes) - { - switch (this.CodePage) - { - // 932 based code pages - case 50220: - case 50221: - case 50222: - { - if (bytes >= 0x100) - { - // map extended char (0xfa40-0xfc4b) to a special range - // (ported from mlang) - if (bytes >= 0xfa40 && bytes <= 0xfc4b) - { - if ( bytes >= 0xfa40 && bytes <= 0xfa5b ) - { - if ( bytes <= 0xfa49 ) - bytes = bytes - 0x0b51 ; - else if ( bytes >= 0xfa4a && bytes <= 0xfa53 ) - bytes = bytes - 0x072f6 ; - else if ( bytes >= 0xfa54 && bytes <= 0xfa57 ) - bytes = bytes - 0x0b5b ; - else if ( bytes == 0xfa58 ) - bytes = 0x878a ; - else if ( bytes == 0xfa59 ) - bytes = 0x8782 ; - else if ( bytes == 0xfa5a ) - bytes = 0x8784 ; - else if ( bytes == 0xfa5b ) - bytes = 0x879a ; - } - else if ( bytes >= 0xfa5c && bytes <= 0xfc4b ) - { - byte tc = unchecked((byte)bytes); - if ( tc < 0x5c ) - bytes = bytes - 0x0d5f; - else if ( tc >= 0x80 && tc <= 0x9B ) - bytes = bytes - 0x0d1d; - else - bytes = bytes - 0x0d1c; - } - } - - // Convert 932 code page to 20932 like code page range - // (also ported from mlang) - byte bLead = unchecked((byte)(bytes >> 8)); - byte bTrail = unchecked((byte)bytes); - - bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71); - bLead = (byte)((bLead << 1) + 1); - if (bTrail > (byte)0x9e) - { - bTrail -= (byte)0x7e; - bLead++; - } - else - { - if (bTrail > (byte)0x7e) - bTrail--; - bTrail -= (byte)0x1f; - } - - bytes = ((int)bLead) << 8 | (int)bTrail; - - // Don't step out of our allocated lead byte area. - // All DBCS lead and trail bytes should be >= 0x21 and <= 0x7e - // This is commented out because Everett/Mlang had illegal PUA - // mappings to ISO2022 code pages that we're maintaining. -// if ((bytes & 0xFF00) < 0x2100 || (bytes & 0xFF00) > 0x7e00 || - // (bytes & 0xFF) < 0x21 || (bytes & 0xFF) > 0x7e) - // return false; - } - else - { - // Adjust 1/2 Katakana - if (bytes >= 0xa1 && bytes <= 0xdf) - bytes += (LEADBYTE_HALFWIDTH << 8) - 0x80; - - // 0x81-0x9f and 0xe0-0xfc CP 932 - // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though) - // b0-df is 1/2 Katakana - if (bytes >= 0x81 && - (bytes <= 0x9f || - (bytes >= 0xe0 && bytes <= 0xfc))) - { - // Don't do lead bytes, we use escape sequences instead. - return false; - } - } - break; - } - case 50225: - { - // For 50225 since we don't rely on lead byte marks, return false and don't add them, - // esp. since we're only a 7 bit code page. - if (bytes >= 0x80 && bytes <= 0xff) - return false; - - // Ignore characters out of range (a1-7f) - if (bytes >= 0x100 && - ((bytes & 0xff) < 0xa1 || (bytes & 0xff) == 0xff || - (bytes & 0xff00) < 0xa100 || (bytes & 0xff00) == 0xff00)) - return false; - - // May as well get them into our 7 bit range - bytes &= 0x7f7f; - - break; - } - case 52936: - { - // Since we don't rely on lead byte marks for 52936, get rid of them so we - // don't end up with extra wierd fffe mappings. - if (bytes >= 0x81 && bytes <= 0xfe) - return false; - - break; - } - } - - return true; - } - - // GetByteCount - internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(count >= 0, "[ISO2022Encoding.GetByteCount]count is negative"); - Debug.Assert(chars != null, "[ISO2022Encoding.GetByteCount]chars is null"); - - // Just call GetBytes with null byte* to get count - return GetBytes(chars, count, null, 0, baseEncoder); - } - - internal override unsafe int GetBytes(char* chars, int charCount, - byte* bytes, int byteCount, EncoderNLS baseEncoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(chars != null, "[ISO2022Encoding.GetBytes]chars is null"); - Debug.Assert(byteCount >= 0, "[ISO2022Encoding.GetBytes]byteCount is negative"); - Debug.Assert(charCount >= 0, "[ISO2022Encoding.GetBytes]charCount is negative"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[ISO2022Encoding.GetBytes]Attempting to use null encoder fallback"); - - // Fix our encoder - ISO2022Encoder encoder = (ISO2022Encoder)baseEncoder; - - // Our return value - int iCount = 0; - - switch(CodePage) - { - case 50220: - case 50221: - case 50222: - iCount = GetBytesCP5022xJP( chars, charCount, bytes, byteCount, encoder ); - break; - case 50225: - iCount = GetBytesCP50225KR( chars, charCount, bytes, byteCount, encoder ); - break; -// Everett had 50227 the same as 936 -/* case 50227: - iCount = GetBytesCP50227CN( chars, charCount, bytes, byteCount, encoder ); - break; -*/ - case 52936: - iCount = GetBytesCP52936( chars, charCount, bytes, byteCount, encoder ); - break; - } - - return iCount; - } - - // This is internal and called by something else, - internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - // Just assert, we're called internally so these should be safe, checked already - Debug.Assert(bytes != null, "[ISO2022Encoding.GetCharCount]bytes is null"); - Debug.Assert(count >= 0, "[ISO2022Encoding.GetCharCount]byteCount is negative"); - - // Just call getChars with null char* to get count - return GetChars(bytes, count, null, 0, baseDecoder); - } - - internal override unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS baseDecoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[ISO2022Encoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[ISO2022Encoding.GetChars]byteCount is negative"); - Debug.Assert(charCount >= 0, "[ISO2022Encoding.GetChars]charCount is negative"); - - // Fix our decoder - ISO2022Decoder decoder = (ISO2022Decoder)baseDecoder; - int iCount = 0; - - switch (CodePage) - { - case 50220: - case 50221: - case 50222: - iCount = GetCharsCP5022xJP( bytes, byteCount, chars, charCount, decoder); - break; - case 50225: - iCount = GetCharsCP50225KR( bytes, byteCount, chars, charCount, decoder); - break; - // Currently 50227 is the same as 936 -// case 50227: - // iCount = GetCharsCP50227CN( bytes, byteCount, chars, charCount, decoder); - // break; - case 52936: - iCount = GetCharsCP52936( bytes, byteCount, chars, charCount, decoder); - break; - default: - Debug.Assert(false, "[ISO2022Encoding.GetChars] had unexpected code page"); - break; - } - - return iCount; - } - - // ISO 2022 Code pages for JP. - // 50220 - No halfwidth Katakana, convert to full width - // 50221 - Use escape sequence for half width Katakana - // 50222 - Use shift-in/shift-out for half width Katakana - // - // These are the JIS code pages, superset of ISO-2022 / ISO-2022-JP-1 - // 0E Shift Out (following bytes are Katakana) - // 0F Shift In (back to "normal" behavior) - // 21-7E Byte ranges (1 or 2 bytes) - // <ESC> $ @ To Double Byte 0208 Mode (actually older code page, but subset of 0208) - // <ESC> $ B To Double Byte 0208 Mode (duplicate) - // <ESC> $ ( D To Double Byte 0212 Mode (previously we misinterpreted this) - // <ESC> $ I To half width Katakana - // <ESC> ( J To JIS-Roman - // <ESC> ( H To JIS-Roman (swedish character set) - // <ESC> ( B To ASCII - // <ESC> & @ Alternate lead in to <ESC> $ B so just ignore it. - // - // So in Katakana mode we add 0x8e as a lead byte and use CP 20932 to convert it - // In ASCII mode we just spit out the single byte. - // In Roman mode we should change 0x5c (\) -> Yen sign and 0x7e (~) to Overline, however - // we didn't in mLang, otherwise roman is like ASCII. - // In 0208 double byte mode we have to |= with 0x8080 and use CP 20932 to convert it. - // In 0212 double byte mode we have to |= with 0x8000 and use CP 20932 to convert it. - // - // Note that JIS Shift In/Shift Out is different than the other ISO2022 encodings. For JIS - // Shift out always shifts to half-width Katakana. Chinese encodings use designator sequences - // instead of escape sequences and shift out to the designated sequence or back in to ASCII. - // - // When decoding JIS 0208, MLang used a '*' (0x2a) character in JIS 0208 mode to map the trailing byte - // to halfwidth katakana. I found no description of that behavior, however that block of 0208 is - // undefined, so we maintain that behavior when decoding. We will never generate characters using - // that technique, but the decoder will process them. - // - private unsafe int GetBytesCP5022xJP(char* chars, int charCount, - byte* bytes, int byteCount, ISO2022Encoder encoder) - { - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Get our mode - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode - ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that shift in will go back to (only used by CP 50222) - - // Check our encoder - if (encoder != null) - { - char charLeftOver = encoder.charLeftOver; - - currentMode = encoder.currentMode; - shiftInMode = encoder.shiftInOutMode; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP5022xJP]leftover character should be high surrogate"); - - // It has to be a high surrogate, which we don't support, so it has to be a fallback - buffer.Fallback(charLeftOver); - } - } - - while (buffer.MoreData) - { - // Get our char - char ch = buffer.GetNextChar(); - - // Get our bytes - ushort iBytes = mapUnicodeToBytes[ch]; - - StartConvert: - // Check for halfwidth bytes - byte bLeadByte = (byte)(iBytes >> 8); - byte bTrailByte = (byte)(iBytes & 0xff); - - if (bLeadByte == LEADBYTE_HALFWIDTH) - { - // Its Halfwidth Katakana - if (CodePage == 50220) - { - // CodePage 50220 doesn't use halfwidth Katakana, convert to fullwidth - // See if its out of range, fallback if so, throws if recursive fallback - if (bTrailByte < 0x21 || bTrailByte >= 0x21 + HalfToFullWidthKanaTable.Length) - { - buffer.Fallback(ch); - continue; - } - - // Get the full width katakana char to use. - iBytes = unchecked((ushort)(HalfToFullWidthKanaTable[bTrailByte - 0x21] & 0x7F7F)); - - // May have to do all sorts of fun stuff for mode, go back to start convert - goto StartConvert; - } - - // Can use halfwidth Katakana, make sure we're in right mode - - // Make sure we're in right mode - if (currentMode != ISO2022Modes.ModeHalfwidthKatakana) - { - // 50222 or 50221, either shift in/out or escape to get to Katakana mode - if (CodePage == 50222) - { - // Shift Out - if (!buffer.AddByte(SHIFT_OUT)) - break; // convert out of space, stop - - // Don't change modes until after AddByte in case it fails for convert - // We get to shift out to Katakana, make sure we'll go back to the right mode - // (This ends up always being ASCII) - shiftInMode = currentMode; - currentMode = ISO2022Modes.ModeHalfwidthKatakana; - } - else - { - // 50221 does halfwidth katakana by escape sequence - Debug.Assert(CodePage == 50221, "[ISO2022Encoding.GetBytesCP5022xJP]Expected Code Page 50221"); - - // Add our escape sequence - if (!buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'I'))) - break; // convert out of space, stop - - currentMode = ISO2022Modes.ModeHalfwidthKatakana; - } - } - - // We know we're in Katakana mode now, so add it. - // Go ahead and add the Katakana byte. Our table tail bytes are 0x80 too big. - if (!buffer.AddByte(unchecked((byte)(bTrailByte & 0x7F)))) - break; // convert out of space, stop - - // Done with this one - continue; - } - else if (bLeadByte != 0) - { - // - // It's a double byte character. - // - - // If we're CP 50222 we may have to shift in from Katakana mode first - if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) - { - // Shift In - if (!buffer.AddByte(SHIFT_IN)) - break; // convert out of space, stop - - // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) - currentMode = shiftInMode; - } - - // Make sure we're in the right mode (JIS 0208 or JIS 0212) - // Note: Right now we don't use JIS 0212. Also this table'd be wrong - - // Its JIS extension 0208 - if (currentMode != ISO2022Modes.ModeJIS0208) - { - // Escape sequence, we can fail after this, mode will be correct for convert - if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)'B'))) - break; // Convert out of space, stop - - currentMode = ISO2022Modes.ModeJIS0208; - } - - // Add our double bytes - if (!buffer.AddByte(unchecked((byte)(bLeadByte)), unchecked((byte)(bTrailByte)))) - break; // Convert out of space, stop - continue; - } - else if (iBytes != 0 || ch == 0) - { - // Single byte Char - // If we're CP 50222 we may have to shift in from Katakana mode first - if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) - { - // Shift IN - if (!buffer.AddByte(SHIFT_IN)) - break; // convert ran out of room - - // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) - currentMode = shiftInMode; - } - - // Its a single byte character, switch to ASCII if we have to - if (currentMode != ISO2022Modes.ModeASCII) - { - if (!buffer.AddByte(ESCAPE,unchecked((byte)'('), unchecked((byte)'B'))) - break; // convert ran out of room - - currentMode = ISO2022Modes.ModeASCII; - } - - // Add the ASCII char - if (!buffer.AddByte(bTrailByte)) - break; // convert had no room left - continue; - } - - // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - } - - // Switch back to ASCII if MustFlush or no encoder - if (currentMode != ISO2022Modes.ModeASCII && - (encoder == null || encoder.MustFlush)) - { - // If we're CP 50222 we may have to shift in from Katakana mode first - if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) - { - // Shift IN, only shift mode if necessary. - if (buffer.AddByte(SHIFT_IN)) - // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) - currentMode = shiftInMode; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - - // switch back to ASCII to finish neatly - if (currentMode != ISO2022Modes.ModeASCII && - (CodePage != 50222 || currentMode != ISO2022Modes.ModeHalfwidthKatakana)) - { - // only shift if it was successful - if (buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'B'))) - currentMode = ISO2022Modes.ModeASCII; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - } - - // Remember our encoder state - if (bytes != null && encoder != null) - { - // This is ASCII if we had to flush - encoder.currentMode = currentMode; - encoder.shiftInOutMode = shiftInMode; - - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - // ISO 2022 Code pages for Korean - CP 50225 - // - // CP 50225 has Shift In/Shift Out codes, and a single designator sequence that is supposed - // to appear once in the file, at the beginning of a line, before any multibyte code points. - // So we stick the designator at the beginning of the output. - // - // These are the KR code page codes for ISO-2022-KR - // 0E Shift Out (following bytes are double byte) - // 0F Shift In (back to ASCII behavior) - // 21-7E Byte ranges (1 or 2 bytes) - // <ESC> $)C Double byte ISO-2022-KR designator - // - // Note that this encoding is a little different than other encodings. The <esc>$)C sequence - // should only appear once per file. (Actually I saw another spec/rfc that said at the beginning - // of each line, but it shouldn't really matter.) - // - // During decoding Mlang accepted ' ', '\t, and '\n' as their respective characters, even if - // it was in double byte mode. We maintain that behavior, although I couldn't find a reference or - // reason for that behavior. We never generate data using that shortcut. - // - // Also Mlang always assumed KR mode, even if the designator wasn't found yet, so we do that as - // well. So basically we just ignore <ESC>$)C when decoding. - // - private unsafe int GetBytesCP50225KR(char* chars, int charCount, - byte* bytes, int byteCount, ISO2022Encoder encoder) - { - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Get our mode - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode - ISO2022Modes shiftOutMode = ISO2022Modes.ModeASCII; // ModeKR if already stamped lead bytes - - // Check our encoder - if (encoder != null) - { - // May have leftover stuff - char charLeftOver = encoder.charLeftOver; - currentMode = encoder.currentMode; - shiftOutMode = encoder.shiftInOutMode; - - // We may have a l left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP50225KR]leftover character should be high surrogate"); - - // It has to be a high surrogate, which we don't support, so it has to be a fallback - buffer.Fallback(charLeftOver); - } - } - - while (buffer.MoreData) - { - // Get our data - char ch = buffer.GetNextChar(); - - // Get our bytes - ushort iBytes = mapUnicodeToBytes[ch]; - - // Check for double byte bytes - byte bLeadByte = (byte)(iBytes >> 8); - byte bTrailByte = (byte)(iBytes & 0xff); - - if (bLeadByte != 0) - { - // - // It's a double byte character. - // - - // If we haven't done our Korean designator, then do so, if we have any input - if (shiftOutMode != ISO2022Modes.ModeKR) - { - // Add our code page designator sequence - if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)')'), unchecked((byte)'C'))) - break; // No room during convert. - - shiftOutMode = ISO2022Modes.ModeKR; - } - - // May have to switch to ModeKR first - if (currentMode != ISO2022Modes.ModeKR) - { - if (!buffer.AddByte(SHIFT_OUT)) - break; // No convert room - - currentMode = ISO2022Modes.ModeKR; - } - - // Add the bytes - if (!buffer.AddByte(bLeadByte, bTrailByte)) - break; // no convert room - continue; - } - else if (iBytes != 0 || ch == 0) - { - // Its a single byte character, switch to ASCII if we have to - if (currentMode != ISO2022Modes.ModeASCII) - { - if (!buffer.AddByte(SHIFT_IN)) - break; - - currentMode = ISO2022Modes.ModeASCII; - } - - // Add the ASCII char - if (!buffer.AddByte(bTrailByte)) - break; - continue; - } - - // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - } - - // Switch back to ASCII if MustFlush or no encoder - if (currentMode != ISO2022Modes.ModeASCII && - (encoder == null || encoder.MustFlush)) - { - // Get back to ASCII to be safe. Only do it if it success. - if (buffer.AddByte(SHIFT_IN)) - currentMode = ISO2022Modes.ModeASCII; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - - // Remember our encoder state - if (bytes != null && encoder != null) - { - // If we didn't use the encoder, then there's no chars left over - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - // This is ASCII if we had to flush - encoder.currentMode = currentMode; - - // We don't use shift out mode, but if we've flushed we need to reset it so it doesn't - // get output again. - if (!encoder.MustFlush || encoder.charLeftOver != (char)0) - { - // We should be not flushing or converting - Debug.Assert(!encoder.MustFlush || !encoder.m_throwOnOverflow, - "[ISO2022Encoding.GetBytesCP50225KR]Expected no left over data or not flushing or not converting"); - encoder.shiftInOutMode = shiftOutMode; - } - else - encoder.shiftInOutMode = ISO2022Modes.ModeASCII; - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - // CP52936 is HZ Encoding - // HZ Encoding has 4 shift sequences: - // ~~ '~' (\u7e) - // ~} shift into 1 byte mode, - // ~{ shift into 2 byte GB 2312-80 - // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) - // (This is for mailers that restrict to 70 or 80 or whatever character lines) - // - // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 - // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e - // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe - // - // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. - // (all bytes <= 0x7f) - private unsafe int GetBytesCP52936(char* chars, int charCount, - byte* bytes, int byteCount, ISO2022Encoder encoder) - { - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Mode - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; - - // Check our encoder - if (encoder != null) - { - char charLeftOver = encoder.charLeftOver; - currentMode = encoder.currentMode; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP52936]leftover character should be high surrogate"); - - // It has to be a high surrogate, which we don't support, so it has to be a fallback - buffer.Fallback(charLeftOver); - } - } - - while (buffer.MoreData) - { - // Get our char - char ch = buffer.GetNextChar(); - - // Get our bytes - ushort sChar = mapUnicodeToBytes[ch]; - if (sChar == 0 && ch != 0) - { - // Wasn't a legal byte sequence, its a surrogate or fallback - // Throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - - // Done with our char, now process fallback - continue; - } - - // Check for halfwidth bytes - byte bLeadByte = (byte)(sChar >> 8); - byte bTrailByte = (byte)(sChar & 0xff); - - // If its a double byte, it has to fit in the lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe range - // (including the 0x8080 that our codepage or's to the value) - if ((bLeadByte != 0 && - (bLeadByte < 0xa1 || bLeadByte > 0xf7 || bTrailByte < 0xa1 || bTrailByte > 0xfe)) || - (bLeadByte == 0 && bTrailByte > 0x80 && bTrailByte != 0xff)) - { - // Illegal character, in 936 code page, but not in HZ subset, get fallback for it - buffer.Fallback(ch); - continue; - } - - // sChar is now either ASCII or has an 0x8080 mask - if (bLeadByte != 0) - { - // Its a double byte mode - if (currentMode != ISO2022Modes.ModeHZ) - { - // Need to add the double byte mode marker - if (!buffer.AddByte((byte)'~', (byte)'{', 2)) - break; // Stop if no buffer space in convert - - currentMode = ISO2022Modes.ModeHZ; - } - - // Go ahead and add the 2 bytes - if (!buffer.AddByte(unchecked((byte)(bLeadByte & 0x7f)), unchecked((byte)(bTrailByte & 0x7f)))) - break; // Stop if no buffer space in convert - } - else - { - // Its supposed to be ASCII - if (currentMode != ISO2022Modes.ModeASCII) - { - // Need to add the ASCII mode marker - // Will have 1 more byte (or 2 if ~) - if (!buffer.AddByte((byte)'~', (byte)'}', bTrailByte == '~' ? 2:1)) - break; - - currentMode = ISO2022Modes.ModeASCII; - } - - // If its a '~' we'll need an extra one - if (bTrailByte == '~') - { - // Need to add the extra ~ - if (!buffer.AddByte((byte)'~', 1)) - break; - } - - // Need to add the character - if (!buffer.AddByte(bTrailByte)) - break; - } - } - - // Add ASCII shift out if we're at end of decoder - if (currentMode != ISO2022Modes.ModeASCII && - (encoder == null || encoder.MustFlush)) - { - // Need to add the ASCII mode marker - // Only turn off other mode if this works - if (buffer.AddByte((byte)'~',(byte)'}')) - currentMode = ISO2022Modes.ModeASCII; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - - // Need to remember our mode - if (encoder != null && bytes != null) - { - // This is ASCII if we had to flush - encoder.currentMode = currentMode; - - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - private unsafe int GetCharsCP5022xJP(byte* bytes, int byteCount, - char* chars, int charCount, ISO2022Decoder decoder) - { - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // No mode information yet - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode - ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that we'll shift in to - byte[] escapeBytes = new byte[4]; - int escapeCount = 0; - - if (decoder != null) - { - currentMode = decoder.currentMode; - shiftInMode = decoder.shiftInOutMode; - - // See if we have leftover decoder buffer to use - // Load our bytesLeftOver - escapeCount = decoder.bytesLeftOverCount; - - // Don't want to mess up decoder if we're counting or throw an exception - for (int i = 0; i < escapeCount; i++) - escapeBytes[i] = decoder.bytesLeftOver[i]; - } - - // Do this until the end - while (buffer.MoreData || escapeCount > 0) - { - byte ch; - - if (escapeCount > 0) - { - // Get more escape sequences if necessary - if (escapeBytes[0] == ESCAPE) - { - // Stop if no more input - if (!buffer.MoreData) - { - if (decoder != null && !decoder.MustFlush) - break; - } - else - { - // Add it to the sequence we can check - escapeBytes[escapeCount++] = buffer.GetNextByte(); - - // We have an escape sequence - ISO2022Modes modeReturn = - CheckEscapeSequenceJP(escapeBytes, escapeCount); - - if (modeReturn != ISO2022Modes.ModeInvalidEscape) - { - if (modeReturn != ISO2022Modes.ModeIncompleteEscape) - { - // Processed escape correctly - escapeCount = 0; - - // We're now this mode - currentMode = shiftInMode = modeReturn; - } - - // Either way, continue to get next escape or real byte - continue; - } - } - - // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. - } - - // Read next escape byte and move them down one. - ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - } - else - { - // Get our next byte - ch = buffer.GetNextByte(); - - if (ch == ESCAPE) - { - // We'll have an escape sequence, use it if we don't have one buffered already - if (escapeCount == 0) - { - // Start this new escape sequence - escapeBytes[0] = ch; - escapeCount = 1; - continue; - } - - // Flush the previous escape sequence, then reuse this escape byte - buffer.AdjustBytes(-1); - } - } - - if (ch == SHIFT_OUT) - { - shiftInMode = currentMode; - currentMode = ISO2022Modes.ModeHalfwidthKatakana; - continue; - } - else if (ch == SHIFT_IN) - { - currentMode = shiftInMode; - continue; - } - - // Get our full character - ushort iBytes = ch; - bool b2Bytes = false; - - if (currentMode == ISO2022Modes.ModeJIS0208) - { - // - // To handle errors, we need to check: - // 1. if trailbyte is there - // 2. if code is valid - // - if (escapeCount > 0) - { - // Let another escape fall through - if (escapeBytes[0] != ESCAPE) - { - // Move them down one & get the next data - iBytes <<= 8; - iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - b2Bytes = true; - } - } - else if (buffer.MoreData) - { - iBytes <<= 8; - iBytes |= buffer.GetNextByte(); - b2Bytes = true; - } - else - { - // Not enough input, use decoder if possible - if (decoder == null || decoder.MustFlush) - { - // No decoder, do fallback for this byte - buffer.Fallback(ch); - break; - } - - // Stick it in the decoder if we're not counting - if (chars != null) - { - escapeBytes[0] = ch; - escapeCount = 1; - } - break; - } - - // MLang treated JIS 0208 '*' lead byte like a single halfwidth katakana - // escape, so use 0x8e00 as katakana lead byte and keep same trail byte. - // 0x2a lead byte range is normally unused in JIS 0208, so shouldn't have - // any wierd compatibility issues. - if ((b2Bytes == true) && ((iBytes & 0xff00) == 0x2a00)) - { - iBytes = (ushort)(iBytes & 0xff); - iBytes |= (LEADBYTE_HALFWIDTH << 8); // Put us in the halfwidth katakana range - } - } - else if (iBytes >= 0xA1 && iBytes <= 0xDF) - { - // Everett accidentally mapped Katakana like shift-jis (932), - // even though this is a 7 bit code page. We keep that mapping - iBytes |= (LEADBYTE_HALFWIDTH << 8); // Map to halfwidth katakana range - iBytes &= 0xff7f; // remove extra 0x80 - } - else if (currentMode == ISO2022Modes.ModeHalfwidthKatakana ) - { - // Add 0x10 lead byte that our encoding expects for Katakana: - iBytes |= (LEADBYTE_HALFWIDTH << 8); - } - - // We have an iBytes to try to convert. - char c = mapBytesToUnicode[iBytes]; - - // See if it was unknown - if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - // Have to do fallback - if (b2Bytes) - { - if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) - break; - } - else - { - if (!buffer.Fallback(ch)) - break; - } - } - else - { - // If we were JIS 0208, then we consumed an extra byte - if (!buffer.AddChar(c, b2Bytes ? 2:1)) - break; - } - } - - // Make sure our decoder state matches our mode, if not counting - if (chars != null && decoder != null) - { - // Remember it if we don't flush - if (!decoder.MustFlush || escapeCount != 0) - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISO2022Encoding.GetCharsCP5022xJP]Expected no state or not converting or not flushing"); - - decoder.currentMode = currentMode; - decoder.shiftInOutMode = shiftInMode; - - // Remember escape buffer - decoder.bytesLeftOverCount = escapeCount; - decoder.bytesLeftOver = escapeBytes; - } - else - { - // We flush, clear buffer - decoder.currentMode = ISO2022Modes.ModeASCII; - decoder.shiftInOutMode = ISO2022Modes.ModeASCII; - decoder.bytesLeftOverCount = 0; - // Slightly different if counting/not counting - } - - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return # of characters we found - return buffer.Count; - } - - // We know we have an escape sequence, so check it starting with the byte after the escape - private ISO2022Modes CheckEscapeSequenceJP( byte[] bytes, int escapeCount ) - { - // Have an escape sequence - if (bytes[0] != ESCAPE) - return ISO2022Modes.ModeInvalidEscape; - - if (escapeCount < 3) - return ISO2022Modes.ModeIncompleteEscape; - - if (bytes[1] == '(') - { - if (bytes[2] == 'B') // <esc>(B - { - return ISO2022Modes.ModeASCII; - } - else if (bytes[2] == 'H') // <esc>(H - { - // Actually this is supposed to be Swedish - // We treat it like ASCII though. - return ISO2022Modes.ModeASCII; - } - else if (bytes[2] == 'J') // <esc>(J - { - // Actually this is supposed to be Roman - // 2 characters are different, but historically we treat it as ascii - return ISO2022Modes.ModeASCII; - } - else if (bytes[2] == 'I') // <esc>(I - { - return ISO2022Modes.ModeHalfwidthKatakana; - } - } - else if (bytes[1] == '$') - { - if (bytes[2] == '@' || // <esc>$@ - bytes[2] == 'B') // <esc>$B - { - return ISO2022Modes.ModeJIS0208; - } - else - { - // Looking for <esc>$(D - if (escapeCount < 4) - return ISO2022Modes.ModeIncompleteEscape; - - if (bytes[2] == '(' && bytes[3] == 'D') // <esc>$(D - { - // Mlang treated 0208 like 0212 even though that's wrong - return ISO2022Modes.ModeJIS0208; - } - } - } - else if (bytes[1] == '&') - { - if (bytes[2] == '@') // <esc>&@ - { - // Ignore ESC & @ (prefix to <esc>$B) - return ISO2022Modes.ModeNOOP; - } - } - - // If we get here we fell through and have an invalid/unknown escape sequence - return ISO2022Modes.ModeInvalidEscape; - } - - private byte DecrementEscapeBytes(ref byte[] bytes, ref int count) - { - Debug.Assert(count > 0, "[ISO2022Encoding.DecrementEscapeBytes]count > 0"); - - // Decrement our count - count--; - - // Remember the first one - byte returnValue = bytes[0]; - - // Move them down one. - for (int i = 0; i < count; i++) - { - bytes[i] = bytes[i+1]; - } - - // Clear out the last byte - bytes[count] = 0; - - // Return the old 1st byte - return returnValue; - } - - // Note that in DBCS mode mlang passed through ' ', '\t' and '\n' as SBCS characters - // probably to allow mailer formatting without too much extra work. - private unsafe int GetCharsCP50225KR(byte* bytes, int byteCount, - char* chars, int charCount, ISO2022Decoder decoder) - { - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // No mode information yet - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode - - byte[] escapeBytes = new byte[4]; - int escapeCount = 0; - - if (decoder != null) - { - currentMode = decoder.currentMode; - - // See if we have leftover decoder buffer to use - // Load our bytesLeftOver - escapeCount = decoder.bytesLeftOverCount; - - // Don't want to mess up decoder if we're counting or throw an exception - for (int i = 0; i < escapeCount; i++) - escapeBytes[i] = decoder.bytesLeftOver[i]; - } - - // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. - while (buffer.MoreData || escapeCount > 0) - { - byte ch; - - if (escapeCount > 0) - { - // Get more escape sequences if necessary - if (escapeBytes[0] == ESCAPE) - { - // Stop if no more input - if (!buffer.MoreData) - { - if (decoder != null && !decoder.MustFlush) - break; - } - else - { - // Add it to the sequence we can check - escapeBytes[escapeCount++] = buffer.GetNextByte(); - - // We have an escape sequence - ISO2022Modes modeReturn = - CheckEscapeSequenceKR(escapeBytes, escapeCount); - - if (modeReturn != ISO2022Modes.ModeInvalidEscape) - { - if (modeReturn != ISO2022Modes.ModeIncompleteEscape) - { - // Processed escape correctly, no effect (we know about KR mode) - escapeCount = 0; - } - - // Either way, continue to get next escape or real byte - continue; - } - } - - // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. - } - - // Still have something left over in escape buffer - // Get it and move them down one - ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - } - else - { - // Get our next byte - ch = buffer.GetNextByte(); - - if (ch == ESCAPE) - { - // We'll have an escape sequence, use it if we don't have one buffered already - if (escapeCount == 0) - { - // Start this new escape sequence - escapeBytes[0] = ch; - escapeCount = 1; - continue; - } - - // Flush previous escape sequence, then reuse this escape byte - buffer.AdjustBytes(-1); - } - } - - if (ch == SHIFT_OUT) - { - currentMode = ISO2022Modes.ModeKR; - continue; - } - else if (ch == SHIFT_IN) - { - currentMode = ISO2022Modes.ModeASCII; - continue; - } - - // Get our full character - ushort iBytes = ch; - bool b2Bytes = false; - - // MLANG was passing through ' ', '\t' and '\n', so we do so as well, but I don't see that in the RFC. - if (currentMode == ISO2022Modes.ModeKR && ch != ' ' && ch != '\t' && ch != '\n') - { - // - // To handle errors, we need to check: - // 1. if trailbyte is there - // 2. if code is valid - // - if (escapeCount > 0) - { - // Let another escape fall through - if (escapeBytes[0] != ESCAPE) - { - // Move them down one & get the next data - iBytes <<= 8; - iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - b2Bytes = true; - } - } - else if (buffer.MoreData) - { - iBytes <<= 8; - iBytes |= buffer.GetNextByte(); - b2Bytes = true; - } - else - { - // Not enough input, use decoder if possible - if (decoder == null || decoder.MustFlush) - { - // No decoder, do fallback for lonely 1st byte - buffer.Fallback(ch); - break; - } - - // Stick it in the decoder if we're not counting - if (chars != null) - { - escapeBytes[0] = ch; - escapeCount = 1; - } - break; - } - } - - // We have a iBytes to try to convert. - char c = mapBytesToUnicode[iBytes]; - - // See if it was unknown - if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - // Have to do fallback - if (b2Bytes) - { - if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) - break; - } - else - { - if (!buffer.Fallback(ch)) - break; - } - } - else - { - if (!buffer.AddChar(c, b2Bytes ? 2:1)) - break; - } - } - - // Make sure our decoder state matches our mode, if not counting - if (chars != null && decoder != null) - { - // Remember it if we don't flush - if (!decoder.MustFlush || escapeCount != 0) - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISO2022Encoding.GetCharsCP50225KR]Expected no state or not converting or not flushing"); - - decoder.currentMode = currentMode; - - // Remember escape buffer - decoder.bytesLeftOverCount = escapeCount; - decoder.bytesLeftOver = escapeBytes; - } - else - { - // We flush, clear buffer - decoder.currentMode = ISO2022Modes.ModeASCII; - decoder.shiftInOutMode = ISO2022Modes.ModeASCII; - decoder.bytesLeftOverCount = 0; - } - - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return # of characters we found - return buffer.Count; - } - - // We know we have an escape sequence, so check it starting with the byte after the escape - private ISO2022Modes CheckEscapeSequenceKR( byte[] bytes, int escapeCount ) - { - // Have an escape sequence - if (bytes[0] != ESCAPE) - return ISO2022Modes.ModeInvalidEscape; - - if (escapeCount < 4) - return ISO2022Modes.ModeIncompleteEscape; - - if (bytes[1] == '$' && bytes[2] == ')' && bytes[3] == 'C') // <esc>$)C - return ISO2022Modes.ModeKR; - - // If we get here we fell through and have an invalid/unknown escape sequence - return ISO2022Modes.ModeInvalidEscape; - } - - // CP52936 is HZ Encoding - // HZ Encoding has 4 shift sequences: - // ~~ '~' (\u7e) - // ~} shift into 1 byte mode, - // ~{ shift into 2 byte GB 2312-80 - // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) - // (This is for mailers that restrict to 70 or 80 or whatever character lines) - // - // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 - // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e - // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe - // - // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. - // (all bytes <= 0x7f) - private unsafe int GetCharsCP52936(byte* bytes, int byteCount, - char* chars, int charCount, ISO2022Decoder decoder) - { - Debug.Assert(byteCount >=0, "[ISO2022Encoding.GetCharsCP52936]count >=0"); - Debug.Assert(bytes!=null, "[ISO2022Encoding.GetCharsCP52936]bytes!=null"); - - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // No mode information yet - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; - int byteLeftOver = -1; - bool bUsedDecoder = false; - - if (decoder != null) - { - currentMode = decoder.currentMode; - // See if we have leftover decoder buffer to use - // Don't want to mess up decoder if we're counting or throw an exception - if (decoder.bytesLeftOverCount != 0 ) - { - // Load our bytesLeftOver - byteLeftOver = decoder.bytesLeftOver[0]; - } - } - - // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. - while (buffer.MoreData || byteLeftOver >= 0) - { - byte ch; - - // May have a left over byte - if (byteLeftOver >= 0) - { - ch = (byte)byteLeftOver; - byteLeftOver = -1; - } - else - { - ch = buffer.GetNextByte(); - } - - // We're in escape mode - if (ch == '~') - { - // Next char is type of switch - if (!buffer.MoreData) - { - // We don't have anything left, it'll be in decoder or a ? - // don't fail if we are allowing overflows - if (decoder == null || decoder.MustFlush) - { - // We'll be a '?' - buffer.Fallback(ch); - // break if we fail & break if we don't (because !MoreData) - // Add succeeded, continue - break; - } - - // Stick it in decoder - if (decoder != null) - decoder.ClearMustFlush(); - - if (chars != null) - { - decoder.bytesLeftOverCount = 1; - decoder.bytesLeftOver[0] = (byte)'~'; - bUsedDecoder = true; - } - break; - } - - // What type is it?, get 2nd byte - ch = buffer.GetNextByte(); - - if (ch == '~' && currentMode == ISO2022Modes.ModeASCII) - { - // Its just a ~~ replacement for ~, add it - if (!buffer.AddChar((char)ch, 2)) - // Add failed, break for converting - break; - - // Add succeeded, continue - continue; - } - else if (ch == '{') - { - // Switching to Double Byte mode - currentMode = ISO2022Modes.ModeHZ; - continue; - } - else if (ch == '}') - { - // Switching to ASCII mode - currentMode = ISO2022Modes.ModeASCII; - continue; - } - else if (ch == '\n') - { - // Ignore ~\n sequence - continue; - } - else - { - // Unknown escape, back up and try the '~' as a "normal" byte or lead byte - buffer.AdjustBytes(-1); - ch = (byte)'~'; - } - } - - // go ahead and add our data - if (currentMode != ISO2022Modes.ModeASCII) - { - // Should be ModeHZ - Debug.Assert(currentMode == ISO2022Modes.ModeHZ, "[ISO2022Encoding.GetCharsCP52936]Expected ModeHZ"); - char cm; - - // Everett allowed characters < 0x20 to be passed as if they were ASCII - if (ch < 0x20) - { - // Emit it as ASCII - goto STOREASCII; - } - - // Its multibyte, should have another byte - if (!buffer.MoreData) - { - // No bytes left - // don't fail if we are allowing overflows - if (decoder == null || decoder.MustFlush) - { - // Not enough bytes, fallback lead byte - buffer.Fallback(ch); - - // Break if we fail & break because !MoreData - break; - } - - if (decoder != null) - decoder.ClearMustFlush(); - - // Stick it in decoder - if (chars != null) - { - decoder.bytesLeftOverCount = 1; - decoder.bytesLeftOver[0] = ch; - bUsedDecoder = true; - } - break; - } - - // Everett uses space as an escape character for single SBCS bytes - byte ch2 = buffer.GetNextByte(); - ushort iBytes = (ushort)(ch << 8 | ch2); - - if (ch == ' ' && ch2 != 0) - { - // Get next char and treat it like ASCII (Everett treated space like an escape - // allowing the next char to be just ascii) - cm = (char)ch2; - goto STOREMULTIBYTE; - } - - // Bytes should be in range: lead byte 0x21-0x77, trail byte: 0x21 - 0x7e - if ((ch < 0x21 || ch > 0x77 || ch2 < 0x21 || ch2 > 0x7e) && - // Everett allowed high bit mappings for same characters (but only if both bits set) - (ch < 0xa1 || ch > 0xf7 || ch2 < 0xa1 || ch2 > 0xfe)) - { - // For some reason Everett allowed XX20 to become unicode 3000... (ideo sp) - if (ch2 == 0x20 && 0x21 <= ch && ch <= 0x7d) - { - iBytes = 0x2121; - goto MULTIBYTE; - } - - // Illegal char, use fallback. If lead byte is 0 have to do it special and do it first - if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) - break; - continue; - } - - MULTIBYTE: - iBytes |= 0x8080; - // Look up the multibyte char to stick it in our data - - // We have a iBytes to try to convert. - cm = mapBytesToUnicode[iBytes]; - - STOREMULTIBYTE: - - // See if it was unknown - if (cm == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - // Fall back the unknown stuff - if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) - break; - continue; - } - - if (!buffer.AddChar(cm, 2)) - break; // convert ran out of buffer, stop - continue; - } - - // Just ASCII - // We allow some chars > 7f because everett did, so we have to look them up. - STOREASCII: - char c = mapBytesToUnicode[ch]; - - // Check if it was unknown - if ((c == UNKNOWN_CHAR_FLAG || c == 0) && (ch != 0)) - { - // fallback the unkown bytes - if (!buffer.Fallback((byte)ch)) - break; - continue; - } - - // Go ahead and add our ASCII character - if (!buffer.AddChar(c)) - break; // convert ran out of buffer, stop - } - - // Need to remember our state, IF we're not counting - if (chars != null && decoder != null) - { - if (!bUsedDecoder) - { - // If we didn't use it, clear the byte left over - decoder.bytesLeftOverCount = 0; - } - - if (decoder.MustFlush && decoder.bytesLeftOverCount == 0) - { - decoder.currentMode = ISO2022Modes.ModeASCII; - } - else - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISO2022Encoding.GetCharsCP52936]Expected no state or not converting or not flushing"); - - decoder.currentMode = currentMode; - } - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return # of characters we found - return buffer.Count; - } - - // Note: These all end up with 1/2 bytes of average byte count, so unless we're 1 we're always - // charCount/2 bytes too big. - public override int GetMaxByteCount(int charCount) - { - if (charCount < 0) - throw new ArgumentOutOfRangeException(nameof(charCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Characters would be # of characters + 1 in case high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; - - if (EncoderFallback.MaxCharCount > 1) - byteCount *= EncoderFallback.MaxCharCount; - - // Start with just generic DBCS values (sort of). - int perChar = 2; - int extraStart = 0; - int extraEnd = 0; - - switch (CodePage) - { - case 50220: - case 50221: - // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP - perChar = 5; // 5 max (4.5 average) - extraEnd = 3; // 3 bytes to shift back to ASCII - break; - case 50222: - // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP - perChar = 5; // 5 max (4.5 average) - extraEnd = 4; // 1 byte to shift from Katakana -> DBCS, 3 bytes to shift back to ASCII from DBCS - break; - case 50225: - // 2 bytes per char + 1 byte SO, or 1 byte per char + 1 byte SI. - perChar = 3; // 3 max, (2.5 average) - extraStart = 4; // EUC-KR marker appears at beginning of file. - extraEnd = 1; // 1 byte to shift back to ascii if necessary. - break; - case 52936: - // 2 bytes per char + 2 byte shift, or 1 byte + 1 byte shift - // Worst case: left over surrogate with no low surrogate is extra ?, could have to switch to ASCII, then could have HZ and flush to ASCII mode - perChar = 4; // 4 max, (3.5 average if every other char is HZ/ASCII) - extraEnd = 2; // 2 if we have to shift back to ASCII - break; - } - - // Return our surrogate and End plus perChar for each char. - byteCount *= perChar; - byteCount += extraStart + extraEnd; - - if (byteCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); - - return (int)byteCount; - } - - public override int GetMaxCharCount(int byteCount) - { - if (byteCount < 0) - throw new ArgumentOutOfRangeException(nameof(byteCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - int perChar = 1; - int extraDecoder = 1; - - switch (CodePage) - { - case 50220: - case 50221: - case 50222: - case 50225: - perChar = 1; // Worst case all ASCII - extraDecoder = 3; // Could have left over 3 chars of 4 char escape sequence, that all become ? - break; - case 52936: - perChar = 1; // Worst case all ASCII - extraDecoder = 1; // sequences are 2 chars, so if next one is illegal, then previous 1 could be ? - break; - } - - // Figure out our length, perchar * char + whatever extra our decoder could do to us. - long charCount = ((long)byteCount * perChar) + extraDecoder; - - // Just in case we have to fall back unknown ones. - if (DecoderFallback.MaxCharCount > 1) - charCount *= DecoderFallback.MaxCharCount; - - if (charCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(byteCount), Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); - - return (int)charCount; - } - - public override Encoder GetEncoder() - { - return new ISO2022Encoder(this); - } - - public override Decoder GetDecoder() - { - return new ISO2022Decoder(this); - } - - [Serializable] - internal class ISO2022Encoder : System.Text.EncoderNLS - { - internal ISO2022Modes currentMode; - internal ISO2022Modes shiftInOutMode; - - internal ISO2022Encoder(EncodingNLS encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - { - // Reset - currentMode = ISO2022Modes.ModeASCII; - shiftInOutMode = ISO2022Modes.ModeASCII; - charLeftOver = (char)0; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our encoder? - internal override bool HasState - { - get - { - // Don't check shift-out mode, it may be ascii (JP) or not (KR) - return (this.charLeftOver != (char)0 || - currentMode != ISO2022Modes.ModeASCII); - } - } - } - - [Serializable] - internal class ISO2022Decoder : System.Text.DecoderNLS - { - internal byte[] bytesLeftOver; - internal int bytesLeftOverCount; - internal ISO2022Modes currentMode; - internal ISO2022Modes shiftInOutMode; - - internal ISO2022Decoder(EncodingNLS encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - { - // Reset - bytesLeftOverCount = 0; - bytesLeftOver = new byte[4]; - currentMode = ISO2022Modes.ModeASCII; - shiftInOutMode = ISO2022Modes.ModeASCII; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - // If have bytes left over or not shifted back to ASCII then have problem - return (this.bytesLeftOverCount != 0 || - currentMode != ISO2022Modes.ModeASCII); - } - } - } - - static ushort[] HalfToFullWidthKanaTable = - { - 0xa1a3, // 0x8ea1 : Halfwidth Ideographic Period - 0xa1d6, // 0x8ea2 : Halfwidth Opening Corner Bracket - 0xa1d7, // 0x8ea3 : Halfwidth Closing Corner Bracket - 0xa1a2, // 0x8ea4 : Halfwidth Ideographic Comma - 0xa1a6, // 0x8ea5 : Halfwidth Katakana Middle Dot - 0xa5f2, // 0x8ea6 : Halfwidth Katakana Wo - 0xa5a1, // 0x8ea7 : Halfwidth Katakana Small A - 0xa5a3, // 0x8ea8 : Halfwidth Katakana Small I - 0xa5a5, // 0x8ea9 : Halfwidth Katakana Small U - 0xa5a7, // 0x8eaa : Halfwidth Katakana Small E - 0xa5a9, // 0x8eab : Halfwidth Katakana Small O - 0xa5e3, // 0x8eac : Halfwidth Katakana Small Ya - 0xa5e5, // 0x8ead : Halfwidth Katakana Small Yu - 0xa5e7, // 0x8eae : Halfwidth Katakana Small Yo - 0xa5c3, // 0x8eaf : Halfwidth Katakana Small Tu - 0xa1bc, // 0x8eb0 : Halfwidth Katakana-Hiragana Prolonged Sound Mark - 0xa5a2, // 0x8eb1 : Halfwidth Katakana A - 0xa5a4, // 0x8eb2 : Halfwidth Katakana I - 0xa5a6, // 0x8eb3 : Halfwidth Katakana U - 0xa5a8, // 0x8eb4 : Halfwidth Katakana E - 0xa5aa, // 0x8eb5 : Halfwidth Katakana O - 0xa5ab, // 0x8eb6 : Halfwidth Katakana Ka - 0xa5ad, // 0x8eb7 : Halfwidth Katakana Ki - 0xa5af, // 0x8eb8 : Halfwidth Katakana Ku - 0xa5b1, // 0x8eb9 : Halfwidth Katakana Ke - 0xa5b3, // 0x8eba : Halfwidth Katakana Ko - 0xa5b5, // 0x8ebb : Halfwidth Katakana Sa - 0xa5b7, // 0x8ebc : Halfwidth Katakana Si - 0xa5b9, // 0x8ebd : Halfwidth Katakana Su - 0xa5bb, // 0x8ebe : Halfwidth Katakana Se - 0xa5bd, // 0x8ebf : Halfwidth Katakana So - 0xa5bf, // 0x8ec0 : Halfwidth Katakana Ta - 0xa5c1, // 0x8ec1 : Halfwidth Katakana Ti - 0xa5c4, // 0x8ec2 : Halfwidth Katakana Tu - 0xa5c6, // 0x8ec3 : Halfwidth Katakana Te - 0xa5c8, // 0x8ec4 : Halfwidth Katakana To - 0xa5ca, // 0x8ec5 : Halfwidth Katakana Na - 0xa5cb, // 0x8ec6 : Halfwidth Katakana Ni - 0xa5cc, // 0x8ec7 : Halfwidth Katakana Nu - 0xa5cd, // 0x8ec8 : Halfwidth Katakana Ne - 0xa5ce, // 0x8ec9 : Halfwidth Katakana No - 0xa5cf, // 0x8eca : Halfwidth Katakana Ha - 0xa5d2, // 0x8ecb : Halfwidth Katakana Hi - 0xa5d5, // 0x8ecc : Halfwidth Katakana Hu - 0xa5d8, // 0x8ecd : Halfwidth Katakana He - 0xa5db, // 0x8ece : Halfwidth Katakana Ho - 0xa5de, // 0x8ecf : Halfwidth Katakana Ma - 0xa5df, // 0x8ed0 : Halfwidth Katakana Mi - 0xa5e0, // 0x8ed1 : Halfwidth Katakana Mu - 0xa5e1, // 0x8ed2 : Halfwidth Katakana Me - 0xa5e2, // 0x8ed3 : Halfwidth Katakana Mo - 0xa5e4, // 0x8ed4 : Halfwidth Katakana Ya - 0xa5e6, // 0x8ed5 : Halfwidth Katakana Yu - 0xa5e8, // 0x8ed6 : Halfwidth Katakana Yo - 0xa5e9, // 0x8ed7 : Halfwidth Katakana Ra - 0xa5ea, // 0x8ed8 : Halfwidth Katakana Ri - 0xa5eb, // 0x8ed9 : Halfwidth Katakana Ru - 0xa5ec, // 0x8eda : Halfwidth Katakana Re - 0xa5ed, // 0x8edb : Halfwidth Katakana Ro - 0xa5ef, // 0x8edc : Halfwidth Katakana Wa - 0xa5f3, // 0x8edd : Halfwidth Katakana N - 0xa1ab, // 0x8ede : Halfwidth Katakana Voiced Sound Mark - 0xa1ac // 0x8edf : Halfwidth Katakana Semi-Voiced Sound Mark - }; - } -} -#endif // FEATURE_CODEPAGES_FILE - |