diff options
author | Jiyoung Yun <jy910.yun@samsung.com> | 2016-11-23 19:09:09 +0900 |
---|---|---|
committer | Jiyoung Yun <jy910.yun@samsung.com> | 2016-11-23 19:09:09 +0900 |
commit | 4b4aad7217d3292650e77eec2cf4c198ea9c3b4b (patch) | |
tree | 98110734c91668dfdbb126fcc0e15ddbd93738ca /src/mscorlib/src/System/Text/ISO2022Encoding.cs | |
parent | fa45f57ed55137c75ac870356a1b8f76c84b229c (diff) | |
download | coreclr-4b4aad7217d3292650e77eec2cf4c198ea9c3b4b.tar.gz coreclr-4b4aad7217d3292650e77eec2cf4c198ea9c3b4b.tar.bz2 coreclr-4b4aad7217d3292650e77eec2cf4c198ea9c3b4b.zip |
Imported Upstream version 1.1.0upstream/1.1.0
Diffstat (limited to 'src/mscorlib/src/System/Text/ISO2022Encoding.cs')
-rw-r--r-- | src/mscorlib/src/System/Text/ISO2022Encoding.cs | 1995 |
1 files changed, 1995 insertions, 0 deletions
diff --git a/src/mscorlib/src/System/Text/ISO2022Encoding.cs b/src/mscorlib/src/System/Text/ISO2022Encoding.cs new file mode 100644 index 0000000000..fe57e7cc57 --- /dev/null +++ b/src/mscorlib/src/System/Text/ISO2022Encoding.cs @@ -0,0 +1,1995 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +// +// +// Notes: +// +// IsAlwaysNormalized ??? +// Regarding Normalization for ISO-2022-JP (50220, 50221, 50222), its the same rules as EUCJP +// Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings +// Form D is precluded because of 0x00a8, which changes to space + dierises. +// +// Note: I think that IsAlwaysNormalized should probably return true for form C for Japanese 20932 based CPs. +// +// For ISO-2022-KR +// Never normalized, C & D (& therefore KC & KD) are precluded because of Hangul syllables and combined characters. +// +// IsAlwaysNormalized ??? +// Regarding Normalization for ISO-2022-CN (50227, 50229) & HZ-GB2312 (52936) I think is similar to the Japanese case. +// Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings +// Form D is precluded because of 0x00a8, which changes to space + dierises. +// +// Note: I think that IsAlwaysNormalized should probably return true for form C for Chinese 20936 based CPs. +// +#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding +namespace System.Text +{ + using System.Globalization; + using System.Diagnostics.Contracts; + using System.Text; + using System.Runtime.InteropServices; + using System; + using System.Security; + using System.Runtime.CompilerServices; + using System.Runtime.Serialization; + + + /*=================================ISO2022Encoding============================ + ** + ** This is used to support ISO 2022 encodings that use shift/escape sequences. + ** + ==============================================================================*/ + + [Serializable] + internal class ISO2022Encoding : DBCSCodePageEncoding + { + const byte SHIFT_OUT = (byte)0x0E; + const byte SHIFT_IN = (byte)0x0F; + const byte ESCAPE = 0x1B; + const byte LEADBYTE_HALFWIDTH = 0x10; + + // We have to load the 936 code page tables, so impersonate 936 as our base + // This pretends to be other code pages as far as memory sections are concerned. + [System.Security.SecurityCritical] // auto-generated + internal ISO2022Encoding(int codePage) : base(codePage, tableBaseCodePages[codePage % 10]) + { + this.m_bUseMlangTypeForSerialization = true; + } + + // Constructor called by serialization. + // Note: We use the base GetObjectData however + [System.Security.SecurityCritical] // auto-generated + internal ISO2022Encoding(SerializationInfo info, StreamingContext context) : base(info, context) + { + // Actually this can't ever get called, CodePageEncoding is our proxy + Contract.Assert(false, "Didn't expect to make it to DBCSCodePageEncoding serialization constructor"); + throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); + } + + static int[] tableBaseCodePages = + { + 932, // 50220 ISO-2022-JP, No halfwidth Katakana, convert to full width + 932, // 50221 ISO-2022-JP, Use escape sequence for half width Katakana + 932, // 50222 ISO-2022-JP, Use shift-in/shift-out for half width Katakana + 0, + 0, + 949, // 50225 ISO-2022-KR, Korean + 936, // 52936 HZ-GB2312, 936 might be better source + 0, //20936, // 50227 ISO-2022-CN, Note: This is just the same as CP 936 in Everett. + 0, + // 50229 is currently unsupported, CP 20000 is currently not built in .nlp file + 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_1 + 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_2 + 0 // ModeASCII + }; + + internal enum ISO2022Modes + { + ModeHalfwidthKatakana = 0, + ModeJIS0208 = 1, + ModeKR = 5, + ModeHZ = 6, + ModeGB2312 = 7, + ModeCNS11643_1 = 9, + ModeCNS11643_2 = 10, + ModeASCII = 11, + + ModeIncompleteEscape = -1, + ModeInvalidEscape = -2, + ModeNOOP = -3 + } + + [System.Security.SecurityCritical] // auto-generated + protected unsafe override String GetMemorySectionName() + { + int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; + + String strFormat; + + switch (this.CodePage) + { + case 50220: + case 50221: + case 50222: + strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022JP"; + break; + case 50225: + strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022KR"; + break; + case 52936: + strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_HZ"; + break; + default: + Contract.Assert(false, "[ISO2022Encoding.GetMemorySectionName] Don't expect to get here for code page " + this.CodePage); + strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}"; + break; + } + + String strName = String.Format(CultureInfo.InvariantCulture, strFormat, + iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, + this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); + + return strName; + } + + // Clean up characters for ISO2022 code pages, etc. + // ISO2022 (50220, 50221, 50222) + // GB-HZ (52936) + protected override bool CleanUpBytes(ref int bytes) + { + switch (this.CodePage) + { + // 932 based code pages + case 50220: + case 50221: + case 50222: + { + if (bytes >= 0x100) + { + // map extended char (0xfa40-0xfc4b) to a special range + // (ported from mlang) + if (bytes >= 0xfa40 && bytes <= 0xfc4b) + { + if ( bytes >= 0xfa40 && bytes <= 0xfa5b ) + { + if ( bytes <= 0xfa49 ) + bytes = bytes - 0x0b51 ; + else if ( bytes >= 0xfa4a && bytes <= 0xfa53 ) + bytes = bytes - 0x072f6 ; + else if ( bytes >= 0xfa54 && bytes <= 0xfa57 ) + bytes = bytes - 0x0b5b ; + else if ( bytes == 0xfa58 ) + bytes = 0x878a ; + else if ( bytes == 0xfa59 ) + bytes = 0x8782 ; + else if ( bytes == 0xfa5a ) + bytes = 0x8784 ; + else if ( bytes == 0xfa5b ) + bytes = 0x879a ; + } + else if ( bytes >= 0xfa5c && bytes <= 0xfc4b ) + { + byte tc = unchecked((byte)bytes); + if ( tc < 0x5c ) + bytes = bytes - 0x0d5f; + else if ( tc >= 0x80 && tc <= 0x9B ) + bytes = bytes - 0x0d1d; + else + bytes = bytes - 0x0d1c; + } + } + + // Convert 932 code page to 20932 like code page range + // (also ported from mlang) + byte bLead = unchecked((byte)(bytes >> 8)); + byte bTrail = unchecked((byte)bytes); + + bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71); + bLead = (byte)((bLead << 1) + 1); + if (bTrail > (byte)0x9e) + { + bTrail -= (byte)0x7e; + bLead++; + } + else + { + if (bTrail > (byte)0x7e) + bTrail--; + bTrail -= (byte)0x1f; + } + + bytes = ((int)bLead) << 8 | (int)bTrail; + + // Don't step out of our allocated lead byte area. + // All DBCS lead and trail bytes should be >= 0x21 and <= 0x7e + // This is commented out because Everett/Mlang had illegal PUA + // mappings to ISO2022 code pages that we're maintaining. +// if ((bytes & 0xFF00) < 0x2100 || (bytes & 0xFF00) > 0x7e00 || + // (bytes & 0xFF) < 0x21 || (bytes & 0xFF) > 0x7e) + // return false; + } + else + { + // Adjust 1/2 Katakana + if (bytes >= 0xa1 && bytes <= 0xdf) + bytes += (LEADBYTE_HALFWIDTH << 8) - 0x80; + + // 0x81-0x9f and 0xe0-0xfc CP 932 + // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though) + // b0-df is 1/2 Katakana + if (bytes >= 0x81 && + (bytes <= 0x9f || + (bytes >= 0xe0 && bytes <= 0xfc))) + { + // Don't do lead bytes, we use escape sequences instead. + return false; + } + } + break; + } + case 50225: + { + // For 50225 since we don't rely on lead byte marks, return false and don't add them, + // esp. since we're only a 7 bit code page. + if (bytes >= 0x80 && bytes <= 0xff) + return false; + + // Ignore characters out of range (a1-7f) + if (bytes >= 0x100 && + ((bytes & 0xff) < 0xa1 || (bytes & 0xff) == 0xff || + (bytes & 0xff00) < 0xa100 || (bytes & 0xff00) == 0xff00)) + return false; + + // May as well get them into our 7 bit range + bytes &= 0x7f7f; + + break; + } + case 52936: + { + // Since we don't rely on lead byte marks for 52936, get rid of them so we + // don't end up with extra wierd fffe mappings. + if (bytes >= 0x81 && bytes <= 0xfe) + return false; + + break; + } + } + + return true; + } + + // GetByteCount + [System.Security.SecurityCritical] // auto-generated + internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Contract.Assert(count >= 0, "[ISO2022Encoding.GetByteCount]count is negative"); + Contract.Assert(chars != null, "[ISO2022Encoding.GetByteCount]chars is null"); + + // Just call GetBytes with null byte* to get count + return GetBytes(chars, count, null, 0, baseEncoder); + } + + [System.Security.SecurityCritical] // auto-generated + internal override unsafe int GetBytes(char* chars, int charCount, + byte* bytes, int byteCount, EncoderNLS baseEncoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Contract.Assert(chars != null, "[ISO2022Encoding.GetBytes]chars is null"); + Contract.Assert(byteCount >= 0, "[ISO2022Encoding.GetBytes]byteCount is negative"); + Contract.Assert(charCount >= 0, "[ISO2022Encoding.GetBytes]charCount is negative"); + + // Assert because we shouldn't be able to have a null encoder. + Contract.Assert(encoderFallback != null, "[ISO2022Encoding.GetBytes]Attempting to use null encoder fallback"); + + // Fix our encoder + ISO2022Encoder encoder = (ISO2022Encoder)baseEncoder; + + // Our return value + int iCount = 0; + + switch(CodePage) + { + case 50220: + case 50221: + case 50222: + iCount = GetBytesCP5022xJP( chars, charCount, bytes, byteCount, encoder ); + break; + case 50225: + iCount = GetBytesCP50225KR( chars, charCount, bytes, byteCount, encoder ); + break; +// Everett had 50227 the same as 936 +/* case 50227: + iCount = GetBytesCP50227CN( chars, charCount, bytes, byteCount, encoder ); + break; +*/ + case 52936: + iCount = GetBytesCP52936( chars, charCount, bytes, byteCount, encoder ); + break; + } + + return iCount; + } + + // This is internal and called by something else, + [System.Security.SecurityCritical] // auto-generated + internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) + { + // Just assert, we're called internally so these should be safe, checked already + Contract.Assert(bytes != null, "[ISO2022Encoding.GetCharCount]bytes is null"); + Contract.Assert(count >= 0, "[ISO2022Encoding.GetCharCount]byteCount is negative"); + + // Just call getChars with null char* to get count + return GetChars(bytes, count, null, 0, baseDecoder); + } + + [System.Security.SecurityCritical] // auto-generated + internal override unsafe int GetChars(byte* bytes, int byteCount, + char* chars, int charCount, DecoderNLS baseDecoder) + { + // Just need to ASSERT, this is called by something else internal that checked parameters already + Contract.Assert(bytes != null, "[ISO2022Encoding.GetChars]bytes is null"); + Contract.Assert(byteCount >= 0, "[ISO2022Encoding.GetChars]byteCount is negative"); + Contract.Assert(charCount >= 0, "[ISO2022Encoding.GetChars]charCount is negative"); + + // Fix our decoder + ISO2022Decoder decoder = (ISO2022Decoder)baseDecoder; + int iCount = 0; + + switch (CodePage) + { + case 50220: + case 50221: + case 50222: + iCount = GetCharsCP5022xJP( bytes, byteCount, chars, charCount, decoder); + break; + case 50225: + iCount = GetCharsCP50225KR( bytes, byteCount, chars, charCount, decoder); + break; + // Currently 50227 is the same as 936 +// case 50227: + // iCount = GetCharsCP50227CN( bytes, byteCount, chars, charCount, decoder); + // break; + case 52936: + iCount = GetCharsCP52936( bytes, byteCount, chars, charCount, decoder); + break; + default: + Contract.Assert(false, "[ISO2022Encoding.GetChars] had unexpected code page"); + break; + } + + return iCount; + } + + // ISO 2022 Code pages for JP. + // 50220 - No halfwidth Katakana, convert to full width + // 50221 - Use escape sequence for half width Katakana + // 50222 - Use shift-in/shift-out for half width Katakana + // + // These are the JIS code pages, superset of ISO-2022 / ISO-2022-JP-1 + // 0E Shift Out (following bytes are Katakana) + // 0F Shift In (back to "normal" behavior) + // 21-7E Byte ranges (1 or 2 bytes) + // <ESC> $ @ To Double Byte 0208 Mode (actually older code page, but subset of 0208) + // <ESC> $ B To Double Byte 0208 Mode (duplicate) + // <ESC> $ ( D To Double Byte 0212 Mode (previously we misinterpreted this) + // <ESC> $ I To half width Katakana + // <ESC> ( J To JIS-Roman + // <ESC> ( H To JIS-Roman (swedish character set) + // <ESC> ( B To ASCII + // <ESC> & @ Alternate lead in to <ESC> $ B so just ignore it. + // + // So in Katakana mode we add 0x8e as a lead byte and use CP 20932 to convert it + // In ASCII mode we just spit out the single byte. + // In Roman mode we should change 0x5c (\) -> Yen sign and 0x7e (~) to Overline, however + // we didn't in mLang, otherwise roman is like ASCII. + // In 0208 double byte mode we have to |= with 0x8080 and use CP 20932 to convert it. + // In 0212 double byte mode we have to |= with 0x8000 and use CP 20932 to convert it. + // + // Note that JIS Shift In/Shift Out is different than the other ISO2022 encodings. For JIS + // Shift out always shifts to half-width Katakana. Chinese encodings use designator sequences + // instead of escape sequences and shift out to the designated sequence or back in to ASCII. + // + // When decoding JIS 0208, MLang used a '*' (0x2a) character in JIS 0208 mode to map the trailing byte + // to halfwidth katakana. I found no description of that behavior, however that block of 0208 is + // undefined, so we maintain that behavior when decoding. We will never generate characters using + // that technique, but the decoder will process them. + // + [System.Security.SecurityCritical] // auto-generated + private unsafe int GetBytesCP5022xJP(char* chars, int charCount, + byte* bytes, int byteCount, ISO2022Encoder encoder) + { + // prepare our helpers + Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( + this, encoder, bytes, byteCount, chars, charCount); + + // Get our mode + ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode + ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that shift in will go back to (only used by CP 50222) + + // Check our encoder + if (encoder != null) + { + char charLeftOver = encoder.charLeftOver; + + currentMode = encoder.currentMode; + shiftInMode = encoder.shiftInOutMode; + + // We may have a left over character from last time, try and process it. + if (charLeftOver > 0) + { + Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP5022xJP]leftover character should be high surrogate"); + + // It has to be a high surrogate, which we don't support, so it has to be a fallback + buffer.Fallback(charLeftOver); + } + } + + while (buffer.MoreData) + { + // Get our char + char ch = buffer.GetNextChar(); + + // Get our bytes + ushort iBytes = mapUnicodeToBytes[ch]; + + StartConvert: + // Check for halfwidth bytes + byte bLeadByte = (byte)(iBytes >> 8); + byte bTrailByte = (byte)(iBytes & 0xff); + + if (bLeadByte == LEADBYTE_HALFWIDTH) + { + // Its Halfwidth Katakana + if (CodePage == 50220) + { + // CodePage 50220 doesn't use halfwidth Katakana, convert to fullwidth + // See if its out of range, fallback if so, throws if recursive fallback + if (bTrailByte < 0x21 || bTrailByte >= 0x21 + HalfToFullWidthKanaTable.Length) + { + buffer.Fallback(ch); + continue; + } + + // Get the full width katakana char to use. + iBytes = unchecked((ushort)(HalfToFullWidthKanaTable[bTrailByte - 0x21] & 0x7F7F)); + + // May have to do all sorts of fun stuff for mode, go back to start convert + goto StartConvert; + } + + // Can use halfwidth Katakana, make sure we're in right mode + + // Make sure we're in right mode + if (currentMode != ISO2022Modes.ModeHalfwidthKatakana) + { + // 50222 or 50221, either shift in/out or escape to get to Katakana mode + if (CodePage == 50222) + { + // Shift Out + if (!buffer.AddByte(SHIFT_OUT)) + break; // convert out of space, stop + + // Don't change modes until after AddByte in case it fails for convert + // We get to shift out to Katakana, make sure we'll go back to the right mode + // (This ends up always being ASCII) + shiftInMode = currentMode; + currentMode = ISO2022Modes.ModeHalfwidthKatakana; + } + else + { + // 50221 does halfwidth katakana by escape sequence + Contract.Assert(CodePage == 50221, "[ISO2022Encoding.GetBytesCP5022xJP]Expected Code Page 50221"); + + // Add our escape sequence + if (!buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'I'))) + break; // convert out of space, stop + + currentMode = ISO2022Modes.ModeHalfwidthKatakana; + } + } + + // We know we're in Katakana mode now, so add it. + // Go ahead and add the Katakana byte. Our table tail bytes are 0x80 too big. + if (!buffer.AddByte(unchecked((byte)(bTrailByte & 0x7F)))) + break; // convert out of space, stop + + // Done with this one + continue; + } + else if (bLeadByte != 0) + { + // + // It's a double byte character. + // + + // If we're CP 50222 we may have to shift in from Katakana mode first + if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) + { + // Shift In + if (!buffer.AddByte(SHIFT_IN)) + break; // convert out of space, stop + + // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) + currentMode = shiftInMode; + } + + // Make sure we're in the right mode (JIS 0208 or JIS 0212) + // Note: Right now we don't use JIS 0212. Also this table'd be wrong + + // Its JIS extension 0208 + if (currentMode != ISO2022Modes.ModeJIS0208) + { + // Escape sequence, we can fail after this, mode will be correct for convert + if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)'B'))) + break; // Convert out of space, stop + + currentMode = ISO2022Modes.ModeJIS0208; + } + + // Add our double bytes + if (!buffer.AddByte(unchecked((byte)(bLeadByte)), unchecked((byte)(bTrailByte)))) + break; // Convert out of space, stop + continue; + } + else if (iBytes != 0 || ch == 0) + { + // Single byte Char + // If we're CP 50222 we may have to shift in from Katakana mode first + if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) + { + // Shift IN + if (!buffer.AddByte(SHIFT_IN)) + break; // convert ran out of room + + // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) + currentMode = shiftInMode; + } + + // Its a single byte character, switch to ASCII if we have to + if (currentMode != ISO2022Modes.ModeASCII) + { + if (!buffer.AddByte(ESCAPE,unchecked((byte)'('), unchecked((byte)'B'))) + break; // convert ran out of room + + currentMode = ISO2022Modes.ModeASCII; + } + + // Add the ASCII char + if (!buffer.AddByte(bTrailByte)) + break; // convert had no room left + continue; + } + + // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) + buffer.Fallback(ch); + } + + // Switch back to ASCII if MustFlush or no encoder + if (currentMode != ISO2022Modes.ModeASCII && + (encoder == null || encoder.MustFlush)) + { + // If we're CP 50222 we may have to shift in from Katakana mode first + if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) + { + // Shift IN, only shift mode if necessary. + if (buffer.AddByte(SHIFT_IN)) + // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) + currentMode = shiftInMode; + else + // If not successful, convert will maintain state for next time, also + // AddByte will have decremented our char count, however we need it to remain the same + buffer.GetNextChar(); + } + + // switch back to ASCII to finish neatly + if (currentMode != ISO2022Modes.ModeASCII && + (CodePage != 50222 || currentMode != ISO2022Modes.ModeHalfwidthKatakana)) + { + // only shift if it was successful + if (buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'B'))) + currentMode = ISO2022Modes.ModeASCII; + else + // If not successful, convert will maintain state for next time, also + // AddByte will have decremented our char count, however we need it to remain the same + buffer.GetNextChar(); + } + } + + // Remember our encoder state + if (bytes != null && encoder != null) + { + // This is ASCII if we had to flush + encoder.currentMode = currentMode; + encoder.shiftInOutMode = shiftInMode; + + if (!buffer.fallbackBuffer.bUsedEncoder) + { + encoder.charLeftOver = (char)0; + } + + encoder.m_charsUsed = buffer.CharsUsed; + } + + // Return our length + return buffer.Count; + } + + // ISO 2022 Code pages for Korean - CP 50225 + // + // CP 50225 has Shift In/Shift Out codes, and a single designator sequence that is supposed + // to appear once in the file, at the beginning of a line, before any multibyte code points. + // So we stick the designator at the beginning of the output. + // + // These are the KR code page codes for ISO-2022-KR + // 0E Shift Out (following bytes are double byte) + // 0F Shift In (back to ASCII behavior) + // 21-7E Byte ranges (1 or 2 bytes) + // <ESC> $)C Double byte ISO-2022-KR designator + // + // Note that this encoding is a little different than other encodings. The <esc>$)C sequence + // should only appear once per file. (Actually I saw another spec/rfc that said at the beginning + // of each line, but it shouldn't really matter.) + // + // During decoding Mlang accepted ' ', '\t, and '\n' as their respective characters, even if + // it was in double byte mode. We maintain that behavior, although I couldn't find a reference or + // reason for that behavior. We never generate data using that shortcut. + // + // Also Mlang always assumed KR mode, even if the designator wasn't found yet, so we do that as + // well. So basically we just ignore <ESC>$)C when decoding. + // + [System.Security.SecurityCritical] // auto-generated + private unsafe int GetBytesCP50225KR(char* chars, int charCount, + byte* bytes, int byteCount, ISO2022Encoder encoder) + { + // prepare our helpers + Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( + this, encoder, bytes, byteCount, chars, charCount); + + // Get our mode + ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode + ISO2022Modes shiftOutMode = ISO2022Modes.ModeASCII; // ModeKR if already stamped lead bytes + + // Check our encoder + if (encoder != null) + { + // May have leftover stuff + char charLeftOver = encoder.charLeftOver; + currentMode = encoder.currentMode; + shiftOutMode = encoder.shiftInOutMode; + + // We may have a l left over character from last time, try and process it. + if (charLeftOver > 0) + { + Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP50225KR]leftover character should be high surrogate"); + + // It has to be a high surrogate, which we don't support, so it has to be a fallback + buffer.Fallback(charLeftOver); + } + } + + while (buffer.MoreData) + { + // Get our data + char ch = buffer.GetNextChar(); + + // Get our bytes + ushort iBytes = mapUnicodeToBytes[ch]; + + // Check for double byte bytes + byte bLeadByte = (byte)(iBytes >> 8); + byte bTrailByte = (byte)(iBytes & 0xff); + + if (bLeadByte != 0) + { + // + // It's a double byte character. + // + + // If we haven't done our Korean designator, then do so, if we have any input + if (shiftOutMode != ISO2022Modes.ModeKR) + { + // Add our code page designator sequence + if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)')'), unchecked((byte)'C'))) + break; // No room during convert. + + shiftOutMode = ISO2022Modes.ModeKR; + } + + // May have to switch to ModeKR first + if (currentMode != ISO2022Modes.ModeKR) + { + if (!buffer.AddByte(SHIFT_OUT)) + break; // No convert room + + currentMode = ISO2022Modes.ModeKR; + } + + // Add the bytes + if (!buffer.AddByte(bLeadByte, bTrailByte)) + break; // no convert room + continue; + } + else if (iBytes != 0 || ch == 0) + { + // Its a single byte character, switch to ASCII if we have to + if (currentMode != ISO2022Modes.ModeASCII) + { + if (!buffer.AddByte(SHIFT_IN)) + break; + + currentMode = ISO2022Modes.ModeASCII; + } + + // Add the ASCII char + if (!buffer.AddByte(bTrailByte)) + break; + continue; + } + + // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) + buffer.Fallback(ch); + } + + // Switch back to ASCII if MustFlush or no encoder + if (currentMode != ISO2022Modes.ModeASCII && + (encoder == null || encoder.MustFlush)) + { + // Get back to ASCII to be safe. Only do it if it success. + if (buffer.AddByte(SHIFT_IN)) + currentMode = ISO2022Modes.ModeASCII; + else + // If not successful, convert will maintain state for next time, also + // AddByte will have decremented our char count, however we need it to remain the same + buffer.GetNextChar(); + } + + // Remember our encoder state + if (bytes != null && encoder != null) + { + // If we didn't use the encoder, then there's no chars left over + if (!buffer.fallbackBuffer.bUsedEncoder) + { + encoder.charLeftOver = (char)0; + } + + // This is ASCII if we had to flush + encoder.currentMode = currentMode; + + // We don't use shift out mode, but if we've flushed we need to reset it so it doesn't + // get output again. + if (!encoder.MustFlush || encoder.charLeftOver != (char)0) + { + // We should be not flushing or converting + Contract.Assert(!encoder.MustFlush || !encoder.m_throwOnOverflow, + "[ISO2022Encoding.GetBytesCP50225KR]Expected no left over data or not flushing or not converting"); + encoder.shiftInOutMode = shiftOutMode; + } + else + encoder.shiftInOutMode = ISO2022Modes.ModeASCII; + + encoder.m_charsUsed = buffer.CharsUsed; + } + + // Return our length + return buffer.Count; + } + + // CP52936 is HZ Encoding + // HZ Encoding has 4 shift sequences: + // ~~ '~' (\u7e) + // ~} shift into 1 byte mode, + // ~{ shift into 2 byte GB 2312-80 + // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) + // (This is for mailers that restrict to 70 or 80 or whatever character lines) + // + // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 + // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e + // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe + // + // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. + // (all bytes <= 0x7f) + [System.Security.SecurityCritical] // auto-generated + private unsafe int GetBytesCP52936(char* chars, int charCount, + byte* bytes, int byteCount, ISO2022Encoder encoder) + { + // prepare our helpers + Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( + this, encoder, bytes, byteCount, chars, charCount); + + // Mode + ISO2022Modes currentMode = ISO2022Modes.ModeASCII; + + // Check our encoder + if (encoder != null) + { + char charLeftOver = encoder.charLeftOver; + currentMode = encoder.currentMode; + + // We may have a left over character from last time, try and process it. + if (charLeftOver > 0) + { + Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP52936]leftover character should be high surrogate"); + + // It has to be a high surrogate, which we don't support, so it has to be a fallback + buffer.Fallback(charLeftOver); + } + } + + while (buffer.MoreData) + { + // Get our char + char ch = buffer.GetNextChar(); + + // Get our bytes + ushort sChar = mapUnicodeToBytes[ch]; + if (sChar == 0 && ch != 0) + { + // Wasn't a legal byte sequence, its a surrogate or fallback + // Throws if recursive (knows because we called InternalGetNextChar) + buffer.Fallback(ch); + + // Done with our char, now process fallback + continue; + } + + // Check for halfwidth bytes + byte bLeadByte = (byte)(sChar >> 8); + byte bTrailByte = (byte)(sChar & 0xff); + + // If its a double byte, it has to fit in the lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe range + // (including the 0x8080 that our codepage or's to the value) + if ((bLeadByte != 0 && + (bLeadByte < 0xa1 || bLeadByte > 0xf7 || bTrailByte < 0xa1 || bTrailByte > 0xfe)) || + (bLeadByte == 0 && bTrailByte > 0x80 && bTrailByte != 0xff)) + { + // Illegal character, in 936 code page, but not in HZ subset, get fallback for it + buffer.Fallback(ch); + continue; + } + + // sChar is now either ASCII or has an 0x8080 mask + if (bLeadByte != 0) + { + // Its a double byte mode + if (currentMode != ISO2022Modes.ModeHZ) + { + // Need to add the double byte mode marker + if (!buffer.AddByte((byte)'~', (byte)'{', 2)) + break; // Stop if no buffer space in convert + + currentMode = ISO2022Modes.ModeHZ; + } + + // Go ahead and add the 2 bytes + if (!buffer.AddByte(unchecked((byte)(bLeadByte & 0x7f)), unchecked((byte)(bTrailByte & 0x7f)))) + break; // Stop if no buffer space in convert + } + else + { + // Its supposed to be ASCII + if (currentMode != ISO2022Modes.ModeASCII) + { + // Need to add the ASCII mode marker + // Will have 1 more byte (or 2 if ~) + if (!buffer.AddByte((byte)'~', (byte)'}', bTrailByte == '~' ? 2:1)) + break; + + currentMode = ISO2022Modes.ModeASCII; + } + + // If its a '~' we'll need an extra one + if (bTrailByte == '~') + { + // Need to add the extra ~ + if (!buffer.AddByte((byte)'~', 1)) + break; + } + + // Need to add the character + if (!buffer.AddByte(bTrailByte)) + break; + } + } + + // Add ASCII shift out if we're at end of decoder + if (currentMode != ISO2022Modes.ModeASCII && + (encoder == null || encoder.MustFlush)) + { + // Need to add the ASCII mode marker + // Only turn off other mode if this works + if (buffer.AddByte((byte)'~',(byte)'}')) + currentMode = ISO2022Modes.ModeASCII; + else + // If not successful, convert will maintain state for next time, also + // AddByte will have decremented our char count, however we need it to remain the same + buffer.GetNextChar(); + } + + // Need to remember our mode + if (encoder != null && bytes != null) + { + // This is ASCII if we had to flush + encoder.currentMode = currentMode; + + if (!buffer.fallbackBuffer.bUsedEncoder) + { + encoder.charLeftOver = (char)0; + } + + encoder.m_charsUsed = buffer.CharsUsed; + } + + // Return our length + return buffer.Count; + } + + [System.Security.SecurityCritical] // auto-generated + private unsafe int GetCharsCP5022xJP(byte* bytes, int byteCount, + char* chars, int charCount, ISO2022Decoder decoder) + { + // Get our info. + Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( + this, decoder, chars, charCount, bytes, byteCount); + + // No mode information yet + ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode + ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that we'll shift in to + byte[] escapeBytes = new byte[4]; + int escapeCount = 0; + + if (decoder != null) + { + currentMode = decoder.currentMode; + shiftInMode = decoder.shiftInOutMode; + + // See if we have leftover decoder buffer to use + // Load our bytesLeftOver + escapeCount = decoder.bytesLeftOverCount; + + // Don't want to mess up decoder if we're counting or throw an exception + for (int i = 0; i < escapeCount; i++) + escapeBytes[i] = decoder.bytesLeftOver[i]; + } + + // Do this until the end + while (buffer.MoreData || escapeCount > 0) + { + byte ch; + + if (escapeCount > 0) + { + // Get more escape sequences if necessary + if (escapeBytes[0] == ESCAPE) + { + // Stop if no more input + if (!buffer.MoreData) + { + if (decoder != null && !decoder.MustFlush) + break; + } + else + { + // Add it to the sequence we can check + escapeBytes[escapeCount++] = buffer.GetNextByte(); + + // We have an escape sequence + ISO2022Modes modeReturn = + CheckEscapeSequenceJP(escapeBytes, escapeCount); + + if (modeReturn != ISO2022Modes.ModeInvalidEscape) + { + if (modeReturn != ISO2022Modes.ModeIncompleteEscape) + { + // Processed escape correctly + escapeCount = 0; + + // We're now this mode + currentMode = shiftInMode = modeReturn; + } + + // Either way, continue to get next escape or real byte + continue; + } + } + + // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. + } + + // Read next escape byte and move them down one. + ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); + } + else + { + // Get our next byte + ch = buffer.GetNextByte(); + + if (ch == ESCAPE) + { + // We'll have an escape sequence, use it if we don't have one buffered already + if (escapeCount == 0) + { + // Start this new escape sequence + escapeBytes[0] = ch; + escapeCount = 1; + continue; + } + + // Flush the previous escape sequence, then reuse this escape byte + buffer.AdjustBytes(-1); + } + } + + if (ch == SHIFT_OUT) + { + shiftInMode = currentMode; + currentMode = ISO2022Modes.ModeHalfwidthKatakana; + continue; + } + else if (ch == SHIFT_IN) + { + currentMode = shiftInMode; + continue; + } + + // Get our full character + ushort iBytes = ch; + bool b2Bytes = false; + + if (currentMode == ISO2022Modes.ModeJIS0208) + { + // + // To handle errors, we need to check: + // 1. if trailbyte is there + // 2. if code is valid + // + if (escapeCount > 0) + { + // Let another escape fall through + if (escapeBytes[0] != ESCAPE) + { + // Move them down one & get the next data + iBytes <<= 8; + iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); + b2Bytes = true; + } + } + else if (buffer.MoreData) + { + iBytes <<= 8; + iBytes |= buffer.GetNextByte(); + b2Bytes = true; + } + else + { + // Not enough input, use decoder if possible + if (decoder == null || decoder.MustFlush) + { + // No decoder, do fallback for this byte + buffer.Fallback(ch); + break; + } + + // Stick it in the decoder if we're not counting + if (chars != null) + { + escapeBytes[0] = ch; + escapeCount = 1; + } + break; + } + + // MLang treated JIS 0208 '*' lead byte like a single halfwidth katakana + // escape, so use 0x8e00 as katakana lead byte and keep same trail byte. + // 0x2a lead byte range is normally unused in JIS 0208, so shouldn't have + // any wierd compatibility issues. + if ((b2Bytes == true) && ((iBytes & 0xff00) == 0x2a00)) + { + iBytes = (ushort)(iBytes & 0xff); + iBytes |= (LEADBYTE_HALFWIDTH << 8); // Put us in the halfwidth katakana range + } + } + else if (iBytes >= 0xA1 && iBytes <= 0xDF) + { + // Everett accidentally mapped Katakana like shift-jis (932), + // even though this is a 7 bit code page. We keep that mapping + iBytes |= (LEADBYTE_HALFWIDTH << 8); // Map to halfwidth katakana range + iBytes &= 0xff7f; // remove extra 0x80 + } + else if (currentMode == ISO2022Modes.ModeHalfwidthKatakana ) + { + // Add 0x10 lead byte that our encoding expects for Katakana: + iBytes |= (LEADBYTE_HALFWIDTH << 8); + } + + // We have an iBytes to try to convert. + char c = mapBytesToUnicode[iBytes]; + + // See if it was unknown + if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) + { + // Have to do fallback + if (b2Bytes) + { + if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) + break; + } + else + { + if (!buffer.Fallback(ch)) + break; + } + } + else + { + // If we were JIS 0208, then we consumed an extra byte + if (!buffer.AddChar(c, b2Bytes ? 2:1)) + break; + } + } + + // Make sure our decoder state matches our mode, if not counting + if (chars != null && decoder != null) + { + // Remember it if we don't flush + if (!decoder.MustFlush || escapeCount != 0) + { + // Either not flushing or had state (from convert) + Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, + "[ISO2022Encoding.GetCharsCP5022xJP]Expected no state or not converting or not flushing"); + + decoder.currentMode = currentMode; + decoder.shiftInOutMode = shiftInMode; + + // Remember escape buffer + decoder.bytesLeftOverCount = escapeCount; + decoder.bytesLeftOver = escapeBytes; + } + else + { + // We flush, clear buffer + decoder.currentMode = ISO2022Modes.ModeASCII; + decoder.shiftInOutMode = ISO2022Modes.ModeASCII; + decoder.bytesLeftOverCount = 0; + // Slightly different if counting/not counting + } + + decoder.m_bytesUsed = buffer.BytesUsed; + } + + // Return # of characters we found + return buffer.Count; + } + + // We know we have an escape sequence, so check it starting with the byte after the escape + private ISO2022Modes CheckEscapeSequenceJP( byte[] bytes, int escapeCount ) + { + // Have an escape sequence + if (bytes[0] != ESCAPE) + return ISO2022Modes.ModeInvalidEscape; + + if (escapeCount < 3) + return ISO2022Modes.ModeIncompleteEscape; + + if (bytes[1] == '(') + { + if (bytes[2] == 'B') // <esc>(B + { + return ISO2022Modes.ModeASCII; + } + else if (bytes[2] == 'H') // <esc>(H + { + // Actually this is supposed to be Swedish + // We treat it like ASCII though. + return ISO2022Modes.ModeASCII; + } + else if (bytes[2] == 'J') // <esc>(J + { + // Actually this is supposed to be Roman + // 2 characters are different, but historically we treat it as ascii + return ISO2022Modes.ModeASCII; + } + else if (bytes[2] == 'I') // <esc>(I + { + return ISO2022Modes.ModeHalfwidthKatakana; + } + } + else if (bytes[1] == '$') + { + if (bytes[2] == '@' || // <esc>$@ + bytes[2] == 'B') // <esc>$B + { + return ISO2022Modes.ModeJIS0208; + } + else + { + // Looking for <esc>$(D + if (escapeCount < 4) + return ISO2022Modes.ModeIncompleteEscape; + + if (bytes[2] == '(' && bytes[3] == 'D') // <esc>$(D + { + // Mlang treated 0208 like 0212 even though that's wrong + return ISO2022Modes.ModeJIS0208; + } + } + } + else if (bytes[1] == '&') + { + if (bytes[2] == '@') // <esc>&@ + { + // Ignore ESC & @ (prefix to <esc>$B) + return ISO2022Modes.ModeNOOP; + } + } + + // If we get here we fell through and have an invalid/unknown escape sequence + return ISO2022Modes.ModeInvalidEscape; + } + + private byte DecrementEscapeBytes(ref byte[] bytes, ref int count) + { + Contract.Assert(count > 0, "[ISO2022Encoding.DecrementEscapeBytes]count > 0"); + + // Decrement our count + count--; + + // Remember the first one + byte returnValue = bytes[0]; + + // Move them down one. + for (int i = 0; i < count; i++) + { + bytes[i] = bytes[i+1]; + } + + // Clear out the last byte + bytes[count] = 0; + + // Return the old 1st byte + return returnValue; + } + + // Note that in DBCS mode mlang passed through ' ', '\t' and '\n' as SBCS characters + // probably to allow mailer formatting without too much extra work. + [System.Security.SecurityCritical] // auto-generated + private unsafe int GetCharsCP50225KR(byte* bytes, int byteCount, + char* chars, int charCount, ISO2022Decoder decoder) + { + // Get our info. + Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( + this, decoder, chars, charCount, bytes, byteCount); + + // No mode information yet + ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode + + byte[] escapeBytes = new byte[4]; + int escapeCount = 0; + + if (decoder != null) + { + currentMode = decoder.currentMode; + + // See if we have leftover decoder buffer to use + // Load our bytesLeftOver + escapeCount = decoder.bytesLeftOverCount; + + // Don't want to mess up decoder if we're counting or throw an exception + for (int i = 0; i < escapeCount; i++) + escapeBytes[i] = decoder.bytesLeftOver[i]; + } + + // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. + while (buffer.MoreData || escapeCount > 0) + { + byte ch; + + if (escapeCount > 0) + { + // Get more escape sequences if necessary + if (escapeBytes[0] == ESCAPE) + { + // Stop if no more input + if (!buffer.MoreData) + { + if (decoder != null && !decoder.MustFlush) + break; + } + else + { + // Add it to the sequence we can check + escapeBytes[escapeCount++] = buffer.GetNextByte(); + + // We have an escape sequence + ISO2022Modes modeReturn = + CheckEscapeSequenceKR(escapeBytes, escapeCount); + + if (modeReturn != ISO2022Modes.ModeInvalidEscape) + { + if (modeReturn != ISO2022Modes.ModeIncompleteEscape) + { + // Processed escape correctly, no effect (we know about KR mode) + escapeCount = 0; + } + + // Either way, continue to get next escape or real byte + continue; + } + } + + // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. + } + + // Still have something left over in escape buffer + // Get it and move them down one + ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); + } + else + { + // Get our next byte + ch = buffer.GetNextByte(); + + if (ch == ESCAPE) + { + // We'll have an escape sequence, use it if we don't have one buffered already + if (escapeCount == 0) + { + // Start this new escape sequence + escapeBytes[0] = ch; + escapeCount = 1; + continue; + } + + // Flush previous escape sequence, then reuse this escape byte + buffer.AdjustBytes(-1); + } + } + + if (ch == SHIFT_OUT) + { + currentMode = ISO2022Modes.ModeKR; + continue; + } + else if (ch == SHIFT_IN) + { + currentMode = ISO2022Modes.ModeASCII; + continue; + } + + // Get our full character + ushort iBytes = ch; + bool b2Bytes = false; + + // MLANG was passing through ' ', '\t' and '\n', so we do so as well, but I don't see that in the RFC. + if (currentMode == ISO2022Modes.ModeKR && ch != ' ' && ch != '\t' && ch != '\n') + { + // + // To handle errors, we need to check: + // 1. if trailbyte is there + // 2. if code is valid + // + if (escapeCount > 0) + { + // Let another escape fall through + if (escapeBytes[0] != ESCAPE) + { + // Move them down one & get the next data + iBytes <<= 8; + iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); + b2Bytes = true; + } + } + else if (buffer.MoreData) + { + iBytes <<= 8; + iBytes |= buffer.GetNextByte(); + b2Bytes = true; + } + else + { + // Not enough input, use decoder if possible + if (decoder == null || decoder.MustFlush) + { + // No decoder, do fallback for lonely 1st byte + buffer.Fallback(ch); + break; + } + + // Stick it in the decoder if we're not counting + if (chars != null) + { + escapeBytes[0] = ch; + escapeCount = 1; + } + break; + } + } + + // We have a iBytes to try to convert. + char c = mapBytesToUnicode[iBytes]; + + // See if it was unknown + if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) + { + // Have to do fallback + if (b2Bytes) + { + if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) + break; + } + else + { + if (!buffer.Fallback(ch)) + break; + } + } + else + { + if (!buffer.AddChar(c, b2Bytes ? 2:1)) + break; + } + } + + // Make sure our decoder state matches our mode, if not counting + if (chars != null && decoder != null) + { + // Remember it if we don't flush + if (!decoder.MustFlush || escapeCount != 0) + { + // Either not flushing or had state (from convert) + Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, + "[ISO2022Encoding.GetCharsCP50225KR]Expected no state or not converting or not flushing"); + + decoder.currentMode = currentMode; + + // Remember escape buffer + decoder.bytesLeftOverCount = escapeCount; + decoder.bytesLeftOver = escapeBytes; + } + else + { + // We flush, clear buffer + decoder.currentMode = ISO2022Modes.ModeASCII; + decoder.shiftInOutMode = ISO2022Modes.ModeASCII; + decoder.bytesLeftOverCount = 0; + } + + decoder.m_bytesUsed = buffer.BytesUsed; + } + + // Return # of characters we found + return buffer.Count; + } + + // We know we have an escape sequence, so check it starting with the byte after the escape + private ISO2022Modes CheckEscapeSequenceKR( byte[] bytes, int escapeCount ) + { + // Have an escape sequence + if (bytes[0] != ESCAPE) + return ISO2022Modes.ModeInvalidEscape; + + if (escapeCount < 4) + return ISO2022Modes.ModeIncompleteEscape; + + if (bytes[1] == '$' && bytes[2] == ')' && bytes[3] == 'C') // <esc>$)C + return ISO2022Modes.ModeKR; + + // If we get here we fell through and have an invalid/unknown escape sequence + return ISO2022Modes.ModeInvalidEscape; + } + + // CP52936 is HZ Encoding + // HZ Encoding has 4 shift sequences: + // ~~ '~' (\u7e) + // ~} shift into 1 byte mode, + // ~{ shift into 2 byte GB 2312-80 + // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) + // (This is for mailers that restrict to 70 or 80 or whatever character lines) + // + // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 + // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e + // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe + // + // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. + // (all bytes <= 0x7f) + [System.Security.SecurityCritical] // auto-generated + private unsafe int GetCharsCP52936(byte* bytes, int byteCount, + char* chars, int charCount, ISO2022Decoder decoder) + { + Contract.Assert(byteCount >=0, "[ISO2022Encoding.GetCharsCP52936]count >=0"); + Contract.Assert(bytes!=null, "[ISO2022Encoding.GetCharsCP52936]bytes!=null"); + + // Get our info. + Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( + this, decoder, chars, charCount, bytes, byteCount); + + // No mode information yet + ISO2022Modes currentMode = ISO2022Modes.ModeASCII; + int byteLeftOver = -1; + bool bUsedDecoder = false; + + if (decoder != null) + { + currentMode = decoder.currentMode; + // See if we have leftover decoder buffer to use + // Don't want to mess up decoder if we're counting or throw an exception + if (decoder.bytesLeftOverCount != 0 ) + { + // Load our bytesLeftOver + byteLeftOver = decoder.bytesLeftOver[0]; + } + } + + // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. + while (buffer.MoreData || byteLeftOver >= 0) + { + byte ch; + + // May have a left over byte + if (byteLeftOver >= 0) + { + ch = (byte)byteLeftOver; + byteLeftOver = -1; + } + else + { + ch = buffer.GetNextByte(); + } + + // We're in escape mode + if (ch == '~') + { + // Next char is type of switch + if (!buffer.MoreData) + { + // We don't have anything left, it'll be in decoder or a ? + // don't fail if we are allowing overflows + if (decoder == null || decoder.MustFlush) + { + // We'll be a '?' + buffer.Fallback(ch); + // break if we fail & break if we don't (because !MoreData) + // Add succeeded, continue + break; + } + + // Stick it in decoder + if (decoder != null) + decoder.ClearMustFlush(); + + if (chars != null) + { + decoder.bytesLeftOverCount = 1; + decoder.bytesLeftOver[0] = (byte)'~'; + bUsedDecoder = true; + } + break; + } + + // What type is it?, get 2nd byte + ch = buffer.GetNextByte(); + + if (ch == '~' && currentMode == ISO2022Modes.ModeASCII) + { + // Its just a ~~ replacement for ~, add it + if (!buffer.AddChar((char)ch, 2)) + // Add failed, break for converting + break; + + // Add succeeded, continue + continue; + } + else if (ch == '{') + { + // Switching to Double Byte mode + currentMode = ISO2022Modes.ModeHZ; + continue; + } + else if (ch == '}') + { + // Switching to ASCII mode + currentMode = ISO2022Modes.ModeASCII; + continue; + } + else if (ch == '\n') + { + // Ignore ~\n sequence + continue; + } + else + { + // Unknown escape, back up and try the '~' as a "normal" byte or lead byte + buffer.AdjustBytes(-1); + ch = (byte)'~'; + } + } + + // go ahead and add our data + if (currentMode != ISO2022Modes.ModeASCII) + { + // Should be ModeHZ + Contract.Assert(currentMode == ISO2022Modes.ModeHZ, "[ISO2022Encoding.GetCharsCP52936]Expected ModeHZ"); + char cm; + + // Everett allowed characters < 0x20 to be passed as if they were ASCII + if (ch < 0x20) + { + // Emit it as ASCII + goto STOREASCII; + } + + // Its multibyte, should have another byte + if (!buffer.MoreData) + { + // No bytes left + // don't fail if we are allowing overflows + if (decoder == null || decoder.MustFlush) + { + // Not enough bytes, fallback lead byte + buffer.Fallback(ch); + + // Break if we fail & break because !MoreData + break; + } + + if (decoder != null) + decoder.ClearMustFlush(); + + // Stick it in decoder + if (chars != null) + { + decoder.bytesLeftOverCount = 1; + decoder.bytesLeftOver[0] = ch; + bUsedDecoder = true; + } + break; + } + + // Everett uses space as an escape character for single SBCS bytes + byte ch2 = buffer.GetNextByte(); + ushort iBytes = (ushort)(ch << 8 | ch2); + + if (ch == ' ' && ch2 != 0) + { + // Get next char and treat it like ASCII (Everett treated space like an escape + // allowing the next char to be just ascii) + cm = (char)ch2; + goto STOREMULTIBYTE; + } + + // Bytes should be in range: lead byte 0x21-0x77, trail byte: 0x21 - 0x7e + if ((ch < 0x21 || ch > 0x77 || ch2 < 0x21 || ch2 > 0x7e) && + // Everett allowed high bit mappings for same characters (but only if both bits set) + (ch < 0xa1 || ch > 0xf7 || ch2 < 0xa1 || ch2 > 0xfe)) + { + // For some reason Everett allowed XX20 to become unicode 3000... (ideo sp) + if (ch2 == 0x20 && 0x21 <= ch && ch <= 0x7d) + { + iBytes = 0x2121; + goto MULTIBYTE; + } + + // Illegal char, use fallback. If lead byte is 0 have to do it special and do it first + if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) + break; + continue; + } + + MULTIBYTE: + iBytes |= 0x8080; + // Look up the multibyte char to stick it in our data + + // We have a iBytes to try to convert. + cm = mapBytesToUnicode[iBytes]; + + STOREMULTIBYTE: + + // See if it was unknown + if (cm == UNKNOWN_CHAR_FLAG && iBytes != 0) + { + // Fall back the unknown stuff + if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) + break; + continue; + } + + if (!buffer.AddChar(cm, 2)) + break; // convert ran out of buffer, stop + continue; + } + + // Just ASCII + // We allow some chars > 7f because everett did, so we have to look them up. + STOREASCII: + char c = mapBytesToUnicode[ch]; + + // Check if it was unknown + if ((c == UNKNOWN_CHAR_FLAG || c == 0) && (ch != 0)) + { + // fallback the unkown bytes + if (!buffer.Fallback((byte)ch)) + break; + continue; + } + + // Go ahead and add our ASCII character + if (!buffer.AddChar(c)) + break; // convert ran out of buffer, stop + } + + // Need to remember our state, IF we're not counting + if (chars != null && decoder != null) + { + if (!bUsedDecoder) + { + // If we didn't use it, clear the byte left over + decoder.bytesLeftOverCount = 0; + } + + if (decoder.MustFlush && decoder.bytesLeftOverCount == 0) + { + decoder.currentMode = ISO2022Modes.ModeASCII; + } + else + { + // Either not flushing or had state (from convert) + Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, + "[ISO2022Encoding.GetCharsCP52936]Expected no state or not converting or not flushing"); + + decoder.currentMode = currentMode; + } + decoder.m_bytesUsed = buffer.BytesUsed; + } + + // Return # of characters we found + return buffer.Count; + } + + // Note: These all end up with 1/2 bytes of average byte count, so unless we're 1 we're always + // charCount/2 bytes too big. + public override int GetMaxByteCount(int charCount) + { + if (charCount < 0) + throw new ArgumentOutOfRangeException("charCount", + Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); + Contract.EndContractBlock(); + + // Characters would be # of characters + 1 in case high surrogate is ? * max fallback + long byteCount = (long)charCount + 1; + + if (EncoderFallback.MaxCharCount > 1) + byteCount *= EncoderFallback.MaxCharCount; + + // Start with just generic DBCS values (sort of). + int perChar = 2; + int extraStart = 0; + int extraEnd = 0; + + switch (CodePage) + { + case 50220: + case 50221: + // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP + perChar = 5; // 5 max (4.5 average) + extraEnd = 3; // 3 bytes to shift back to ASCII + break; + case 50222: + // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP + perChar = 5; // 5 max (4.5 average) + extraEnd = 4; // 1 byte to shift from Katakana -> DBCS, 3 bytes to shift back to ASCII from DBCS + break; + case 50225: + // 2 bytes per char + 1 byte SO, or 1 byte per char + 1 byte SI. + perChar = 3; // 3 max, (2.5 average) + extraStart = 4; // EUC-KR marker appears at beginning of file. + extraEnd = 1; // 1 byte to shift back to ascii if necessary. + break; + case 52936: + // 2 bytes per char + 2 byte shift, or 1 byte + 1 byte shift + // Worst case: left over surrogate with no low surrogate is extra ?, could have to switch to ASCII, then could have HZ and flush to ASCII mode + perChar = 4; // 4 max, (3.5 average if every other char is HZ/ASCII) + extraEnd = 2; // 2 if we have to shift back to ASCII + break; + } + + // Return our surrogate and End plus perChar for each char. + byteCount *= perChar; + byteCount += extraStart + extraEnd; + + if (byteCount > 0x7fffffff) + throw new ArgumentOutOfRangeException("charCount", Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); + + return (int)byteCount; + } + + public override int GetMaxCharCount(int byteCount) + { + if (byteCount < 0) + throw new ArgumentOutOfRangeException("byteCount", + Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); + Contract.EndContractBlock(); + + int perChar = 1; + int extraDecoder = 1; + + switch (CodePage) + { + case 50220: + case 50221: + case 50222: + case 50225: + perChar = 1; // Worst case all ASCII + extraDecoder = 3; // Could have left over 3 chars of 4 char escape sequence, that all become ? + break; + case 52936: + perChar = 1; // Worst case all ASCII + extraDecoder = 1; // sequences are 2 chars, so if next one is illegal, then previous 1 could be ? + break; + } + + // Figure out our length, perchar * char + whatever extra our decoder could do to us. + long charCount = ((long)byteCount * perChar) + extraDecoder; + + // Just in case we have to fall back unknown ones. + if (DecoderFallback.MaxCharCount > 1) + charCount *= DecoderFallback.MaxCharCount; + + if (charCount > 0x7fffffff) + throw new ArgumentOutOfRangeException("byteCount", Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); + + return (int)charCount; + } + + public override Encoder GetEncoder() + { + return new ISO2022Encoder(this); + } + + public override Decoder GetDecoder() + { + return new ISO2022Decoder(this); + } + + [Serializable] + internal class ISO2022Encoder : System.Text.EncoderNLS + { + internal ISO2022Modes currentMode; + internal ISO2022Modes shiftInOutMode; + + internal ISO2022Encoder(EncodingNLS encoding) : base(encoding) + { + // base calls reset + } + + public override void Reset() + { + // Reset + currentMode = ISO2022Modes.ModeASCII; + shiftInOutMode = ISO2022Modes.ModeASCII; + charLeftOver = (char)0; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our encoder? + internal override bool HasState + { + get + { + // Don't check shift-out mode, it may be ascii (JP) or not (KR) + return (this.charLeftOver != (char)0 || + currentMode != ISO2022Modes.ModeASCII); + } + } + } + + [Serializable] + internal class ISO2022Decoder : System.Text.DecoderNLS + { + internal byte[] bytesLeftOver; + internal int bytesLeftOverCount; + internal ISO2022Modes currentMode; + internal ISO2022Modes shiftInOutMode; + + internal ISO2022Decoder(EncodingNLS encoding) : base(encoding) + { + // base calls reset + } + + public override void Reset() + { + // Reset + bytesLeftOverCount = 0; + bytesLeftOver = new byte[4]; + currentMode = ISO2022Modes.ModeASCII; + shiftInOutMode = ISO2022Modes.ModeASCII; + if (m_fallbackBuffer != null) + m_fallbackBuffer.Reset(); + } + + // Anything left in our decoder? + internal override bool HasState + { + get + { + // If have bytes left over or not shifted back to ASCII then have problem + return (this.bytesLeftOverCount != 0 || + currentMode != ISO2022Modes.ModeASCII); + } + } + } + + static ushort[] HalfToFullWidthKanaTable = + { + 0xa1a3, // 0x8ea1 : Halfwidth Ideographic Period + 0xa1d6, // 0x8ea2 : Halfwidth Opening Corner Bracket + 0xa1d7, // 0x8ea3 : Halfwidth Closing Corner Bracket + 0xa1a2, // 0x8ea4 : Halfwidth Ideographic Comma + 0xa1a6, // 0x8ea5 : Halfwidth Katakana Middle Dot + 0xa5f2, // 0x8ea6 : Halfwidth Katakana Wo + 0xa5a1, // 0x8ea7 : Halfwidth Katakana Small A + 0xa5a3, // 0x8ea8 : Halfwidth Katakana Small I + 0xa5a5, // 0x8ea9 : Halfwidth Katakana Small U + 0xa5a7, // 0x8eaa : Halfwidth Katakana Small E + 0xa5a9, // 0x8eab : Halfwidth Katakana Small O + 0xa5e3, // 0x8eac : Halfwidth Katakana Small Ya + 0xa5e5, // 0x8ead : Halfwidth Katakana Small Yu + 0xa5e7, // 0x8eae : Halfwidth Katakana Small Yo + 0xa5c3, // 0x8eaf : Halfwidth Katakana Small Tu + 0xa1bc, // 0x8eb0 : Halfwidth Katakana-Hiragana Prolonged Sound Mark + 0xa5a2, // 0x8eb1 : Halfwidth Katakana A + 0xa5a4, // 0x8eb2 : Halfwidth Katakana I + 0xa5a6, // 0x8eb3 : Halfwidth Katakana U + 0xa5a8, // 0x8eb4 : Halfwidth Katakana E + 0xa5aa, // 0x8eb5 : Halfwidth Katakana O + 0xa5ab, // 0x8eb6 : Halfwidth Katakana Ka + 0xa5ad, // 0x8eb7 : Halfwidth Katakana Ki + 0xa5af, // 0x8eb8 : Halfwidth Katakana Ku + 0xa5b1, // 0x8eb9 : Halfwidth Katakana Ke + 0xa5b3, // 0x8eba : Halfwidth Katakana Ko + 0xa5b5, // 0x8ebb : Halfwidth Katakana Sa + 0xa5b7, // 0x8ebc : Halfwidth Katakana Si + 0xa5b9, // 0x8ebd : Halfwidth Katakana Su + 0xa5bb, // 0x8ebe : Halfwidth Katakana Se + 0xa5bd, // 0x8ebf : Halfwidth Katakana So + 0xa5bf, // 0x8ec0 : Halfwidth Katakana Ta + 0xa5c1, // 0x8ec1 : Halfwidth Katakana Ti + 0xa5c4, // 0x8ec2 : Halfwidth Katakana Tu + 0xa5c6, // 0x8ec3 : Halfwidth Katakana Te + 0xa5c8, // 0x8ec4 : Halfwidth Katakana To + 0xa5ca, // 0x8ec5 : Halfwidth Katakana Na + 0xa5cb, // 0x8ec6 : Halfwidth Katakana Ni + 0xa5cc, // 0x8ec7 : Halfwidth Katakana Nu + 0xa5cd, // 0x8ec8 : Halfwidth Katakana Ne + 0xa5ce, // 0x8ec9 : Halfwidth Katakana No + 0xa5cf, // 0x8eca : Halfwidth Katakana Ha + 0xa5d2, // 0x8ecb : Halfwidth Katakana Hi + 0xa5d5, // 0x8ecc : Halfwidth Katakana Hu + 0xa5d8, // 0x8ecd : Halfwidth Katakana He + 0xa5db, // 0x8ece : Halfwidth Katakana Ho + 0xa5de, // 0x8ecf : Halfwidth Katakana Ma + 0xa5df, // 0x8ed0 : Halfwidth Katakana Mi + 0xa5e0, // 0x8ed1 : Halfwidth Katakana Mu + 0xa5e1, // 0x8ed2 : Halfwidth Katakana Me + 0xa5e2, // 0x8ed3 : Halfwidth Katakana Mo + 0xa5e4, // 0x8ed4 : Halfwidth Katakana Ya + 0xa5e6, // 0x8ed5 : Halfwidth Katakana Yu + 0xa5e8, // 0x8ed6 : Halfwidth Katakana Yo + 0xa5e9, // 0x8ed7 : Halfwidth Katakana Ra + 0xa5ea, // 0x8ed8 : Halfwidth Katakana Ri + 0xa5eb, // 0x8ed9 : Halfwidth Katakana Ru + 0xa5ec, // 0x8eda : Halfwidth Katakana Re + 0xa5ed, // 0x8edb : Halfwidth Katakana Ro + 0xa5ef, // 0x8edc : Halfwidth Katakana Wa + 0xa5f3, // 0x8edd : Halfwidth Katakana N + 0xa1ab, // 0x8ede : Halfwidth Katakana Voiced Sound Mark + 0xa1ac // 0x8edf : Halfwidth Katakana Semi-Voiced Sound Mark + }; + } +} +#endif // FEATURE_CODEPAGES_FILE + |