diff options
Diffstat (limited to 'src/mscorlib/src/System/Text/EUCJPEncoding.cs')
-rw-r--r-- | src/mscorlib/src/System/Text/EUCJPEncoding.cs | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/src/mscorlib/src/System/Text/EUCJPEncoding.cs b/src/mscorlib/src/System/Text/EUCJPEncoding.cs new file mode 100644 index 0000000000..7c90caec0b --- /dev/null +++ b/src/mscorlib/src/System/Text/EUCJPEncoding.cs @@ -0,0 +1,186 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding +namespace System.Text +{ + using System.Text; + using System.Globalization; + + // EUCJPEncoding + // + // EUC-JP Encoding (51932) + // + // EUC-JP has the following code points: + // 00-7F - ASCII + // 80-8D & 90-9F - Control. (Like Unicode, except for 8e and 8f) + // A1-FE, A1-FE - 2 byte JIS X 0208 range. + // 8E, A1-DF - 2 byte half-width Katakana + // 8F, A1-FE, A1-FE - 3 byte JIX X 0212 range. WE DON'T USE JIS 0212!!! + // + // New thoughts: + // Fixing windows 20932 code page so that all characters can be looked up there. + // + // Old thoughts: + // Windows NLS uses a special CP20932 for EUC-JP, but it is not used by mlang. Windows + // Maps the 3 byte ranges to the 2 byte CP20932 by masking the 2nd byte with & 0x7F. + // MLang uses the native windows 932 code page, which is more reliable, however the code points + // don't line up as nicely as the 20932 code page, however it doesn't have JIS X 0212 support. + // + // So what we do is: + // 1. For ASCII, leave it alone + // 2. For half-width Katakana, use the leading byte and convert with 20936 code page. + // 3. For JIS X 0208, Use the leading & trailing bytes with 20936 code page + // 4. For JIS X 0212, Remove the lead byte, & 0xFF7F, and use the CP20936 table to convert. + // + // Regarding Normalization: + // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings + // Form D is precluded because of 0x00a8, which changes to space + dierises. + // + // I think that IsAlwaysNormalized should probably return true for form C (but not certain) + // + // NOTE: We don't use JIS 0212 so we are basically a DBCS code page, we just have to modify + // the 932 table we're basing this on. + // + + using System; + + [Serializable] + internal class EUCJPEncoding : DBCSCodePageEncoding + { + // This pretends to be CP 932 as far as memory tables are concerned. + [System.Security.SecurityCritical] // auto-generated + public EUCJPEncoding() : base(51932, 932) + { + this.m_bUseMlangTypeForSerialization = true; + } + + [System.Security.SecurityCritical] // auto-generated + protected unsafe override String GetMemorySectionName() + { + int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; + + String strName = String.Format(CultureInfo.InvariantCulture, "CodePage_{0}_{1}_{2}_{3}_{4}_EUCJP", + iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, + this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); + + return strName; + } + + // Clean up characters for EUC-JP code pages, etc. + protected override bool CleanUpBytes(ref int bytes) + { + if (bytes >= 0x100) + { + // map extended char (0xfa40-0xfc4b) to a special range + // (ported from mlang) + if (bytes >= 0xfa40 && bytes <= 0xfc4b) + { + if ( bytes >= 0xfa40 && bytes <= 0xfa5b ) + { + if ( bytes <= 0xfa49 ) + bytes = bytes - 0x0b51 ; + else if ( bytes >= 0xfa4a && bytes <= 0xfa53 ) + bytes = bytes - 0x072f6 ; + else if ( bytes >= 0xfa54 && bytes <= 0xfa57 ) + bytes = bytes - 0x0b5b ; + else if ( bytes == 0xfa58 ) + bytes = 0x878a ; + else if ( bytes == 0xfa59 ) + bytes = 0x8782 ; + else if ( bytes == 0xfa5a ) + bytes = 0x8784 ; + else if ( bytes == 0xfa5b ) + bytes = 0x879a ; + } + else if ( bytes >= 0xfa5c && bytes <= 0xfc4b ) + { + byte tc = unchecked((byte)bytes); + if ( tc < 0x5c ) + bytes = bytes - 0x0d5f; + else if ( tc >= 0x80 && tc <= 0x9B ) + bytes = bytes - 0x0d1d; + else + bytes = bytes - 0x0d1c; + } + } + + // Convert 932 code page to 20932 like code page range + // (also ported from mlang) + byte bLead = unchecked((byte)(bytes >> 8)); + byte bTrail = unchecked((byte)bytes); + + bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71); + bLead = (byte)((bLead << 1) + 1); + if (bTrail > (byte)0x9e) + { + bTrail -= (byte)0x7e; + bLead++; + } + else + { + if (bTrail > (byte)0x7e) + bTrail--; + bTrail -= (byte)0x1f; + } + + bytes = ((int)bLead) << 8 | (int)bTrail | 0x8080; + + // // Don't step on our katakana special plane, if katakana space return false. + // if (bytes >= 0x8E00 && bytes <= 0x8EFF) + // return false; + + // Don't step out of our allocated lead byte area. + // All DBCS lead and trail bytes should be >= 0xa1 and <= 0xfe + if ((bytes & 0xFF00) < 0xa100 || (bytes & 0xFF00) > 0xfe00 || + (bytes & 0xFF) < 0xa1 || (bytes & 0xFF) > 0xfe) + return false; + + // WARNING: Our funky mapping allows illegal values, which we continue to use + // so that we're compatible with Everett. + } + else + { + // For 51932 1/2 Katakana gets a 0x8E lead byte + // Adjust 1/2 Katakana + if (bytes >= 0xa1 && bytes <= 0xdf) + { + bytes |= 0x8E00; + return true; + } + + // 0x81-0x9f and 0xe0-0xfc CP 932 + // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though) + // b0-df is 1/2 Katakana + // So 81-9f & e0-fc are 932 lead bytes, a1-fe are our lead bytes + // so ignore everything above 0x80 except 0xa0 and 0xff + if (bytes >= 0x81 && bytes != 0xa0 && bytes != 0xff) + { + // We set diffent lead bytes later, so just return false + return false; + } + } + + return true; + } + + [System.Security.SecurityCritical] // auto-generated + protected override unsafe void CleanUpEndBytes(char* chars) + { + // Need to special case CP 51932 + // 0x81-0x9f and 0xe0-0xfc CP 932 + // 0x8e and 0xa1-0xfe CP 20932 + // 0x10 and 0x21-0x9? Us (remapping 932) + // b0-df is 1/2 Katakana (trail byte) + + // A1-FE are DBCS code points + for (int i = 0xA1; i <= 0xFE; i++) + chars[i] = LEAD_BYTE_CHAR; + + // And 8E is lead byte for Katakana (already set) + chars[0x8e] = LEAD_BYTE_CHAR; + } + } +} +#endif // FEATURE_CODEPAGES_FILE |