summaryrefslogtreecommitdiff
path: root/src/mscorlib/src/System/Text/EUCJPEncoding.cs
blob: 7c90caec0bb9939a8dd7433aa1469969b5e86907 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding
namespace System.Text
{
    using System.Text;
    using System.Globalization;

    // EUCJPEncoding
    //
    // EUC-JP Encoding (51932)
    //
    // EUC-JP has the following code points:
    //  00-7F            - ASCII
    //  80-8D & 90-9F    - Control.  (Like Unicode, except for 8e and 8f)
    //  A1-FE, A1-FE     - 2 byte JIS X 0208 range.
    //  8E, A1-DF        - 2 byte half-width Katakana
    //  8F, A1-FE, A1-FE - 3 byte JIX X 0212 range. WE DON'T USE JIS 0212!!!
    //
    // New thoughts:
    //  Fixing windows 20932 code page so that all characters can be looked up there.
    //
    // Old thoughts:
    // Windows NLS uses a special CP20932 for EUC-JP, but it is not used by mlang.  Windows
    // Maps the 3 byte ranges to the 2 byte CP20932 by masking the 2nd byte with & 0x7F.
    // MLang uses the native windows 932 code page, which is more reliable, however the code points
    // don't line up as nicely as the 20932 code page, however it doesn't have JIS X 0212 support.
    //
    // So what we do is:
    //  1.  For ASCII, leave it alone
    //  2.  For half-width Katakana, use the leading byte and convert with 20936 code page.
    //  3.  For JIS X 0208, Use the leading & trailing bytes with 20936 code page
    //  4.  For JIS X 0212, Remove the lead byte, & 0xFF7F, and use the CP20936 table to convert.
    //
    // Regarding Normalization:
    //  Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings
    //  Form D is precluded because of 0x00a8, which changes to space + dierises.
    //
    // I think that IsAlwaysNormalized should probably return true for form C (but not certain)
    //
    // NOTE: We don't use JIS 0212 so we are basically a DBCS code page, we just have to modify
    //       the 932 table we're basing this on.
    //

    using System;

    [Serializable]
    internal class EUCJPEncoding : DBCSCodePageEncoding
    {
        // This pretends to be CP 932 as far as memory tables are concerned.
        [System.Security.SecurityCritical]  // auto-generated
        public EUCJPEncoding() : base(51932, 932)
        {
            this.m_bUseMlangTypeForSerialization = true;
        }

        [System.Security.SecurityCritical]  // auto-generated
        protected unsafe override String GetMemorySectionName()
        {
            int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage;

            String strName = String.Format(CultureInfo.InvariantCulture, "CodePage_{0}_{1}_{2}_{3}_{4}_EUCJP",
                iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor,
                this.pCodePage->VersionRevision, this.pCodePage->VersionBuild);

            return strName;
        }

        // Clean up characters for EUC-JP code pages, etc.
        protected override bool CleanUpBytes(ref int bytes)
        {
            if (bytes >= 0x100)
            {
                // map extended char (0xfa40-0xfc4b) to a special range
                // (ported from mlang)
                if (bytes >= 0xfa40 && bytes <= 0xfc4b)
                {
                    if ( bytes >= 0xfa40 && bytes <= 0xfa5b )
                    {
                        if ( bytes <= 0xfa49 )
                            bytes = bytes - 0x0b51 ;
                        else if ( bytes >= 0xfa4a && bytes <= 0xfa53 )
                            bytes = bytes - 0x072f6 ;
                        else if ( bytes >= 0xfa54 && bytes <= 0xfa57 )
                            bytes = bytes - 0x0b5b ;
                        else if ( bytes == 0xfa58 )
                            bytes = 0x878a ;
                        else if ( bytes == 0xfa59 )
                            bytes = 0x8782 ;
                        else if ( bytes == 0xfa5a )
                            bytes = 0x8784 ;
                        else if ( bytes == 0xfa5b )
                            bytes = 0x879a ;
                    }
                    else if ( bytes >= 0xfa5c && bytes <= 0xfc4b )
                    {
                        byte tc = unchecked((byte)bytes);
                        if ( tc < 0x5c )
                            bytes = bytes - 0x0d5f;
                        else if ( tc >= 0x80 && tc <= 0x9B )
                            bytes = bytes - 0x0d1d;
                        else
                            bytes = bytes - 0x0d1c;
                    }
                }

                // Convert 932 code page to 20932 like code page range
                // (also ported from mlang)
                byte bLead = unchecked((byte)(bytes >> 8));
                byte bTrail = unchecked((byte)bytes);

                bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71);
                bLead = (byte)((bLead << 1) + 1);
                if (bTrail > (byte)0x9e)
                {
                    bTrail -= (byte)0x7e;
                    bLead++;
                }
                else
                {
                    if (bTrail > (byte)0x7e)
                        bTrail--;
                    bTrail -= (byte)0x1f;
                }

                bytes = ((int)bLead) << 8 | (int)bTrail | 0x8080;

          //      // Don't step on our katakana special plane, if katakana space return false.
            //    if (bytes >= 0x8E00 && bytes <= 0x8EFF)
              //      return false;

                // Don't step out of our allocated lead byte area.
                // All DBCS lead and trail bytes should be >= 0xa1 and <= 0xfe
                if ((bytes & 0xFF00) < 0xa100 || (bytes & 0xFF00) > 0xfe00 ||
                    (bytes & 0xFF) < 0xa1 || (bytes & 0xFF) > 0xfe)
                    return false;

                // WARNING: Our funky mapping allows illegal values, which we continue to use
                // so that we're compatible with Everett.
            }
            else
            {
                // For 51932 1/2 Katakana gets a 0x8E lead byte
                // Adjust 1/2 Katakana
                if (bytes >= 0xa1 && bytes <= 0xdf)
                {
                    bytes |= 0x8E00;
                    return true;
                }

                // 0x81-0x9f and 0xe0-0xfc CP 932
                // 0x8e and 0xa1-0xfe      CP 20932 (we don't use 8e though)
                // b0-df is 1/2 Katakana
                // So 81-9f & e0-fc are 932 lead bytes, a1-fe are our lead bytes
                // so ignore everything above 0x80 except 0xa0 and 0xff
                if (bytes >= 0x81 && bytes != 0xa0 && bytes != 0xff)
                {
                    // We set diffent lead bytes later, so just return false
                    return false;
                }
            }

            return true;
        }

        [System.Security.SecurityCritical]  // auto-generated
        protected override unsafe void CleanUpEndBytes(char* chars)
        {
            // Need to special case CP 51932
            // 0x81-0x9f and 0xe0-0xfc CP 932
            // 0x8e and 0xa1-0xfe      CP 20932
            // 0x10 and 0x21-0x9?       Us (remapping 932)
            // b0-df is 1/2 Katakana (trail byte)

            // A1-FE are DBCS code points
            for (int i = 0xA1; i <= 0xFE; i++)
                chars[i] = LEAD_BYTE_CHAR;

            // And 8E is lead byte for Katakana (already set)
            chars[0x8e] = LEAD_BYTE_CHAR;
        }
    }
}
#endif // FEATURE_CODEPAGES_FILE