diff options
Diffstat (limited to 'src/mscorlib/src/System/Text')
30 files changed, 20 insertions, 9390 deletions
diff --git a/src/mscorlib/src/System/Text/ASCIIEncoding.cs b/src/mscorlib/src/System/Text/ASCIIEncoding.cs index fc7589f2d8..07b7f3e890 100644 --- a/src/mscorlib/src/System/Text/ASCIIEncoding.cs +++ b/src/mscorlib/src/System/Text/ASCIIEncoding.cs @@ -6,7 +6,6 @@ namespace System.Text { using System; using System.Runtime.Serialization; - using System.Security.Permissions; using System.Diagnostics; using System.Diagnostics.Contracts; @@ -22,7 +21,6 @@ namespace System.Text // [Serializable] -[System.Runtime.InteropServices.ComVisible(true)] public class ASCIIEncoding : Encoding { // Used by Encoding.ASCII for lazy initialization @@ -72,7 +70,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetByteCount(char* chars, int count) { return EncodingForwarder.GetByteCount(this, chars, count); @@ -100,7 +97,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { return EncodingForwarder.GetBytes(this, chars, charCount, bytes, byteCount); @@ -115,7 +111,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetCharCount(byte* bytes, int count) { return EncodingForwarder.GetCharCount(this, bytes, count); @@ -128,7 +123,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { return EncodingForwarder.GetChars(this, bytes, byteCount, chars, charCount); @@ -741,7 +735,6 @@ namespace System.Text // True if and only if the encoding only uses single byte code points. (Ie, ASCII, 1252, etc) - [System.Runtime.InteropServices.ComVisible(false)] public override bool IsSingleByte { get @@ -750,14 +743,12 @@ namespace System.Text } } - [System.Runtime.InteropServices.ComVisible(false)] public override Decoder GetDecoder() { return new DecoderNLS(this); } - [System.Runtime.InteropServices.ComVisible(false)] public override Encoder GetEncoder() { return new EncoderNLS(this); diff --git a/src/mscorlib/src/System/Text/BaseCodePageEncoding.cs b/src/mscorlib/src/System/Text/BaseCodePageEncoding.cs deleted file mode 100644 index 0a42237dc1..0000000000 --- a/src/mscorlib/src/System/Text/BaseCodePageEncoding.cs +++ /dev/null @@ -1,332 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -#if FEATURE_CODEPAGES_FILE -namespace System.Text -{ - using System; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Globalization; - using System.Runtime.InteropServices; - using System.Security; - using System.Collections; - using System.Runtime.CompilerServices; - using System.Runtime.Serialization; - using System.Runtime.Versioning; - using System.Security.Permissions; - using Microsoft.Win32.SafeHandles; - - // Our input file data structures look like: - // - // Header Structure Looks Like: - // struct NLSPlusHeader - // { - // WORD[16] filename; // 32 bytes - // WORD[4] version; // 8 bytes = 40 // I.e: 3, 2, 0, 0 - // WORD count; // 2 bytes = 42 // Number of code page index's that'll follow - // } - // - // Each code page section looks like: - // struct NLSCodePageIndex - // { - // WORD[16] codePageName; // 32 bytes - // WORD codePage; // +2 bytes = 34 - // WORD byteCount; // +2 bytes = 36 - // DWORD offset; // +4 bytes = 40 // Bytes from beginning of FILE. - // } - // - // Each code page then has its own header - // struct NLSCodePage - // { - // WORD[16] codePageName; // 32 bytes - // WORD[4] version; // 8 bytes = 40 // I.e: 3.2.0.0 - // WORD codePage; // 2 bytes = 42 - // WORD byteCount; // 2 bytes = 44 // 1 or 2 byte code page (SBCS or DBCS) - // WORD unicodeReplace; // 2 bytes = 46 // default replacement unicode character - // WORD byteReplace; // 2 bytes = 48 // default replacement byte(s) - // BYTE[] data; // data section - // } - - [Serializable] - internal abstract class BaseCodePageEncoding : EncodingNLS, ISerializable - { - // Static & Const stuff - internal const String CODE_PAGE_DATA_FILE_NAME = "codepages.nlp"; - [NonSerialized] - protected int dataTableCodePage; - - // Variables to help us allocate/mark our memory section correctly - [NonSerialized] - protected bool bFlagDataTable = true; - [NonSerialized] - protected int iExtraBytes = 0; - - // Our private unicode to bytes best fit array and visa versa. - [NonSerialized] - protected char[] arrayUnicodeBestFit = null; - [NonSerialized] - protected char[] arrayBytesBestFit = null; - - // This is used to help ISCII, EUCJP and ISO2022 figure out they're MlangEncodings - [NonSerialized] - protected bool m_bUseMlangTypeForSerialization = false; - - static BaseCodePageEncoding() - { - } - - // - // This is the header for the native data table that we load from CODE_PAGE_DATA_FILE_NAME. - // - // Explicit layout is used here since a syntax like char[16] can not be used in sequential layout. - [StructLayout(LayoutKind.Explicit)] - internal unsafe struct CodePageDataFileHeader - { - [FieldOffset(0)] - internal char TableName; // WORD[16] - [FieldOffset(0x20)] - internal ushort Version; // WORD[4] - [FieldOffset(0x28)] - internal short CodePageCount; // WORD - [FieldOffset(0x2A)] - internal short unused1; // Add a unused WORD so that CodePages is aligned with DWORD boundary. - // Otherwise, 64-bit version will fail. - [FieldOffset(0x2C)] - internal CodePageIndex CodePages; // Start of code page index - } - - [StructLayout(LayoutKind.Explicit, Pack=2)] - internal unsafe struct CodePageIndex - { - [FieldOffset(0)] - internal char CodePageName; // WORD[16] - [FieldOffset(0x20)] - internal short CodePage; // WORD - [FieldOffset(0x22)] - internal short ByteCount; // WORD - [FieldOffset(0x24)] - internal int Offset; // DWORD - } - - [StructLayout(LayoutKind.Explicit)] - internal unsafe struct CodePageHeader - { - [FieldOffset(0)] - internal char CodePageName; // WORD[16] - [FieldOffset(0x20)] - internal ushort VersionMajor; // WORD - [FieldOffset(0x22)] - internal ushort VersionMinor; // WORD - [FieldOffset(0x24)] - internal ushort VersionRevision;// WORD - [FieldOffset(0x26)] - internal ushort VersionBuild; // WORD - [FieldOffset(0x28)] - internal short CodePage; // WORD - [FieldOffset(0x2a)] - internal short ByteCount; // WORD // 1 or 2 byte code page (SBCS or DBCS) - [FieldOffset(0x2c)] - internal char UnicodeReplace; // WORD // default replacement unicode character - [FieldOffset(0x2e)] - internal ushort ByteReplace; // WORD // default replacement bytes - [FieldOffset(0x30)] - internal short FirstDataWord; // WORD[] - } - - // Initialize our global stuff - unsafe static CodePageDataFileHeader* m_pCodePageFileHeader = - (CodePageDataFileHeader*)GlobalizationAssembly.GetGlobalizationResourceBytePtr( - typeof(CharUnicodeInfo).Assembly, CODE_PAGE_DATA_FILE_NAME); - - // Real variables - [NonSerialized] - unsafe protected CodePageHeader* pCodePage = null; - - // Safe handle wrapper around section map view - [NonSerialized] - protected SafeViewOfFileHandle safeMemorySectionHandle = null; - - // Safe handle wrapper around mapped file handle - [NonSerialized] - protected SafeFileMappingHandle safeFileMappingHandle = null; - - internal BaseCodePageEncoding(int codepage) : this(codepage, codepage) - { - } - - internal BaseCodePageEncoding(int codepage, int dataCodePage) : - base(codepage == 0? Microsoft.Win32.Win32Native.GetACP(): codepage) - { - // Remember number of code page that we'll be using the table for. - dataTableCodePage = dataCodePage; - LoadCodePageTables(); - } - - // Constructor called by serialization. - internal BaseCodePageEncoding(SerializationInfo info, StreamingContext context) : base(0) - { - // We cannot ever call this, we've proxied ourselved to CodePageEncoding - throw new ArgumentNullException("this"); - } - - // ISerializable implementation - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // Make sure to get the base stuff too This throws if info is null - SerializeEncoding(info, context); - Debug.Assert(info!=null, "[BaseCodePageEncoding.GetObjectData] Expected null info to throw"); - - // Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding) - info.AddValue(m_bUseMlangTypeForSerialization ? "m_maxByteSize" : "maxCharSize", - this.IsSingleByte ? 1 : 2); - - // Use this class or MLangBaseCodePageEncoding as our deserializer. - info.SetType(m_bUseMlangTypeForSerialization ? typeof(MLangCodePageEncoding) : - typeof(CodePageEncoding)); - } - - // We need to load tables for our code page - private unsafe void LoadCodePageTables() - { - CodePageHeader* pCodePage = FindCodePage(dataTableCodePage); - - // Make sure we have one - if (pCodePage == null) - { - // Didn't have one - throw new NotSupportedException( - Environment.GetResourceString("NotSupported_NoCodepageData", CodePage)); - } - - // Remember our code page - this.pCodePage = pCodePage; - - // We had it, so load it - LoadManagedCodePage(); - } - - // Look up the code page pointer - private static unsafe CodePageHeader* FindCodePage(int codePage) - { - // We'll have to loop through all of the m_pCodePageIndex[] items to find our code page, this isn't - // binary or anything so its not monsterously fast. - for (int i = 0; i < m_pCodePageFileHeader->CodePageCount; i++) - { - CodePageIndex* pCodePageIndex = (&(m_pCodePageFileHeader->CodePages)) + i; - - if (pCodePageIndex->CodePage == codePage) - { - // Found it! - CodePageHeader* pCodePage = - (CodePageHeader*)((byte*)m_pCodePageFileHeader + pCodePageIndex->Offset); - return pCodePage; - } - } - - // Couldn't find it - return null; - } - - // Get our code page byte count - internal static unsafe int GetCodePageByteSize(int codePage) - { - // Get our code page info - CodePageHeader* pCodePage = FindCodePage(codePage); - - // If null return 0 - if (pCodePage == null) - return 0; - - Debug.Assert(pCodePage->ByteCount == 1 || pCodePage->ByteCount == 2, - "[BaseCodePageEncoding] Code page (" + codePage + ") has invalid byte size (" + pCodePage->ByteCount + ") in table"); - // Return what it says for byte count - return pCodePage->ByteCount; - } - - // We have a managed code page entry, so load our tables - protected abstract unsafe void LoadManagedCodePage(); - - // Allocate memory to load our code page - protected unsafe byte* GetSharedMemory(int iSize) - { - // Build our name - String strName = GetMemorySectionName(); - - IntPtr mappedFileHandle; - - // This gets shared memory for our map. If its can't, it gives us clean memory. - Byte *pMemorySection = EncodingTable.nativeCreateOpenFileMapping(strName, iSize, out mappedFileHandle); - Debug.Assert(pMemorySection != null, - "[BaseCodePageEncoding.GetSharedMemory] Expected non-null memory section to be opened"); - - // If that failed, we have to die. - if (pMemorySection == null) - throw new OutOfMemoryException( - Environment.GetResourceString("Arg_OutOfMemoryException")); - - // if we have null file handle. this means memory was allocated after - // failing to open the mapped file. - - if (mappedFileHandle != IntPtr.Zero) - { - safeMemorySectionHandle = new SafeViewOfFileHandle((IntPtr) pMemorySection, true); - safeFileMappingHandle = new SafeFileMappingHandle(mappedFileHandle, true); - } - - return pMemorySection; - } - - protected unsafe virtual String GetMemorySectionName() - { - int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; - - String strName = String.Format(CultureInfo.InvariantCulture, "NLS_CodePage_{0}_{1}_{2}_{3}_{4}", - iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, - this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); - - return strName; - } - - protected abstract unsafe void ReadBestFitTable(); - - internal override char[] GetBestFitUnicodeToBytesData() - { - // Read in our best fit table if necessary - if (arrayUnicodeBestFit == null) ReadBestFitTable(); - - Debug.Assert(arrayUnicodeBestFit != null, - "[BaseCodePageEncoding.GetBestFitUnicodeToBytesData]Expected non-null arrayUnicodeBestFit"); - - // Normally we don't have any best fit data. - return arrayUnicodeBestFit; - } - - internal override char[] GetBestFitBytesToUnicodeData() - { - // Read in our best fit table if necessary - if (arrayBytesBestFit == null) ReadBestFitTable(); - - Debug.Assert(arrayBytesBestFit != null, - "[BaseCodePageEncoding.GetBestFitBytesToUnicodeData]Expected non-null arrayBytesBestFit"); - - // Normally we don't have any best fit data. - return arrayBytesBestFit; - } - - // During the AppDomain shutdown the Encoding class may already finalized and the memory section - // is invalid. so we detect that by validating the memory section handle then re-initialize the memory - // section by calling LoadManagedCodePage() method and eventually the mapped file handle and - // the memory section pointer will get finalized one more time. - internal unsafe void CheckMemorySection() - { - if (safeMemorySectionHandle != null && safeMemorySectionHandle.DangerousGetHandle() == IntPtr.Zero) - { - LoadManagedCodePage(); - } - } - } -} - -#endif // FEATURE_CODEPAGES_FILE diff --git a/src/mscorlib/src/System/Text/CodePageEncoding.cs b/src/mscorlib/src/System/Text/CodePageEncoding.cs deleted file mode 100644 index 7805c6580a..0000000000 --- a/src/mscorlib/src/System/Text/CodePageEncoding.cs +++ /dev/null @@ -1,136 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - - -// WARNING: -// -// This is just an IObjectReference proxy for the Code Page Encodings. -namespace System.Text -{ - using System; - using System.Runtime.Serialization; - using System.Security.Permissions; - using System.Diagnostics; - using System.Diagnostics.Contracts; - - /*=================================CodePageEncoding================================== - ** This class is here only to deserialize the Code Page classes from Everett (V1.1) into - ** Appropriate Whidbey (V2.0) objects. We also serialize the Whidbey classes - ** using this proxy since we pretty much need one anyway and that solves Whidbey - ** to Everett compatibility as well. - ==============================================================================*/ - - [Serializable] - internal sealed class CodePageEncoding : IObjectReference, ISerializable - { - // Temp stuff - [NonSerialized] - private int m_codePage; - [NonSerialized] - private bool m_isReadOnly; - [NonSerialized] - private bool m_deserializedFromEverett = false; - - [NonSerialized] - private EncoderFallback encoderFallback = null; - [NonSerialized] - private DecoderFallback decoderFallback = null; - - // Might need this when GetRealObjecting - [NonSerialized] - private Encoding realEncoding = null; - - // Constructor called by serialization. - internal CodePageEncoding(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - // All versions have a code page - this.m_codePage = (int)info.GetValue("m_codePage", typeof(int)); - - // See if we have a code page - try - { - // - // Try Whidbey V2.0 Fields - // - this.m_isReadOnly = (bool)info.GetValue("m_isReadOnly", typeof(bool)); - - this.encoderFallback = (EncoderFallback)info.GetValue("encoderFallback", typeof(EncoderFallback)); - this.decoderFallback = (DecoderFallback)info.GetValue("decoderFallback", typeof(DecoderFallback)); - } - catch (SerializationException) - { - // - // Didn't have Whidbey things, must be Everett - // - this.m_deserializedFromEverett = true; - - // May as well be read only - this.m_isReadOnly = true; - } - } - - // Just get it from GetEncoding - public Object GetRealObject(StreamingContext context) - { - // Get our encoding (Note: This has default fallbacks for readonly and everett cases) - this.realEncoding = Encoding.GetEncoding(this.m_codePage); - - // If its read only then it uses default fallbacks, otherwise pick up the new ones - // Otherwise we want to leave the new one read only - if (!this.m_deserializedFromEverett && !this.m_isReadOnly) - { - this.realEncoding = (Encoding)this.realEncoding.Clone(); - this.realEncoding.EncoderFallback = this.encoderFallback; - this.realEncoding.DecoderFallback = this.decoderFallback; - } - - return this.realEncoding; - } - - // ISerializable implementation - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // We cannot ever call this. - Debug.Assert(false, "Didn't expect to make it to CodePageEncoding ISerializable.GetObjectData"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - - // Same problem with the Decoder, this only happens with Everett Decoders - [Serializable] - internal sealed class Decoder : IObjectReference, ISerializable - { - // Might need this when GetRealObjecting - [NonSerialized] - private Encoding realEncoding = null; - - // Constructor called by serialization, have to handle deserializing from Everett - internal Decoder(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - this.realEncoding = (Encoding)info.GetValue("encoding", typeof(Encoding)); - } - - // Just get it from GetDecider - public Object GetRealObject(StreamingContext context) - { - return this.realEncoding.GetDecoder(); - } - - // ISerializable implementation, get data for this object - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // We cannot ever call this. - Debug.Assert(false, "Didn't expect to make it to CodePageEncoding.Decoder.GetObjectData"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - } - } -} diff --git a/src/mscorlib/src/System/Text/DBCSCodePageEncoding.cs b/src/mscorlib/src/System/Text/DBCSCodePageEncoding.cs deleted file mode 100644 index 28b85d591e..0000000000 --- a/src/mscorlib/src/System/Text/DBCSCodePageEncoding.cs +++ /dev/null @@ -1,1194 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding -namespace System.Text -{ - using System; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Text; - using System.Threading; - using System.Runtime.Serialization; - using System.Security; - using System.Security.Permissions; - - // DBCSCodePageEncoding - // - [Serializable] - internal class DBCSCodePageEncoding : BaseCodePageEncoding, ISerializable - { - // Pointers to our memory section parts - [NonSerialized] - protected unsafe char* mapBytesToUnicode = null; // char 65536 - [NonSerialized] - protected unsafe ushort* mapUnicodeToBytes = null; // byte 65536 - [NonSerialized] - protected unsafe int* mapCodePageCached = null; // to remember which CP is cached - - [NonSerialized] - protected const char UNKNOWN_CHAR_FLAG=(char)0x0; - [NonSerialized] - protected const char UNICODE_REPLACEMENT_CHAR=(char)0xFFFD; - [NonSerialized] - protected const char LEAD_BYTE_CHAR=(char)0xFFFE; // For lead bytes - - // Note that even though we provide bytesUnknown and byteCountUnknown, - // They aren't actually used because of the fallback mechanism. (char is though) - [NonSerialized] - ushort bytesUnknown; - [NonSerialized] - int byteCountUnknown; - [NonSerialized] - protected char charUnknown = (char)0; - - public DBCSCodePageEncoding(int codePage) : this(codePage, codePage) - { - } - - internal DBCSCodePageEncoding(int codePage, int dataCodePage) : base(codePage, dataCodePage) - { - } - - // Constructor called by serialization. - // Note: We use the base GetObjectData however - internal DBCSCodePageEncoding(SerializationInfo info, StreamingContext context) : base(0) - { - // Actually this can't ever get called, CodePageEncoding is our proxy - Debug.Assert(false, "Didn't expect to make it to DBCSCodePageEncoding serialization constructor"); - throw new ArgumentNullException("this"); - } - - // MBCS data section: - // - // We treat each multibyte pattern as 2 bytes in our table. If its a single byte, then the high byte - // for that position will be 0. When the table is loaded, leading bytes are flagged with 0xFFFE, so - // when reading the table look up with each byte. If the result is 0xFFFE, then use 2 bytes to read - // further data. FFFF is a special value indicating that the unicode code is the same as the - // character code (this helps us support code points < 0x20). FFFD is used as replacement character. - // - // Normal table: - // WCHAR* - Starting with MB code point 0. - // FFFF indicates we are to use the multibyte value for our code point. - // FFFE is the lead byte mark. (This should only appear in positions < 0x100) - // FFFD is the replacement (unknown character) mark. - // 2-20 means to advance the pointer 2-0x20 characters. - // 1 means that to advance to the multibyte position contained in the next char. - // 0 nothing special (I don't think its possible.) - // - // Table ends when multibyte position has advanced to 0xFFFF. - // - // Bytes->Unicode Best Fit table: - // WCHAR* - Same as normal table, except first wchar is byte position to start at. - // - // Unicode->Bytes Best Fit Table: - // WCHAR* - Same as normal table, except first wchar is char position to start at and - // we loop through unicode code points and the table has the byte points that - // corrospond to those unicode code points. - // We have a managed code page entry, so load our tables - // - protected override unsafe void LoadManagedCodePage() - { - // Should be loading OUR code page - Debug.Assert(pCodePage->CodePage == this.dataTableCodePage, - "[DBCSCodePageEncoding.LoadManagedCodePage]Expected to load data table code page"); - - // Make sure we're really a 1 byte code page - if (pCodePage->ByteCount != 2) - throw new NotSupportedException( - Environment.GetResourceString("NotSupported_NoCodepageData", CodePage)); - // Remember our unknown bytes & chars - bytesUnknown = pCodePage->ByteReplace; - charUnknown = pCodePage->UnicodeReplace; - - // Need to make sure the fallback buffer's fallback char is correct - if (this.DecoderFallback.IsMicrosoftBestFitFallback) - { - ((InternalDecoderBestFitFallback)(this.DecoderFallback)).cReplacement = charUnknown; - } - - // Is our replacement bytesUnknown a single or double byte character? - byteCountUnknown = 1; - if (bytesUnknown > 0xff) - byteCountUnknown++; - - // We use fallback encoder, which uses ?, which so far all of our tables do as well - Debug.Assert(bytesUnknown == 0x3f, - "[DBCSCodePageEncoding.LoadManagedCodePage]Expected 0x3f (?) as unknown byte character"); - - // Get our mapped section (bytes to allocate = 2 bytes per 65536 Unicode chars + 2 bytes per 65536 DBCS chars) - // Plus 4 byte to remember CP # when done loading it. (Don't want to get IA64 or anything out of alignment) - byte *pMemorySection = GetSharedMemory(65536 * 2 * 2 + 4 + this.iExtraBytes); - - mapBytesToUnicode = (char*)pMemorySection; - mapUnicodeToBytes = (ushort*)(pMemorySection + 65536 * 2); - mapCodePageCached = (int*)(pMemorySection + 65536 * 2 * 2 + this.iExtraBytes); - - // If its cached (& filled in) we don't have to do anything else - if (*mapCodePageCached != 0) - { - Debug.Assert(((*mapCodePageCached == this.dataTableCodePage && this.bFlagDataTable) || - (*mapCodePageCached == this.CodePage && !this.bFlagDataTable)), - "[DBCSCodePageEncoding.LoadManagedCodePage]Expected mapped section cached page flag to be set to data table or regular code page."); - - // Special case for GB18030 because it mangles its own code page after this function - if ((*mapCodePageCached != this.dataTableCodePage && this.bFlagDataTable) || - (*mapCodePageCached != this.CodePage && !this.bFlagDataTable)) - throw new OutOfMemoryException( - Environment.GetResourceString("Arg_OutOfMemoryException")); - - // If its cached (& filled in) we don't have to do anything else - return; - } - - // Need to read our data file and fill in our section. - // WARNING: Multiple code pieces could do this at once (so we don't have to lock machine-wide) - // so be careful here. Only stick legal values in here, don't stick temporary values. - - // Move to the beginning of the data section - char* pData = (char*)&(pCodePage->FirstDataWord); - - // We start at bytes position 0 - int bytePosition = 0; - int useBytes = 0; - - while (bytePosition < 0x10000) - { - // Get the next byte - char input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next data as our byte position - bytePosition = (int)(*pData); - pData++; - continue; - } - else if (input < 0x20 && input > 0) - { - // Advance input characters - bytePosition += input; - continue; - } - else if (input == 0xFFFF) - { - // Same as our bytePosition - useBytes = bytePosition; - input = unchecked((char)bytePosition); - } - else if (input == LEAD_BYTE_CHAR) // 0xfffe - { - // Lead byte mark - Debug.Assert(bytePosition < 0x100, "[DBCSCodePageEncoding.LoadManagedCodePage]expected lead byte to be < 0x100"); - useBytes = bytePosition; - // input stays 0xFFFE - } - else if (input == UNICODE_REPLACEMENT_CHAR) - { - // Replacement char is already done - bytePosition++; - continue; - } - else - { - // Use this character - useBytes = bytePosition; - // input == input; - } - - // We may need to clean up the selected character & position - if (CleanUpBytes(ref useBytes)) - { - // Use this selected character at the selected position, don't do this if not supposed to. - if (input != LEAD_BYTE_CHAR) - { - // Don't do this for lead byte marks. - mapUnicodeToBytes[input] = unchecked((ushort)useBytes); - } - mapBytesToUnicode[useBytes] = input; - } - bytePosition++; - } - - // See if we have any clean up junk to do - CleanUpEndBytes(mapBytesToUnicode); - - // We're done with our mapped section, set our flag so others don't have to rebuild table. - // We only do this if we're flagging(using) the data table as our primary mechanism - if (this.bFlagDataTable) - *mapCodePageCached = this.dataTableCodePage; - } - - // Any special processing for this code page - protected virtual bool CleanUpBytes(ref int bytes) - { - return true; - } - - // Any special processing for this code page - protected virtual unsafe void CleanUpEndBytes(char* chars) - { - } - - // Private object for locking instead of locking on a public type for SQL reliability work. - private static Object s_InternalSyncObject; - private static Object InternalSyncObject - { - get - { - if (s_InternalSyncObject == null) - { - Object o = new Object(); - Interlocked.CompareExchange<Object>(ref s_InternalSyncObject, o, null); - } - return s_InternalSyncObject; - } - } - - // Read in our best fit table - protected unsafe override void ReadBestFitTable() - { - // Lock so we don't confuse ourselves. - lock(InternalSyncObject) - { - // If we got a best fit array already then don't do this - if (arrayUnicodeBestFit == null) - { - // - // Read in Best Fit table. - // - - // First we have to advance past original character mapping table - // Move to the beginning of the data section - char* pData = (char*)&(pCodePage->FirstDataWord); - - // We start at bytes position 0 - int bytesPosition = 0; - - while (bytesPosition < 0x10000) - { - // Get the next byte - char input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next data as our byte position - bytesPosition = (int)(*pData); - pData++; - } - else if (input < 0x20 && input > 0) - { - // Advance input characters - bytesPosition += input; - } - else - { - // All other cases add 1 to bytes position - bytesPosition++; - } - } - - // Now bytesPosition is at start of bytes->unicode best fit table - char* pBytes2Unicode = pData; - - // Now pData should be pointing to first word of bytes -> unicode best fit table - // (which we're also not using at the moment) - int iBestFitCount = 0; - bytesPosition = *pData; - pData++; - - while (bytesPosition < 0x10000) - { - // Get the next byte - char input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next data as our byte position - bytesPosition = (int)(*pData); - pData++; - } - else if (input < 0x20 && input > 0) - { - // Advance input characters - bytesPosition += input; - } - else - { - // Use this character (unless its unknown, unk just skips 1) - if (input != UNICODE_REPLACEMENT_CHAR) - { - int correctedChar = bytesPosition; - if (CleanUpBytes(ref correctedChar)) - { - // Sometimes correction makes them same as no best fit, skip those. - if (mapBytesToUnicode[correctedChar] != input) - { - iBestFitCount++; - } - } - } - - // Position gets incremented in any case. - bytesPosition++; - } - - } - - // Now we know how big the best fit table has to be - char[] arrayTemp = new char[iBestFitCount * 2]; - - // Now we know how many best fits we have, so go back & read them in - iBestFitCount = 0; - pData = pBytes2Unicode; - bytesPosition = *pData; - pData++; - bool bOutOfOrder = false; - - // Read it all in again - while (bytesPosition < 0x10000) - { - // Get the next byte - char input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next data as our byte position - bytesPosition = (int)(*pData); - pData++; - } - else if (input < 0x20 && input > 0) - { - // Advance input characters - bytesPosition += input; - } - else - { - // Use this character (unless its unknown, unk just skips 1) - if (input != UNICODE_REPLACEMENT_CHAR) - { - int correctedChar = bytesPosition; - if (CleanUpBytes(ref correctedChar)) - { - // Sometimes correction makes them same as no best fit, skip those. - if (mapBytesToUnicode[correctedChar] != input) - { - if (correctedChar != bytesPosition) - bOutOfOrder = true; - - arrayTemp[iBestFitCount++] = unchecked((char)correctedChar); - arrayTemp[iBestFitCount++] = input; - } - } - } - - // Position gets incremented in any case. - bytesPosition++; - } - } - - // If they're out of order we need to sort them. - if (bOutOfOrder) - { - Debug.Assert((arrayTemp.Length / 2) < 20, - "[DBCSCodePageEncoding.ReadBestFitTable]Expected small best fit table < 20 for code page " + CodePage + ", not " + arrayTemp.Length / 2); - - for (int i = 0; i < arrayTemp.Length - 2; i+=2) - { - int iSmallest = i; - char cSmallest = arrayTemp[i]; - - for (int j = i + 2; j < arrayTemp.Length; j+=2) - { - // Find smallest one for front - if (cSmallest > arrayTemp[j]) - { - cSmallest = arrayTemp[j]; - iSmallest = j; - } - } - - // If smallest one is something else, switch them - if (iSmallest != i) - { - char temp = arrayTemp[iSmallest]; - arrayTemp[iSmallest] = arrayTemp[i]; - arrayTemp[i] = temp; - temp = arrayTemp[iSmallest+1]; - arrayTemp[iSmallest+1] = arrayTemp[i+1]; - arrayTemp[i+1] = temp; - } - } - } - - // Remember our array - arrayBytesBestFit = arrayTemp; - - // Now were at beginning of Unicode -> Bytes best fit table, need to count them - char* pUnicode2Bytes = pData; - int unicodePosition = *(pData++); - iBestFitCount = 0; - - while (unicodePosition < 0x10000) - { - // Get the next byte - char input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next data as our byte position - unicodePosition = (int)*pData; - pData++; - } - else if (input < 0x20 && input > 0) - { - // Advance input characters - unicodePosition += input; - } - else - { - // Same as our unicodePosition or use this character - if (input > 0) - iBestFitCount++; - unicodePosition++; - } - } - - // Allocate our table - arrayTemp = new char[iBestFitCount*2]; - - // Now do it again to fill the array with real values - pData = pUnicode2Bytes; - unicodePosition = *(pData++); - iBestFitCount = 0; - - while (unicodePosition < 0x10000) - { - // Get the next byte - char input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next data as our byte position - unicodePosition = (int)*pData; - pData++; - } - else if (input < 0x20 && input > 0) - { - // Advance input characters - unicodePosition += input; - } - else - { - if (input > 0) - { - // Use this character, may need to clean it up - int correctedChar = (int)input; - if (CleanUpBytes(ref correctedChar)) - { - arrayTemp[iBestFitCount++] = unchecked((char)unicodePosition); - // Have to map it to Unicode because best fit will need unicode value of best fit char. - arrayTemp[iBestFitCount++] = mapBytesToUnicode[correctedChar]; - - // This won't work if it won't round trip. - // We can't do this assert for CP 51932 & 50220 because they aren't - // calling CleanUpBytes() for best fit. All the string stuff here - // also makes this assert slow. - // Debug.Assert(arrayTemp[iBestFitCount-1] != (char)0xFFFD, String.Format( - // "[DBCSCodePageEncoding.ReadBestFitTable] No valid Unicode value {0:X4} for round trip bytes {1:X4}, encoding {2}", - // (int)mapBytesToUnicode[input], (int)input, CodePage)); - } - } - unicodePosition++; - } - } - - // Remember our array - arrayUnicodeBestFit = arrayTemp; - } - - } - } - - // GetByteCount - // Note: We start by assuming that the output will be the same as count. Having - // an encoder or fallback may change that assumption - internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(count >= 0, "[DBCSCodePageEncoding.GetByteCount]count is negative"); - Debug.Assert(chars != null, "[DBCSCodePageEncoding.GetByteCount]chars is null"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[DBCSCodePageEncoding.GetByteCount]Attempting to use null fallback"); - - CheckMemorySection(); - - // Get any left over characters - char charLeftOver = (char)0; - if (encoder != null) - { - charLeftOver = encoder.charLeftOver; - - // Only count if encoder.m_throwOnOverflow - if (encoder.InternalHasFallbackBuffer && encoder.FallbackBuffer.Remaining > 0) - throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty", - this.EncodingName, encoder.Fallback.GetType())); - } - - // prepare our end - int byteCount = 0; - char* charEnd = chars + count; - - // For fallback we will need a fallback buffer - EncoderFallbackBuffer fallbackBuffer = null; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[DBCSCodePageEncoding.GetByteCount]leftover character should be high surrogate"); - Debug.Assert(encoder != null, - "[DBCSCodePageEncoding.GetByteCount]Expect to have encoder if we have a charLeftOver"); - - // Since left over char was a surrogate, it'll have to be fallen back. - // Get Fallback - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); - // This will fallback a pair if *chars is a low surrogate - fallbackBuffer.InternalFallback(charLeftOver, ref chars); - } - - // Now we may have fallback char[] already (from the encoder) - - // We have to use fallback method. - char ch; - while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || - chars < charEnd) - { - // First unwind any fallback - if (ch == 0) - { - // No fallback, just get next char - ch = *chars; - chars++; - } - - // get byte for this char - ushort sTemp = mapUnicodeToBytes[ch]; - - // Check for fallback, this'll catch surrogate pairs too. - if (sTemp == 0 && ch != (char)0) - { - if (fallbackBuffer == null) - { - // Initialize the buffer - if (encoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(charEnd - count, charEnd, encoder, false); - } - - // Get Fallback - fallbackBuffer.InternalFallback(ch, ref chars); - continue; - } - - // We'll use this one - byteCount++; - if (sTemp >= 0x100) - byteCount++; - } - - return (int)byteCount; - } - - internal override unsafe int GetBytes(char* chars, int charCount, - byte* bytes, int byteCount, EncoderNLS encoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[DBCSCodePageEncoding.GetBytes]bytes is null"); - Debug.Assert(byteCount >= 0, "[DBCSCodePageEncoding.GetBytes]byteCount is negative"); - Debug.Assert(chars != null, "[DBCSCodePageEncoding.GetBytes]chars is null"); - Debug.Assert(charCount >= 0, "[DBCSCodePageEncoding.GetBytes]charCount is negative"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[DBCSCodePageEncoding.GetBytes]Attempting to use null encoder fallback"); - - CheckMemorySection(); - - // For fallback we will need a fallback buffer - EncoderFallbackBuffer fallbackBuffer = null; - - // prepare our end - char* charEnd = chars + charCount; - char* charStart = chars; - byte* byteStart = bytes; - byte* byteEnd = bytes + byteCount; - - // Get any left over characters - char charLeftOver = (char)0; - if (encoder != null) - { - charLeftOver = encoder.charLeftOver; - Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), - "[DBCSCodePageEncoding.GetBytes]leftover character should be high surrogate"); - - // Go ahead and get the fallback buffer (need leftover fallback if converting) - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, true); - - // If we're not converting we must not have a fallback buffer - if (encoder.m_throwOnOverflow && fallbackBuffer.Remaining > 0) - throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty", - this.EncodingName, encoder.Fallback.GetType())); - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(encoder != null, - "[DBCSCodePageEncoding.GetBytes]Expect to have encoder if we have a charLeftOver"); - - // Since left over char was a surrogate, it'll have to be fallen back. - // Get Fallback - fallbackBuffer.InternalFallback(charLeftOver, ref chars); - } - } - - // Now we may have fallback char[] already from the encoder - - // Go ahead and do it, including the fallback. - char ch; - while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || - chars < charEnd) - { - // First unwind any fallback - if (ch == 0) - { - // No fallback, just get next char - ch = *chars; - chars++; - } - - // get byte for this char - ushort sTemp = mapUnicodeToBytes[ch]; - - // Check for fallback, this'll catch surrogate pairs too. - if (sTemp == 0 && ch != (char)0) - { - if (fallbackBuffer == null) - { - // Initialize the buffer - Debug.Assert(encoder == null, - "[DBCSCodePageEncoding.GetBytes]Expected delayed create fallback only if no encoder."); - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, true); - } - - // Get Fallback - fallbackBuffer.InternalFallback(ch, ref chars); - continue; - } - - // We'll use this one (or two) - // Bounds check - - // Go ahead and add it, lead byte 1st if necessary - if (sTemp >= 0x100) - { - if (bytes + 1 >= byteEnd) - { - // didn't use this char, we'll throw or use buffer - if (fallbackBuffer == null || fallbackBuffer.bFallingBack == false) - { - Debug.Assert(chars > charStart, - "[DBCSCodePageEncoding.GetBytes]Expected chars to have advanced (double byte case)"); - chars--; // don't use last char - } - else - fallbackBuffer.MovePrevious(); // don't use last fallback - ThrowBytesOverflow(encoder, chars == charStart); // throw ? - break; // don't throw, stop - } - - *bytes = unchecked((byte)(sTemp >> 8)); - bytes++; - } - // Single byte - else if (bytes >= byteEnd) - { - // didn't use this char, we'll throw or use buffer - if (fallbackBuffer == null || fallbackBuffer.bFallingBack == false) - { - Debug.Assert(chars > charStart, - "[DBCSCodePageEncoding.GetBytes]Expected chars to have advanced (single byte case)"); - chars--; // don't use last char - } - else - fallbackBuffer.MovePrevious(); // don't use last fallback - ThrowBytesOverflow(encoder, chars == charStart); // throw ? - break; // don't throw, stop - } - - *bytes = unchecked((byte)(sTemp & 0xff)); - bytes++; - } - - // encoder stuff if we have one - if (encoder != null) - { - // Fallback stuck it in encoder if necessary, but we have to clear MustFlush cases - if (fallbackBuffer != null && !fallbackBuffer.bUsedEncoder) - // Clear it in case of MustFlush - encoder.charLeftOver = (char)0; - - // Set our chars used count - encoder.m_charsUsed = (int)(chars - charStart); - } - - // If we're not converting we must not have a fallback buffer - // (We don't really have a way to clear none-encoder using fallbacks however) -// Debug.Assert((encoder == null || encoder.m_throwOnOverflow) && -// (fallbackBuffer == null || fallbackBuffer.Remaining == 0), -// "[DBCSEncoding.GetBytes]Expected empty fallback buffer at end if not converting"); - - return (int)(bytes - byteStart); - } - - // This is internal and called by something else, - internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - // Just assert, we're called internally so these should be safe, checked already - Debug.Assert(bytes != null, "[DBCSCodePageEncoding.GetCharCount]bytes is null"); - Debug.Assert(count >= 0, "[DBCSCodePageEncoding.GetCharCount]byteCount is negative"); - - CheckMemorySection(); - - // Fix our decoder - DBCSDecoder decoder = (DBCSDecoder)baseDecoder; - - // Get our fallback - DecoderFallbackBuffer fallbackBuffer = null; - - // We'll need to know where the end is - byte* byteEnd = bytes + count; - int charCount = count; // Assume 1 char / byte - - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check m_throwOnOverflow for count) - Debug.Assert(decoder == null || - !decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[DBCSCodePageEncoding.GetCharCount]Expected empty fallback buffer at start"); - - // If we have a left over byte, use it - if (decoder != null && decoder.bLeftOver > 0) - { - // We have a left over byte? - if (count == 0) - { - // No input though - if (!decoder.MustFlush) - { - // Don't have to flush - return 0; - } - - - Debug.Assert(fallbackBuffer == null, - "[DBCSCodePageEncoding.GetCharCount]Expected empty fallback buffer"); - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(bytes, null); - - byte[] byteBuffer = new byte[] { unchecked((byte)decoder.bLeftOver) }; - return fallbackBuffer.InternalFallback(byteBuffer, bytes); - } - - // Get our full info - int iBytes = decoder.bLeftOver << 8; - iBytes |= (*bytes); - bytes++; - - // This is either 1 known char or fallback - // Already counted 1 char - // Look up our bytes - char cDecoder = mapBytesToUnicode[iBytes]; - if (cDecoder == 0 && iBytes != 0) - { - // Deallocate preallocated one - charCount--; - - // We'll need a fallback - Debug.Assert(fallbackBuffer == null, - "[DBCSCodePageEncoding.GetCharCount]Expected empty fallback buffer for unknown pair"); - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - count, null); - - // Do fallback, we know there're 2 bytes - byte[] byteBuffer = new byte[] { unchecked((byte)(iBytes >> 8)), unchecked((byte)iBytes) }; - charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); - } - // else we already reserved space for this one. - } - - // Loop, watch out for fallbacks - while (bytes < byteEnd) - { - // Faster if don't use *bytes++; - int iBytes = *bytes; - bytes++; - char c = mapBytesToUnicode[iBytes]; - - // See if it was a double byte character - if (c == LEAD_BYTE_CHAR) - { - // Its a lead byte - charCount--; // deallocate preallocated lead byte - if (bytes < byteEnd) - { - // Have another to use, so use it - iBytes <<= 8; - iBytes |= *bytes; - bytes++; - c = mapBytesToUnicode[iBytes]; - } - else - { - // No input left - if (decoder == null || decoder.MustFlush) - { - // have to flush anyway, set to unknown so we use fallback in a 'sec - charCount++; // reallocate deallocated lead byte - c = UNKNOWN_CHAR_FLAG; - } - else - { - // We'll stick it in decoder - break; - } - } - } - - // See if it was unknown. - // Unknown and known chars already allocated, but fallbacks aren't - if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - if (fallbackBuffer == null) - { - if (decoder == null) - fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - count, null); - } - - // Do fallback - charCount--; // Get rid of preallocated extra char - byte[] byteBuffer = null; - if (iBytes < 0x100) - byteBuffer = new byte[] { unchecked((byte)iBytes) }; - else - byteBuffer = new byte[] { unchecked((byte)(iBytes >> 8)), unchecked((byte)iBytes) }; - charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); - } - } - - // Shouldn't have anything in fallback buffer for GetChars - Debug.Assert(decoder == null || !decoder.m_throwOnOverflow || - !decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[DBCSCodePageEncoding.GetCharCount]Expected empty fallback buffer at end"); - - // Return our count - return charCount; - } - - internal override unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS baseDecoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[DBCSCodePageEncoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[DBCSCodePageEncoding.GetChars]byteCount is negative"); - Debug.Assert(chars != null, "[DBCSCodePageEncoding.GetChars]chars is null"); - Debug.Assert(charCount >= 0, "[DBCSCodePageEncoding.GetChars]charCount is negative"); - - CheckMemorySection(); - - // Fix our decoder - DBCSDecoder decoder = (DBCSDecoder)baseDecoder; - - // We'll need to know where the end is - byte* byteStart = bytes; - byte* byteEnd = bytes + byteCount; - char* charStart = chars; - char* charEnd = chars + charCount; - bool bUsedDecoder = false; - - // Get our fallback - DecoderFallbackBuffer fallbackBuffer = null; - - // Shouldn't have anything in fallback buffer for GetChars - Debug.Assert(decoder == null || !decoder.m_throwOnOverflow || - !decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[DBCSCodePageEncoding.GetChars]Expected empty fallback buffer at start"); - - // If we have a left over byte, use it - if (decoder != null && decoder.bLeftOver > 0) - { - // We have a left over byte? - if (byteCount == 0) - { - // No input though - if (!decoder.MustFlush) - { - // Don't have to flush - return 0; - } - - // Well, we're flushing, so use '?' or fallback - // fallback leftover byte - Debug.Assert(fallbackBuffer == null, - "[DBCSCodePageEncoding.GetChars]Expected empty fallback"); - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(bytes, charEnd); - - // If no room its hopeless, this was 1st fallback - byte[] byteBuffer = new byte[] { unchecked((byte)decoder.bLeftOver) }; - if (!fallbackBuffer.InternalFallback(byteBuffer, bytes, ref chars)) - ThrowCharsOverflow(decoder, true); - - decoder.bLeftOver = 0; - - // Done, return it - return (int)(chars-charStart); - } - - // Get our full info - int iBytes = decoder.bLeftOver << 8; - iBytes |= (*bytes); - bytes++; - - // Look up our bytes - char cDecoder = mapBytesToUnicode[iBytes]; - if (cDecoder == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - Debug.Assert(fallbackBuffer == null, - "[DBCSCodePageEncoding.GetChars]Expected empty fallback for two bytes"); - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - byteCount, charEnd); - - byte[] byteBuffer = new byte[] { unchecked((byte)(iBytes >> 8)), unchecked((byte)iBytes) }; - if (!fallbackBuffer.InternalFallback(byteBuffer, bytes, ref chars)) - ThrowCharsOverflow(decoder, true); - } - else - { - // Do we have output room?, hopeless if not, this is first char - if (chars >= charEnd) - ThrowCharsOverflow(decoder, true); - - *(chars++) = cDecoder; - } - } - - // Loop, paying attention to our fallbacks. - while (bytes < byteEnd) - { - // Faster if don't use *bytes++; - int iBytes = *bytes; - bytes++; - char c = mapBytesToUnicode[iBytes]; - - // See if it was a double byte character - if (c == LEAD_BYTE_CHAR) - { - // Its a lead byte - if (bytes < byteEnd) - { - // Have another to use, so use it - iBytes <<= 8; - iBytes |= *bytes; - bytes++; - c = mapBytesToUnicode[iBytes]; - } - else - { - // No input left - if (decoder == null || decoder.MustFlush) - { - // have to flush anyway, set to unknown so we use fallback in a 'sec - c = UNKNOWN_CHAR_FLAG; - } - else - { - // Stick it in decoder - bUsedDecoder = true; - decoder.bLeftOver = (byte)iBytes; - break; - } - } - } - - // See if it was unknown - if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - if (fallbackBuffer == null) - { - if (decoder == null) - fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - byteCount, charEnd); - } - - // Do fallback - byte[] byteBuffer = null; - if (iBytes < 0x100) - byteBuffer = new byte[] { unchecked((byte)iBytes) }; - else - byteBuffer = new byte[] { unchecked((byte)(iBytes >> 8)), unchecked((byte)iBytes) }; - if (!fallbackBuffer.InternalFallback(byteBuffer, bytes, ref chars)) - { - // May or may not throw, but we didn't get these byte(s) - Debug.Assert(bytes >= byteStart + byteBuffer.Length, - "[DBCSCodePageEncoding.GetChars]Expected bytes to have advanced for fallback"); - bytes-=byteBuffer.Length; // didn't use these byte(s) - fallbackBuffer.InternalReset(); // Didn't fall this back - ThrowCharsOverflow(decoder, bytes == byteStart); // throw? - break; // don't throw, but stop loop - } - } - else - { - // Do we have buffer room? - if (chars >= charEnd) - { - // May or may not throw, but we didn't get these byte(s) - Debug.Assert(bytes > byteStart, - "[DBCSCodePageEncoding.GetChars]Expected bytes to have advanced for lead byte"); - bytes--; // unused byte - if (iBytes >= 0x100) - { - Debug.Assert(bytes > byteStart, - "[DBCSCodePageEncoding.GetChars]Expected bytes to have advanced for trail byte"); - bytes--; // 2nd unused byte - } - ThrowCharsOverflow(decoder, bytes == byteStart); // throw? - break; // don't throw, but stop loop - } - - *(chars++) = c; - } - } - - // We already stuck it in encoder if necessary, but we have to clear cases where nothing new got into decoder - if (decoder != null) - { - // Clear it in case of MustFlush - if (bUsedDecoder == false) - { - decoder.bLeftOver = 0; - } - - // Remember our count - decoder.m_bytesUsed = (int)(bytes - byteStart); - } - - // Shouldn't have anything in fallback buffer for GetChars - Debug.Assert(decoder == null || !decoder.m_throwOnOverflow || - !decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, - "[DBCSCodePageEncoding.GetChars]Expected empty fallback buffer at end"); - - // Return length of our output - return (int)(chars - charStart); - } - - public override int GetMaxByteCount(int charCount) - { - if (charCount < 0) - throw new ArgumentOutOfRangeException(nameof(charCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Characters would be # of characters + 1 in case high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; - - if (EncoderFallback.MaxCharCount > 1) - byteCount *= EncoderFallback.MaxCharCount; - - // 2 to 1 is worst case. Already considered surrogate fallback - byteCount *= 2; - - if (byteCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); - - return (int)byteCount; - } - - public override int GetMaxCharCount(int byteCount) - { - if (byteCount < 0) - throw new ArgumentOutOfRangeException(nameof(byteCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // DBCS is pretty much the same, but could have hanging high byte making extra ? and fallback for unknown - long charCount = ((long)byteCount + 1); - - // 1 to 1 for most characters. Only surrogates with fallbacks have less, unknown fallbacks could be longer. - if (DecoderFallback.MaxCharCount > 1) - charCount *= DecoderFallback.MaxCharCount; - - if (charCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(byteCount), Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); - - return (int)charCount; - } - - public override Decoder GetDecoder() - { - return new DBCSDecoder(this); - } - - [Serializable] - internal class DBCSDecoder : DecoderNLS - { - // Need a place for the last left over byte - internal byte bLeftOver = 0; - - public DBCSDecoder(DBCSCodePageEncoding encoding) : base(encoding) - { - // Base calls reset - } - - public override void Reset() - { - this.bLeftOver = 0; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - return (this.bLeftOver != 0); - } - } - } - } -} -#endif // FEATURE_CODEPAGES_FILE - diff --git a/src/mscorlib/src/System/Text/Decoder.cs b/src/mscorlib/src/System/Text/Decoder.cs index 0ebbacddcf..a9fea82a39 100644 --- a/src/mscorlib/src/System/Text/Decoder.cs +++ b/src/mscorlib/src/System/Text/Decoder.cs @@ -20,7 +20,6 @@ namespace System.Text // class are typically obtained through calls to the GetDecoder method // of Encoding objects. // - [System.Runtime.InteropServices.ComVisible(true)] [Serializable] public abstract class Decoder { @@ -39,7 +38,6 @@ namespace System.Text // We don't call default reset because default reset probably isn't good if we aren't initialized. } - [System.Runtime.InteropServices.ComVisible(false)] public DecoderFallback Fallback { get @@ -65,7 +63,6 @@ namespace System.Text // Note: we don't test for threading here because async access to Encoders and Decoders // doesn't work anyway. - [System.Runtime.InteropServices.ComVisible(false)] public DecoderFallbackBuffer FallbackBuffer { get @@ -99,7 +96,6 @@ namespace System.Text // // Virtual implimentation has to call GetChars with flush and a big enough buffer to clear a 0 byte string // We avoid GetMaxCharCount() because a) we can't call the base encoder and b) it might be really big. - [System.Runtime.InteropServices.ComVisible(false)] public virtual void Reset() { byte[] byteTemp = Array.Empty<byte>(); @@ -117,7 +113,6 @@ namespace System.Text // public abstract int GetCharCount(byte[] bytes, int index, int count); - [System.Runtime.InteropServices.ComVisible(false)] public virtual int GetCharCount(byte[] bytes, int index, int count, bool flush) { return GetCharCount(bytes, index, count); @@ -126,7 +121,6 @@ namespace System.Text // We expect this to be the workhorse for NLS Encodings, but for existing // ones we need a working (if slow) default implimentation) [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetCharCount(byte* bytes, int count, bool flush) { // Validate input parameters @@ -190,7 +184,6 @@ namespace System.Text // could easily overflow our output buffer. Therefore we do an extra test // when we copy the buffer so that we don't overflow charCount either. [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount, bool flush) { @@ -248,7 +241,6 @@ namespace System.Text // Note that if all of the input bytes are not consumed, then we'll do a /2, which means // that its likely that we didn't consume as many bytes as we could have. For some // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) - [System.Runtime.InteropServices.ComVisible(false)] public virtual void Convert(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, int charCount, bool flush, out int bytesUsed, out int charsUsed, out bool completed) @@ -306,7 +298,6 @@ namespace System.Text // that its likely that we didn't consume as many bytes as we could have. For some // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe void Convert(byte* bytes, int byteCount, char* chars, int charCount, bool flush, out int bytesUsed, out int charsUsed, out bool completed) diff --git a/src/mscorlib/src/System/Text/DecoderFallback.cs b/src/mscorlib/src/System/Text/DecoderFallback.cs index 42483a724d..bfd4a2852d 100644 --- a/src/mscorlib/src/System/Text/DecoderFallback.cs +++ b/src/mscorlib/src/System/Text/DecoderFallback.cs @@ -75,14 +75,6 @@ namespace System.Text // Maximum number of characters that this instance of this fallback could return public abstract int MaxCharCount { get; } - - internal bool IsMicrosoftBestFitFallback - { - get - { - return bIsMicrosoftBestFitFallback; - } - } } diff --git a/src/mscorlib/src/System/Text/DecoderNLS.cs b/src/mscorlib/src/System/Text/DecoderNLS.cs index e44c43adef..79474f8d8c 100644 --- a/src/mscorlib/src/System/Text/DecoderNLS.cs +++ b/src/mscorlib/src/System/Text/DecoderNLS.cs @@ -5,7 +5,6 @@ namespace System.Text { using System.Runtime.Serialization; - using System.Security.Permissions; using System.Text; using System; using System.Diagnostics.Contracts; @@ -98,7 +97,7 @@ namespace System.Text bytes = new byte[1]; // Just call pointer version - fixed (byte* pBytes = bytes) + fixed (byte* pBytes = &bytes[0]) return GetCharCount(pBytes + index, count, flush); } @@ -159,8 +158,8 @@ namespace System.Text chars = new char[1]; // Just call pointer version - fixed (byte* pBytes = bytes) - fixed (char* pChars = chars) + fixed (byte* pBytes = &bytes[0]) + fixed (char* pChars = &chars[0]) // Remember that charCount is # to decode, not size of array return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, flush); @@ -223,9 +222,9 @@ namespace System.Text chars = new char[1]; // Just call the pointer version (public overrides can't do this) - fixed (byte* pBytes = bytes) + fixed (byte* pBytes = &bytes[0]) { - fixed (char* pChars = chars) + fixed (char* pChars = &chars[0]) { Convert(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, flush, out bytesUsed, out charsUsed, out completed); diff --git a/src/mscorlib/src/System/Text/EUCJPEncoding.cs b/src/mscorlib/src/System/Text/EUCJPEncoding.cs deleted file mode 100644 index 44345b22b9..0000000000 --- a/src/mscorlib/src/System/Text/EUCJPEncoding.cs +++ /dev/null @@ -1,183 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding -namespace System.Text -{ - using System.Text; - using System.Globalization; - - // EUCJPEncoding - // - // EUC-JP Encoding (51932) - // - // EUC-JP has the following code points: - // 00-7F - ASCII - // 80-8D & 90-9F - Control. (Like Unicode, except for 8e and 8f) - // A1-FE, A1-FE - 2 byte JIS X 0208 range. - // 8E, A1-DF - 2 byte half-width Katakana - // 8F, A1-FE, A1-FE - 3 byte JIX X 0212 range. WE DON'T USE JIS 0212!!! - // - // New thoughts: - // Fixing windows 20932 code page so that all characters can be looked up there. - // - // Old thoughts: - // Windows NLS uses a special CP20932 for EUC-JP, but it is not used by mlang. Windows - // Maps the 3 byte ranges to the 2 byte CP20932 by masking the 2nd byte with & 0x7F. - // MLang uses the native windows 932 code page, which is more reliable, however the code points - // don't line up as nicely as the 20932 code page, however it doesn't have JIS X 0212 support. - // - // So what we do is: - // 1. For ASCII, leave it alone - // 2. For half-width Katakana, use the leading byte and convert with 20936 code page. - // 3. For JIS X 0208, Use the leading & trailing bytes with 20936 code page - // 4. For JIS X 0212, Remove the lead byte, & 0xFF7F, and use the CP20936 table to convert. - // - // Regarding Normalization: - // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings - // Form D is precluded because of 0x00a8, which changes to space + dierises. - // - // I think that IsAlwaysNormalized should probably return true for form C (but not certain) - // - // NOTE: We don't use JIS 0212 so we are basically a DBCS code page, we just have to modify - // the 932 table we're basing this on. - // - - using System; - - [Serializable] - internal class EUCJPEncoding : DBCSCodePageEncoding - { - // This pretends to be CP 932 as far as memory tables are concerned. - public EUCJPEncoding() : base(51932, 932) - { - this.m_bUseMlangTypeForSerialization = true; - } - - protected unsafe override String GetMemorySectionName() - { - int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; - - String strName = String.Format(CultureInfo.InvariantCulture, "CodePage_{0}_{1}_{2}_{3}_{4}_EUCJP", - iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, - this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); - - return strName; - } - - // Clean up characters for EUC-JP code pages, etc. - protected override bool CleanUpBytes(ref int bytes) - { - if (bytes >= 0x100) - { - // map extended char (0xfa40-0xfc4b) to a special range - // (ported from mlang) - if (bytes >= 0xfa40 && bytes <= 0xfc4b) - { - if ( bytes >= 0xfa40 && bytes <= 0xfa5b ) - { - if ( bytes <= 0xfa49 ) - bytes = bytes - 0x0b51 ; - else if ( bytes >= 0xfa4a && bytes <= 0xfa53 ) - bytes = bytes - 0x072f6 ; - else if ( bytes >= 0xfa54 && bytes <= 0xfa57 ) - bytes = bytes - 0x0b5b ; - else if ( bytes == 0xfa58 ) - bytes = 0x878a ; - else if ( bytes == 0xfa59 ) - bytes = 0x8782 ; - else if ( bytes == 0xfa5a ) - bytes = 0x8784 ; - else if ( bytes == 0xfa5b ) - bytes = 0x879a ; - } - else if ( bytes >= 0xfa5c && bytes <= 0xfc4b ) - { - byte tc = unchecked((byte)bytes); - if ( tc < 0x5c ) - bytes = bytes - 0x0d5f; - else if ( tc >= 0x80 && tc <= 0x9B ) - bytes = bytes - 0x0d1d; - else - bytes = bytes - 0x0d1c; - } - } - - // Convert 932 code page to 20932 like code page range - // (also ported from mlang) - byte bLead = unchecked((byte)(bytes >> 8)); - byte bTrail = unchecked((byte)bytes); - - bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71); - bLead = (byte)((bLead << 1) + 1); - if (bTrail > (byte)0x9e) - { - bTrail -= (byte)0x7e; - bLead++; - } - else - { - if (bTrail > (byte)0x7e) - bTrail--; - bTrail -= (byte)0x1f; - } - - bytes = ((int)bLead) << 8 | (int)bTrail | 0x8080; - - // // Don't step on our katakana special plane, if katakana space return false. - // if (bytes >= 0x8E00 && bytes <= 0x8EFF) - // return false; - - // Don't step out of our allocated lead byte area. - // All DBCS lead and trail bytes should be >= 0xa1 and <= 0xfe - if ((bytes & 0xFF00) < 0xa100 || (bytes & 0xFF00) > 0xfe00 || - (bytes & 0xFF) < 0xa1 || (bytes & 0xFF) > 0xfe) - return false; - - // WARNING: Our funky mapping allows illegal values, which we continue to use - // so that we're compatible with Everett. - } - else - { - // For 51932 1/2 Katakana gets a 0x8E lead byte - // Adjust 1/2 Katakana - if (bytes >= 0xa1 && bytes <= 0xdf) - { - bytes |= 0x8E00; - return true; - } - - // 0x81-0x9f and 0xe0-0xfc CP 932 - // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though) - // b0-df is 1/2 Katakana - // So 81-9f & e0-fc are 932 lead bytes, a1-fe are our lead bytes - // so ignore everything above 0x80 except 0xa0 and 0xff - if (bytes >= 0x81 && bytes != 0xa0 && bytes != 0xff) - { - // We set diffent lead bytes later, so just return false - return false; - } - } - - return true; - } - - protected override unsafe void CleanUpEndBytes(char* chars) - { - // Need to special case CP 51932 - // 0x81-0x9f and 0xe0-0xfc CP 932 - // 0x8e and 0xa1-0xfe CP 20932 - // 0x10 and 0x21-0x9? Us (remapping 932) - // b0-df is 1/2 Katakana (trail byte) - - // A1-FE are DBCS code points - for (int i = 0xA1; i <= 0xFE; i++) - chars[i] = LEAD_BYTE_CHAR; - - // And 8E is lead byte for Katakana (already set) - chars[0x8e] = LEAD_BYTE_CHAR; - } - } -} -#endif // FEATURE_CODEPAGES_FILE diff --git a/src/mscorlib/src/System/Text/Encoder.cs b/src/mscorlib/src/System/Text/Encoder.cs index b9d4581276..f766f98142 100644 --- a/src/mscorlib/src/System/Text/Encoder.cs +++ b/src/mscorlib/src/System/Text/Encoder.cs @@ -20,7 +20,6 @@ namespace System.Text // class are typically obtained through calls to the GetEncoder method // of Encoding objects. // - [System.Runtime.InteropServices.ComVisible(true)] [Serializable] public abstract class Encoder { @@ -39,7 +38,6 @@ namespace System.Text // We don't call default reset because default reset probably isn't good if we aren't initialized. } - [System.Runtime.InteropServices.ComVisible(false)] public EncoderFallback Fallback { get @@ -65,7 +63,6 @@ namespace System.Text // Note: we don't test for threading here because async access to Encoders and Decoders // doesn't work anyway. - [System.Runtime.InteropServices.ComVisible(false)] public EncoderFallbackBuffer FallbackBuffer { get @@ -99,7 +96,6 @@ namespace System.Text // // Virtual implimentation has to call GetBytes with flush and a big enough buffer to clear a 0 char string // We avoid GetMaxByteCount() because a) we can't call the base encoder and b) it might be really big. - [System.Runtime.InteropServices.ComVisible(false)] public virtual void Reset() { char[] charTemp = {}; @@ -122,7 +118,6 @@ namespace System.Text // unfortunately for existing overrides, it has to call the [] version, // which is really slow, so avoid this method if you might be calling external encodings. [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetByteCount(char* chars, int count, bool flush) { // Validate input parameters @@ -183,7 +178,6 @@ namespace System.Text // could easily overflow our output buffer. Therefore we do an extra test // when we copy the buffer so that we don't overflow byteCount either. [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush) { @@ -240,7 +234,6 @@ namespace System.Text // Note that if all of the input chars are not consumed, then we'll do a /2, which means // that its likely that we didn't consume as many chars as we could have. For some // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) - [System.Runtime.InteropServices.ComVisible(false)] public virtual void Convert(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, int byteCount, bool flush, out int charsUsed, out int bytesUsed, out bool completed) @@ -299,7 +292,6 @@ namespace System.Text // that its likely that we didn't consume as many chars as we could have. For some // applications this could be slow. (Like trying to exactly fill an output buffer from a bigger stream) [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe void Convert(char* chars, int charCount, byte* bytes, int byteCount, bool flush, out int charsUsed, out int bytesUsed, out bool completed) diff --git a/src/mscorlib/src/System/Text/EncoderBestFitFallback.cs b/src/mscorlib/src/System/Text/EncoderBestFitFallback.cs index c5f82a299b..9be095bbd8 100644 --- a/src/mscorlib/src/System/Text/EncoderBestFitFallback.cs +++ b/src/mscorlib/src/System/Text/EncoderBestFitFallback.cs @@ -123,7 +123,7 @@ namespace System.Text 0xD800, 0xDBFF)); if (!Char.IsLowSurrogate(charUnknownLow)) - throw new ArgumentOutOfRangeException("CharUnknownLow", + throw new ArgumentOutOfRangeException(nameof(charUnknownLow), Environment.GetResourceString("ArgumentOutOfRange_Range", 0xDC00, 0xDFFF)); Contract.EndContractBlock(); diff --git a/src/mscorlib/src/System/Text/EncoderExceptionFallback.cs b/src/mscorlib/src/System/Text/EncoderExceptionFallback.cs index 051f50ac7c..6735e7a5f8 100644 --- a/src/mscorlib/src/System/Text/EncoderExceptionFallback.cs +++ b/src/mscorlib/src/System/Text/EncoderExceptionFallback.cs @@ -68,7 +68,7 @@ namespace System.Text } if (!Char.IsLowSurrogate(charUnknownLow)) { - throw new ArgumentOutOfRangeException("CharUnknownLow", + throw new ArgumentOutOfRangeException(nameof(charUnknownLow), Environment.GetResourceString("ArgumentOutOfRange_Range", 0xDC00, 0xDFFF)); } diff --git a/src/mscorlib/src/System/Text/EncoderNLS.cs b/src/mscorlib/src/System/Text/EncoderNLS.cs index 2add017d68..95901e01f4 100644 --- a/src/mscorlib/src/System/Text/EncoderNLS.cs +++ b/src/mscorlib/src/System/Text/EncoderNLS.cs @@ -5,7 +5,6 @@ namespace System.Text { using System.Runtime.Serialization; - using System.Security.Permissions; using System.Text; using System; using System.Diagnostics.Contracts; @@ -98,7 +97,7 @@ namespace System.Text // Just call the pointer version int result = -1; - fixed (char* pChars = chars) + fixed (char* pChars = &chars[0]) { result = GetByteCount(pChars + index, count, flush); } @@ -151,8 +150,8 @@ namespace System.Text bytes = new byte[1]; // Just call pointer version - fixed (char* pChars = chars) - fixed (byte* pBytes = bytes) + fixed (char* pChars = &chars[0]) + fixed (byte* pBytes = &bytes[0]) // Remember that charCount is # to decode, not size of array. return GetBytes(pChars + charIndex, charCount, @@ -212,9 +211,9 @@ namespace System.Text bytes = new byte[1]; // Just call the pointer version (can't do this for non-msft encoders) - fixed (char* pChars = chars) + fixed (char* pChars = &chars[0]) { - fixed (byte* pBytes = bytes) + fixed (byte* pBytes = &bytes[0]) { Convert(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, flush, out charsUsed, out bytesUsed, out completed); diff --git a/src/mscorlib/src/System/Text/EncoderReplacementFallback.cs b/src/mscorlib/src/System/Text/EncoderReplacementFallback.cs index 604cddf9bb..b0657ff18d 100644 --- a/src/mscorlib/src/System/Text/EncoderReplacementFallback.cs +++ b/src/mscorlib/src/System/Text/EncoderReplacementFallback.cs @@ -153,7 +153,7 @@ namespace System.Text 0xD800, 0xDBFF)); if (!Char.IsLowSurrogate(charUnknownLow)) - throw new ArgumentOutOfRangeException("CharUnknownLow", + throw new ArgumentOutOfRangeException(nameof(charUnknownLow), Environment.GetResourceString("ArgumentOutOfRange_Range", 0xDC00, 0xDFFF)); Contract.EndContractBlock(); diff --git a/src/mscorlib/src/System/Text/Encoding.cs b/src/mscorlib/src/System/Text/Encoding.cs index 658bdbb133..8cb01e41fa 100644 --- a/src/mscorlib/src/System/Text/Encoding.cs +++ b/src/mscorlib/src/System/Text/Encoding.cs @@ -12,7 +12,6 @@ namespace System.Text using System.Runtime.Serialization; using System.Globalization; using System.Security; - using System.Security.Permissions; using System.Threading; using System.Text; using System.Diagnostics; @@ -83,7 +82,6 @@ namespace System.Text // generally executes faster. // - [System.Runtime.InteropServices.ComVisible(true)] [Serializable] public abstract class Encoding : ICloneable { @@ -372,23 +370,6 @@ namespace System.Text return dstEncoding.GetBytes(srcEncoding.GetChars(bytes, index, count)); } -#if FEATURE_CODEPAGES_FILE - // Private object for locking instead of locking on a public type for SQL reliability work. - private static Object s_InternalSyncObject; - private static Object InternalSyncObject { - get { - if (s_InternalSyncObject == null) { - Object o = new Object(); - Interlocked.CompareExchange<Object>(ref s_InternalSyncObject, o, null); - } - return s_InternalSyncObject; - } - } - - // On Desktop, encoding instances that aren't cached in a static field are cached in - // a hash table by codepage. - private static volatile Hashtable encodings; -#endif public static void RegisterProvider(EncodingProvider provider) { @@ -441,45 +422,6 @@ namespace System.Text "Argument_CodepageNotSupported", codepage), nameof(codepage)); } -#if FEATURE_CODEPAGES_FILE - object key = codepage; // Box once - - // See if we have a hash table with our encoding in it already. - if (encodings != null) { - result = (Encoding)encodings[key]; - } - - if (result == null) - { - // Don't conflict with ourselves - lock (InternalSyncObject) - { - // Need a new hash table - // in case another thread beat us to creating the Dictionary - if (encodings == null) { - encodings = new Hashtable(); - } - - // Double check that we don't have one in the table (in case another thread beat us here) - if ((result = (Encoding)encodings[key]) != null) - return result; - - if (codepage == CodePageWindows1252) - { - result = new SBCSCodePageEncoding(codepage); - } - else - { - result = GetEncodingCodePage(codepage) ?? GetEncodingRare(codepage); - } - - Debug.Assert(result != null, "result != null"); - - encodings.Add(key, result); - } - } - return result; -#else // Is it a valid code page? if (EncodingTable.GetCodePageDataItem(codepage) == null) { @@ -488,7 +430,6 @@ namespace System.Text } return UTF8; -#endif // FEATURE_CODEPAGES_FILE } [Pure] @@ -510,86 +451,6 @@ namespace System.Text return fallbackEncoding; } -#if FEATURE_CODEPAGES_FILE - private static Encoding GetEncodingRare(int codepage) - { - Debug.Assert(codepage != 0 && codepage != 1200 && codepage != 1201 && codepage != 65001, - "[Encoding.GetEncodingRare]This code page (" + codepage + ") isn't supported by GetEncodingRare!"); - Encoding result; - switch (codepage) - { - case ISCIIAssemese: - case ISCIIBengali: - case ISCIIDevanagari: - case ISCIIGujarathi: - case ISCIIKannada: - case ISCIIMalayalam: - case ISCIIOriya: - case ISCIIPanjabi: - case ISCIITamil: - case ISCIITelugu: - result = new ISCIIEncoding(codepage); - break; - // GB2312-80 uses same code page for 20936 and mac 10008 - case CodePageMacGB2312: - // case CodePageGB2312: - // result = new DBCSCodePageEncoding(codepage, EUCCN); - result = new DBCSCodePageEncoding(CodePageMacGB2312, CodePageGB2312); - break; - - // Mac Korean 10003 and 20949 are the same - case CodePageMacKorean: - result = new DBCSCodePageEncoding(CodePageMacKorean, CodePageDLLKorean); - break; - // GB18030 Code Pages - case GB18030: - result = new GB18030Encoding(); - break; - // ISO2022 Code Pages - case ISOKorean: - // case ISOSimplifiedCN - case ChineseHZ: - case ISO2022JP: // JIS JP, full-width Katakana mode (no half-width Katakana) - case ISO2022JPESC: // JIS JP, esc sequence to do Katakana. - case ISO2022JPSISO: // JIS JP with Shift In/ Shift Out Katakana support - result = new ISO2022Encoding(codepage); - break; - // Duplicate EUC-CN (51936) just calls a base code page 936, - // so does ISOSimplifiedCN (50227), which's gotta be broken - case DuplicateEUCCN: - case ISOSimplifiedCN: - result = new DBCSCodePageEncoding(codepage, EUCCN); // Just maps to 936 - break; - case EUCJP: - result = new EUCJPEncoding(); - break; - case EUCKR: - result = new DBCSCodePageEncoding(codepage, CodePageDLLKorean); // Maps to 20949 - break; - case ENC50229: - throw new NotSupportedException(Environment.GetResourceString("NotSupported_CodePage50229")); - case ISO_8859_8I: - result = new SBCSCodePageEncoding(codepage, ISO_8859_8_Visual); // Hebrew maps to a different code page - break; - default: - // Not found, already tried codepage table code pages in GetEncoding() - throw new NotSupportedException( - Environment.GetResourceString("NotSupported_NoCodepageData", codepage)); - } - return result; - } - - private static Encoding GetEncodingCodePage(int CodePage) - { - // Single Byte or Double Byte Code Page? (0 if not found) - int i = BaseCodePageEncoding.GetCodePageByteSize(CodePage); - if (i == 1) return new SBCSCodePageEncoding(CodePage); - else if (i == 2) return new DBCSCodePageEncoding(CodePage); - - // Return null if we didn't find one. - return null; - } -#endif // FEATURE_CODEPAGES_FILE // Returns an Encoding object for a given name or a given code page value. // [Pure] @@ -764,7 +625,6 @@ namespace System.Text // True if and only if the encoding only uses single byte code points. (Ie, ASCII, 1252, etc) - [System.Runtime.InteropServices.ComVisible(false)] public virtual bool IsSingleByte { get @@ -774,7 +634,6 @@ namespace System.Text } - [System.Runtime.InteropServices.ComVisible(false)] public EncoderFallback EncoderFallback { get @@ -796,7 +655,6 @@ namespace System.Text } - [System.Runtime.InteropServices.ComVisible(false)] public DecoderFallback DecoderFallback { get @@ -818,7 +676,6 @@ namespace System.Text } - [System.Runtime.InteropServices.ComVisible(false)] public virtual Object Clone() { Encoding newEncoding = (Encoding)this.MemberwiseClone(); @@ -829,7 +686,6 @@ namespace System.Text } - [System.Runtime.InteropServices.ComVisible(false)] public bool IsReadOnly { get @@ -918,7 +774,6 @@ namespace System.Text // a 3rd party encoding. [Pure] [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetByteCount(char* chars, int count) { // Validate input parameters @@ -1080,7 +935,6 @@ namespace System.Text // when we copy the buffer so that we don't overflow byteCount either. [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { @@ -1149,7 +1003,6 @@ namespace System.Text // ones we need a working (if slow) default implimentation) [Pure] [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetCharCount(byte* bytes, int count) { // Validate input parameters @@ -1236,7 +1089,6 @@ namespace System.Text // when we copy the buffer so that we don't overflow charCount either. [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public virtual unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { @@ -1291,7 +1143,6 @@ namespace System.Text [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public unsafe string GetString(byte* bytes, int byteCount) { if (bytes == null) @@ -1320,18 +1171,12 @@ namespace System.Text // IsAlwaysNormalized // Returns true if the encoding is always normalized for the specified encoding form [Pure] - [System.Runtime.InteropServices.ComVisible(false)] public bool IsAlwaysNormalized() { -#if !FEATURE_NORM_IDNA_ONLY return this.IsAlwaysNormalized(NormalizationForm.FormC); -#else - return this.IsAlwaysNormalized((NormalizationForm)ExtendedNormalizationForms.FormIdna); -#endif } [Pure] - [System.Runtime.InteropServices.ComVisible(false)] public virtual bool IsAlwaysNormalized(NormalizationForm form) { // Assume false unless the encoding knows otherwise @@ -1364,23 +1209,10 @@ namespace System.Text Encoding enc; -#if FEATURE_CODEPAGES_FILE - int codePage = Win32Native.GetACP(); - - // For US English, we can save some startup working set by not calling - // GetEncoding(int codePage) since JITting GetEncoding will force us to load - // all the Encoding classes for ASCII, UTF7 & UTF8, & UnicodeEncoding. - - if (codePage == 1252) - enc = new SBCSCodePageEncoding(codePage); - else - enc = GetEncoding(codePage); -#else // FEATURE_CODEPAGES_FILE // For silverlight we use UTF8 since ANSI isn't available enc = UTF8; -#endif // FEATURE_CODEPAGES_FILE // This method should only ever return one Encoding instance return Interlocked.CompareExchange(ref defaultEncoding, enc, null) ?? enc; @@ -1882,20 +1714,6 @@ namespace System.Text return AddChar(ch,1); } - - internal unsafe bool AddChar(char ch1, char ch2, int numBytes) - { - // Need room for 2 chars - if (chars >= charEnd - 1) - { - // Throw maybe - bytes-=numBytes; // Didn't encode these bytes - enc.ThrowCharsOverflow(decoder, bytes <= byteStart); // Throw? - return false; // No throw, but no store either - } - return AddChar(ch1, numBytes) && AddChar(ch2, numBytes); - } - internal unsafe void AdjustBytes(int count) { bytes += count; @@ -1909,12 +1727,6 @@ namespace System.Text } } - // Do we have count more bytes? - internal unsafe bool EvenMoreData(int count) - { - return (bytes <= byteEnd - count); - } - // GetNextByte shouldn't be called unless the caller's already checked more data or even more data, // but we'll double check just to make sure. internal unsafe byte GetNextByte() @@ -1942,24 +1754,6 @@ namespace System.Text return Fallback(byteBuffer); } - internal unsafe bool Fallback(byte byte1, byte byte2) - { - // Build our buffer - byte[] byteBuffer = new byte[] { byte1, byte2 }; - - // Do the fallback and add the data. - return Fallback(byteBuffer); - } - - internal unsafe bool Fallback(byte byte1, byte byte2, byte byte3, byte byte4) - { - // Build our buffer - byte[] byteBuffer = new byte[] { byte1, byte2, byte3, byte4 }; - - // Do the fallback and add the data. - return Fallback(byteBuffer); - } - internal unsafe bool Fallback(byte[] byteBuffer) { // Do the fallback and add the data. @@ -2067,26 +1861,6 @@ namespace System.Text return (AddByte(b1, 1 + moreBytesExpected) && AddByte(b2, moreBytesExpected)); } - internal unsafe bool AddByte(byte b1, byte b2, byte b3) - { - return AddByte(b1, b2, b3, (int)0); - } - - internal unsafe bool AddByte(byte b1, byte b2, byte b3, int moreBytesExpected) - { - return (AddByte(b1, 2 + moreBytesExpected) && - AddByte(b2, 1 + moreBytesExpected) && - AddByte(b3, moreBytesExpected)); - } - - internal unsafe bool AddByte(byte b1, byte b2, byte b3, byte b4) - { - return (AddByte(b1, 3) && - AddByte(b2, 2) && - AddByte(b3, 1) && - AddByte(b4, 0)); - } - internal unsafe void MovePrevious(bool bThrow) { if (fallbackBuffer.bFallingBack) @@ -2104,12 +1878,6 @@ namespace System.Text enc.ThrowBytesOverflow(encoder, bytes == byteStart); // Throw? (and reset fallback if not converting) } - internal unsafe bool Fallback(char charFallback) - { - // Do the fallback - return fallbackBuffer.InternalFallback(charFallback, ref chars); - } - internal unsafe bool MoreData { get diff --git a/src/mscorlib/src/System/Text/EncodingForwarder.cs b/src/mscorlib/src/System/Text/EncodingForwarder.cs index 9a8dd26627..50ccbd9333 100644 --- a/src/mscorlib/src/System/Text/EncodingForwarder.cs +++ b/src/mscorlib/src/System/Text/EncodingForwarder.cs @@ -130,7 +130,7 @@ namespace System.Text if (bytes.Length == 0) bytes = new byte[1]; - fixed (char* pChars = s) fixed (byte* pBytes = bytes) + fixed (char* pChars = s) fixed (byte* pBytes = &bytes[0]) { return encoding.GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, encoder: null); } @@ -170,7 +170,7 @@ namespace System.Text bytes = new byte[1]; // Just call the (internal) pointer version - fixed (char* pChars = chars) fixed (byte* pBytes = bytes) + fixed (char* pChars = chars) fixed (byte* pBytes = &bytes[0]) { return encoding.GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, encoder: null); } @@ -266,7 +266,7 @@ namespace System.Text if (chars.Length == 0) chars = new char[1]; - fixed (byte* pBytes = bytes) fixed (char* pChars = chars) + fixed (byte* pBytes = bytes) fixed (char* pChars = &chars[0]) { return encoding.GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, decoder: null); } diff --git a/src/mscorlib/src/System/Text/EncodingNLS.cs b/src/mscorlib/src/System/Text/EncodingNLS.cs index fbddf37e88..cb6ed8a52c 100644 --- a/src/mscorlib/src/System/Text/EncodingNLS.cs +++ b/src/mscorlib/src/System/Text/EncodingNLS.cs @@ -15,7 +15,6 @@ namespace System.Text // This class overrides Encoding with the things we need for our NLS Encodings - [System.Runtime.InteropServices.ComVisible(true)] [Serializable] internal abstract class EncodingNLS : Encoding { diff --git a/src/mscorlib/src/System/Text/EncodingProvider.cs b/src/mscorlib/src/System/Text/EncodingProvider.cs index a7f745a753..734d1ac761 100644 --- a/src/mscorlib/src/System/Text/EncodingProvider.cs +++ b/src/mscorlib/src/System/Text/EncodingProvider.cs @@ -8,7 +8,6 @@ namespace System.Text using System.Collections; using System.Collections.Generic; - [System.Runtime.InteropServices.ComVisible(true)] public abstract class EncodingProvider { public EncodingProvider() { } diff --git a/src/mscorlib/src/System/Text/GB18030Encoding.cs b/src/mscorlib/src/System/Text/GB18030Encoding.cs deleted file mode 100644 index 8ed52a6ab8..0000000000 --- a/src/mscorlib/src/System/Text/GB18030Encoding.cs +++ /dev/null @@ -1,1365 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - - -// -// Ported to managed code from c_gb18030.c and related gb18030 dll files -// -// -// Abstract: -// -// Managed implementation of GB18030-2000 (code page 54936) ported from implimentation in c_g18030.dll -// This file contains functions to convert GB18030-2000 (code page 54936) into Unicode, and vice versa. -// -// Notes: -// GB18030-2000 (aka GBK2K) is designed to be mostly compatible with GBK (codepage 936), -// while supports the full range of Unicode code points (BMP + 16 supplementary planes). -// -// The structure for GB18030 is: -// * Single byte: -// 0x00 ~ 0x7f -// * Two-byte: -// 0x81 ~ 0xfe, 0x40 ~ 0x7e (leading byte, trailing byte) -// 0x81 ~ 0xfe, 0x80 ~ 0xfe (leading byte, trailing byte) -// * Four-byte: -// 0x81 ~ 0xfe, 0x30 ~ 0x39, 0x81 ~ 0xfe, 0x30 ~ 0x39. -// The surrogare pair will be encoded from 0x90, 0x30, 0x81, 0x30 -// -// The BMP range is fully supported in GB18030 using 1-byte, 2-byte and 4-byte sequences. -// In valid 4-byte GB18030, there are two gaps that can not be mapped to Unicode characters. -// 0x84, 0x31, 0xa5, 0x30 (just after the GB18030 bytes for U+FFFF(*)) ~ 0x8f, 0x39, 0xfe, 0x39 (just before the first GB18030 bytes for U+D800,U+DC00) -// 0xe3, 0x32, 0x9a, 0x36 (just after the GB18030 bytes for U+DBFF U+DFFF(**)) ~ 0xfe, 0x39, 0xfe, 0x39 -// -// -// Note1: U+FFFF = 0x84, 0x31, 0xa4, 0x39 -// Note2: U+DBFF U+DFFF = 0xe3, 0x32, 0x9a, 0x35 -// -// Tables used in GB18030Encoding: -// -// Our data is similar to the 936 Code Page, so we start from there to build our tables. We build the -// normal double byte mapUnicodeToBytes and mapBytesToUnicode tables by applying differences from 936. -// We also build a map4BytesToUnicode table and a mapUnicodeTo4BytesFlags -// -// * mapUnicodeTo4BytesFlags -// This is an array of bytes, so we have to do a / 8 and << %8 to check the appropriate bit (see Is4Byte()) -// If the bit is set its true. -// -// true - If set/true this is a 4 byte code. The value in mapUnicodeToBytes will be the 4 byte offset -// false - If cleared/false this is a 1 or 2 byte code. The value in mapUnicodeToBytes will be the 2 bytes. -// -// * mapUnicodeToBytes -// Contains either the 2 byte value of double byte GB18030 or the 4 byte offset for 4 byte GB18030, -// depending on the value of the flag in mapUnicodeTo4BytesFlags -// -// * mapBytesToUnicode -// mapBytesToUnicode maps 2 byte GB 18030 to Unicode like other DBCS code pages. -// -// * map4BytesToUnicode -// map4BytesToUnicode is indexed by the 4 byte offset and contains the unicode value for each 4 byte offset -// -// -// 4 Byte sequences -// We generally use the offset for the 4 byte sequence, such as: -// -// The index value is the offset of the 4-byte GB18030. -// -// 4-byte GB18030 Index value -// ============== =========== -// 81,30,81,30 0 -// 81,30,81,31 1 -// 81,30,81,32 2 -// ... ... -// -// The value of map4BytesToUnicode cotains the Unicode codepoint for the offset of the -// corresponding 4-byte GB18030. -// -// E.g. map4BytesToUnicode[0] = 0x0080. This means that GB18030 0x81, 0x30, 0x81, 0x30 will be converted to Unicode U+0800. -// -// 4 Byte Surrogate Sequences -// Those work similarly to the normal 4 byte sequences, but start at a different offset -// -// We don't override IsAlwaysNormalized because GB18030 covers all of the unicode space, so isn't guaranteed to be normal. -// -#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding -namespace System.Text -{ - using System; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Text; - using System.Runtime.InteropServices; - using System.Security; - using System.Runtime.CompilerServices; - using System.Runtime.Serialization; - using System.Runtime.Versioning; - using System.Security.Permissions; - using System.Globalization; - - /*=================================GB18030Encoding============================ - ** - ** This is used to support GB18030-2000 encoding (code page 54936). - ** - ==============================================================================*/ - - [Serializable] - internal sealed class GB18030Encoding : DBCSCodePageEncoding, ISerializable - { - // This is the table of 4 byte conversions. - private const int GBLast4ByteCode = 0x99FB; - [NonSerialized] - unsafe internal char* map4BytesToUnicode = null; // new char[GBLast4ByteCode + 1]; // Need to map all 4 byte sequences to Unicode - [NonSerialized] - unsafe internal byte* mapUnicodeTo4BytesFlags = null; // new byte[0x10000 / 8]; // Need 1 bit for each code point to say if its 4 byte or not - - private const int GB18030 = 54936; - - // First and last character of surrogate range as offset from 4 byte GB18030 GB81308130 - private const int GBSurrogateOffset = 0x2E248; // GB90308130 - private const int GBLastSurrogateOffset = 0x12E247; // GBE3329A35 - - // We have to load the 936 code page tables, so impersonate 936 as our base - internal GB18030Encoding() : base(GB18030, 936) - { - } - - // Constructor called by serialization. - internal GB18030Encoding(SerializationInfo info, StreamingContext context) : - base(GB18030, 936) - { - // Set up our base, also throws if info was empty - DeserializeEncoding(info, context); - Debug.Assert(info!=null, "[GB18030Encoding(Serialization...)] Expected null info to throw"); - - // Already build our code page, fallbacks & read only, so we're good to go! - } - - // ISerializable implementation - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // Make sure to get the base stuff too This throws if info is null - SerializeEncoding(info, context); - Debug.Assert(info!=null, "[GB18030.GetObjectData] Expected null info to throw"); - - // Everett doesn't need more than the basics - } - - // This loads our base 936 code page and then applys the changes from the tableUnicodeToGBDiffs table. - // See table comments for table format. - protected override unsafe void LoadManagedCodePage() - { - // Use base code page loading algorithm. - // We need to use our main CP as our flag. - this.bFlagDataTable = false; - this.iExtraBytes = (GBLast4ByteCode + 1) * 2 + 0x10000 / 8; - - // Load most of our code page - base.LoadManagedCodePage(); - - // Point to our new data sections - byte *pMemorySection = (byte *) safeMemorySectionHandle.DangerousGetHandle(); - mapUnicodeTo4BytesFlags = pMemorySection + 65536 * 2 * 2; - map4BytesToUnicode = (char*)(pMemorySection + 65536 * 2 * 2 + 0x10000 / 8); - - // Need to check our pointer to see if we're loaded, return if we're built already - if (*mapCodePageCached == this.CodePage) - return; - - // Once we've done our base LoadManagedCodePage, we'll have to add our fixes - char unicodeCount = (char)0; - ushort count4Byte = 0; - for (int index = 0; index < tableUnicodeToGBDiffs.Length; index++) - { - ushort data = tableUnicodeToGBDiffs[index]; - - // Check high bit - if ((data & 0x8000) != 0) - { - // Make be exact value - if (data > 0x9000 && data != 0xD1A6) - { - // It was an exact value (gb18040[data] = unicode) - mapBytesToUnicode[data] = unicodeCount; - mapUnicodeToBytes[unicodeCount] = data; - unicodeCount++; - } - else - { - // It was a CP 936 compatible data, that table's already loaded, just increment our pointer - unicodeCount += unchecked((char)(data & 0x7FFF)); - } - } - else - { - // It was GB 18030 4 byte data, next <data> characters are 4 byte sequences. - while (data > 0) - { - Debug.Assert(count4Byte <= GBLast4ByteCode, - "[GB18030Encoding.LoadManagedCodePage] Found too many 4 byte codes in data table."); - - // Set the 4 byte -> Unicode value - map4BytesToUnicode[count4Byte] = unicodeCount; - // Set the unicode -> 4 bytes value, including flag that its a 4 byte sequence - mapUnicodeToBytes[unicodeCount] = count4Byte; - // Set the flag saying its a 4 byte sequence - mapUnicodeTo4BytesFlags[unicodeCount / 8] |= unchecked((byte)(1 << (unicodeCount % 8))); - unicodeCount++; - count4Byte++; - data--; - } - - } - } - - // unicodeCount should've wrapped back to 0 - Debug.Assert(unicodeCount == 0, - "[GB18030Encoding.LoadManagedCodePage] Expected unicodeCount to wrap around to 0 as all chars were processed"); - - // We should've read in GBLast4ByteCode 4 byte sequences - Debug.Assert(count4Byte == GBLast4ByteCode + 1, - "[GB18030Encoding.LoadManagedCodePage] Expected 0x99FB to be last 4 byte offset, found 0x" + count4Byte.ToString("X4", CultureInfo.InvariantCulture)); - - // Need to flag ourselves saying we've built this CP. - *mapCodePageCached = this.CodePage; - } - - internal override void SetDefaultFallbacks() - { - // For GB18030Encoding just use default replacement fallbacks because its only for bad surrogates - this.encoderFallback = EncoderFallback.ReplacementFallback; - this.decoderFallback = DecoderFallback.ReplacementFallback; - } - - // Is4Byte - // Checks the 4 byte table and returns true if this is a 4 byte code. - // Its a 4 byte code if the flag is set in mapUnicodeTo4BytesFlags - internal unsafe bool Is4Byte(char charTest) - { - // See what kind it is - byte b4Byte = mapUnicodeTo4BytesFlags[charTest / 8]; - return (b4Byte != 0 && (b4Byte & (1 << (charTest % 8))) != 0); - } - - // GetByteCount - internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) - { - // Just call GetBytes() with null bytes - return GetBytes(chars, count, null, 0, encoder); - } - - internal override unsafe int GetBytes(char* chars, int charCount, - byte* bytes, int byteCount, EncoderNLS encoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - // We'll allow null bytes as a count -// Debug.Assert(bytes != null, "[GB18030Encoding.GetBytes]bytes is null"); - Debug.Assert(byteCount >= 0, "[GB18030Encoding.GetBytes]byteCount is negative"); - Debug.Assert(chars != null, "[GB18030Encoding.GetBytes]chars is null"); - Debug.Assert(charCount >= 0, "[GB18030Encoding.GetBytes]charCount is negative"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[GB18030Encoding.GetBytes]Attempting to use null encoder fallback"); - - // Get any left over characters - char charLeftOver = (char)0; - if (encoder != null) - charLeftOver = encoder.charLeftOver; - - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Try again if we were MustFlush - TryAgain: - - // Go ahead and do it, including the fallback. - while (buffer.MoreData) - { - // Get next char - char ch = buffer.GetNextChar(); - - // Have to check for charLeftOver - if (charLeftOver != 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), - "[GB18030Encoding.GetBytes] leftover character should be high surrogate, not 0x" + ((int)charLeftOver).ToString("X4", CultureInfo.InvariantCulture)); - - // If our next char isn't a low surrogate, then we need to do fallback. - if (!Char.IsLowSurrogate(ch)) - { - // No low surrogate, fallback high surrogate & try this one again - buffer.MovePrevious(false); // (Ignoring this character, don't thow) - if (!buffer.Fallback(charLeftOver)) - { - charLeftOver = (char)0; - break; - } - charLeftOver = (char)0; - continue; - } - else - { - // Next is a surrogate, add it as surrogate pair - - // Need 4 bytes for surrogates - // Get our offset - int offset = ((charLeftOver - 0xd800) << 10) + (ch - 0xdc00); - - byte byte4 = (byte)((offset % 0x0a) + 0x30); - offset /= 0x0a; - byte byte3 = (byte)((offset % 0x7e) + 0x81); - offset /= 0x7e; - byte byte2 = (byte)((offset % 0x0a) + 0x30); - offset /= 0x0a; - Debug.Assert(offset < 0x6f, - "[GB18030Encoding.GetBytes](1) Expected offset < 0x6f, not 0x" + offset.ToString("X2", CultureInfo.InvariantCulture)); - - charLeftOver = (char)0; - if (!buffer.AddByte((byte)(offset + 0x90),byte2,byte3,byte4)) - { - // Didn't work, need to back up for both surrogates (AddByte already backed up one) - buffer.MovePrevious(false); // (don't throw) - break; - } - } - charLeftOver = '\0'; - } - // ASCII's easiest - else if (ch <= 0x7f) - { - // Need a byte - if (!buffer.AddByte((byte)ch)) - break; - } - // See if its a surrogate pair - else if (Char.IsHighSurrogate(ch)) - { - // Remember it for next time - charLeftOver = ch; - } - else if (Char.IsLowSurrogate(ch)) - { - // Low surrogates should've been found already - if (!buffer.Fallback(ch)) - break; - } - else - { - // Not surrogate or ASCII, get value - ushort iBytes = mapUnicodeToBytes[ch]; - - // See what kind it is - if (Is4Byte(ch)) - { - // - // This Unicode character will be converted to four-byte GB18030. - // - // Need 4 bytes - byte byte4 = (byte)((iBytes % 0x0a) + 0x30); - iBytes /= 0x0a; - byte byte3 = (byte)((iBytes % 0x7e) + 0x81); - iBytes /= 0x7e; - byte byte2 = (byte)((iBytes % 0x0a) + 0x30); - iBytes /= 0x0a; - Debug.Assert(iBytes < 0x7e, - "[GB18030Encoding.GetBytes]Expected iBytes < 0x7e, not 0x" + iBytes.ToString("X2", CultureInfo.InvariantCulture)); - if (!buffer.AddByte((byte)(iBytes + 0x81), byte2, byte3, byte4)) - break; - } - else - { - // Its 2 byte, use it - if (!buffer.AddByte(unchecked((byte)(iBytes >> 8)), unchecked((byte)(iBytes & 0xff)))) - break; - } - } - } - - // Do we need to flush our charLeftOver? - if ((encoder == null || encoder.MustFlush) && (charLeftOver > 0)) - { - // Fall it back - buffer.Fallback(charLeftOver); - charLeftOver = (char)0; - goto TryAgain; - } - - // Fallback stuck it in encoder if necessary, but we have to clear MustFlash cases - // (Check bytes != null, don't clear it if we're just counting) - if (encoder != null) - { - // Remember our charLeftOver - if (bytes != null) - encoder.charLeftOver = charLeftOver; - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - // Helper methods - internal bool IsGBLeadByte(short ch) - { - // return true if we're in the lead byte range - return ((ch) >= 0x81 && (ch) <= 0xfe); - } - - internal bool IsGBTwoByteTrailing(short ch) - { - // Return true if we are in range for the trailing byte of a 2 byte sequence - return (((ch) >= 0x40 && (ch) <= 0x7e) || - ((ch) >= 0x80 && (ch) <= 0xfe)); - } - - internal bool IsGBFourByteTrailing(short ch) - { - // Return true if we are in range for the trailing byte of a 4 byte sequence - return ((ch) >= 0x30 && (ch) <= 0x39); - } - - internal int GetFourBytesOffset(short offset1, short offset2, short offset3, short offset4) - { - return ((offset1 - 0x81) * 0x0a * 0x7e * 0x0a + - (offset2 - 0x30) * 0x7e * 0x0a + - (offset3 - 0x81) * 0x0a + - offset4 - 0x30); - } - - // This is internal and called by something else, - internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - // Just call GetChars() with null chars to count - return GetChars(bytes, count, null, 0, baseDecoder); - } - - internal override unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS baseDecoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - // We'll allow null chars as a count - Debug.Assert(bytes != null, "[GB18030Encoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[GB18030Encoding.GetChars]byteCount is negative"); -// Debug.Assert(chars != null, "[GB18030Encoding.GetChars]chars is null"); - Debug.Assert(charCount >= 0, "[GB18030Encoding.GetChars]charCount is negative"); - - // Fix our decoder - GB18030Decoder decoder = (GB18030Decoder)baseDecoder; - - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // Need temp bytes because we can't muss up decoder - short byte1 = -1; - short byte2 = -1; - short byte3 = -1; - short byte4 = -1; - - // See if there was anything to get out of the decoder - if (decoder != null && decoder.bLeftOver1 != -1) - { - // Need temp bytes because we can't muss up decoder - byte1 = decoder.bLeftOver1; - byte2 = decoder.bLeftOver2; - byte3 = decoder.bLeftOver3; - byte4 = decoder.bLeftOver4; - - // Loop because we might have too many in buffer - // This could happen if we are working on a 4 byte sequence, but it isn't valid. - while (byte1 != -1) - { - // If its not a lead byte, use ? or its value, then scoot them down & try again - // This could happen if we previously had a bad 4 byte sequence and this is a trail byte - if (!IsGBLeadByte(byte1)) - { - // This is either a ? or ASCII, need 1 char output - if (byte1 <= 0x7f) - { - if (!buffer.AddChar((char)byte1)) // Its ASCII - break; - } - else - { - if (!buffer.Fallback((byte)byte1)) // Not a valid byte - break; - } - - byte1 = byte2; - byte2 = byte3; - byte3 = byte4; - byte4 = -1; - continue; - } - - // Read in more bytes as needed - while (byte2 == -1 || - (IsGBFourByteTrailing(byte2) && byte4 == -1)) - { - // Do we have room? - if (!buffer.MoreData) - { - // No input left to read, do we have to flush? - if (!decoder.MustFlush) - { - // Don't stick stuff in decoder when counting - if (chars != null) - { - // Don't have to flush, won't have any chars - // Decoder is correct, just return - decoder.bLeftOver1 = byte1; - decoder.bLeftOver2 = byte2; - decoder.bLeftOver3 = byte3; - decoder.bLeftOver4 = byte4; - } - - decoder.m_bytesUsed = buffer.BytesUsed; - return buffer.Count; - } - - // We'll have to flush, add a ? and scoot them down to try again - // We could be trying for a 4 byte sequence but byte 3 could be ascii and should be spit out - // Breaking will do this because we have zeros - break; - } - - // Read them in - if (byte2 == -1) byte2 = buffer.GetNextByte(); - else if (byte3 == -1) byte3 = buffer.GetNextByte(); - else byte4 = buffer.GetNextByte(); - } - - // Now we have our 2 or 4 bytes - if (IsGBTwoByteTrailing(byte2)) - { - // - // The trailing byte is a GB18030 two-byte sequence trailing byte. - // - int iTwoBytes = byte1 << 8; - iTwoBytes |= unchecked((byte)byte2); - if (!buffer.AddChar(this.mapBytesToUnicode[iTwoBytes], 2)) - break; - - // We're done with it - byte1 = -1; - byte2 = -1; - } - else if (IsGBFourByteTrailing(byte2) && - IsGBLeadByte(byte3) && - IsGBFourByteTrailing(byte4)) - { - // - // Four-byte GB18030 - // - - int sFourBytesOffset = GetFourBytesOffset( - byte1, byte2, byte3, byte4); - - // What kind is it? - if (sFourBytesOffset <= GBLast4ByteCode) - { - // - // The Unicode will be in the BMP range. - // - if (!buffer.AddChar(map4BytesToUnicode[sFourBytesOffset], 4)) - break; - } - else if (sFourBytesOffset >= GBSurrogateOffset && - sFourBytesOffset <= GBLastSurrogateOffset) - { - // - // This will be converted to a surrogate pair, need another char - // - - // Use our surrogate - sFourBytesOffset -= GBSurrogateOffset; - if (!buffer.AddChar(unchecked((char)(0xd800 + (sFourBytesOffset / 0x400))), - unchecked((char)(0xdc00 + (sFourBytesOffset % 0x400))), 4)) - break; - } - else - { - // Real GB18030 codepoint, but can't be mapped to unicode - // We already checked our buffer space. - // Do fallback here if we impliment decoderfallbacks. - if (!buffer.Fallback((byte)byte1, (byte)byte2, (byte)byte3, (byte)byte4)) - break; - } - - // We're done with this one - byte1 = -1; - byte2 = -1; - byte3 = -1; - byte4 = -1; - } - else - { - // Not a valid sequence, use '?' for 1st byte & scoot them all down 1 - if (!buffer.Fallback((byte)byte1)) - break; - - // Move all bytes down 1 - byte1 = byte2; - byte2 = byte3; - byte3 = byte4; - byte4 = -1; - } - } - } - - // Loop, just do '?' replacement because we don't have fallbacks for decodings. - while (buffer.MoreData) - { - byte ch = buffer.GetNextByte(); - - // ASCII case is easy - if (ch <= 0x7f) - { - // ASCII, have room? - if (!buffer.AddChar((char)ch)) - break; // No room in convert buffer, so stop - } - // See if its a lead byte - else if (IsGBLeadByte(ch)) - { - // ch is a lead byte, have room for more? - if (buffer.MoreData) - { - byte ch2 = buffer.GetNextByte(); - if (IsGBTwoByteTrailing(ch2)) - { - // - // The trailing byte is a GB18030 two-byte sequence trailing byte. - // - - // - // Two-byte GB18030 - // - int iTwoBytes = ch << 8; - iTwoBytes |= ch2; - if (!buffer.AddChar(this.mapBytesToUnicode[iTwoBytes], 2)) - break; - } - else if (IsGBFourByteTrailing(ch2)) - { - // Do we have room for Four Byte Sequence? (already have 1 byte) - if (buffer.EvenMoreData(2)) - { - // Is it a valid 4 byte sequence? - byte ch3 = buffer.GetNextByte(); - byte ch4 = buffer.GetNextByte(); - if (IsGBLeadByte(ch3) && - IsGBFourByteTrailing(ch4)) - { - // - // Four-byte GB18030 - // - int sFourBytesOffset = GetFourBytesOffset(ch, ch2, ch3, ch4); - - // What kind is it? - // We'll be at least 1 BMP char or a '?' char. - - if (sFourBytesOffset <= GBLast4ByteCode) - { - // - // The Unicode will be in the BMP range. - // - if (!buffer.AddChar(map4BytesToUnicode[sFourBytesOffset],4)) - break; - } - else if (sFourBytesOffset >= GBSurrogateOffset && - sFourBytesOffset <= GBLastSurrogateOffset) - { - // - // This will be converted to a surrogate pair, need another char - // - - // Use our surrogate - sFourBytesOffset -= GBSurrogateOffset; - if (!buffer.AddChar(unchecked((char)(0xd800 + (sFourBytesOffset / 0x400))), - unchecked((char)(0xdc00 + (sFourBytesOffset % 0x400))),4)) - break; - } - else - { - // Real GB18030 codepoint, but can't be mapped to unicode - if (!buffer.Fallback(ch, ch2, ch3, ch4)) - break; - } - } - else - { - // Not a valid 2 or 4 byte sequence, use '?' for ch and try other 3 again - buffer.AdjustBytes(-3); - if (!buffer.Fallback(ch)) - break; - } - } - else - { - // No room for 4 bytes, have 2 already, may be one more - // Lead byte but no place to stick it - if (decoder != null && !decoder.MustFlush) - { - // (make sure not to set decoder if counting, so check chars) - if (chars != null) - { - // We'll be able to stick the remainder in the decoder - byte1 = ch; - byte2 = ch2; - - if (buffer.MoreData) - byte3 = buffer.GetNextByte(); - else - byte3 = -1; - - byte4=-1; - } - break; - } - - // Won't go in decoder, we'll use '?' for it. - if (!buffer.Fallback(ch, ch2)) - break; - } - } - else - { - // Unknown byte sequence, fall back lead byte and try 2nd one again - buffer.AdjustBytes(-1); - if (!buffer.Fallback(ch)) - break; - } - } - else - { - // Lead byte but don't know about trail byte - // (make sure not to set decoder if counting, so check bytes) - if (decoder != null && !decoder.MustFlush) - { - // We'll be able to stick it in the decoder - // (don't actually do it when counting though) - if (chars != null) - { - byte1 = ch; - byte2 = -1; - byte3 = -1; - byte4 = -1; - } - break; - } - - if (!buffer.Fallback(ch)) - break; - } - } - else - { - // Not ASCII and not a lead byte, we'll use '?' for it if we have room - if (!buffer.Fallback(ch)) - break; - } - } - - // Need to flush the decoder if necessary - // (make sure not to set decoder if counting, so check bytes) - if (decoder != null) - { - if (chars != null) - { - decoder.bLeftOver1 = byte1; - decoder.bLeftOver2 = byte2; - decoder.bLeftOver3 = byte3; - decoder.bLeftOver4 = byte4; - } - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return the # of characters we found - return buffer.Count; - } - - public override int GetMaxByteCount(int charCount) - { - if (charCount < 0) - throw new ArgumentOutOfRangeException(nameof(charCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Characters would be # of characters + 1 in case high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; - - if (EncoderFallback.MaxCharCount > 1) - byteCount *= EncoderFallback.MaxCharCount; - - // We could have 4 bytes for each char, no extra for surrogates because 18030 can do whole unicode range. - byteCount *= 4; - - if (byteCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); - - return (int)byteCount; - } - - public override int GetMaxCharCount(int byteCount) - { - if (byteCount < 0) - throw new ArgumentOutOfRangeException(nameof(byteCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Just return length, we could have a single char for each byte + whatever extra our decoder could do to us. - // If decoder is messed up it could spit out 3 ?s. - long charCount = ((long)byteCount) + 3; - - // Take fallback size into consideration - if (DecoderFallback.MaxCharCount > 1) - charCount *= DecoderFallback.MaxCharCount; - - if (charCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(byteCount), Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); - - return (int)charCount; - } - - public override Decoder GetDecoder() - { - return new GB18030Decoder(this); - } - - [Serializable] - internal sealed class GB18030Decoder : System.Text.DecoderNLS, ISerializable - { - internal short bLeftOver1 = -1; - internal short bLeftOver2 = -1; - internal short bLeftOver3 = -1; - internal short bLeftOver4 = -1; - - internal GB18030Decoder(EncodingNLS encoding) : base(encoding) - { - // DecoderNLS Calls reset - } - - // Constructor called by serialization, have to handle deserializing from Everett - internal GB18030Decoder(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - try - { - // - // Try Whidbey V2.0 Fields - // - this.m_encoding = (Encoding)info.GetValue("m_encoding", typeof(Encoding)); - this.m_fallback = (DecoderFallback)info.GetValue("m_fallback", typeof(DecoderFallback)); - this.bLeftOver1 = (short)info.GetValue("bLeftOver1", typeof(short)); - this.bLeftOver2 = (short)info.GetValue("bLeftOver2", typeof(short)); - this.bLeftOver3 = (short)info.GetValue("bLeftOver3", typeof(short)); - this.bLeftOver4 = (short)info.GetValue("bLeftOver4", typeof(short)); - } - catch (SerializationException) - { - // Didn't have Whidbey stuff, try Everett (DecoderNLS already called Reset()) - this.m_encoding = new GB18030Encoding(); - } - } - - // ISerializable implementation, get data for this object - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - // Save Whidbey data - // Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding) - info.AddValue("m_encoding", this.m_encoding); - info.AddValue("m_fallback", this.m_fallback); - info.AddValue("bLeftOver1", this.bLeftOver1); - info.AddValue("bLeftOver2", this.bLeftOver2); - info.AddValue("bLeftOver3", this.bLeftOver3); - info.AddValue("bLeftOver4", this.bLeftOver4); - - // Everett needs different data (this is just empty for Everett) - info.AddValue("m_leftOverBytes", (int)0); - info.AddValue("leftOver", new byte[8]); - } - - public override void Reset() - { - bLeftOver1 = -1; - bLeftOver2 = -1; - bLeftOver3 = -1; - bLeftOver4 = -1; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - return (this.bLeftOver1 >= 0); - } - } - } - - // tableUnicodeToGBDiffs - // - // This compressed data enumerates the differences between gb18030 and the 936 code page as follows: - // - // <count> & 0x8000 == 0x8000 The next count <count> characters are identical to 936 characters. - // <count> & 0x8000 == 0x0000 The next count <count> characters are 4 byte gb18030 characters. - // Except for: - // <count> >= 0x9000 && <count> != 0xD1A6 This character is this 2 byte GB18030 value. - // - readonly ushort[] tableUnicodeToGBDiffs = - { - 0x8080, // U+0000 - U+007F ( 128 chars) use CP 936 conversion. - 0x0024, // U+0080 - U+00A3 ( 36 chars) are GB18030 81 30 81 30 - 81 30 84 35 (offset 0000 - 0023) - 0x8001, // U+00A4 - U+00A4 ( 1 chars) use CP 936 conversion. - 0x0002, // U+00A5 - U+00A6 ( 2 chars) are GB18030 81 30 84 36 - 81 30 84 37 (offset 0024 - 0025) - 0x8002, // U+00A7 - U+00A8 ( 2 chars) use CP 936 conversion. - 0x0007, // U+00A9 - U+00AF ( 7 chars) are GB18030 81 30 84 38 - 81 30 85 34 (offset 0026 - 002C) - 0x8002, // U+00B0 - U+00B1 ( 2 chars) use CP 936 conversion. - 0x0005, // U+00B2 - U+00B6 ( 5 chars) are GB18030 81 30 85 35 - 81 30 85 39 (offset 002D - 0031) - 0x8001, // U+00B7 - U+00B7 ( 1 chars) use CP 936 conversion. - 0x001F, // U+00B8 - U+00D6 ( 31 chars) are GB18030 81 30 86 30 - 81 30 89 30 (offset 0032 - 0050) - 0x8001, // U+00D7 - U+00D7 ( 1 chars) use CP 936 conversion. - 0x0008, // U+00D8 - U+00DF ( 8 chars) are GB18030 81 30 89 31 - 81 30 89 38 (offset 0051 - 0058) - 0x8002, // U+00E0 - U+00E1 ( 2 chars) use CP 936 conversion. - 0x0006, // U+00E2 - U+00E7 ( 6 chars) are GB18030 81 30 89 39 - 81 30 8A 34 (offset 0059 - 005E) - 0x8003, // U+00E8 - U+00EA ( 3 chars) use CP 936 conversion. - 0x0001, // U+00EB - U+00EB ( 1 chars) are GB18030 81 30 8A 35 - 81 30 8A 35 (offset 005F - 005F) - 0x8002, // U+00EC - U+00ED ( 2 chars) use CP 936 conversion. - 0x0004, // U+00EE - U+00F1 ( 4 chars) are GB18030 81 30 8A 36 - 81 30 8A 39 (offset 0060 - 0063) - 0x8002, // U+00F2 - U+00F3 ( 2 chars) use CP 936 conversion. - 0x0003, // U+00F4 - U+00F6 ( 3 chars) are GB18030 81 30 8B 30 - 81 30 8B 32 (offset 0064 - 0066) - 0x8001, // U+00F7 - U+00F7 ( 1 chars) use CP 936 conversion. - 0x0001, // U+00F8 - U+00F8 ( 1 chars) are GB18030 81 30 8B 33 - 81 30 8B 33 (offset 0067 - 0067) - 0x8002, // U+00F9 - U+00FA ( 2 chars) use CP 936 conversion. - 0x0001, // U+00FB - U+00FB ( 1 chars) are GB18030 81 30 8B 34 - 81 30 8B 34 (offset 0068 - 0068) - 0x8001, // U+00FC - U+00FC ( 1 chars) use CP 936 conversion. - 0x0004, // U+00FD - U+0100 ( 4 chars) are GB18030 81 30 8B 35 - 81 30 8B 38 (offset 0069 - 006C) - 0x8001, // U+0101 - U+0101 ( 1 chars) use CP 936 conversion. - 0x0011, // U+0102 - U+0112 ( 17 chars) are GB18030 81 30 8B 39 - 81 30 8D 35 (offset 006D - 007D) - 0x8001, // U+0113 - U+0113 ( 1 chars) use CP 936 conversion. - 0x0007, // U+0114 - U+011A ( 7 chars) are GB18030 81 30 8D 36 - 81 30 8E 32 (offset 007E - 0084) - 0x8001, // U+011B - U+011B ( 1 chars) use CP 936 conversion. - 0x000F, // U+011C - U+012A ( 15 chars) are GB18030 81 30 8E 33 - 81 30 8F 37 (offset 0085 - 0093) - 0x8001, // U+012B - U+012B ( 1 chars) use CP 936 conversion. - 0x0018, // U+012C - U+0143 ( 24 chars) are GB18030 81 30 8F 38 - 81 30 92 31 (offset 0094 - 00AB) - 0x8001, // U+0144 - U+0144 ( 1 chars) use CP 936 conversion. - 0x0003, // U+0145 - U+0147 ( 3 chars) are GB18030 81 30 92 32 - 81 30 92 34 (offset 00AC - 00AE) - 0x8001, // U+0148 - U+0148 ( 1 chars) use CP 936 conversion. - 0x0004, // U+0149 - U+014C ( 4 chars) are GB18030 81 30 92 35 - 81 30 92 38 (offset 00AF - 00B2) - 0x8001, // U+014D - U+014D ( 1 chars) use CP 936 conversion. - 0x001D, // U+014E - U+016A ( 29 chars) are GB18030 81 30 92 39 - 81 30 95 37 (offset 00B3 - 00CF) - 0x8001, // U+016B - U+016B ( 1 chars) use CP 936 conversion. - 0x0062, // U+016C - U+01CD ( 98 chars) are GB18030 81 30 95 38 - 81 30 9F 35 (offset 00D0 - 0131) - 0x8001, // U+01CE - U+01CE ( 1 chars) use CP 936 conversion. - 0x0001, // U+01CF - U+01CF ( 1 chars) are GB18030 81 30 9F 36 - 81 30 9F 36 (offset 0132 - 0132) - 0x8001, // U+01D0 - U+01D0 ( 1 chars) use CP 936 conversion. - 0x0001, // U+01D1 - U+01D1 ( 1 chars) are GB18030 81 30 9F 37 - 81 30 9F 37 (offset 0133 - 0133) - 0x8001, // U+01D2 - U+01D2 ( 1 chars) use CP 936 conversion. - 0x0001, // U+01D3 - U+01D3 ( 1 chars) are GB18030 81 30 9F 38 - 81 30 9F 38 (offset 0134 - 0134) - 0x8001, // U+01D4 - U+01D4 ( 1 chars) use CP 936 conversion. - 0x0001, // U+01D5 - U+01D5 ( 1 chars) are GB18030 81 30 9F 39 - 81 30 9F 39 (offset 0135 - 0135) - 0x8001, // U+01D6 - U+01D6 ( 1 chars) use CP 936 conversion. - 0x0001, // U+01D7 - U+01D7 ( 1 chars) are GB18030 81 30 A0 30 - 81 30 A0 30 (offset 0136 - 0136) - 0x8001, // U+01D8 - U+01D8 ( 1 chars) use CP 936 conversion. - 0x0001, // U+01D9 - U+01D9 ( 1 chars) are GB18030 81 30 A0 31 - 81 30 A0 31 (offset 0137 - 0137) - 0x8001, // U+01DA - U+01DA ( 1 chars) use CP 936 conversion. - 0x0001, // U+01DB - U+01DB ( 1 chars) are GB18030 81 30 A0 32 - 81 30 A0 32 (offset 0138 - 0138) - 0x8001, // U+01DC - U+01DC ( 1 chars) use CP 936 conversion. - 0x001C, // U+01DD - U+01F8 ( 28 chars) are GB18030 81 30 A0 33 - 81 30 A3 30 (offset 0139 - 0154) - 0xA8BF, // U+01F9 is non-936 GB18030 value A8 BF. - 0x0057, // U+01FA - U+0250 ( 87 chars) are GB18030 81 30 A3 31 - 81 30 AB 37 (offset 0155 - 01AB) - 0x8001, // U+0251 - U+0251 ( 1 chars) use CP 936 conversion. - 0x000F, // U+0252 - U+0260 ( 15 chars) are GB18030 81 30 AB 38 - 81 30 AD 32 (offset 01AC - 01BA) - 0x8001, // U+0261 - U+0261 ( 1 chars) use CP 936 conversion. - 0x0065, // U+0262 - U+02C6 ( 101 chars) are GB18030 81 30 AD 33 - 81 30 B7 33 (offset 01BB - 021F) - 0x8001, // U+02C7 - U+02C7 ( 1 chars) use CP 936 conversion. - 0x0001, // U+02C8 - U+02C8 ( 1 chars) are GB18030 81 30 B7 34 - 81 30 B7 34 (offset 0220 - 0220) - 0x8003, // U+02C9 - U+02CB ( 3 chars) use CP 936 conversion. - 0x000D, // U+02CC - U+02D8 ( 13 chars) are GB18030 81 30 B7 35 - 81 30 B8 37 (offset 0221 - 022D) - 0x8001, // U+02D9 - U+02D9 ( 1 chars) use CP 936 conversion. - 0x00B7, // U+02DA - U+0390 ( 183 chars) are GB18030 81 30 B8 38 - 81 30 CB 30 (offset 022E - 02E4) - 0x8011, // U+0391 - U+03A1 ( 17 chars) use CP 936 conversion. - 0x0001, // U+03A2 - U+03A2 ( 1 chars) are GB18030 81 30 CB 31 - 81 30 CB 31 (offset 02E5 - 02E5) - 0x8007, // U+03A3 - U+03A9 ( 7 chars) use CP 936 conversion. - 0x0007, // U+03AA - U+03B0 ( 7 chars) are GB18030 81 30 CB 32 - 81 30 CB 38 (offset 02E6 - 02EC) - 0x8011, // U+03B1 - U+03C1 ( 17 chars) use CP 936 conversion. - 0x0001, // U+03C2 - U+03C2 ( 1 chars) are GB18030 81 30 CB 39 - 81 30 CB 39 (offset 02ED - 02ED) - 0x8007, // U+03C3 - U+03C9 ( 7 chars) use CP 936 conversion. - 0x0037, // U+03CA - U+0400 ( 55 chars) are GB18030 81 30 CC 30 - 81 30 D1 34 (offset 02EE - 0324) - 0x8001, // U+0401 - U+0401 ( 1 chars) use CP 936 conversion. - 0x000E, // U+0402 - U+040F ( 14 chars) are GB18030 81 30 D1 35 - 81 30 D2 38 (offset 0325 - 0332) - 0x8040, // U+0410 - U+044F ( 64 chars) use CP 936 conversion. - 0x0001, // U+0450 - U+0450 ( 1 chars) are GB18030 81 30 D2 39 - 81 30 D2 39 (offset 0333 - 0333) - 0x8001, // U+0451 - U+0451 ( 1 chars) use CP 936 conversion. - 0x1BBE, // U+0452 - U+200F ( 7102 chars) are GB18030 81 30 D3 30 - 81 36 A5 31 (offset 0334 - 1EF1) - 0x8001, // U+2010 - U+2010 ( 1 chars) use CP 936 conversion. - 0x0002, // U+2011 - U+2012 ( 2 chars) are GB18030 81 36 A5 32 - 81 36 A5 33 (offset 1EF2 - 1EF3) - 0x8004, // U+2013 - U+2016 ( 4 chars) use CP 936 conversion. - 0x0001, // U+2017 - U+2017 ( 1 chars) are GB18030 81 36 A5 34 - 81 36 A5 34 (offset 1EF4 - 1EF4) - 0x8002, // U+2018 - U+2019 ( 2 chars) use CP 936 conversion. - 0x0002, // U+201A - U+201B ( 2 chars) are GB18030 81 36 A5 35 - 81 36 A5 36 (offset 1EF5 - 1EF6) - 0x8002, // U+201C - U+201D ( 2 chars) use CP 936 conversion. - 0x0007, // U+201E - U+2024 ( 7 chars) are GB18030 81 36 A5 37 - 81 36 A6 33 (offset 1EF7 - 1EFD) - 0x8002, // U+2025 - U+2026 ( 2 chars) use CP 936 conversion. - 0x0009, // U+2027 - U+202F ( 9 chars) are GB18030 81 36 A6 34 - 81 36 A7 32 (offset 1EFE - 1F06) - 0x8001, // U+2030 - U+2030 ( 1 chars) use CP 936 conversion. - 0x0001, // U+2031 - U+2031 ( 1 chars) are GB18030 81 36 A7 33 - 81 36 A7 33 (offset 1F07 - 1F07) - 0x8002, // U+2032 - U+2033 ( 2 chars) use CP 936 conversion. - 0x0001, // U+2034 - U+2034 ( 1 chars) are GB18030 81 36 A7 34 - 81 36 A7 34 (offset 1F08 - 1F08) - 0x8001, // U+2035 - U+2035 ( 1 chars) use CP 936 conversion. - 0x0005, // U+2036 - U+203A ( 5 chars) are GB18030 81 36 A7 35 - 81 36 A7 39 (offset 1F09 - 1F0D) - 0x8001, // U+203B - U+203B ( 1 chars) use CP 936 conversion. - 0x0070, // U+203C - U+20AB ( 112 chars) are GB18030 81 36 A8 30 - 81 36 B3 31 (offset 1F0E - 1F7D) - 0xA2E3, // U+20AC is non-936 GB18030 value A2 E3. - 0x0056, // U+20AD - U+2102 ( 86 chars) are GB18030 81 36 B3 32 - 81 36 BB 37 (offset 1F7E - 1FD3) - 0x8001, // U+2103 - U+2103 ( 1 chars) use CP 936 conversion. - 0x0001, // U+2104 - U+2104 ( 1 chars) are GB18030 81 36 BB 38 - 81 36 BB 38 (offset 1FD4 - 1FD4) - 0x8001, // U+2105 - U+2105 ( 1 chars) use CP 936 conversion. - 0x0003, // U+2106 - U+2108 ( 3 chars) are GB18030 81 36 BB 39 - 81 36 BC 31 (offset 1FD5 - 1FD7) - 0x8001, // U+2109 - U+2109 ( 1 chars) use CP 936 conversion. - 0x000C, // U+210A - U+2115 ( 12 chars) are GB18030 81 36 BC 32 - 81 36 BD 33 (offset 1FD8 - 1FE3) - 0x8001, // U+2116 - U+2116 ( 1 chars) use CP 936 conversion. - 0x000A, // U+2117 - U+2120 ( 10 chars) are GB18030 81 36 BD 34 - 81 36 BE 33 (offset 1FE4 - 1FED) - 0x8001, // U+2121 - U+2121 ( 1 chars) use CP 936 conversion. - 0x003E, // U+2122 - U+215F ( 62 chars) are GB18030 81 36 BE 34 - 81 36 C4 35 (offset 1FEE - 202B) - 0x800C, // U+2160 - U+216B ( 12 chars) use CP 936 conversion. - 0x0004, // U+216C - U+216F ( 4 chars) are GB18030 81 36 C4 36 - 81 36 C4 39 (offset 202C - 202F) - 0x800A, // U+2170 - U+2179 ( 10 chars) use CP 936 conversion. - 0x0016, // U+217A - U+218F ( 22 chars) are GB18030 81 36 C5 30 - 81 36 C7 31 (offset 2030 - 2045) - 0x8004, // U+2190 - U+2193 ( 4 chars) use CP 936 conversion. - 0x0002, // U+2194 - U+2195 ( 2 chars) are GB18030 81 36 C7 32 - 81 36 C7 33 (offset 2046 - 2047) - 0x8004, // U+2196 - U+2199 ( 4 chars) use CP 936 conversion. - 0x006E, // U+219A - U+2207 ( 110 chars) are GB18030 81 36 C7 34 - 81 36 D2 33 (offset 2048 - 20B5) - 0x8001, // U+2208 - U+2208 ( 1 chars) use CP 936 conversion. - 0x0006, // U+2209 - U+220E ( 6 chars) are GB18030 81 36 D2 34 - 81 36 D2 39 (offset 20B6 - 20BB) - 0x8001, // U+220F - U+220F ( 1 chars) use CP 936 conversion. - 0x0001, // U+2210 - U+2210 ( 1 chars) are GB18030 81 36 D3 30 - 81 36 D3 30 (offset 20BC - 20BC) - 0x8001, // U+2211 - U+2211 ( 1 chars) use CP 936 conversion. - 0x0003, // U+2212 - U+2214 ( 3 chars) are GB18030 81 36 D3 31 - 81 36 D3 33 (offset 20BD - 20BF) - 0x8001, // U+2215 - U+2215 ( 1 chars) use CP 936 conversion. - 0x0004, // U+2216 - U+2219 ( 4 chars) are GB18030 81 36 D3 34 - 81 36 D3 37 (offset 20C0 - 20C3) - 0x8001, // U+221A - U+221A ( 1 chars) use CP 936 conversion. - 0x0002, // U+221B - U+221C ( 2 chars) are GB18030 81 36 D3 38 - 81 36 D3 39 (offset 20C4 - 20C5) - 0x8004, // U+221D - U+2220 ( 4 chars) use CP 936 conversion. - 0x0002, // U+2221 - U+2222 ( 2 chars) are GB18030 81 36 D4 30 - 81 36 D4 31 (offset 20C6 - 20C7) - 0x8001, // U+2223 - U+2223 ( 1 chars) use CP 936 conversion. - 0x0001, // U+2224 - U+2224 ( 1 chars) are GB18030 81 36 D4 32 - 81 36 D4 32 (offset 20C8 - 20C8) - 0x8001, // U+2225 - U+2225 ( 1 chars) use CP 936 conversion. - 0x0001, // U+2226 - U+2226 ( 1 chars) are GB18030 81 36 D4 33 - 81 36 D4 33 (offset 20C9 - 20C9) - 0x8005, // U+2227 - U+222B ( 5 chars) use CP 936 conversion. - 0x0002, // U+222C - U+222D ( 2 chars) are GB18030 81 36 D4 34 - 81 36 D4 35 (offset 20CA - 20CB) - 0x8001, // U+222E - U+222E ( 1 chars) use CP 936 conversion. - 0x0005, // U+222F - U+2233 ( 5 chars) are GB18030 81 36 D4 36 - 81 36 D5 30 (offset 20CC - 20D0) - 0x8004, // U+2234 - U+2237 ( 4 chars) use CP 936 conversion. - 0x0005, // U+2238 - U+223C ( 5 chars) are GB18030 81 36 D5 31 - 81 36 D5 35 (offset 20D1 - 20D5) - 0x8001, // U+223D - U+223D ( 1 chars) use CP 936 conversion. - 0x000A, // U+223E - U+2247 ( 10 chars) are GB18030 81 36 D5 36 - 81 36 D6 35 (offset 20D6 - 20DF) - 0x8001, // U+2248 - U+2248 ( 1 chars) use CP 936 conversion. - 0x0003, // U+2249 - U+224B ( 3 chars) are GB18030 81 36 D6 36 - 81 36 D6 38 (offset 20E0 - 20E2) - 0x8001, // U+224C - U+224C ( 1 chars) use CP 936 conversion. - 0x0005, // U+224D - U+2251 ( 5 chars) are GB18030 81 36 D6 39 - 81 36 D7 33 (offset 20E3 - 20E7) - 0x8001, // U+2252 - U+2252 ( 1 chars) use CP 936 conversion. - 0x000D, // U+2253 - U+225F ( 13 chars) are GB18030 81 36 D7 34 - 81 36 D8 36 (offset 20E8 - 20F4) - 0x8002, // U+2260 - U+2261 ( 2 chars) use CP 936 conversion. - 0x0002, // U+2262 - U+2263 ( 2 chars) are GB18030 81 36 D8 37 - 81 36 D8 38 (offset 20F5 - 20F6) - 0x8004, // U+2264 - U+2267 ( 4 chars) use CP 936 conversion. - 0x0006, // U+2268 - U+226D ( 6 chars) are GB18030 81 36 D8 39 - 81 36 D9 34 (offset 20F7 - 20FC) - 0x8002, // U+226E - U+226F ( 2 chars) use CP 936 conversion. - 0x0025, // U+2270 - U+2294 ( 37 chars) are GB18030 81 36 D9 35 - 81 36 DD 31 (offset 20FD - 2121) - 0x8001, // U+2295 - U+2295 ( 1 chars) use CP 936 conversion. - 0x0003, // U+2296 - U+2298 ( 3 chars) are GB18030 81 36 DD 32 - 81 36 DD 34 (offset 2122 - 2124) - 0x8001, // U+2299 - U+2299 ( 1 chars) use CP 936 conversion. - 0x000B, // U+229A - U+22A4 ( 11 chars) are GB18030 81 36 DD 35 - 81 36 DE 35 (offset 2125 - 212F) - 0x8001, // U+22A5 - U+22A5 ( 1 chars) use CP 936 conversion. - 0x0019, // U+22A6 - U+22BE ( 25 chars) are GB18030 81 36 DE 36 - 81 36 E1 30 (offset 2130 - 2148) - 0x8001, // U+22BF - U+22BF ( 1 chars) use CP 936 conversion. - 0x0052, // U+22C0 - U+2311 ( 82 chars) are GB18030 81 36 E1 31 - 81 36 E9 32 (offset 2149 - 219A) - 0x8001, // U+2312 - U+2312 ( 1 chars) use CP 936 conversion. - 0x014D, // U+2313 - U+245F ( 333 chars) are GB18030 81 36 E9 33 - 81 37 8C 35 (offset 219B - 22E7) - 0x800A, // U+2460 - U+2469 ( 10 chars) use CP 936 conversion. - 0x000A, // U+246A - U+2473 ( 10 chars) are GB18030 81 37 8C 36 - 81 37 8D 35 (offset 22E8 - 22F1) - 0x8028, // U+2474 - U+249B ( 40 chars) use CP 936 conversion. - 0x0064, // U+249C - U+24FF ( 100 chars) are GB18030 81 37 8D 36 - 81 37 97 35 (offset 22F2 - 2355) - 0x804C, // U+2500 - U+254B ( 76 chars) use CP 936 conversion. - 0x0004, // U+254C - U+254F ( 4 chars) are GB18030 81 37 97 36 - 81 37 97 39 (offset 2356 - 2359) - 0x8024, // U+2550 - U+2573 ( 36 chars) use CP 936 conversion. - 0x000D, // U+2574 - U+2580 ( 13 chars) are GB18030 81 37 98 30 - 81 37 99 32 (offset 235A - 2366) - 0x800F, // U+2581 - U+258F ( 15 chars) use CP 936 conversion. - 0x0003, // U+2590 - U+2592 ( 3 chars) are GB18030 81 37 99 33 - 81 37 99 35 (offset 2367 - 2369) - 0x8003, // U+2593 - U+2595 ( 3 chars) use CP 936 conversion. - 0x000A, // U+2596 - U+259F ( 10 chars) are GB18030 81 37 99 36 - 81 37 9A 35 (offset 236A - 2373) - 0x8002, // U+25A0 - U+25A1 ( 2 chars) use CP 936 conversion. - 0x0010, // U+25A2 - U+25B1 ( 16 chars) are GB18030 81 37 9A 36 - 81 37 9C 31 (offset 2374 - 2383) - 0x8002, // U+25B2 - U+25B3 ( 2 chars) use CP 936 conversion. - 0x0008, // U+25B4 - U+25BB ( 8 chars) are GB18030 81 37 9C 32 - 81 37 9C 39 (offset 2384 - 238B) - 0x8002, // U+25BC - U+25BD ( 2 chars) use CP 936 conversion. - 0x0008, // U+25BE - U+25C5 ( 8 chars) are GB18030 81 37 9D 30 - 81 37 9D 37 (offset 238C - 2393) - 0x8002, // U+25C6 - U+25C7 ( 2 chars) use CP 936 conversion. - 0x0003, // U+25C8 - U+25CA ( 3 chars) are GB18030 81 37 9D 38 - 81 37 9E 30 (offset 2394 - 2396) - 0x8001, // U+25CB - U+25CB ( 1 chars) use CP 936 conversion. - 0x0002, // U+25CC - U+25CD ( 2 chars) are GB18030 81 37 9E 31 - 81 37 9E 32 (offset 2397 - 2398) - 0x8002, // U+25CE - U+25CF ( 2 chars) use CP 936 conversion. - 0x0012, // U+25D0 - U+25E1 ( 18 chars) are GB18030 81 37 9E 33 - 81 37 A0 30 (offset 2399 - 23AA) - 0x8004, // U+25E2 - U+25E5 ( 4 chars) use CP 936 conversion. - 0x001F, // U+25E6 - U+2604 ( 31 chars) are GB18030 81 37 A0 31 - 81 37 A3 31 (offset 23AB - 23C9) - 0x8002, // U+2605 - U+2606 ( 2 chars) use CP 936 conversion. - 0x0002, // U+2607 - U+2608 ( 2 chars) are GB18030 81 37 A3 32 - 81 37 A3 33 (offset 23CA - 23CB) - 0x8001, // U+2609 - U+2609 ( 1 chars) use CP 936 conversion. - 0x0036, // U+260A - U+263F ( 54 chars) are GB18030 81 37 A3 34 - 81 37 A8 37 (offset 23CC - 2401) - 0x8001, // U+2640 - U+2640 ( 1 chars) use CP 936 conversion. - 0x0001, // U+2641 - U+2641 ( 1 chars) are GB18030 81 37 A8 38 - 81 37 A8 38 (offset 2402 - 2402) - 0x8001, // U+2642 - U+2642 ( 1 chars) use CP 936 conversion. - 0x083E, // U+2643 - U+2E80 ( 2110 chars) are GB18030 81 37 A8 39 - 81 38 FD 38 (offset 2403 - 2C40) - 0xFE50, // U+2E81 is non-936 GB18030 value FE 50. - 0x0002, // U+2E82 - U+2E83 ( 2 chars) are GB18030 81 38 FD 39 - 81 38 FE 30 (offset 2C41 - 2C42) - 0xFE54, // U+2E84 is non-936 GB18030 value FE 54. - 0x0003, // U+2E85 - U+2E87 ( 3 chars) are GB18030 81 38 FE 31 - 81 38 FE 33 (offset 2C43 - 2C45) - 0xFE57, // U+2E88 is non-936 GB18030 value FE 57. - 0x0002, // U+2E89 - U+2E8A ( 2 chars) are GB18030 81 38 FE 34 - 81 38 FE 35 (offset 2C46 - 2C47) - 0xFE58, // U+2E8B is non-936 GB18030 value FE 58. - 0xFE5D, // U+2E8C is non-936 GB18030 value FE 5D. - 0x000A, // U+2E8D - U+2E96 ( 10 chars) are GB18030 81 38 FE 36 - 81 39 81 35 (offset 2C48 - 2C51) - 0xFE5E, // U+2E97 is non-936 GB18030 value FE 5E. - 0x000F, // U+2E98 - U+2EA6 ( 15 chars) are GB18030 81 39 81 36 - 81 39 83 30 (offset 2C52 - 2C60) - 0xFE6B, // U+2EA7 is non-936 GB18030 value FE 6B. - 0x0002, // U+2EA8 - U+2EA9 ( 2 chars) are GB18030 81 39 83 31 - 81 39 83 32 (offset 2C61 - 2C62) - 0xFE6E, // U+2EAA is non-936 GB18030 value FE 6E. - 0x0003, // U+2EAB - U+2EAD ( 3 chars) are GB18030 81 39 83 33 - 81 39 83 35 (offset 2C63 - 2C65) - 0xFE71, // U+2EAE is non-936 GB18030 value FE 71. - 0x0004, // U+2EAF - U+2EB2 ( 4 chars) are GB18030 81 39 83 36 - 81 39 83 39 (offset 2C66 - 2C69) - 0xFE73, // U+2EB3 is non-936 GB18030 value FE 73. - 0x0002, // U+2EB4 - U+2EB5 ( 2 chars) are GB18030 81 39 84 30 - 81 39 84 31 (offset 2C6A - 2C6B) - 0xFE74, // U+2EB6 is non-936 GB18030 value FE 74. - 0xFE75, // U+2EB7 is non-936 GB18030 value FE 75. - 0x0003, // U+2EB8 - U+2EBA ( 3 chars) are GB18030 81 39 84 32 - 81 39 84 34 (offset 2C6C - 2C6E) - 0xFE79, // U+2EBB is non-936 GB18030 value FE 79. - 0x000E, // U+2EBC - U+2EC9 ( 14 chars) are GB18030 81 39 84 35 - 81 39 85 38 (offset 2C6F - 2C7C) - 0xFE84, // U+2ECA is non-936 GB18030 value FE 84. - 0x0125, // U+2ECB - U+2FEF ( 293 chars) are GB18030 81 39 85 39 - 81 39 A3 31 (offset 2C7D - 2DA1) - 0xA98A, // U+2FF0 is non-936 GB18030 value A9 8A. - 0xA98B, // U+2FF1 is non-936 GB18030 value A9 8B. - 0xA98C, // U+2FF2 is non-936 GB18030 value A9 8C. - 0xA98D, // U+2FF3 is non-936 GB18030 value A9 8D. - 0xA98E, // U+2FF4 is non-936 GB18030 value A9 8E. - 0xA98F, // U+2FF5 is non-936 GB18030 value A9 8F. - 0xA990, // U+2FF6 is non-936 GB18030 value A9 90. - 0xA991, // U+2FF7 is non-936 GB18030 value A9 91. - 0xA992, // U+2FF8 is non-936 GB18030 value A9 92. - 0xA993, // U+2FF9 is non-936 GB18030 value A9 93. - 0xA994, // U+2FFA is non-936 GB18030 value A9 94. - 0xA995, // U+2FFB is non-936 GB18030 value A9 95. - 0x0004, // U+2FFC - U+2FFF ( 4 chars) are GB18030 81 39 A3 32 - 81 39 A3 35 (offset 2DA2 - 2DA5) - 0x8004, // U+3000 - U+3003 ( 4 chars) use CP 936 conversion. - 0x0001, // U+3004 - U+3004 ( 1 chars) are GB18030 81 39 A3 36 - 81 39 A3 36 (offset 2DA6 - 2DA6) - 0x8013, // U+3005 - U+3017 ( 19 chars) use CP 936 conversion. - 0x0005, // U+3018 - U+301C ( 5 chars) are GB18030 81 39 A3 37 - 81 39 A4 31 (offset 2DA7 - 2DAB) - 0x8002, // U+301D - U+301E ( 2 chars) use CP 936 conversion. - 0x0002, // U+301F - U+3020 ( 2 chars) are GB18030 81 39 A4 32 - 81 39 A4 33 (offset 2DAC - 2DAD) - 0x8009, // U+3021 - U+3029 ( 9 chars) use CP 936 conversion. - 0x0014, // U+302A - U+303D ( 20 chars) are GB18030 81 39 A4 34 - 81 39 A6 33 (offset 2DAE - 2DC1) - 0xA989, // U+303E is non-936 GB18030 value A9 89. - 0x0002, // U+303F - U+3040 ( 2 chars) are GB18030 81 39 A6 34 - 81 39 A6 35 (offset 2DC2 - 2DC3) - 0x8053, // U+3041 - U+3093 ( 83 chars) use CP 936 conversion. - 0x0007, // U+3094 - U+309A ( 7 chars) are GB18030 81 39 A6 36 - 81 39 A7 32 (offset 2DC4 - 2DCA) - 0x8004, // U+309B - U+309E ( 4 chars) use CP 936 conversion. - 0x0002, // U+309F - U+30A0 ( 2 chars) are GB18030 81 39 A7 33 - 81 39 A7 34 (offset 2DCB - 2DCC) - 0x8056, // U+30A1 - U+30F6 ( 86 chars) use CP 936 conversion. - 0x0005, // U+30F7 - U+30FB ( 5 chars) are GB18030 81 39 A7 35 - 81 39 A7 39 (offset 2DCD - 2DD1) - 0x8003, // U+30FC - U+30FE ( 3 chars) use CP 936 conversion. - 0x0006, // U+30FF - U+3104 ( 6 chars) are GB18030 81 39 A8 30 - 81 39 A8 35 (offset 2DD2 - 2DD7) - 0x8025, // U+3105 - U+3129 ( 37 chars) use CP 936 conversion. - 0x00F6, // U+312A - U+321F ( 246 chars) are GB18030 81 39 A8 36 - 81 39 C1 31 (offset 2DD8 - 2ECD) - 0x800A, // U+3220 - U+3229 ( 10 chars) use CP 936 conversion. - 0x0007, // U+322A - U+3230 ( 7 chars) are GB18030 81 39 C1 32 - 81 39 C1 38 (offset 2ECE - 2ED4) - 0x8001, // U+3231 - U+3231 ( 1 chars) use CP 936 conversion. - 0x0071, // U+3232 - U+32A2 ( 113 chars) are GB18030 81 39 C1 39 - 81 39 CD 31 (offset 2ED5 - 2F45) - 0x8001, // U+32A3 - U+32A3 ( 1 chars) use CP 936 conversion. - 0x00EA, // U+32A4 - U+338D ( 234 chars) are GB18030 81 39 CD 32 - 81 39 E4 35 (offset 2F46 - 302F) - 0x8002, // U+338E - U+338F ( 2 chars) use CP 936 conversion. - 0x000C, // U+3390 - U+339B ( 12 chars) are GB18030 81 39 E4 36 - 81 39 E5 37 (offset 3030 - 303B) - 0x8003, // U+339C - U+339E ( 3 chars) use CP 936 conversion. - 0x0002, // U+339F - U+33A0 ( 2 chars) are GB18030 81 39 E5 38 - 81 39 E5 39 (offset 303C - 303D) - 0x8001, // U+33A1 - U+33A1 ( 1 chars) use CP 936 conversion. - 0x0022, // U+33A2 - U+33C3 ( 34 chars) are GB18030 81 39 E6 30 - 81 39 E9 33 (offset 303E - 305F) - 0x8001, // U+33C4 - U+33C4 ( 1 chars) use CP 936 conversion. - 0x0009, // U+33C5 - U+33CD ( 9 chars) are GB18030 81 39 E9 34 - 81 39 EA 32 (offset 3060 - 3068) - 0x8001, // U+33CE - U+33CE ( 1 chars) use CP 936 conversion. - 0x0002, // U+33CF - U+33D0 ( 2 chars) are GB18030 81 39 EA 33 - 81 39 EA 34 (offset 3069 - 306A) - 0x8002, // U+33D1 - U+33D2 ( 2 chars) use CP 936 conversion. - 0x0002, // U+33D3 - U+33D4 ( 2 chars) are GB18030 81 39 EA 35 - 81 39 EA 36 (offset 306B - 306C) - 0x8001, // U+33D5 - U+33D5 ( 1 chars) use CP 936 conversion. - 0x0071, // U+33D6 - U+3446 ( 113 chars) are GB18030 81 39 EA 37 - 81 39 F5 39 (offset 306D - 30DD) - 0xFE56, // U+3447 is non-936 GB18030 value FE 56. - 0x002B, // U+3448 - U+3472 ( 43 chars) are GB18030 81 39 F6 30 - 81 39 FA 32 (offset 30DE - 3108) - 0xFE55, // U+3473 is non-936 GB18030 value FE 55. - 0x012A, // U+3474 - U+359D ( 298 chars) are GB18030 81 39 FA 33 - 82 30 9A 30 (offset 3109 - 3232) - 0xFE5A, // U+359E is non-936 GB18030 value FE 5A. - 0x006F, // U+359F - U+360D ( 111 chars) are GB18030 82 30 9A 31 - 82 30 A5 31 (offset 3233 - 32A1) - 0xFE5C, // U+360E is non-936 GB18030 value FE 5C. - 0x000B, // U+360F - U+3619 ( 11 chars) are GB18030 82 30 A5 32 - 82 30 A6 32 (offset 32A2 - 32AC) - 0xFE5B, // U+361A is non-936 GB18030 value FE 5B. - 0x02FD, // U+361B - U+3917 ( 765 chars) are GB18030 82 30 A6 33 - 82 30 F2 37 (offset 32AD - 35A9) - 0xFE60, // U+3918 is non-936 GB18030 value FE 60. - 0x0055, // U+3919 - U+396D ( 85 chars) are GB18030 82 30 F2 38 - 82 30 FB 32 (offset 35AA - 35FE) - 0xFE5F, // U+396E is non-936 GB18030 value FE 5F. - 0x0060, // U+396F - U+39CE ( 96 chars) are GB18030 82 30 FB 33 - 82 31 86 38 (offset 35FF - 365E) - 0xFE62, // U+39CF is non-936 GB18030 value FE 62. - 0xFE65, // U+39D0 is non-936 GB18030 value FE 65. - 0x000E, // U+39D1 - U+39DE ( 14 chars) are GB18030 82 31 86 39 - 82 31 88 32 (offset 365F - 366C) - 0xFE63, // U+39DF is non-936 GB18030 value FE 63. - 0x0093, // U+39E0 - U+3A72 ( 147 chars) are GB18030 82 31 88 33 - 82 31 96 39 (offset 366D - 36FF) - 0xFE64, // U+3A73 is non-936 GB18030 value FE 64. - 0x00DA, // U+3A74 - U+3B4D ( 218 chars) are GB18030 82 31 97 30 - 82 31 AC 37 (offset 3700 - 37D9) - 0xFE68, // U+3B4E is non-936 GB18030 value FE 68. - 0x011F, // U+3B4F - U+3C6D ( 287 chars) are GB18030 82 31 AC 38 - 82 31 C9 34 (offset 37DA - 38F8) - 0xFE69, // U+3C6E is non-936 GB18030 value FE 69. - 0x0071, // U+3C6F - U+3CDF ( 113 chars) are GB18030 82 31 C9 35 - 82 31 D4 37 (offset 38F9 - 3969) - 0xFE6A, // U+3CE0 is non-936 GB18030 value FE 6A. - 0x0375, // U+3CE1 - U+4055 ( 885 chars) are GB18030 82 31 D4 38 - 82 32 AF 32 (offset 396A - 3CDE) - 0xFE6F, // U+4056 is non-936 GB18030 value FE 6F. - 0x0108, // U+4057 - U+415E ( 264 chars) are GB18030 82 32 AF 33 - 82 32 C9 36 (offset 3CDF - 3DE6) - 0xFE70, // U+415F is non-936 GB18030 value FE 70. - 0x01D7, // U+4160 - U+4336 ( 471 chars) are GB18030 82 32 C9 37 - 82 32 F8 37 (offset 3DE7 - 3FBD) - 0xFE72, // U+4337 is non-936 GB18030 value FE 72. - 0x0074, // U+4338 - U+43AB ( 116 chars) are GB18030 82 32 F8 38 - 82 33 86 33 (offset 3FBE - 4031) - 0xFE78, // U+43AC is non-936 GB18030 value FE 78. - 0x0004, // U+43AD - U+43B0 ( 4 chars) are GB18030 82 33 86 34 - 82 33 86 37 (offset 4032 - 4035) - 0xFE77, // U+43B1 is non-936 GB18030 value FE 77. - 0x002B, // U+43B2 - U+43DC ( 43 chars) are GB18030 82 33 86 38 - 82 33 8B 30 (offset 4036 - 4060) - 0xFE7A, // U+43DD is non-936 GB18030 value FE 7A. - 0x00F8, // U+43DE - U+44D5 ( 248 chars) are GB18030 82 33 8B 31 - 82 33 A3 38 (offset 4061 - 4158) - 0xFE7B, // U+44D6 is non-936 GB18030 value FE 7B. - 0x0175, // U+44D7 - U+464B ( 373 chars) are GB18030 82 33 A3 39 - 82 33 C9 31 (offset 4159 - 42CD) - 0xFE7D, // U+464C is non-936 GB18030 value FE 7D. - 0x0014, // U+464D - U+4660 ( 20 chars) are GB18030 82 33 C9 32 - 82 33 CB 31 (offset 42CE - 42E1) - 0xFE7C, // U+4661 is non-936 GB18030 value FE 7C. - 0x00C1, // U+4662 - U+4722 ( 193 chars) are GB18030 82 33 CB 32 - 82 33 DE 34 (offset 42E2 - 43A2) - 0xFE80, // U+4723 is non-936 GB18030 value FE 80. - 0x0005, // U+4724 - U+4728 ( 5 chars) are GB18030 82 33 DE 35 - 82 33 DE 39 (offset 43A3 - 43A7) - 0xFE81, // U+4729 is non-936 GB18030 value FE 81. - 0x0052, // U+472A - U+477B ( 82 chars) are GB18030 82 33 DF 30 - 82 33 E7 31 (offset 43A8 - 43F9) - 0xFE82, // U+477C is non-936 GB18030 value FE 82. - 0x0010, // U+477D - U+478C ( 16 chars) are GB18030 82 33 E7 32 - 82 33 E8 37 (offset 43FA - 4409) - 0xFE83, // U+478D is non-936 GB18030 value FE 83. - 0x01B9, // U+478E - U+4946 ( 441 chars) are GB18030 82 33 E8 38 - 82 34 96 38 (offset 440A - 45C2) - 0xFE85, // U+4947 is non-936 GB18030 value FE 85. - 0x0032, // U+4948 - U+4979 ( 50 chars) are GB18030 82 34 96 39 - 82 34 9B 38 (offset 45C3 - 45F4) - 0xFE86, // U+497A is non-936 GB18030 value FE 86. - 0x0002, // U+497B - U+497C ( 2 chars) are GB18030 82 34 9B 39 - 82 34 9C 30 (offset 45F5 - 45F6) - 0xFE87, // U+497D is non-936 GB18030 value FE 87. - 0x0004, // U+497E - U+4981 ( 4 chars) are GB18030 82 34 9C 31 - 82 34 9C 34 (offset 45F7 - 45FA) - 0xFE88, // U+4982 is non-936 GB18030 value FE 88. - 0xFE89, // U+4983 is non-936 GB18030 value FE 89. - 0x0001, // U+4984 - U+4984 ( 1 chars) are GB18030 82 34 9C 35 - 82 34 9C 35 (offset 45FB - 45FB) - 0xFE8A, // U+4985 is non-936 GB18030 value FE 8A. - 0xFE8B, // U+4986 is non-936 GB18030 value FE 8B. - 0x0014, // U+4987 - U+499A ( 20 chars) are GB18030 82 34 9C 36 - 82 34 9E 35 (offset 45FC - 460F) - 0xFE8D, // U+499B is non-936 GB18030 value FE 8D. - 0x0003, // U+499C - U+499E ( 3 chars) are GB18030 82 34 9E 36 - 82 34 9E 38 (offset 4610 - 4612) - 0xFE8C, // U+499F is non-936 GB18030 value FE 8C. - 0x0016, // U+49A0 - U+49B5 ( 22 chars) are GB18030 82 34 9E 39 - 82 34 A1 30 (offset 4613 - 4628) - 0xFE8F, // U+49B6 is non-936 GB18030 value FE 8F. - 0xFE8E, // U+49B7 is non-936 GB18030 value FE 8E. - 0x02BF, // U+49B8 - U+4C76 ( 703 chars) are GB18030 82 34 A1 31 - 82 34 E7 33 (offset 4629 - 48E7) - 0xFE96, // U+4C77 is non-936 GB18030 value FE 96. - 0x0027, // U+4C78 - U+4C9E ( 39 chars) are GB18030 82 34 E7 34 - 82 34 EB 32 (offset 48E8 - 490E) - 0xFE93, // U+4C9F is non-936 GB18030 value FE 93. - 0xFE94, // U+4CA0 is non-936 GB18030 value FE 94. - 0xFE95, // U+4CA1 is non-936 GB18030 value FE 95. - 0xFE97, // U+4CA2 is non-936 GB18030 value FE 97. - 0xFE92, // U+4CA3 is non-936 GB18030 value FE 92. - 0x006F, // U+4CA4 - U+4D12 ( 111 chars) are GB18030 82 34 EB 33 - 82 34 F6 33 (offset 490F - 497D) - 0xFE98, // U+4D13 is non-936 GB18030 value FE 98. - 0xFE99, // U+4D14 is non-936 GB18030 value FE 99. - 0xFE9A, // U+4D15 is non-936 GB18030 value FE 9A. - 0xFE9B, // U+4D16 is non-936 GB18030 value FE 9B. - 0xFE9C, // U+4D17 is non-936 GB18030 value FE 9C. - 0xFE9D, // U+4D18 is non-936 GB18030 value FE 9D. - 0xFE9E, // U+4D19 is non-936 GB18030 value FE 9E. - 0x0094, // U+4D1A - U+4DAD ( 148 chars) are GB18030 82 34 F6 34 - 82 35 87 31 (offset 497E - 4A11) - 0xFE9F, // U+4DAE is non-936 GB18030 value FE 9F. - 0x0051, // U+4DAF - U+4DFF ( 81 chars) are GB18030 82 35 87 32 - 82 35 8F 32 (offset 4A12 - 4A62) - 0xD1A6, // U+4E00 - U+9FA5 (20902 chars) use CP 936 conversion. - 0x385A, // U+9FA6 - U+D7FF (14426 chars) are GB18030 82 35 8F 33 - 83 36 C7 38 (offset 4A63 - 82BC) - 0x8F6C, // U+D800 - U+E76B ( 3948 chars) use CP 936 conversion. - 0x0001, // U+E76C - U+E76C ( 1 chars) are GB18030 83 36 C7 39 - 83 36 C7 39 (offset 82BD - 82BD) - 0x805B, // U+E76D - U+E7C7 ( 91 chars) use CP 936 conversion. - 0x0001, // U+E7C8 - U+E7C8 ( 1 chars) are GB18030 83 36 C8 30 - 83 36 C8 30 (offset 82BE - 82BE) - 0x801E, // U+E7C9 - U+E7E6 ( 30 chars) use CP 936 conversion. - 0x000D, // U+E7E7 - U+E7F3 ( 13 chars) are GB18030 83 36 C8 31 - 83 36 C9 33 (offset 82BF - 82CB) - 0x8021, // U+E7F4 - U+E814 ( 33 chars) use CP 936 conversion. - 0x0001, // U+E815 - U+E815 ( 1 chars) are GB18030 83 36 C9 34 - 83 36 C9 34 (offset 82CC - 82CC) - 0x8003, // U+E816 - U+E818 ( 3 chars) use CP 936 conversion. - 0x0005, // U+E819 - U+E81D ( 5 chars) are GB18030 83 36 C9 35 - 83 36 C9 39 (offset 82CD - 82D1) - 0x8001, // U+E81E - U+E81E ( 1 chars) use CP 936 conversion. - 0x0007, // U+E81F - U+E825 ( 7 chars) are GB18030 83 36 CA 30 - 83 36 CA 36 (offset 82D2 - 82D8) - 0x8001, // U+E826 - U+E826 ( 1 chars) use CP 936 conversion. - 0x0004, // U+E827 - U+E82A ( 4 chars) are GB18030 83 36 CA 37 - 83 36 CB 30 (offset 82D9 - 82DC) - 0x8002, // U+E82B - U+E82C ( 2 chars) use CP 936 conversion. - 0x0004, // U+E82D - U+E830 ( 4 chars) are GB18030 83 36 CB 31 - 83 36 CB 34 (offset 82DD - 82E0) - 0x8002, // U+E831 - U+E832 ( 2 chars) use CP 936 conversion. - 0x0008, // U+E833 - U+E83A ( 8 chars) are GB18030 83 36 CB 35 - 83 36 CC 32 (offset 82E1 - 82E8) - 0x8001, // U+E83B - U+E83B ( 1 chars) use CP 936 conversion. - 0x0007, // U+E83C - U+E842 ( 7 chars) are GB18030 83 36 CC 33 - 83 36 CC 39 (offset 82E9 - 82EF) - 0x8001, // U+E843 - U+E843 ( 1 chars) use CP 936 conversion. - 0x0010, // U+E844 - U+E853 ( 16 chars) are GB18030 83 36 CD 30 - 83 36 CE 35 (offset 82F0 - 82FF) - 0x8002, // U+E854 - U+E855 ( 2 chars) use CP 936 conversion. - 0x000E, // U+E856 - U+E863 ( 14 chars) are GB18030 83 36 CE 36 - 83 36 CF 39 (offset 8300 - 830D) - 0x8001, // U+E864 - U+E864 ( 1 chars) use CP 936 conversion. - 0x10C7, // U+E865 - U+F92B ( 4295 chars) are GB18030 83 36 D0 30 - 84 30 85 34 (offset 830E - 93D4) - 0x8001, // U+F92C - U+F92C ( 1 chars) use CP 936 conversion. - 0x004C, // U+F92D - U+F978 ( 76 chars) are GB18030 84 30 85 35 - 84 30 8D 30 (offset 93D5 - 9420) - 0x8001, // U+F979 - U+F979 ( 1 chars) use CP 936 conversion. - 0x001B, // U+F97A - U+F994 ( 27 chars) are GB18030 84 30 8D 31 - 84 30 8F 37 (offset 9421 - 943B) - 0x8001, // U+F995 - U+F995 ( 1 chars) use CP 936 conversion. - 0x0051, // U+F996 - U+F9E6 ( 81 chars) are GB18030 84 30 8F 38 - 84 30 97 38 (offset 943C - 948C) - 0x8001, // U+F9E7 - U+F9E7 ( 1 chars) use CP 936 conversion. - 0x0009, // U+F9E8 - U+F9F0 ( 9 chars) are GB18030 84 30 97 39 - 84 30 98 37 (offset 948D - 9495) - 0x8001, // U+F9F1 - U+F9F1 ( 1 chars) use CP 936 conversion. - 0x001A, // U+F9F2 - U+FA0B ( 26 chars) are GB18030 84 30 98 38 - 84 30 9B 33 (offset 9496 - 94AF) - 0x8004, // U+FA0C - U+FA0F ( 4 chars) use CP 936 conversion. - 0x0001, // U+FA10 - U+FA10 ( 1 chars) are GB18030 84 30 9B 34 - 84 30 9B 34 (offset 94B0 - 94B0) - 0x8001, // U+FA11 - U+FA11 ( 1 chars) use CP 936 conversion. - 0x0001, // U+FA12 - U+FA12 ( 1 chars) are GB18030 84 30 9B 35 - 84 30 9B 35 (offset 94B1 - 94B1) - 0x8002, // U+FA13 - U+FA14 ( 2 chars) use CP 936 conversion. - 0x0003, // U+FA15 - U+FA17 ( 3 chars) are GB18030 84 30 9B 36 - 84 30 9B 38 (offset 94B2 - 94B4) - 0x8001, // U+FA18 - U+FA18 ( 1 chars) use CP 936 conversion. - 0x0006, // U+FA19 - U+FA1E ( 6 chars) are GB18030 84 30 9B 39 - 84 30 9C 34 (offset 94B5 - 94BA) - 0x8003, // U+FA1F - U+FA21 ( 3 chars) use CP 936 conversion. - 0x0001, // U+FA22 - U+FA22 ( 1 chars) are GB18030 84 30 9C 35 - 84 30 9C 35 (offset 94BB - 94BB) - 0x8002, // U+FA23 - U+FA24 ( 2 chars) use CP 936 conversion. - 0x0002, // U+FA25 - U+FA26 ( 2 chars) are GB18030 84 30 9C 36 - 84 30 9C 37 (offset 94BC - 94BD) - 0x8003, // U+FA27 - U+FA29 ( 3 chars) use CP 936 conversion. - 0x0406, // U+FA2A - U+FE2F ( 1030 chars) are GB18030 84 30 9C 38 - 84 31 85 37 (offset 94BE - 98C3) - 0x8002, // U+FE30 - U+FE31 ( 2 chars) use CP 936 conversion. - 0x0001, // U+FE32 - U+FE32 ( 1 chars) are GB18030 84 31 85 38 - 84 31 85 38 (offset 98C4 - 98C4) - 0x8012, // U+FE33 - U+FE44 ( 18 chars) use CP 936 conversion. - 0x0004, // U+FE45 - U+FE48 ( 4 chars) are GB18030 84 31 85 39 - 84 31 86 32 (offset 98C5 - 98C8) - 0x800A, // U+FE49 - U+FE52 ( 10 chars) use CP 936 conversion. - 0x0001, // U+FE53 - U+FE53 ( 1 chars) are GB18030 84 31 86 33 - 84 31 86 33 (offset 98C9 - 98C9) - 0x8004, // U+FE54 - U+FE57 ( 4 chars) use CP 936 conversion. - 0x0001, // U+FE58 - U+FE58 ( 1 chars) are GB18030 84 31 86 34 - 84 31 86 34 (offset 98CA - 98CA) - 0x800E, // U+FE59 - U+FE66 ( 14 chars) use CP 936 conversion. - 0x0001, // U+FE67 - U+FE67 ( 1 chars) are GB18030 84 31 86 35 - 84 31 86 35 (offset 98CB - 98CB) - 0x8004, // U+FE68 - U+FE6B ( 4 chars) use CP 936 conversion. - 0x0095, // U+FE6C - U+FF00 ( 149 chars) are GB18030 84 31 86 36 - 84 31 95 34 (offset 98CC - 9960) - 0x805E, // U+FF01 - U+FF5E ( 94 chars) use CP 936 conversion. - 0x0081, // U+FF5F - U+FFDF ( 129 chars) are GB18030 84 31 95 35 - 84 31 A2 33 (offset 9961 - 99E1) - 0x8006, // U+FFE0 - U+FFE5 ( 6 chars) use CP 936 conversion. - 0x001A, // U+FFE6 - U+FFFF ( 26 chars) are GB18030 84 31 A2 34 - 84 31 A4 39 (offset 99E2 - 99FB) - }; - } -} -#endif // FEATURE_CODEPAGES_FILE - diff --git a/src/mscorlib/src/System/Text/ISCIIEncoding.cs b/src/mscorlib/src/System/Text/ISCIIEncoding.cs deleted file mode 100644 index 751b8217c0..0000000000 --- a/src/mscorlib/src/System/Text/ISCIIEncoding.cs +++ /dev/null @@ -1,2621 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -// ISCIIEncoding -// -// Ported from windows c_iscii. If you find bugs here, there're likely similar -// bugs in the windows version -namespace System.Text -{ - using System; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Globalization; - using System.Runtime.Serialization; - using System.Security.Permissions; - - // Encodes text into and out of the ISCII encodings. - // ISCII contains characters to encode indic scripts by mapping indic scripts - // to the same code page. This works because they are all related scripts. - // ISCII provides a "font" selection method to switch between the appropriate - // fonts to display the other scripts. All ISCII characters are above the - // ASCII range to provide ASCII compatibility. - // - // IsAlwaysNormalized() isn't overridden - // We don't override IsAlwaysNormalized() because it is false for all forms (like base implimentation) - // Forms C & KC have things like 0933 + 093C == composed 0934, so they aren't normalized - // Forms D & KD have things like 0934, which decomposes to 0933 + 093C, so not normal. - // Form IDNA has the above problems plus case mapping, so false (like most encodings) - // - - [Serializable] - internal class ISCIIEncoding : EncodingNLS, ISerializable - { - // Constants - private const int CodeDefault = 0; // 0x40 Default - private const int CodeRoman = 1; // 0x41 Roman Transliteration (not supported) - private const int CodeDevanagari = 2; // 0x42 57002 - private const int CodeBengali = 3; // 0x43 57003 - private const int CodeTamil = 4; // 0x44 57004 - private const int CodeTelugu = 5; // 0x45 57005 - private const int CodeAssamese = 6; // 0x46 57006 Assamese (Bengali) - private const int CodeOriya = 7; // 0x47 57007 - private const int CodeKannada = 8; // 0x48 57008 - private const int CodeMalayalam = 9; // 0x49 57009 - private const int CodeGujarati = 10; // 0x4a 57010 - private const int CodePunjabi = 11; // 0x4b 57011 Punjabi (Gurmukhi) - - // Ranges - private const int MultiByteBegin = 0xa0; // Beginning of MultiByte space in ISCII - private const int IndicBegin = 0x0901; // Beginining of Unicode Indic script code points - private const int IndicEnd = 0x0d6f; // End of Unicode Indic Script code points - - // ISCII Control Values - private const byte ControlATR = 0xef; // Attribute (ATR) code - private const byte ControlCodePageStart = 0x40; // Start of code page range - - // Interesting ISCII characters - private const byte Virama = 0xe8; - private const byte Nukta = 0xe9; - private const byte DevenagariExt = 0xf0; - - // Interesting Unicode characters - private const char ZWNJ = (char)0x200c; - private const char ZWJ = (char)0x200d; - - // Code Page - private int defaultCodePage; - - public ISCIIEncoding(int codePage) : base(codePage) - { - // Set our code page (subtracting windows code page # offset) - defaultCodePage = codePage - 57000; - - // Legal windows code pages are between Devanagari and Punjabi - Debug.Assert(defaultCodePage >= CodeDevanagari && defaultCodePage <= CodePunjabi, - "[ISCIIEncoding] Code page (" + codePage + " isn't supported by ISCIIEncoding!"); - - // This shouldn't really be possible - if (defaultCodePage < CodeDevanagari || defaultCodePage > CodePunjabi) - throw new ArgumentException(Environment.GetResourceString( - "Argument_CodepageNotSupported", codePage), nameof(codePage)); - } - - // Constructor called by serialization. - internal ISCIIEncoding(SerializationInfo info, StreamingContext context) : base(0) - { - // Actually this can't ever get called, MLangCodePageEncoding is our proxy - // (In Everett this was done by MLang) - Debug.Assert(false, "Didn't expect to make it to ISCIIEncoding serialization constructor"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - - // ISerializable implementation - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // Make sure to get the base stuff too This throws if info is null - SerializeEncoding(info, context); - Debug.Assert(info!=null, "[ISCIIEncoding.GetObjectData] Expected null info to throw"); - - // Just need Everett MLangCodePageEncoding maxCharSize - info.AddValue("m_maxByteSize", 2); - - // Always have this as our helper - info.SetType(typeof(MLangCodePageEncoding)); - } - - // Our MaxByteCount is 4 times the input size. That could be because - // the first input character could be in the wrong code page ("font") and - // then that character could also be encoded in 2 code points - public override int GetMaxByteCount(int charCount) - { - if (charCount < 0) - throw new ArgumentOutOfRangeException(nameof(charCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Characters would be # of characters + 1 in case high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; - - if (EncoderFallback.MaxCharCount > 1) - byteCount *= EncoderFallback.MaxCharCount; - - // 4 Time input because 1st input could require code page change and also that char could require 2 code points - byteCount *= 4; - - if (byteCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); - - return (int)byteCount; - } - - // Our MaxCharCount is the same as the byteCount. There are a few sequences - // where 2 (or more) bytes could become 2 chars, but thats still 1 to 1. - public override int GetMaxCharCount(int byteCount) - { - if (byteCount < 0) - throw new ArgumentOutOfRangeException(nameof(byteCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Our MaxCharCount is the same as the byteCount. There are a few sequences - // where 2 (or more) bytes could become 2 chars, but thats still 1 to 1. - // Also could have 1 in decoder if we're waiting to see if next char's a nukta. - long charCount = ((long)byteCount + 1); - - // Some code points are undefined so we could fall back. - if (DecoderFallback.MaxCharCount > 1) - charCount *= DecoderFallback.MaxCharCount; - - if (charCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(byteCount), Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); - - return (int)charCount; - } - - // Our workhorse version - internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) - { - // Use null pointer to ask GetBytes for count - return GetBytes(chars, count, null, 0, baseEncoder); - } - - // Workhorse - internal override unsafe int GetBytes(char *chars, int charCount, - byte* bytes, int byteCount, EncoderNLS baseEncoder) - { - // Allow null bytes for counting - Debug.Assert(chars != null, "[ISCIIEncoding.GetBytes]chars!=null"); -// Debug.Assert(bytes != null, "[ISCIIEncoding.GetBytes]bytes!=null"); - Debug.Assert(charCount >=0, "[ISCIIEncoding.GetBytes]charCount >=0"); - Debug.Assert(byteCount >=0, "[ISCIIEncoding.GetBytes]byteCount >=0"); - - // Need the ISCII Encoder - ISCIIEncoder encoder = (ISCIIEncoder) baseEncoder; - - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - int currentCodePage = this.defaultCodePage; - bool bLastVirama = false; - - // Use encoder info if available - if (encoder != null) - { - // Remember our old state - currentCodePage = encoder.currentCodePage; - bLastVirama = encoder.bLastVirama; - - // If we have a high surrogate left over, then fall it back - if (encoder.charLeftOver > 0) - { - buffer.Fallback(encoder.charLeftOver); - bLastVirama = false; // Redundant - } - } - - while (buffer.MoreData) - { - // Get our data - char ch = buffer.GetNextChar(); - - // See if its a Multi Byte Character - if (ch < MultiByteBegin) - { - // Its a boring low character, add it. - if (!buffer.AddByte((byte)ch)) - break; - bLastVirama = false; - continue; - } - - // See if its outside of the Indic script Range range - if ((ch < IndicBegin) || (ch > IndicEnd)) - { - // See if its a ZWJ or ZWNJ and if we has bLastVirama; - if (bLastVirama && (ch == ZWNJ || ch == ZWJ)) - { - // It was a bLastVirama and ZWNJ || ZWJ - if (ch == ZWNJ) - { - if (!buffer.AddByte(Virama)) - break; - } - else // ZWJ - { - if (!buffer.AddByte(Nukta)) - break; - } - - // bLastVirama now counts as false - bLastVirama = false; - continue; - } - - // Have to do our fallback - // - // Note that this will fallback 2 chars if this is a high surrogate. - // Throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - bLastVirama = false; - continue; - } - - // Its in the Unicode Indic script range - int indicInfo = UnicodeToIndicChar[ch - IndicBegin]; - byte byteIndic = (byte)indicInfo; - int indicScript = (0x000f & (indicInfo >> 8)); - int indicTwoBytes = (0xf000 & indicInfo); - - // If IndicInfo is 0 then have to do fallback - if (indicInfo == 0) - { - // Its some Unicode character we don't have indic for. - // Have to do our fallback - // Add Fallback Count - // Note that chars was preincremented, and GetEncoderFallbackString might add an extra - // if chars != charEnd and there's a surrogate. - // Throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - - bLastVirama = false; - continue; - } - - // See if our code page ("font" in ISCII spec) has to change - // (This if doesn't add character, just changes character set) - Debug.Assert(indicScript!=0, "[ISCIIEncoding.GetBytes]expected an indic script value"); - if (indicScript != currentCodePage) - { - // It changed, spit out the ATR - if (!buffer.AddByte(ControlATR, (byte)(indicScript | ControlCodePageStart))) - break; - - // Now spit out the new code page (& remember it) (do this afterwards in case AddByte failed) - currentCodePage = indicScript; - - // We only know how to map from Unicode to pages from Devanagari to Punjabi (2 to 11) - Debug.Assert(currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi, - "[ISCIIEncoding.GetBytes]Code page (" + currentCodePage + " shouldn't appear in ISCII from Unicode table!"); - } - - // Safe to add our byte now - if (!buffer.AddByte(byteIndic, indicTwoBytes != 0 ? 1:0)) - break; - - // Remember if this one was a Virama - bLastVirama = (byteIndic == Virama); - - // Some characters need extra bytes - if (indicTwoBytes != 0) - { - // This one needs another byte - Debug.Assert((indicTwoBytes >> 12) > 0 && (indicTwoBytes >> 12) <= 3, - "[ISCIIEncoding.GetBytes]Expected indicTwoBytes from 1-3, not " + (indicTwoBytes >> 12)); - - // Already did buffer checking, but... - if (!buffer.AddByte(SecondIndicByte[indicTwoBytes >> 12])) - break; - } - } - - // May need to switch back to our default code page - if (currentCodePage != defaultCodePage && (encoder == null || encoder.MustFlush)) - { - // It changed, spit out the ATR - if (buffer.AddByte(ControlATR, (byte)(defaultCodePage | ControlCodePageStart))) - currentCodePage = defaultCodePage; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - bLastVirama = false; - } - - // Make sure we remember our state if necessary - // Note that we don't care about flush because Virama and code page - // changes are legal at the end. - // Don't set encoder if we're just counting - if (encoder != null && bytes != null) - { - // Clear Encoder if necessary. - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - // Remember our code page/virama state - encoder.currentCodePage = currentCodePage; - encoder.bLastVirama = bLastVirama; - - // How many chars were used? - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - // Workhorse - internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - // Just call GetChars with null chars saying we want count - return GetChars(bytes, count, null, 0, baseDecoder); - } - - // For decoding, the following interesting rules apply: - // Virama followed by another Virama or Nukta becomes Virama + ZWNJ or Virama + ZWJ - // ATR is followed by a byte to switch code pages ("fonts") - // Devenagari F0, B8 -> \u0952 - // Devenagari F0, BF -> \u0970 - // Some characters followed by E9 become a different character instead. - internal override unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS baseDecoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - // Allow null chars for counting - Debug.Assert(bytes != null, "[ISCIIEncoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[ISCIIEncoding.GetChars]byteCount is negative"); -// Debug.Assert(chars != null, "[ISCIIEncoding.GetChars]chars is null"); - Debug.Assert(charCount >= 0, "[ISCIIEncoding.GetChars]charCount is negative"); - - // Need the ISCII Decoder - ISCIIDecoder decoder = (ISCIIDecoder) baseDecoder; - - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - int currentCodePage = this.defaultCodePage; - bool bLastATR = false; - bool bLastVirama = false; - bool bLastDevenagariStressAbbr = false; - char cLastCharForNextNukta = '\0'; - char cLastCharForNoNextNukta = '\0'; - - // See if there's anything in our decoder - if (decoder != null) - { - currentCodePage = decoder.currentCodePage; - bLastATR = decoder.bLastATR; - bLastVirama = decoder.bLastVirama; - bLastDevenagariStressAbbr = decoder.bLastDevenagariStressAbbr; - cLastCharForNextNukta = decoder.cLastCharForNextNukta; - cLastCharForNoNextNukta = decoder.cLastCharForNoNextNukta; - } - - bool bLastSpecial = bLastVirama | bLastATR | bLastDevenagariStressAbbr | - (cLastCharForNextNukta != '\0'); - - // Get our current code page index (some code pages are dups) - int currentCodePageIndex = -1; - Debug.Assert(currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi, - "[ISCIIEncoding.GetChars]Decoder code page must be >= Devanagari and <= Punjabi, not " + currentCodePage); - - if (currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi) - { - currentCodePageIndex = IndicMappingIndex[currentCodePage]; - } - - // Loop through our input - while (buffer.MoreData) - { - byte b = buffer.GetNextByte(); - - // See if last one was special - if (bLastSpecial) - { - // Now it won't be - bLastSpecial = false; - - // One and only one of our flags should be set - Debug.Assert(((bLastVirama ? 1 : 0) + (bLastATR ? 1 : 0) + - (bLastDevenagariStressAbbr ? 1 : 0) + - ((cLastCharForNextNukta > 0) ? 1 : 0)) == 1, - String.Format(CultureInfo.InvariantCulture, - "[ISCIIEncoding.GetChars]Special cases require 1 and only 1 special case flag: LastATR {0} Dev. {1} Nukta {2}", - bLastATR, bLastDevenagariStressAbbr, cLastCharForNextNukta)); - // If the last one was an ATR, then we'll have to do ATR stuff - if (bLastATR) - { - // We only support Devanagari - Punjabi - if (b >= (0x40 | CodeDevanagari) && b <= (0x40 | CodePunjabi)) - { - // Remember the code page - currentCodePage = b & 0xf; - currentCodePageIndex = IndicMappingIndex[currentCodePage]; - // No longer last ATR - bLastATR = false; - continue; - } - - // Change back to default? - if (b == 0x40) - { - currentCodePage = this.defaultCodePage; - currentCodePageIndex = -1; - - if (currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi) - { - currentCodePageIndex = IndicMappingIndex[currentCodePage]; - } - // No longer last ATR - bLastATR = false; - continue; - } - - // We don't support Roman - if (b == 0x41) - { - currentCodePage = this.defaultCodePage; - currentCodePageIndex = -1; - - if (currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi) - { - currentCodePageIndex = IndicMappingIndex[currentCodePage]; - } - - // Even though we don't know how to support Roman, windows didn't add a ? so we don't either. - // No longer last ATR - bLastATR = false; - continue; - } - - // Other code pages & ATR codes not supported, fallback the ATR - // If fails, decrements the buffer, which is OK, we remember ATR state. - if (!buffer.Fallback(ControlATR)) - break; - - // No longer last ATR (fell back) - bLastATR = false; - - // we know we can't have any of these other modes - Debug.Assert(bLastVirama == false, "[ISCIIEncoding.GetChars] Expected no bLastVirama in bLastATR mode"); - Debug.Assert(bLastDevenagariStressAbbr == false, "[ISCIIEncoding.GetChars] Expected no bLastDevenagariStressAbbr in bLastATR mode"); - Debug.Assert(cLastCharForNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNextNukta in bLastATR mode"); - Debug.Assert(cLastCharForNoNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNoNextNukta in bLastATR mode"); - - // Keep processing this byte - } - else if (bLastVirama) - { - // If last was Virama, then we might need ZWNJ or ZWJ instead - if (b == Virama) - { - // If no room, then stop - if (!buffer.AddChar(ZWNJ)) - break; - bLastVirama = false; - continue; - } - if (b == Nukta) - { - // If no room, then stop - if (!buffer.AddChar(ZWJ)) - break; - bLastVirama = false; - continue; - } - - // No longer in this mode, fall through to handle character - // (Virama itself was added when flag was set last iteration) - bLastVirama = false; - - // We know we can't have any of these other modes - Debug.Assert(bLastATR == false, "[ISCIIEncoding.GetChars] Expected no bLastATR in bLastVirama mode"); - Debug.Assert(bLastDevenagariStressAbbr == false, "[ISCIIEncoding.GetChars] Expected no bLastDevenagariStressAbbr in bLastVirama mode"); - Debug.Assert(cLastCharForNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNextNukta in bLastVirama mode"); - Debug.Assert(cLastCharForNoNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNoNextNukta in bLastVirama mode"); - } - else if (bLastDevenagariStressAbbr) - { - // Last byte was an 0xf0 (ext). - // If current is b8 or bf, then we have 952 or 970. Otherwise fallback - if (b == 0xb8) - { - // It was a 0xb8 - if (!buffer.AddChar('\x0952')) // Devanagari stress sign anudatta - break; - bLastDevenagariStressAbbr = false; - continue; - } - - if (b == 0xbf) - { - // It was a 0xbf - if (!buffer.AddChar('\x0970')) // Devanagari abbr. sign - break; - bLastDevenagariStressAbbr = false; - continue; - } - - // Wasn't an expected pattern, do fallback for f0 (ext) - // if fails, fallback will back up our buffer - if (!buffer.Fallback(DevenagariExt)) - break; - - // Keep processing this byte (turn off mode) - // (last character was added when mode was set) - bLastDevenagariStressAbbr = false; - - Debug.Assert(bLastATR == false, "[ISCIIEncoding.GetChars] Expected no bLastATR in bLastDevenagariStressAbbr mode"); - Debug.Assert(bLastVirama == false, "[ISCIIEncoding.GetChars] Expected no bLastVirama in bLastDevenagariStressAbbr mode"); - Debug.Assert(cLastCharForNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNextNukta in bLastDevenagariStressAbbr mode"); - Debug.Assert(cLastCharForNoNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNoNextNukta in bLastDevenagariStressAbbr mode"); - } - else - { - // We were checking for next char being a nukta - Debug.Assert(cLastCharForNextNukta > 0 && cLastCharForNoNextNukta > 0, - "[ISCIIEncoding.GetChars]No other special case found, but cLastCharFor(No)NextNukta variable(s) aren't set."); - - // We'll either add combined char or last char - if (b == Nukta) - { - // We combine nukta with previous char - if (!buffer.AddChar(cLastCharForNextNukta)) - break; - - // Done already - cLastCharForNextNukta = cLastCharForNoNextNukta = '\0'; - continue; - } - - // No Nukta, just add last character and keep processing current byte - if (!buffer.AddChar(cLastCharForNoNextNukta)) - break; - - // Keep processing this byte, turn off mode. - cLastCharForNextNukta = cLastCharForNoNextNukta = '\0'; - - Debug.Assert(bLastATR == false, "[ISCIIEncoding.GetChars] Expected no bLastATR in cLastCharForNextNukta mode"); - Debug.Assert(bLastVirama == false, "[ISCIIEncoding.GetChars] Expected no bLastVirama in cLastCharForNextNukta mode"); - Debug.Assert(bLastDevenagariStressAbbr == false, "[ISCIIEncoding.GetChars] Expected no bLastDevenagariStressAbbr in cLastCharForNextNukta mode"); - } - } - - // Now bLastSpecial should be false and all flags false. - Debug.Assert (!bLastSpecial && !bLastDevenagariStressAbbr && !bLastVirama && !bLastATR && - cLastCharForNextNukta == '\0', - "[ISCIIEncoding.GetChars]No special state for last code point should exist at this point."); - - // If its a simple byte, just add it - if (b < MultiByteBegin) - { - if (!buffer.AddChar((char)b)) - break; - continue; - } - - // See if its an ATR marker - if (b == ControlATR) - { - bLastATR = bLastSpecial = true; - continue; - } - - Debug.Assert (currentCodePageIndex != -1, "[ISCIIEncoding.GetChars]Expected valid currentCodePageIndex != -1"); - char ch = IndicMapping[currentCodePageIndex, 0, b - MultiByteBegin]; - char cAlt = IndicMapping[currentCodePageIndex, 1, b - MultiByteBegin]; - - // If no 2nd char, just add it, also lonely Nuktas get added as well. - if (cAlt == 0 || b == Nukta) - { - // If it was an unknown character do fallback - - // ? if not known. - if (ch == 0) - { - // Fallback the unknown byte - if (!buffer.Fallback(b)) - break; - } - else - { - // Add the known character - if (!buffer.AddChar(ch)) - break; - } - continue; - } - - // if b == Virama set last Virama so we can do ZWJ or ZWNJ next time if needed. - if (b == Virama) - { - // Add Virama - if (!buffer.AddChar(ch)) - break; - bLastVirama = bLastSpecial = true; - continue; - } - - // See if its one that changes with a Nukta - if ((cAlt & 0xF000) == 0) - { - // It could change if next char is a nukta - bLastSpecial = true; - cLastCharForNextNukta = cAlt; - cLastCharForNoNextNukta = ch; - continue; - } - - // We must be the Devenagari special case for F0, B8 & F0, BF - Debug.Assert(currentCodePage == CodeDevanagari && b == DevenagariExt, - String.Format(CultureInfo.InvariantCulture, - "[ISCIIEncoding.GetChars] Devenagari special case must {0} not {1} or in Devanagari code page {2} not {3}.", - DevenagariExt, b, CodeDevanagari, currentCodePage)); - bLastDevenagariStressAbbr = bLastSpecial = true; - - } - - // If we don't have a decoder, or if we had to flush, then we need to get rid - // of last ATR, LastNoNextNukta and LastDevenagariExt. - if (decoder == null || decoder.MustFlush) - { - // If these fail (because of Convert with insufficient buffer), then they'll turn off MustFlush as well. - if (bLastATR) - { - // Have to add ATR fallback - if (buffer.Fallback(ControlATR)) - bLastATR = false; - else - // If not successful, convert will maintain state for next time, also - // AddChar will have decremented our byte count, however we need it to remain the same - buffer.GetNextByte(); - } - else if (bLastDevenagariStressAbbr) - { - // Have to do fallback for DevenagariExt - if (buffer.Fallback(DevenagariExt)) - bLastDevenagariStressAbbr = false; - else - // If not successful, convert will maintain state for next time, also - // AddChar will have decremented our byte count, however we need it to remain the same - buffer.GetNextByte(); - } - else if (cLastCharForNoNextNukta != '\0') - { - // Have to add our last char because there was no next nukta - if (buffer.AddChar(cLastCharForNoNextNukta)) - cLastCharForNoNextNukta = cLastCharForNextNukta = '\0'; - else - // If not successful, convert will maintain state for next time, also - // AddChar will have decremented our byte count, however we need it to remain the same - buffer.GetNextByte(); - } - // LastVirama is unimportant for flushing decoder. - } - - // Remember any left over stuff - // (only remember if we aren't counting) - if (decoder != null && chars != null) - { - // If not flushing or have state (from convert) then need to remember state - if (!decoder.MustFlush || - cLastCharForNoNextNukta != '\0' || bLastATR || bLastDevenagariStressAbbr) - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISCIIEncoding.GetChars]Expected no state or not converting or not flushing"); - decoder.currentCodePage = currentCodePage; - decoder.bLastVirama = bLastVirama; - decoder.bLastATR = bLastATR; - decoder.bLastDevenagariStressAbbr = bLastDevenagariStressAbbr; - decoder.cLastCharForNextNukta = cLastCharForNextNukta; - decoder.cLastCharForNoNextNukta = cLastCharForNoNextNukta; - } - else - { - decoder.currentCodePage = this.defaultCodePage; - decoder.bLastVirama = false; - decoder.bLastATR = false; - decoder.bLastDevenagariStressAbbr = false; - decoder.cLastCharForNextNukta = '\0'; - decoder.cLastCharForNoNextNukta = '\0'; - } - decoder.m_bytesUsed = buffer.BytesUsed; - } - // Otherwise we already did fallback and added extra things - - // Return the # of characters we found - return buffer.Count; - } - - public override Decoder GetDecoder() - { - return new ISCIIDecoder(this); - } - - public override Encoder GetEncoder() - { - return new ISCIIEncoder(this); - } - - public override int GetHashCode() - { - //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable. - return defaultCodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode(); - } - - [Serializable] - internal class ISCIIEncoder : EncoderNLS - { - // Need to remember the default code page (for HasState) - internal int defaultCodePage = 0; - - // Need a place for the current code page - internal int currentCodePage = 0; - - // Was the last character a virama? (Because ZWJ and ZWNJ are different then) - internal bool bLastVirama = false; - - public ISCIIEncoder(Encoding encoding) : base(encoding) - { - this.currentCodePage = this.defaultCodePage = encoding.CodePage - 57000; - - // base calls reset - } - - // Warning: If you're decoding mixed encoding files or something, this could be confusing - // We don't always force back to base encoding mapping, so if you reset where do you restart? - public override void Reset() - { - bLastVirama = false; - charLeftOver = (char)0; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our encoder? - // Encoder not only has to get rid of left over characters, but it has to switch back to the current code page. - internal override bool HasState - { - get - { - return (this.charLeftOver != (char)0 || this.currentCodePage != this.defaultCodePage); - } - } - } - - [Serializable] - internal class ISCIIDecoder : DecoderNLS - { - // Need a place to store any our current code page and last ATR flag - internal int currentCodePage = 0; - internal bool bLastATR = false; - internal bool bLastVirama = false; - internal bool bLastDevenagariStressAbbr = false; - internal char cLastCharForNextNukta = '\0'; - internal char cLastCharForNoNextNukta = '\0'; - - public ISCIIDecoder(Encoding encoding) : base(encoding) - { - this.currentCodePage = encoding.CodePage - 57000; - - // base calls reset - } - - // Warning: If you're decoding mixed encoding files or something, this could be confusing - // We don't always force back to base encoding mapping, so if you reset where do you restart? - public override void Reset() - { - bLastATR = false; - bLastVirama = false; - bLastDevenagariStressAbbr = false; - cLastCharForNextNukta = '\0'; - cLastCharForNoNextNukta = '\0'; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - return (this.cLastCharForNextNukta != '\0' || this.cLastCharForNoNextNukta != '\0' || - this.bLastATR || this.bLastDevenagariStressAbbr); - } - } - } - - // - // ISCII Tables - // - // From Windows ISCII\tables.c - // - - //////////////////////////////////////////////////////////////////////////// - // - // Char to Byte - // - // 0xXYZZ Where Y is the code page "font" part and ZZ is the byte character - // The high X bits also reference the SecondIndicByte table if an - // extra byte is needed. - // 0x0000 For undefined characters - // - // This is valid for values IndicBegin to IndicEnd - // - // WARNING: When this was copied from windows, the ? characters (0x003F) were - // searched/replaced with 0x0000. - // - //////////////////////////////////////////////////////////////////////////// - - static int[] UnicodeToIndicChar = - { - 0x02a1, // U+0901 : Devanagari Sign Candrabindu - 0x02a2, // U+0902 : Devanagari Sign Anusvara - 0x02a3, // U+0903 : Devanagari Sign Visarga - 0x0000, // U+0904 : Undefined - 0x02a4, // U+0905 : Devanagari Letter A - 0x02a5, // U+0906 : Devanagari Letter Aa - 0x02a6, // U+0907 : Devanagari Letter I - 0x02a7, // U+0908 : Devanagari Letter Ii - 0x02a8, // U+0909 : Devanagari Letter U - 0x02a9, // U+090a : Devanagari Letter Uu - 0x02aa, // U+090b : Devanagari Letter Vocalic R - 0x12a6, // U+090c : Devanagari Letter Vocalic L - 0x02ae, // U+090d : Devanagari Letter Candra E - 0x02ab, // U+090e : Devanagari Letter Short E - 0x02ac, // U+090f : Devanagari Letter E - 0x02ad, // U+0910 : Devanagari Letter Ai - 0x02b2, // U+0911 : Devanagari Letter Candra O - 0x02af, // U+0912 : Devanagari Letter Short O - 0x02b0, // U+0913 : Devanagari Letter O - 0x02b1, // U+0914 : Devanagari Letter Au - 0x02b3, // U+0915 : Devanagari Letter Ka - 0x02b4, // U+0916 : Devanagari Letter Kha - 0x02b5, // U+0917 : Devanagari Letter Ga - 0x02b6, // U+0918 : Devanagari Letter Gha - 0x02b7, // U+0919 : Devanagari Letter Nga - 0x02b8, // U+091a : Devanagari Letter Ca - 0x02b9, // U+091b : Devanagari Letter Cha - 0x02ba, // U+091c : Devanagari Letter Ja - 0x02bb, // U+091d : Devanagari Letter Jha - 0x02bc, // U+091e : Devanagari Letter Nya - 0x02bd, // U+091f : Devanagari Letter Tta - 0x02be, // U+0920 : Devanagari Letter Ttha - 0x02bf, // U+0921 : Devanagari Letter Dda - 0x02c0, // U+0922 : Devanagari Letter Ddha - 0x02c1, // U+0923 : Devanagari Letter Nna - 0x02c2, // U+0924 : Devanagari Letter Ta - 0x02c3, // U+0925 : Devanagari Letter Tha - 0x02c4, // U+0926 : Devanagari Letter Da - 0x02c5, // U+0927 : Devanagari Letter Dha - 0x02c6, // U+0928 : Devanagari Letter Na - 0x02c7, // U+0929 : Devanagari Letter Nnna - 0x02c8, // U+092a : Devanagari Letter Pa - 0x02c9, // U+092b : Devanagari Letter Pha - 0x02ca, // U+092c : Devanagari Letter Ba - 0x02cb, // U+092d : Devanagari Letter Bha - 0x02cc, // U+092e : Devanagari Letter Ma - 0x02cd, // U+092f : Devanagari Letter Ya - 0x02cf, // U+0930 : Devanagari Letter Ra - 0x02d0, // U+0931 : Devanagari Letter Rra - 0x02d1, // U+0932 : Devanagari Letter La - 0x02d2, // U+0933 : Devanagari Letter Lla - 0x02d3, // U+0934 : Devanagari Letter Llla - 0x02d4, // U+0935 : Devanagari Letter Va - 0x02d5, // U+0936 : Devanagari Letter Sha - 0x02d6, // U+0937 : Devanagari Letter Ssa - 0x02d7, // U+0938 : Devanagari Letter Sa - 0x02d8, // U+0939 : Devanagari Letter Ha - 0x0000, // U+093a : Undefined - 0x0000, // U+093b : Undefined - 0x02e9, // U+093c : Devanagari Sign Nukta - 0x12ea, // U+093d : Devanagari Sign Avagraha - 0x02da, // U+093e : Devanagari Vowel Sign Aa - 0x02db, // U+093f : Devanagari Vowel Sign I - 0x02dc, // U+0940 : Devanagari Vowel Sign Ii - 0x02dd, // U+0941 : Devanagari Vowel Sign U - 0x02de, // U+0942 : Devanagari Vowel Sign Uu - 0x02df, // U+0943 : Devanagari Vowel Sign Vocalic R - 0x12df, // U+0944 : Devanagari Vowel Sign Vocalic Rr - 0x02e3, // U+0945 : Devanagari Vowel Sign Candra E - 0x02e0, // U+0946 : Devanagari Vowel Sign Short E - 0x02e1, // U+0947 : Devanagari Vowel Sign E - 0x02e2, // U+0948 : Devanagari Vowel Sign Ai - 0x02e7, // U+0949 : Devanagari Vowel Sign Candra O - 0x02e4, // U+094a : Devanagari Vowel Sign Short O - 0x02e5, // U+094b : Devanagari Vowel Sign O - 0x02e6, // U+094c : Devanagari Vowel Sign Au - 0x02e8, // U+094d : Devanagari Sign Virama - 0x0000, // U+094e : Undefined - 0x0000, // U+094f : Undefined - 0x12a1, // U+0950 : Devanagari Om - 0x0000, // U+0951 : Devanagari Stress Sign Udatta - 0x22f0, // U+0952 : Devanagari Stress Sign Anudatta - 0x0000, // U+0953 : Devanagari Grave Accent - 0x0000, // U+0954 : Devanagari Acute Accent - 0x0000, // U+0955 : Undefined - 0x0000, // U+0956 : Undefined - 0x0000, // U+0957 : Undefined - 0x12b3, // U+0958 : Devanagari Letter Qa - 0x12b4, // U+0959 : Devanagari Letter Khha - 0x12b5, // U+095a : Devanagari Letter Ghha - 0x12ba, // U+095b : Devanagari Letter Za - 0x12bf, // U+095c : Devanagari Letter Dddha - 0x12c0, // U+095d : Devanagari Letter Rha - 0x12c9, // U+095e : Devanagari Letter Fa - 0x02ce, // U+095f : Devanagari Letter Yya - 0x12aa, // U+0960 : Devanagari Letter Vocalic Rr - 0x12a7, // U+0961 : Devanagari Letter Vocalic Ll - 0x12db, // U+0962 : Devanagari Vowel Sign Vocalic L - 0x12dc, // U+0963 : Devanagari Vowel Sign Vocalic Ll - 0x02ea, // U+0964 : Devanagari Danda - 0x0000, // U+0965 : Devanagari Double Danda - 0x02f1, // U+0966 : Devanagari Digit Zero - 0x02f2, // U+0967 : Devanagari Digit One - 0x02f3, // U+0968 : Devanagari Digit Two - 0x02f4, // U+0969 : Devanagari Digit Three - 0x02f5, // U+096a : Devanagari Digit Four - 0x02f6, // U+096b : Devanagari Digit Five - 0x02f7, // U+096c : Devanagari Digit Six - 0x02f8, // U+096d : Devanagari Digit Seven - 0x02f9, // U+096e : Devanagari Digit Eight - 0x02fa, // U+096f : Devanagari Digit Nine - 0x32f0, // U+0970 : Devanagari Abbreviation Sign - 0x0000, // U+0971 : Undefined - 0x0000, // U+0972 : Undefined - 0x0000, // U+0973 : Undefined - 0x0000, // U+0974 : Undefined - 0x0000, // U+0975 : Undefined - 0x0000, // U+0976 : Undefined - 0x0000, // U+0977 : Undefined - 0x0000, // U+0978 : Undefined - 0x0000, // U+0979 : Undefined - 0x0000, // U+097a : Undefined - 0x0000, // U+097b : Undefined - 0x0000, // U+097c : Undefined - 0x0000, // U+097d : Undefined - 0x0000, // U+097e : Undefined - 0x0000, // U+097f : Undefined - 0x0000, // U+0980 : Undefined - 0x03a1, // U+0981 : Bengali Sign Candrabindu - 0x03a2, // U+0982 : Bengali Sign Anusvara - 0x03a3, // U+0983 : Bengali Sign Visarga - 0x0000, // U+0984 : Undefined - 0x03a4, // U+0985 : Bengali Letter A - 0x03a5, // U+0986 : Bengali Letter Aa - 0x03a6, // U+0987 : Bengali Letter I - 0x03a7, // U+0988 : Bengali Letter Ii - 0x03a8, // U+0989 : Bengali Letter U - 0x03a9, // U+098a : Bengali Letter Uu - 0x03aa, // U+098b : Bengali Letter Vocalic R - 0x13a6, // U+098c : Bengali Letter Vocalic L - 0x0000, // U+098d : Undefined - 0x0000, // U+098e : Undefined - 0x03ab, // U+098f : Bengali Letter E - 0x03ad, // U+0990 : Bengali Letter Ai - 0x0000, // U+0991 : Undefined - 0x0000, // U+0992 : Undefined - 0x03af, // U+0993 : Bengali Letter O - 0x03b1, // U+0994 : Bengali Letter Au - 0x03b3, // U+0995 : Bengali Letter Ka - 0x03b4, // U+0996 : Bengali Letter Kha - 0x03b5, // U+0997 : Bengali Letter Ga - 0x03b6, // U+0998 : Bengali Letter Gha - 0x03b7, // U+0999 : Bengali Letter Nga - 0x03b8, // U+099a : Bengali Letter Ca - 0x03b9, // U+099b : Bengali Letter Cha - 0x03ba, // U+099c : Bengali Letter Ja - 0x03bb, // U+099d : Bengali Letter Jha - 0x03bc, // U+099e : Bengali Letter Nya - 0x03bd, // U+099f : Bengali Letter Tta - 0x03be, // U+09a0 : Bengali Letter Ttha - 0x03bf, // U+09a1 : Bengali Letter Dda - 0x03c0, // U+09a2 : Bengali Letter Ddha - 0x03c1, // U+09a3 : Bengali Letter Nna - 0x03c2, // U+09a4 : Bengali Letter Ta - 0x03c3, // U+09a5 : Bengali Letter Tha - 0x03c4, // U+09a6 : Bengali Letter Da - 0x03c5, // U+09a7 : Bengali Letter Dha - 0x03c6, // U+09a8 : Bengali Letter Na - 0x0000, // U+09a9 : Undefined - 0x03c8, // U+09aa : Bengali Letter Pa - 0x03c9, // U+09ab : Bengali Letter Pha - 0x03ca, // U+09ac : Bengali Letter Ba - 0x03cb, // U+09ad : Bengali Letter Bha - 0x03cc, // U+09ae : Bengali Letter Ma - 0x03cd, // U+09af : Bengali Letter Ya - 0x03cf, // U+09b0 : Bengali Letter Ra - 0x0000, // U+09b1 : Undefined - 0x03d1, // U+09b2 : Bengali Letter La - 0x0000, // U+09b3 : Undefined - 0x0000, // U+09b4 : Undefined - 0x0000, // U+09b5 : Undefined - 0x03d5, // U+09b6 : Bengali Letter Sha - 0x03d6, // U+09b7 : Bengali Letter Ssa - 0x03d7, // U+09b8 : Bengali Letter Sa - 0x03d8, // U+09b9 : Bengali Letter Ha - 0x0000, // U+09ba : Undefined - 0x0000, // U+09bb : Undefined - 0x03e9, // U+09bc : Bengali Sign Nukta - 0x0000, // U+09bd : Undefined - 0x03da, // U+09be : Bengali Vowel Sign Aa - 0x03db, // U+09bf : Bengali Vowel Sign I - 0x03dc, // U+09c0 : Bengali Vowel Sign Ii - 0x03dd, // U+09c1 : Bengali Vowel Sign U - 0x03de, // U+09c2 : Bengali Vowel Sign Uu - 0x03df, // U+09c3 : Bengali Vowel Sign Vocalic R - 0x13df, // U+09c4 : Bengali Vowel Sign Vocalic Rr - 0x0000, // U+09c5 : Undefined - 0x0000, // U+09c6 : Undefined - 0x03e0, // U+09c7 : Bengali Vowel Sign E - 0x03e2, // U+09c8 : Bengali Vowel Sign Ai - 0x0000, // U+09c9 : Undefined - 0x0000, // U+09ca : Undefined - 0x03e4, // U+09cb : Bengali Vowel Sign O - 0x03e6, // U+09cc : Bengali Vowel Sign Au - 0x03e8, // U+09cd : Bengali Sign Virama - 0x0000, // U+09ce : Undefined - 0x0000, // U+09cf : Undefined - 0x0000, // U+09d0 : Undefined - 0x0000, // U+09d1 : Undefined - 0x0000, // U+09d2 : Undefined - 0x0000, // U+09d3 : Undefined - 0x0000, // U+09d4 : Undefined - 0x0000, // U+09d5 : Undefined - 0x0000, // U+09d6 : Undefined - 0x0000, // U+09d7 : Bengali Au Length Mark - 0x0000, // U+09d8 : Undefined - 0x0000, // U+09d9 : Undefined - 0x0000, // U+09da : Undefined - 0x0000, // U+09db : Undefined - 0x13bf, // U+09dc : Bengali Letter Rra - 0x13c0, // U+09dd : Bengali Letter Rha - 0x0000, // U+09de : Undefined - 0x03ce, // U+09df : Bengali Letter Yya - 0x13aa, // U+09e0 : Bengali Letter Vocalic Rr - 0x13a7, // U+09e1 : Bengali Letter Vocalic Ll - 0x13db, // U+09e2 : Bengali Vowel Sign Vocalic L - 0x13dc, // U+09e3 : Bengali Vowel Sign Vocalic Ll - 0x0000, // U+09e4 : Undefined - 0x0000, // U+09e5 : Undefined - 0x03f1, // U+09e6 : Bengali Digit Zero - 0x03f2, // U+09e7 : Bengali Digit One - 0x03f3, // U+09e8 : Bengali Digit Two - 0x03f4, // U+09e9 : Bengali Digit Three - 0x03f5, // U+09ea : Bengali Digit Four - 0x03f6, // U+09eb : Bengali Digit Five - 0x03f7, // U+09ec : Bengali Digit Six - 0x03f8, // U+09ed : Bengali Digit Seven - 0x03f9, // U+09ee : Bengali Digit Eight - 0x03fa, // U+09ef : Bengali Digit Nine - 0x0000, // U+09f0 : Bengali Letter Ra With Middle Diagonal - 0x0000, // U+09f1 : Bengali Letter Ra With Lower Diagonal - 0x0000, // U+09f2 : Bengali Rupee Mark - 0x0000, // U+09f3 : Bengali Rupee Sign - 0x0000, // U+09f4 : Bengali Currency Numerator One - 0x0000, // U+09f5 : Bengali Currency Numerator Two - 0x0000, // U+09f6 : Bengali Currency Numerator Three - 0x0000, // U+09f7 : Bengali Currency Numerator Four - 0x0000, // U+09f8 : Bengali Currency Numerator One Less Than The Denominator - 0x0000, // U+09f9 : Bengali Currency Denominator Sixteen - 0x0000, // U+09fa : Bengali Isshar - 0x0000, // U+09fb : Undefined - 0x0000, // U+09fc : Undefined - 0x0000, // U+09fd : Undefined - 0x0000, // U+09fe : Undefined - 0x0000, // U+09ff : Undefined - 0x0000, // U+0a00 : Undefined - 0x0000, // U+0a01 : Undefined - 0x0ba2, // U+0a02 : Gurmukhi Sign Bindi - 0x0000, // U+0a03 : Undefined - 0x0000, // U+0a04 : Undefined - 0x0ba4, // U+0a05 : Gurmukhi Letter A - 0x0ba5, // U+0a06 : Gurmukhi Letter Aa - 0x0ba6, // U+0a07 : Gurmukhi Letter I - 0x0ba7, // U+0a08 : Gurmukhi Letter Ii - 0x0ba8, // U+0a09 : Gurmukhi Letter U - 0x0ba9, // U+0a0a : Gurmukhi Letter Uu - 0x0000, // U+0a0b : Undefined - 0x0000, // U+0a0c : Undefined - 0x0000, // U+0a0d : Undefined - 0x0000, // U+0a0e : Undefined - 0x0bab, // U+0a0f : Gurmukhi Letter Ee - 0x0bad, // U+0a10 : Gurmukhi Letter Ai - 0x0000, // U+0a11 : Undefined - 0x0000, // U+0a12 : Undefined - 0x0bb0, // U+0a13 : Gurmukhi Letter Oo - 0x0bb1, // U+0a14 : Gurmukhi Letter Au - 0x0bb3, // U+0a15 : Gurmukhi Letter Ka - 0x0bb4, // U+0a16 : Gurmukhi Letter Kha - 0x0bb5, // U+0a17 : Gurmukhi Letter Ga - 0x0bb6, // U+0a18 : Gurmukhi Letter Gha - 0x0bb7, // U+0a19 : Gurmukhi Letter Nga - 0x0bb8, // U+0a1a : Gurmukhi Letter Ca - 0x0bb9, // U+0a1b : Gurmukhi Letter Cha - 0x0bba, // U+0a1c : Gurmukhi Letter Ja - 0x0bbb, // U+0a1d : Gurmukhi Letter Jha - 0x0bbc, // U+0a1e : Gurmukhi Letter Nya - 0x0bbd, // U+0a1f : Gurmukhi Letter Tta - 0x0bbe, // U+0a20 : Gurmukhi Letter Ttha - 0x0bbf, // U+0a21 : Gurmukhi Letter Dda - 0x0bc0, // U+0a22 : Gurmukhi Letter Ddha - 0x0bc1, // U+0a23 : Gurmukhi Letter Nna - 0x0bc2, // U+0a24 : Gurmukhi Letter Ta - 0x0bc3, // U+0a25 : Gurmukhi Letter Tha - 0x0bc4, // U+0a26 : Gurmukhi Letter Da - 0x0bc5, // U+0a27 : Gurmukhi Letter Dha - 0x0bc6, // U+0a28 : Gurmukhi Letter Na - 0x0000, // U+0a29 : Undefined - 0x0bc8, // U+0a2a : Gurmukhi Letter Pa - 0x0bc9, // U+0a2b : Gurmukhi Letter Pha - 0x0bca, // U+0a2c : Gurmukhi Letter Ba - 0x0bcb, // U+0a2d : Gurmukhi Letter Bha - 0x0bcc, // U+0a2e : Gurmukhi Letter Ma - 0x0bcd, // U+0a2f : Gurmukhi Letter Ya - 0x0bcf, // U+0a30 : Gurmukhi Letter Ra - 0x0000, // U+0a31 : Undefined - 0x0bd1, // U+0a32 : Gurmukhi Letter La - 0x0bd2, // U+0a33 : Gurmukhi Letter Lla - 0x0000, // U+0a34 : Undefined - 0x0bd4, // U+0a35 : Gurmukhi Letter Va - 0x0bd5, // U+0a36 : Gurmukhi Letter Sha - 0x0000, // U+0a37 : Undefined - 0x0bd7, // U+0a38 : Gurmukhi Letter Sa - 0x0bd8, // U+0a39 : Gurmukhi Letter Ha - 0x0000, // U+0a3a : Undefined - 0x0000, // U+0a3b : Undefined - 0x0be9, // U+0a3c : Gurmukhi Sign Nukta - 0x0000, // U+0a3d : Undefined - 0x0bda, // U+0a3e : Gurmukhi Vowel Sign Aa - 0x0bdb, // U+0a3f : Gurmukhi Vowel Sign I - 0x0bdc, // U+0a40 : Gurmukhi Vowel Sign Ii - 0x0bdd, // U+0a41 : Gurmukhi Vowel Sign U - 0x0bde, // U+0a42 : Gurmukhi Vowel Sign Uu - 0x0000, // U+0a43 : Undefined - 0x0000, // U+0a44 : Undefined - 0x0000, // U+0a45 : Undefined - 0x0000, // U+0a46 : Undefined - 0x0be0, // U+0a47 : Gurmukhi Vowel Sign Ee - 0x0be2, // U+0a48 : Gurmukhi Vowel Sign Ai - 0x0000, // U+0a49 : Undefined - 0x0000, // U+0a4a : Undefined - 0x0be4, // U+0a4b : Gurmukhi Vowel Sign Oo - 0x0be6, // U+0a4c : Gurmukhi Vowel Sign Au - 0x0be8, // U+0a4d : Gurmukhi Sign Virama - 0x0000, // U+0a4e : Undefined - 0x0000, // U+0a4f : Undefined - 0x0000, // U+0a50 : Undefined - 0x0000, // U+0a51 : Undefined - 0x0000, // U+0a52 : Undefined - 0x0000, // U+0a53 : Undefined - 0x0000, // U+0a54 : Undefined - 0x0000, // U+0a55 : Undefined - 0x0000, // U+0a56 : Undefined - 0x0000, // U+0a57 : Undefined - 0x0000, // U+0a58 : Undefined - 0x1bb4, // U+0a59 : Gurmukhi Letter Khha - 0x1bb5, // U+0a5a : Gurmukhi Letter Ghha - 0x1bba, // U+0a5b : Gurmukhi Letter Za - 0x1bc0, // U+0a5c : Gurmukhi Letter Rra - 0x0000, // U+0a5d : Undefined - 0x1bc9, // U+0a5e : Gurmukhi Letter Fa - 0x0000, // U+0a5f : Undefined - 0x0000, // U+0a60 : Undefined - 0x0000, // U+0a61 : Undefined - 0x0000, // U+0a62 : Undefined - 0x0000, // U+0a63 : Undefined - 0x0000, // U+0a64 : Undefined - 0x0000, // U+0a65 : Undefined - 0x0bf1, // U+0a66 : Gurmukhi Digit Zero - 0x0bf2, // U+0a67 : Gurmukhi Digit One - 0x0bf3, // U+0a68 : Gurmukhi Digit Two - 0x0bf4, // U+0a69 : Gurmukhi Digit Three - 0x0bf5, // U+0a6a : Gurmukhi Digit Four - 0x0bf6, // U+0a6b : Gurmukhi Digit Five - 0x0bf7, // U+0a6c : Gurmukhi Digit Six - 0x0bf8, // U+0a6d : Gurmukhi Digit Seven - 0x0bf9, // U+0a6e : Gurmukhi Digit Eight - 0x0bfa, // U+0a6f : Gurmukhi Digit Nine - 0x0000, // U+0a70 : Gurmukhi Tippi - 0x0000, // U+0a71 : Gurmukhi Addak - 0x0000, // U+0a72 : Gurmukhi Iri - 0x0000, // U+0a73 : Gurmukhi Ura - 0x0000, // U+0a74 : Gurmukhi Ek Onkar - 0x0000, // U+0a75 : Undefined - 0x0000, // U+0a76 : Undefined - 0x0000, // U+0a77 : Undefined - 0x0000, // U+0a78 : Undefined - 0x0000, // U+0a79 : Undefined - 0x0000, // U+0a7a : Undefined - 0x0000, // U+0a7b : Undefined - 0x0000, // U+0a7c : Undefined - 0x0000, // U+0a7d : Undefined - 0x0000, // U+0a7e : Undefined - 0x0000, // U+0a7f : Undefined - 0x0000, // U+0a80 : Undefined - 0x0aa1, // U+0a81 : Gujarati Sign Candrabindu - 0x0aa2, // U+0a82 : Gujarati Sign Anusvara - 0x0aa3, // U+0a83 : Gujarati Sign Visarga - 0x0000, // U+0a84 : Undefined - 0x0aa4, // U+0a85 : Gujarati Letter A - 0x0aa5, // U+0a86 : Gujarati Letter Aa - 0x0aa6, // U+0a87 : Gujarati Letter I - 0x0aa7, // U+0a88 : Gujarati Letter Ii - 0x0aa8, // U+0a89 : Gujarati Letter U - 0x0aa9, // U+0a8a : Gujarati Letter Uu - 0x0aaa, // U+0a8b : Gujarati Letter Vocalic R - 0x0000, // U+0a8c : Undefined - 0x0aae, // U+0a8d : Gujarati Vowel Candra E - 0x0000, // U+0a8e : Undefined - 0x0aab, // U+0a8f : Gujarati Letter E - 0x0aad, // U+0a90 : Gujarati Letter Ai - 0x0ab2, // U+0a91 : Gujarati Vowel Candra O - 0x0000, // U+0a92 : Undefined - 0x0ab0, // U+0a93 : Gujarati Letter O - 0x0ab1, // U+0a94 : Gujarati Letter Au - 0x0ab3, // U+0a95 : Gujarati Letter Ka - 0x0ab4, // U+0a96 : Gujarati Letter Kha - 0x0ab5, // U+0a97 : Gujarati Letter Ga - 0x0ab6, // U+0a98 : Gujarati Letter Gha - 0x0ab7, // U+0a99 : Gujarati Letter Nga - 0x0ab8, // U+0a9a : Gujarati Letter Ca - 0x0ab9, // U+0a9b : Gujarati Letter Cha - 0x0aba, // U+0a9c : Gujarati Letter Ja - 0x0abb, // U+0a9d : Gujarati Letter Jha - 0x0abc, // U+0a9e : Gujarati Letter Nya - 0x0abd, // U+0a9f : Gujarati Letter Tta - 0x0abe, // U+0aa0 : Gujarati Letter Ttha - 0x0abf, // U+0aa1 : Gujarati Letter Dda - 0x0ac0, // U+0aa2 : Gujarati Letter Ddha - 0x0ac1, // U+0aa3 : Gujarati Letter Nna - 0x0ac2, // U+0aa4 : Gujarati Letter Ta - 0x0ac3, // U+0aa5 : Gujarati Letter Tha - 0x0ac4, // U+0aa6 : Gujarati Letter Da - 0x0ac5, // U+0aa7 : Gujarati Letter Dha - 0x0ac6, // U+0aa8 : Gujarati Letter Na - 0x0000, // U+0aa9 : Undefined - 0x0ac8, // U+0aaa : Gujarati Letter Pa - 0x0ac9, // U+0aab : Gujarati Letter Pha - 0x0aca, // U+0aac : Gujarati Letter Ba - 0x0acb, // U+0aad : Gujarati Letter Bha - 0x0acc, // U+0aae : Gujarati Letter Ma - 0x0acd, // U+0aaf : Gujarati Letter Ya - 0x0acf, // U+0ab0 : Gujarati Letter Ra - 0x0000, // U+0ab1 : Undefined - 0x0ad1, // U+0ab2 : Gujarati Letter La - 0x0ad2, // U+0ab3 : Gujarati Letter Lla - 0x0000, // U+0ab4 : Undefined - 0x0ad4, // U+0ab5 : Gujarati Letter Va - 0x0ad5, // U+0ab6 : Gujarati Letter Sha - 0x0ad6, // U+0ab7 : Gujarati Letter Ssa - 0x0ad7, // U+0ab8 : Gujarati Letter Sa - 0x0ad8, // U+0ab9 : Gujarati Letter Ha - 0x0000, // U+0aba : Undefined - 0x0000, // U+0abb : Undefined - 0x0ae9, // U+0abc : Gujarati Sign Nukta - 0x1aea, // U+0abd : Gujarati Sign Avagraha - 0x0ada, // U+0abe : Gujarati Vowel Sign Aa - 0x0adb, // U+0abf : Gujarati Vowel Sign I - 0x0adc, // U+0ac0 : Gujarati Vowel Sign Ii - 0x0add, // U+0ac1 : Gujarati Vowel Sign U - 0x0ade, // U+0ac2 : Gujarati Vowel Sign Uu - 0x0adf, // U+0ac3 : Gujarati Vowel Sign Vocalic R - 0x1adf, // U+0ac4 : Gujarati Vowel Sign Vocalic Rr - 0x0ae3, // U+0ac5 : Gujarati Vowel Sign Candra E - 0x0000, // U+0ac6 : Undefined - 0x0ae0, // U+0ac7 : Gujarati Vowel Sign E - 0x0ae2, // U+0ac8 : Gujarati Vowel Sign Ai - 0x0ae7, // U+0ac9 : Gujarati Vowel Sign Candra O - 0x0000, // U+0aca : Undefined - 0x0ae4, // U+0acb : Gujarati Vowel Sign O - 0x0ae6, // U+0acc : Gujarati Vowel Sign Au - 0x0ae8, // U+0acd : Gujarati Sign Virama - 0x0000, // U+0ace : Undefined - 0x0000, // U+0acf : Undefined - 0x1aa1, // U+0ad0 : Gujarati Om - 0x0000, // U+0ad1 : Undefined - 0x0000, // U+0ad2 : Undefined - 0x0000, // U+0ad3 : Undefined - 0x0000, // U+0ad4 : Undefined - 0x0000, // U+0ad5 : Undefined - 0x0000, // U+0ad6 : Undefined - 0x0000, // U+0ad7 : Undefined - 0x0000, // U+0ad8 : Undefined - 0x0000, // U+0ad9 : Undefined - 0x0000, // U+0ada : Undefined - 0x0000, // U+0adb : Undefined - 0x0000, // U+0adc : Undefined - 0x0000, // U+0add : Undefined - 0x0000, // U+0ade : Undefined - 0x0000, // U+0adf : Undefined - 0x1aaa, // U+0ae0 : Gujarati Letter Vocalic Rr - 0x0000, // U+0ae1 : Undefined - 0x0000, // U+0ae2 : Undefined - 0x0000, // U+0ae3 : Undefined - 0x0000, // U+0ae4 : Undefined - 0x0000, // U+0ae5 : Undefined - 0x0af1, // U+0ae6 : Gujarati Digit Zero - 0x0af2, // U+0ae7 : Gujarati Digit One - 0x0af3, // U+0ae8 : Gujarati Digit Two - 0x0af4, // U+0ae9 : Gujarati Digit Three - 0x0af5, // U+0aea : Gujarati Digit Four - 0x0af6, // U+0aeb : Gujarati Digit Five - 0x0af7, // U+0aec : Gujarati Digit Six - 0x0af8, // U+0aed : Gujarati Digit Seven - 0x0af9, // U+0aee : Gujarati Digit Eight - 0x0afa, // U+0aef : Gujarati Digit Nine - 0x0000, // U+0af0 : Undefined - 0x0000, // U+0af1 : Undefined - 0x0000, // U+0af2 : Undefined - 0x0000, // U+0af3 : Undefined - 0x0000, // U+0af4 : Undefined - 0x0000, // U+0af5 : Undefined - 0x0000, // U+0af6 : Undefined - 0x0000, // U+0af7 : Undefined - 0x0000, // U+0af8 : Undefined - 0x0000, // U+0af9 : Undefined - 0x0000, // U+0afa : Undefined - 0x0000, // U+0afb : Undefined - 0x0000, // U+0afc : Undefined - 0x0000, // U+0afd : Undefined - 0x0000, // U+0afe : Undefined - 0x0000, // U+0aff : Undefined - 0x0000, // U+0b00 : Undefined - 0x07a1, // U+0b01 : Oriya Sign Candrabindu - 0x07a2, // U+0b02 : Oriya Sign Anusvara - 0x07a3, // U+0b03 : Oriya Sign Visarga - 0x0000, // U+0b04 : Undefined - 0x07a4, // U+0b05 : Oriya Letter A - 0x07a5, // U+0b06 : Oriya Letter Aa - 0x07a6, // U+0b07 : Oriya Letter I - 0x07a7, // U+0b08 : Oriya Letter Ii - 0x07a8, // U+0b09 : Oriya Letter U - 0x07a9, // U+0b0a : Oriya Letter Uu - 0x07aa, // U+0b0b : Oriya Letter Vocalic R - 0x17a6, // U+0b0c : Oriya Letter Vocalic L - 0x0000, // U+0b0d : Undefined - 0x0000, // U+0b0e : Undefined - 0x07ab, // U+0b0f : Oriya Letter E - 0x07ad, // U+0b10 : Oriya Letter Ai - 0x0000, // U+0b11 : Undefined - 0x0000, // U+0b12 : Undefined - 0x07b0, // U+0b13 : Oriya Letter O - 0x07b1, // U+0b14 : Oriya Letter Au - 0x07b3, // U+0b15 : Oriya Letter Ka - 0x07b4, // U+0b16 : Oriya Letter Kha - 0x07b5, // U+0b17 : Oriya Letter Ga - 0x07b6, // U+0b18 : Oriya Letter Gha - 0x07b7, // U+0b19 : Oriya Letter Nga - 0x07b8, // U+0b1a : Oriya Letter Ca - 0x07b9, // U+0b1b : Oriya Letter Cha - 0x07ba, // U+0b1c : Oriya Letter Ja - 0x07bb, // U+0b1d : Oriya Letter Jha - 0x07bc, // U+0b1e : Oriya Letter Nya - 0x07bd, // U+0b1f : Oriya Letter Tta - 0x07be, // U+0b20 : Oriya Letter Ttha - 0x07bf, // U+0b21 : Oriya Letter Dda - 0x07c0, // U+0b22 : Oriya Letter Ddha - 0x07c1, // U+0b23 : Oriya Letter Nna - 0x07c2, // U+0b24 : Oriya Letter Ta - 0x07c3, // U+0b25 : Oriya Letter Tha - 0x07c4, // U+0b26 : Oriya Letter Da - 0x07c5, // U+0b27 : Oriya Letter Dha - 0x07c6, // U+0b28 : Oriya Letter Na - 0x0000, // U+0b29 : Undefined - 0x07c8, // U+0b2a : Oriya Letter Pa - 0x07c9, // U+0b2b : Oriya Letter Pha - 0x07ca, // U+0b2c : Oriya Letter Ba - 0x07cb, // U+0b2d : Oriya Letter Bha - 0x07cc, // U+0b2e : Oriya Letter Ma - 0x07cd, // U+0b2f : Oriya Letter Ya - 0x07cf, // U+0b30 : Oriya Letter Ra - 0x0000, // U+0b31 : Undefined - 0x07d1, // U+0b32 : Oriya Letter La - 0x07d2, // U+0b33 : Oriya Letter Lla - 0x0000, // U+0b34 : Undefined - 0x0000, // U+0b35 : Undefined - 0x07d5, // U+0b36 : Oriya Letter Sha - 0x07d6, // U+0b37 : Oriya Letter Ssa - 0x07d7, // U+0b38 : Oriya Letter Sa - 0x07d8, // U+0b39 : Oriya Letter Ha - 0x0000, // U+0b3a : Undefined - 0x0000, // U+0b3b : Undefined - 0x07e9, // U+0b3c : Oriya Sign Nukta - 0x17ea, // U+0b3d : Oriya Sign Avagraha - 0x07da, // U+0b3e : Oriya Vowel Sign Aa - 0x07db, // U+0b3f : Oriya Vowel Sign I - 0x07dc, // U+0b40 : Oriya Vowel Sign Ii - 0x07dd, // U+0b41 : Oriya Vowel Sign U - 0x07de, // U+0b42 : Oriya Vowel Sign Uu - 0x07df, // U+0b43 : Oriya Vowel Sign Vocalic R - 0x0000, // U+0b44 : Undefined - 0x0000, // U+0b45 : Undefined - 0x0000, // U+0b46 : Undefined - 0x07e0, // U+0b47 : Oriya Vowel Sign E - 0x07e2, // U+0b48 : Oriya Vowel Sign Ai - 0x0000, // U+0b49 : Undefined - 0x0000, // U+0b4a : Undefined - 0x07e4, // U+0b4b : Oriya Vowel Sign O - 0x07e6, // U+0b4c : Oriya Vowel Sign Au - 0x07e8, // U+0b4d : Oriya Sign Virama - 0x0000, // U+0b4e : Undefined - 0x0000, // U+0b4f : Undefined - 0x0000, // U+0b50 : Undefined - 0x0000, // U+0b51 : Undefined - 0x0000, // U+0b52 : Undefined - 0x0000, // U+0b53 : Undefined - 0x0000, // U+0b54 : Undefined - 0x0000, // U+0b55 : Undefined - 0x0000, // U+0b56 : Oriya Ai Length Mark - 0x0000, // U+0b57 : Oriya Au Length Mark - 0x0000, // U+0b58 : Undefined - 0x0000, // U+0b59 : Undefined - 0x0000, // U+0b5a : Undefined - 0x0000, // U+0b5b : Undefined - 0x17bf, // U+0b5c : Oriya Letter Rra - 0x17c0, // U+0b5d : Oriya Letter Rha - 0x0000, // U+0b5e : Undefined - 0x07ce, // U+0b5f : Oriya Letter Yya - 0x17aa, // U+0b60 : Oriya Letter Vocalic Rr - 0x17a7, // U+0b61 : Oriya Letter Vocalic Ll - 0x0000, // U+0b62 : Undefined - 0x0000, // U+0b63 : Undefined - 0x0000, // U+0b64 : Undefined - 0x0000, // U+0b65 : Undefined - 0x07f1, // U+0b66 : Oriya Digit Zero - 0x07f2, // U+0b67 : Oriya Digit One - 0x07f3, // U+0b68 : Oriya Digit Two - 0x07f4, // U+0b69 : Oriya Digit Three - 0x07f5, // U+0b6a : Oriya Digit Four - 0x07f6, // U+0b6b : Oriya Digit Five - 0x07f7, // U+0b6c : Oriya Digit Six - 0x07f8, // U+0b6d : Oriya Digit Seven - 0x07f9, // U+0b6e : Oriya Digit Eight - 0x07fa, // U+0b6f : Oriya Digit Nine - 0x0000, // U+0b70 : Oriya Isshar - 0x0000, // U+0b71 : Undefined - 0x0000, // U+0b72 : Undefined - 0x0000, // U+0b73 : Undefined - 0x0000, // U+0b74 : Undefined - 0x0000, // U+0b75 : Undefined - 0x0000, // U+0b76 : Undefined - 0x0000, // U+0b77 : Undefined - 0x0000, // U+0b78 : Undefined - 0x0000, // U+0b79 : Undefined - 0x0000, // U+0b7a : Undefined - 0x0000, // U+0b7b : Undefined - 0x0000, // U+0b7c : Undefined - 0x0000, // U+0b7d : Undefined - 0x0000, // U+0b7e : Undefined - 0x0000, // U+0b7f : Undefined - 0x0000, // U+0b80 : Undefined - 0x0000, // U+0b81 : Undefined - 0x04a2, // U+0b82 : Tamil Sign Anusvara - 0x04a3, // U+0b83 : Tamil Sign Visarga - 0x0000, // U+0b84 : Undefined - 0x04a4, // U+0b85 : Tamil Letter A - 0x04a5, // U+0b86 : Tamil Letter Aa - 0x04a6, // U+0b87 : Tamil Letter I - 0x04a7, // U+0b88 : Tamil Letter Ii - 0x04a8, // U+0b89 : Tamil Letter U - 0x04a9, // U+0b8a : Tamil Letter Uu - 0x0000, // U+0b8b : Undefined - 0x0000, // U+0b8c : Undefined - 0x0000, // U+0b8d : Undefined - 0x0000, // U+0b8e : Tamil Letter E - 0x04ab, // U+0b8f : Tamil Letter Ee - 0x04ad, // U+0b90 : Tamil Letter Ai - 0x0000, // U+0b91 : Undefined - 0x04af, // U+0b92 : Tamil Letter O - 0x04b0, // U+0b93 : Tamil Letter Oo - 0x04b1, // U+0b94 : Tamil Letter Au - 0x04b3, // U+0b95 : Tamil Letter Ka - 0x0000, // U+0b96 : Undefined - 0x0000, // U+0b97 : Undefined - 0x0000, // U+0b98 : Undefined - 0x04b7, // U+0b99 : Tamil Letter Nga - 0x04b8, // U+0b9a : Tamil Letter Ca - 0x0000, // U+0b9b : Undefined - 0x04ba, // U+0b9c : Tamil Letter Ja - 0x0000, // U+0b9d : Undefined - 0x04bc, // U+0b9e : Tamil Letter Nya - 0x04bd, // U+0b9f : Tamil Letter Tta - 0x0000, // U+0ba0 : Undefined - 0x0000, // U+0ba1 : Undefined - 0x0000, // U+0ba2 : Undefined - 0x04c1, // U+0ba3 : Tamil Letter Nna - 0x04c2, // U+0ba4 : Tamil Letter Ta - 0x0000, // U+0ba5 : Undefined - 0x0000, // U+0ba6 : Undefined - 0x0000, // U+0ba7 : Undefined - 0x04c6, // U+0ba8 : Tamil Letter Na - 0x04c7, // U+0ba9 : Tamil Letter Nnna - 0x04c8, // U+0baa : Tamil Letter Pa - 0x0000, // U+0bab : Undefined - 0x0000, // U+0bac : Undefined - 0x0000, // U+0bad : Undefined - 0x04cc, // U+0bae : Tamil Letter Ma - 0x04cd, // U+0baf : Tamil Letter Ya - 0x04cf, // U+0bb0 : Tamil Letter Ra - 0x04d0, // U+0bb1 : Tamil Letter Rra - 0x04d1, // U+0bb2 : Tamil Letter La - 0x04d2, // U+0bb3 : Tamil Letter Lla - 0x04d3, // U+0bb4 : Tamil Letter Llla - 0x04d4, // U+0bb5 : Tamil Letter Va - 0x0000, // U+0bb6 : Undefined - 0x04d5, // U+0bb7 : Tamil Letter Ssa - 0x04d7, // U+0bb8 : Tamil Letter Sa - 0x04d8, // U+0bb9 : Tamil Letter Ha - 0x0000, // U+0bba : Undefined - 0x0000, // U+0bbb : Undefined - 0x0000, // U+0bbc : Undefined - 0x0000, // U+0bbd : Undefined - 0x04da, // U+0bbe : Tamil Vowel Sign Aa - 0x04db, // U+0bbf : Tamil Vowel Sign I - 0x04dc, // U+0bc0 : Tamil Vowel Sign Ii - 0x04dd, // U+0bc1 : Tamil Vowel Sign U - 0x04de, // U+0bc2 : Tamil Vowel Sign Uu - 0x0000, // U+0bc3 : Undefined - 0x0000, // U+0bc4 : Undefined - 0x0000, // U+0bc5 : Undefined - 0x04e0, // U+0bc6 : Tamil Vowel Sign E - 0x04e1, // U+0bc7 : Tamil Vowel Sign Ee - 0x04e2, // U+0bc8 : Tamil Vowel Sign Ai - 0x0000, // U+0bc9 : Undefined - 0x04e4, // U+0bca : Tamil Vowel Sign O - 0x04e5, // U+0bcb : Tamil Vowel Sign Oo - 0x04e6, // U+0bcc : Tamil Vowel Sign Au - 0x04e8, // U+0bcd : Tamil Sign Virama - 0x0000, // U+0bce : Undefined - 0x0000, // U+0bcf : Undefined - 0x0000, // U+0bd0 : Undefined - 0x0000, // U+0bd1 : Undefined - 0x0000, // U+0bd2 : Undefined - 0x0000, // U+0bd3 : Undefined - 0x0000, // U+0bd4 : Undefined - 0x0000, // U+0bd5 : Undefined - 0x0000, // U+0bd6 : Undefined - 0x0000, // U+0bd7 : Tamil Au Length Mark - 0x0000, // U+0bd8 : Undefined - 0x0000, // U+0bd9 : Undefined - 0x0000, // U+0bda : Undefined - 0x0000, // U+0bdb : Undefined - 0x0000, // U+0bdc : Undefined - 0x0000, // U+0bdd : Undefined - 0x0000, // U+0bde : Undefined - 0x0000, // U+0bdf : Undefined - 0x0000, // U+0be0 : Undefined - 0x0000, // U+0be1 : Undefined - 0x0000, // U+0be2 : Undefined - 0x0000, // U+0be3 : Undefined - 0x0000, // U+0be4 : Undefined - 0x0000, // U+0be5 : Undefined - 0x0000, // U+0be6 : Undefined - 0x04f2, // U+0be7 : Tamil Digit One - 0x04f3, // U+0be8 : Tamil Digit Two - 0x04f4, // U+0be9 : Tamil Digit Three - 0x04f5, // U+0bea : Tamil Digit Four - 0x04f6, // U+0beb : Tamil Digit Five - 0x04f7, // U+0bec : Tamil Digit Six - 0x04f8, // U+0bed : Tamil Digit Seven - 0x04f9, // U+0bee : Tamil Digit Eight - 0x04fa, // U+0bef : Tamil Digit Nine - 0x0000, // U+0bf0 : Tamil Number Ten - 0x0000, // U+0bf1 : Tamil Number One Hundred - 0x0000, // U+0bf2 : Tamil Number One Thousand - 0x0000, // U+0bf3 : Undefined - 0x0000, // U+0bf4 : Undefined - 0x0000, // U+0bf5 : Undefined - 0x0000, // U+0bf6 : Undefined - 0x0000, // U+0bf7 : Undefined - 0x0000, // U+0bf8 : Undefined - 0x0000, // U+0bf9 : Undefined - 0x0000, // U+0bfa : Undefined - 0x0000, // U+0bfb : Undefined - 0x0000, // U+0bfc : Undefined - 0x0000, // U+0bfd : Undefined - 0x0000, // U+0bfe : Undefined - 0x0000, // U+0bff : Undefined - 0x0000, // U+0c00 : Undefined - 0x05a1, // U+0c01 : Telugu Sign Candrabindu - 0x05a2, // U+0c02 : Telugu Sign Anusvara - 0x05a3, // U+0c03 : Telugu Sign Visarga - 0x0000, // U+0c04 : Undefined - 0x05a4, // U+0c05 : Telugu Letter A - 0x05a5, // U+0c06 : Telugu Letter Aa - 0x05a6, // U+0c07 : Telugu Letter I - 0x05a7, // U+0c08 : Telugu Letter Ii - 0x05a8, // U+0c09 : Telugu Letter U - 0x05a9, // U+0c0a : Telugu Letter Uu - 0x05aa, // U+0c0b : Telugu Letter Vocalic R - 0x15a6, // U+0c0c : Telugu Letter Vocalic L - 0x0000, // U+0c0d : Undefined - 0x05ab, // U+0c0e : Telugu Letter E - 0x05ac, // U+0c0f : Telugu Letter Ee - 0x05ad, // U+0c10 : Telugu Letter Ai - 0x0000, // U+0c11 : Undefined - 0x05af, // U+0c12 : Telugu Letter O - 0x05b0, // U+0c13 : Telugu Letter Oo - 0x05b1, // U+0c14 : Telugu Letter Au - 0x05b3, // U+0c15 : Telugu Letter Ka - 0x05b4, // U+0c16 : Telugu Letter Kha - 0x05b5, // U+0c17 : Telugu Letter Ga - 0x05b6, // U+0c18 : Telugu Letter Gha - 0x05b7, // U+0c19 : Telugu Letter Nga - 0x05b8, // U+0c1a : Telugu Letter Ca - 0x05b9, // U+0c1b : Telugu Letter Cha - 0x05ba, // U+0c1c : Telugu Letter Ja - 0x05bb, // U+0c1d : Telugu Letter Jha - 0x05bc, // U+0c1e : Telugu Letter Nya - 0x05bd, // U+0c1f : Telugu Letter Tta - 0x05be, // U+0c20 : Telugu Letter Ttha - 0x05bf, // U+0c21 : Telugu Letter Dda - 0x05c0, // U+0c22 : Telugu Letter Ddha - 0x05c1, // U+0c23 : Telugu Letter Nna - 0x05c2, // U+0c24 : Telugu Letter Ta - 0x05c3, // U+0c25 : Telugu Letter Tha - 0x05c4, // U+0c26 : Telugu Letter Da - 0x05c5, // U+0c27 : Telugu Letter Dha - 0x05c6, // U+0c28 : Telugu Letter Na - 0x0000, // U+0c29 : Undefined - 0x05c8, // U+0c2a : Telugu Letter Pa - 0x05c9, // U+0c2b : Telugu Letter Pha - 0x05ca, // U+0c2c : Telugu Letter Ba - 0x05cb, // U+0c2d : Telugu Letter Bha - 0x05cc, // U+0c2e : Telugu Letter Ma - 0x05cd, // U+0c2f : Telugu Letter Ya - 0x05cf, // U+0c30 : Telugu Letter Ra - 0x05d0, // U+0c31 : Telugu Letter Rra - 0x05d1, // U+0c32 : Telugu Letter La - 0x05d2, // U+0c33 : Telugu Letter Lla - 0x0000, // U+0c34 : Undefined - 0x05d4, // U+0c35 : Telugu Letter Va - 0x05d5, // U+0c36 : Telugu Letter Sha - 0x05d6, // U+0c37 : Telugu Letter Ssa - 0x05d7, // U+0c38 : Telugu Letter Sa - 0x05d8, // U+0c39 : Telugu Letter Ha - 0x0000, // U+0c3a : Undefined - 0x0000, // U+0c3b : Undefined - 0x0000, // U+0c3c : Undefined - 0x0000, // U+0c3d : Undefined - 0x05da, // U+0c3e : Telugu Vowel Sign Aa - 0x05db, // U+0c3f : Telugu Vowel Sign I - 0x05dc, // U+0c40 : Telugu Vowel Sign Ii - 0x05dd, // U+0c41 : Telugu Vowel Sign U - 0x05de, // U+0c42 : Telugu Vowel Sign Uu - 0x05df, // U+0c43 : Telugu Vowel Sign Vocalic R - 0x15df, // U+0c44 : Telugu Vowel Sign Vocalic Rr - 0x0000, // U+0c45 : Undefined - 0x05e0, // U+0c46 : Telugu Vowel Sign E - 0x05e1, // U+0c47 : Telugu Vowel Sign Ee - 0x05e2, // U+0c48 : Telugu Vowel Sign Ai - 0x0000, // U+0c49 : Undefined - 0x05e4, // U+0c4a : Telugu Vowel Sign O - 0x05e5, // U+0c4b : Telugu Vowel Sign Oo - 0x05e6, // U+0c4c : Telugu Vowel Sign Au - 0x05e8, // U+0c4d : Telugu Sign Virama - 0x0000, // U+0c4e : Undefined - 0x0000, // U+0c4f : Undefined - 0x0000, // U+0c50 : Undefined - 0x0000, // U+0c51 : Undefined - 0x0000, // U+0c52 : Undefined - 0x0000, // U+0c53 : Undefined - 0x0000, // U+0c54 : Undefined - 0x0000, // U+0c55 : Telugu Length Mark - 0x0000, // U+0c56 : Telugu Ai Length Mark - 0x0000, // U+0c57 : Undefined - 0x0000, // U+0c58 : Undefined - 0x0000, // U+0c59 : Undefined - 0x0000, // U+0c5a : Undefined - 0x0000, // U+0c5b : Undefined - 0x0000, // U+0c5c : Undefined - 0x0000, // U+0c5d : Undefined - 0x0000, // U+0c5e : Undefined - 0x0000, // U+0c5f : Undefined - 0x15aa, // U+0c60 : Telugu Letter Vocalic Rr - 0x15a7, // U+0c61 : Telugu Letter Vocalic Ll - 0x0000, // U+0c62 : Undefined - 0x0000, // U+0c63 : Undefined - 0x0000, // U+0c64 : Undefined - 0x0000, // U+0c65 : Undefined - 0x05f1, // U+0c66 : Telugu Digit Zero - 0x05f2, // U+0c67 : Telugu Digit One - 0x05f3, // U+0c68 : Telugu Digit Two - 0x05f4, // U+0c69 : Telugu Digit Three - 0x05f5, // U+0c6a : Telugu Digit Four - 0x05f6, // U+0c6b : Telugu Digit Five - 0x05f7, // U+0c6c : Telugu Digit Six - 0x05f8, // U+0c6d : Telugu Digit Seven - 0x05f9, // U+0c6e : Telugu Digit Eight - 0x05fa, // U+0c6f : Telugu Digit Nine - 0x0000, // U+0c70 : Undefined - 0x0000, // U+0c71 : Undefined - 0x0000, // U+0c72 : Undefined - 0x0000, // U+0c73 : Undefined - 0x0000, // U+0c74 : Undefined - 0x0000, // U+0c75 : Undefined - 0x0000, // U+0c76 : Undefined - 0x0000, // U+0c77 : Undefined - 0x0000, // U+0c78 : Undefined - 0x0000, // U+0c79 : Undefined - 0x0000, // U+0c7a : Undefined - 0x0000, // U+0c7b : Undefined - 0x0000, // U+0c7c : Undefined - 0x0000, // U+0c7d : Undefined - 0x0000, // U+0c7e : Undefined - 0x0000, // U+0c7f : Undefined - 0x0000, // U+0c80 : Undefined - 0x0000, // U+0c81 : Undefined - 0x08a2, // U+0c82 : Kannada Sign Anusvara - 0x08a3, // U+0c83 : Kannada Sign Visarga - 0x0000, // U+0c84 : Undefined - 0x08a4, // U+0c85 : Kannada Letter A - 0x08a5, // U+0c86 : Kannada Letter Aa - 0x08a6, // U+0c87 : Kannada Letter I - 0x08a7, // U+0c88 : Kannada Letter Ii - 0x08a8, // U+0c89 : Kannada Letter U - 0x08a9, // U+0c8a : Kannada Letter Uu - 0x08aa, // U+0c8b : Kannada Letter Vocalic R - 0x18a6, // U+0c8c : Kannada Letter Vocalic L - 0x0000, // U+0c8d : Undefined - 0x08ab, // U+0c8e : Kannada Letter E - 0x08ac, // U+0c8f : Kannada Letter Ee - 0x08ad, // U+0c90 : Kannada Letter Ai - 0x0000, // U+0c91 : Undefined - 0x08af, // U+0c92 : Kannada Letter O - 0x08b0, // U+0c93 : Kannada Letter Oo - 0x08b1, // U+0c94 : Kannada Letter Au - 0x08b3, // U+0c95 : Kannada Letter Ka - 0x08b4, // U+0c96 : Kannada Letter Kha - 0x08b5, // U+0c97 : Kannada Letter Ga - 0x08b6, // U+0c98 : Kannada Letter Gha - 0x08b7, // U+0c99 : Kannada Letter Nga - 0x08b8, // U+0c9a : Kannada Letter Ca - 0x08b9, // U+0c9b : Kannada Letter Cha - 0x08ba, // U+0c9c : Kannada Letter Ja - 0x08bb, // U+0c9d : Kannada Letter Jha - 0x08bc, // U+0c9e : Kannada Letter Nya - 0x08bd, // U+0c9f : Kannada Letter Tta - 0x08be, // U+0ca0 : Kannada Letter Ttha - 0x08bf, // U+0ca1 : Kannada Letter Dda - 0x08c0, // U+0ca2 : Kannada Letter Ddha - 0x08c1, // U+0ca3 : Kannada Letter Nna - 0x08c2, // U+0ca4 : Kannada Letter Ta - 0x08c3, // U+0ca5 : Kannada Letter Tha - 0x08c4, // U+0ca6 : Kannada Letter Da - 0x08c5, // U+0ca7 : Kannada Letter Dha - 0x08c6, // U+0ca8 : Kannada Letter Na - 0x0000, // U+0ca9 : Undefined - 0x08c8, // U+0caa : Kannada Letter Pa - 0x08c9, // U+0cab : Kannada Letter Pha - 0x08ca, // U+0cac : Kannada Letter Ba - 0x08cb, // U+0cad : Kannada Letter Bha - 0x08cc, // U+0cae : Kannada Letter Ma - 0x08cd, // U+0caf : Kannada Letter Ya - 0x08cf, // U+0cb0 : Kannada Letter Ra - 0x08d0, // U+0cb1 : Kannada Letter Rra - 0x08d1, // U+0cb2 : Kannada Letter La - 0x08d2, // U+0cb3 : Kannada Letter Lla - 0x0000, // U+0cb4 : Undefined - 0x08d4, // U+0cb5 : Kannada Letter Va - 0x08d5, // U+0cb6 : Kannada Letter Sha - 0x08d6, // U+0cb7 : Kannada Letter Ssa - 0x08d7, // U+0cb8 : Kannada Letter Sa - 0x08d8, // U+0cb9 : Kannada Letter Ha - 0x0000, // U+0cba : Undefined - 0x0000, // U+0cbb : Undefined - 0x0000, // U+0cbc : Undefined - 0x0000, // U+0cbd : Undefined - 0x08da, // U+0cbe : Kannada Vowel Sign Aa - 0x08db, // U+0cbf : Kannada Vowel Sign I - 0x08dc, // U+0cc0 : Kannada Vowel Sign Ii - 0x08dd, // U+0cc1 : Kannada Vowel Sign U - 0x08de, // U+0cc2 : Kannada Vowel Sign Uu - 0x08df, // U+0cc3 : Kannada Vowel Sign Vocalic R - 0x18df, // U+0cc4 : Kannada Vowel Sign Vocalic Rr - 0x0000, // U+0cc5 : Undefined - 0x08e0, // U+0cc6 : Kannada Vowel Sign E - 0x08e1, // U+0cc7 : Kannada Vowel Sign Ee - 0x08e2, // U+0cc8 : Kannada Vowel Sign Ai - 0x0000, // U+0cc9 : Undefined - 0x08e4, // U+0cca : Kannada Vowel Sign O - 0x08e5, // U+0ccb : Kannada Vowel Sign Oo - 0x08e6, // U+0ccc : Kannada Vowel Sign Au - 0x08e8, // U+0ccd : Kannada Sign Virama - 0x0000, // U+0cce : Undefined - 0x0000, // U+0ccf : Undefined - 0x0000, // U+0cd0 : Undefined - 0x0000, // U+0cd1 : Undefined - 0x0000, // U+0cd2 : Undefined - 0x0000, // U+0cd3 : Undefined - 0x0000, // U+0cd4 : Undefined - 0x0000, // U+0cd5 : Kannada Length Mark - 0x0000, // U+0cd6 : Kannada Ai Length Mark - 0x0000, // U+0cd7 : Undefined - 0x0000, // U+0cd8 : Undefined - 0x0000, // U+0cd9 : Undefined - 0x0000, // U+0cda : Undefined - 0x0000, // U+0cdb : Undefined - 0x0000, // U+0cdc : Undefined - 0x0000, // U+0cdd : Undefined - 0x18c9, // U+0cde : Kannada Letter Fa - 0x0000, // U+0cdf : Undefined - 0x18aa, // U+0ce0 : Kannada Letter Vocalic Rr - 0x18a7, // U+0ce1 : Kannada Letter Vocalic Ll - 0x0000, // U+0ce2 : Undefined - 0x0000, // U+0ce3 : Undefined - 0x0000, // U+0ce4 : Undefined - 0x0000, // U+0ce5 : Undefined - 0x08f1, // U+0ce6 : Kannada Digit Zero - 0x08f2, // U+0ce7 : Kannada Digit One - 0x08f3, // U+0ce8 : Kannada Digit Two - 0x08f4, // U+0ce9 : Kannada Digit Three - 0x08f5, // U+0cea : Kannada Digit Four - 0x08f6, // U+0ceb : Kannada Digit Five - 0x08f7, // U+0cec : Kannada Digit Six - 0x08f8, // U+0ced : Kannada Digit Seven - 0x08f9, // U+0cee : Kannada Digit Eight - 0x08fa, // U+0cef : Kannada Digit Nine - 0x0000, // U+0cf0 : Undefined - 0x0000, // U+0cf1 : Undefined - 0x0000, // U+0cf2 : Undefined - 0x0000, // U+0cf3 : Undefined - 0x0000, // U+0cf4 : Undefined - 0x0000, // U+0cf5 : Undefined - 0x0000, // U+0cf6 : Undefined - 0x0000, // U+0cf7 : Undefined - 0x0000, // U+0cf8 : Undefined - 0x0000, // U+0cf9 : Undefined - 0x0000, // U+0cfa : Undefined - 0x0000, // U+0cfb : Undefined - 0x0000, // U+0cfc : Undefined - 0x0000, // U+0cfd : Undefined - 0x0000, // U+0cfe : Undefined - 0x0000, // U+0cff : Undefined - 0x0000, // U+0d00 : Undefined - 0x0000, // U+0d01 : Undefined - 0x09a2, // U+0d02 : Malayalam Sign Anusvara - 0x09a3, // U+0d03 : Malayalam Sign Visarga - 0x0000, // U+0d04 : Undefined - 0x09a4, // U+0d05 : Malayalam Letter A - 0x09a5, // U+0d06 : Malayalam Letter Aa - 0x09a6, // U+0d07 : Malayalam Letter I - 0x09a7, // U+0d08 : Malayalam Letter Ii - 0x09a8, // U+0d09 : Malayalam Letter U - 0x09a9, // U+0d0a : Malayalam Letter Uu - 0x09aa, // U+0d0b : Malayalam Letter Vocalic R - 0x19a6, // U+0d0c : Malayalam Letter Vocalic L - 0x0000, // U+0d0d : Undefined - 0x09ab, // U+0d0e : Malayalam Letter E - 0x09ac, // U+0d0f : Malayalam Letter Ee - 0x09ad, // U+0d10 : Malayalam Letter Ai - 0x0000, // U+0d11 : Undefined - 0x09af, // U+0d12 : Malayalam Letter O - 0x09b0, // U+0d13 : Malayalam Letter Oo - 0x09b1, // U+0d14 : Malayalam Letter Au - 0x09b3, // U+0d15 : Malayalam Letter Ka - 0x09b4, // U+0d16 : Malayalam Letter Kha - 0x09b5, // U+0d17 : Malayalam Letter Ga - 0x09b6, // U+0d18 : Malayalam Letter Gha - 0x09b7, // U+0d19 : Malayalam Letter Nga - 0x09b8, // U+0d1a : Malayalam Letter Ca - 0x09b9, // U+0d1b : Malayalam Letter Cha - 0x09ba, // U+0d1c : Malayalam Letter Ja - 0x09bb, // U+0d1d : Malayalam Letter Jha - 0x09bc, // U+0d1e : Malayalam Letter Nya - 0x09bd, // U+0d1f : Malayalam Letter Tta - 0x09be, // U+0d20 : Malayalam Letter Ttha - 0x09bf, // U+0d21 : Malayalam Letter Dda - 0x09c0, // U+0d22 : Malayalam Letter Ddha - 0x09c1, // U+0d23 : Malayalam Letter Nna - 0x09c2, // U+0d24 : Malayalam Letter Ta - 0x09c3, // U+0d25 : Malayalam Letter Tha - 0x09c4, // U+0d26 : Malayalam Letter Da - 0x09c5, // U+0d27 : Malayalam Letter Dha - 0x09c6, // U+0d28 : Malayalam Letter Na - 0x0000, // U+0d29 : Undefined - 0x09c8, // U+0d2a : Malayalam Letter Pa - 0x09c9, // U+0d2b : Malayalam Letter Pha - 0x09ca, // U+0d2c : Malayalam Letter Ba - 0x09cb, // U+0d2d : Malayalam Letter Bha - 0x09cc, // U+0d2e : Malayalam Letter Ma - 0x09cd, // U+0d2f : Malayalam Letter Ya - 0x09cf, // U+0d30 : Malayalam Letter Ra - 0x09d0, // U+0d31 : Malayalam Letter Rra - 0x09d1, // U+0d32 : Malayalam Letter La - 0x09d2, // U+0d33 : Malayalam Letter Lla - 0x09d3, // U+0d34 : Malayalam Letter Llla - 0x09d4, // U+0d35 : Malayalam Letter Va - 0x09d5, // U+0d36 : Malayalam Letter Sha - 0x09d6, // U+0d37 : Malayalam Letter Ssa - 0x09d7, // U+0d38 : Malayalam Letter Sa - 0x09d8, // U+0d39 : Malayalam Letter Ha - 0x0000, // U+0d3a : Undefined - 0x0000, // U+0d3b : Undefined - 0x0000, // U+0d3c : Undefined - 0x0000, // U+0d3d : Undefined - 0x09da, // U+0d3e : Malayalam Vowel Sign Aa - 0x09db, // U+0d3f : Malayalam Vowel Sign I - 0x09dc, // U+0d40 : Malayalam Vowel Sign Ii - 0x09dd, // U+0d41 : Malayalam Vowel Sign U - 0x09de, // U+0d42 : Malayalam Vowel Sign Uu - 0x09df, // U+0d43 : Malayalam Vowel Sign Vocalic R - 0x0000, // U+0d44 : Undefined - 0x0000, // U+0d45 : Undefined - 0x09e0, // U+0d46 : Malayalam Vowel Sign E - 0x09e1, // U+0d47 : Malayalam Vowel Sign Ee - 0x09e2, // U+0d48 : Malayalam Vowel Sign Ai - 0x0000, // U+0d49 : Undefined - 0x09e4, // U+0d4a : Malayalam Vowel Sign O - 0x09e5, // U+0d4b : Malayalam Vowel Sign Oo - 0x09e6, // U+0d4c : Malayalam Vowel Sign Au - 0x09e8, // U+0d4d : Malayalam Sign Virama - 0x0000, // U+0d4e : Undefined - 0x0000, // U+0d4f : Undefined - 0x0000, // U+0d50 : Undefined - 0x0000, // U+0d51 : Undefined - 0x0000, // U+0d52 : Undefined - 0x0000, // U+0d53 : Undefined - 0x0000, // U+0d54 : Undefined - 0x0000, // U+0d55 : Undefined - 0x0000, // U+0d56 : Undefined - 0x0000, // U+0d57 : Malayalam Au Length Mark - 0x0000, // U+0d58 : Undefined - 0x0000, // U+0d59 : Undefined - 0x0000, // U+0d5a : Undefined - 0x0000, // U+0d5b : Undefined - 0x0000, // U+0d5c : Undefined - 0x0000, // U+0d5d : Undefined - 0x0000, // U+0d5e : Undefined - 0x0000, // U+0d5f : Undefined - 0x19aa, // U+0d60 : Malayalam Letter Vocalic Rr - 0x19a7, // U+0d61 : Malayalam Letter Vocalic Ll - 0x0000, // U+0d62 : Undefined - 0x0000, // U+0d63 : Undefined - 0x0000, // U+0d64 : Undefined - 0x0000, // U+0d65 : Undefined - 0x09f1, // U+0d66 : Malayalam Digit Zero - 0x09f2, // U+0d67 : Malayalam Digit One - 0x09f3, // U+0d68 : Malayalam Digit Two - 0x09f4, // U+0d69 : Malayalam Digit Three - 0x09f5, // U+0d6a : Malayalam Digit Four - 0x09f6, // U+0d6b : Malayalam Digit Five - 0x09f7, // U+0d6c : Malayalam Digit Six - 0x09f8, // U+0d6d : Malayalam Digit Seven - 0x09f9, // U+0d6e : Malayalam Digit Eight - 0x09fa // U+0d6f : Malayalam Digit Nine - }; - - //////////////////////////////////////////////////////////////////////////// - // SecondIndicByte - // - // This is used if the UnicodeToIndic table 4 high bits are set, this is - // the value of the second Indic byte when applicable. - //////////////////////////////////////////////////////////////////////////// - static byte[] SecondIndicByte = - { - 0x00, - 0xe9, - 0xb8, // U+0952 == 0xf0_0xb8 - 0xbf // U+0970 == 0xf0_0xbf - }; - - //////////////////////////////////////////////////////////////////////////// - // IndicMapping - // - // This table maps the 10 indic code pages to their unicode counterparts. - // There are 0x60 characters in each table. The tables are in pairs of 2 - // (1st char, 2nd char) and there are 10 tables (1 for each code page "font") - //////////////////////////////////////////////////////////////////////////// - static int[] IndicMappingIndex = - { - -1, // 0 DEF 0X40 Default // Not a real code page - -1, // 1 RMN 0X41 Roman // Transliteration not supported - 0, // 2 DEV 0X42 Devanagari - 1, // 3 BNG 0X43 Bengali - 2, // 4 TML 0X44 Tamil - 3, // 5 TLG 0X45 Telugu - 1, // 6 ASM 0X46 Assamese (Bengali) - Reuses table 1 - 4, // 7 ORI 0X47 Oriya - 5, // 8 KND 0X48 Kannada - 6, // 9 MLM 0X49 Malayalam - 7, // 10 GJR 0X4A Gujarati - 8 // 11 PNJ 0X4B Punjabi (Gurmukhi) - }; - - //////////////////////////////////////////////////////////////////////////// - // IndicMapping - // - // This table contains 9 tables for the 10 indic code pages to their unicode counterparts. - // There are 0x60 characters in each table. The tables are in pairs of 2 - // (1st char, 2nd char) and there are 10 tables (1 for each code page "font") - // - // The first index is the table index (from the IndicMappingIndex table), - // the 2nd the byte index, the third the character index. - // - // For byte 0 a 0x0000 value indicates an unknown character - // For byte 1 a 0 value indicates no special attributes. - // For byte 1, 200C & 200D are Virama, Nukta special cases - // For byte 1, B8BF is Devanagari stress & abbreviation sign special cases - // - // WARNING: When copying these from windows, ? 0x003F were changed to 0x0000. - // - //////////////////////////////////////////////////////////////////////////// - // char[codePageMapIndex][byte][character] - static char[,,] IndicMapping = - { - { - //////////////////////////////////////////////////////////////////////////// - // - // Devanagari Table 0, Code Page (2, 0x42, 57002) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0901', '\x0902', '\x0903', '\x0905', '\x0906', '\x0907', '\x0908', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0909', '\x090a', '\x090b', '\x090e', '\x090f', '\x0910', '\x090d', '\x0912', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0913', '\x0914', '\x0911', '\x0915', '\x0916', '\x0917', '\x0918', '\x0919', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x091a', '\x091b', '\x091c', '\x091d', '\x091e', '\x091f', '\x0920', '\x0921', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0922', '\x0923', '\x0924', '\x0925', '\x0926', '\x0927', '\x0928', '\x0929', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x092a', '\x092b', '\x092c', '\x092d', '\x092e', '\x092f', '\x095f', '\x0930', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0931', '\x0932', '\x0933', '\x0934', '\x0935', '\x0936', '\x0937', '\x0938', - // d8, d9, da, db, dc, dd, de, df, - '\x0939', '\x0000', '\x093e', '\x093f', '\x0940', '\x0941', '\x0942', '\x0943', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0946', '\x0947', '\x0948', '\x0945', '\x094a', '\x094b', '\x094c', '\x0949', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x094d', '\x093c', '\x0964', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0966', '\x0967', '\x0968', '\x0969', '\x096a', '\x096b', '\x096c', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x096d', '\x096e', '\x096f', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0950', '\x0', '\x0', '\x0', '\x0', '\x090c', '\x0961', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0960', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0958', '\x0959', '\x095a', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x095b', '\x0', '\x0', '\x0', '\x0', '\x095c', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x095d', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x095e', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0962', '\x0963', '\x0', '\x0', '\x0944', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x093d', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\xB8BF', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Bengali & Assemese Table 1', Code Pages (3, '43', 57003 & 6', '46', 57006) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0981', '\x0982', '\x0983', '\x0985', '\x0986', '\x0987', '\x0988', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0989', '\x098a', '\x098b', '\x098f', '\x098f', '\x0990', '\x0990', '\x0993', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0993', '\x0994', '\x0994', '\x0995', '\x0996', '\x0997', '\x0998', '\x0999', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x099a', '\x099b', '\x099c', '\x099d', '\x099e', '\x099f', '\x09a0', '\x09a1', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x09a2', '\x09a3', '\x09a4', '\x09a5', '\x09a6', '\x09a7', '\x09a8', '\x09a8', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x09aa', '\x09ab', '\x09ac', '\x09ad', '\x09ae', '\x09af', '\x09df', '\x09b0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x09b0', '\x09b2', '\x09b2', '\x09b2', '\x09ac', '\x09b6', '\x09b7', '\x09b8', - // d8, d9, da, db, dc, dd, de, df, - '\x09b9', '\x0000', '\x09be', '\x09bf', '\x09c0', '\x09c1', '\x09c2', '\x09c3', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x09c7', '\x09c7', '\x09c8', '\x09c8', '\x09cb', '\x09cb', '\x09cc', '\x09cc', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x09cd', '\x09bc', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x09e6', '\x09e7', '\x09e8', '\x09e9', '\x09ea', '\x09eb', '\x09ec', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x09ed', '\x09ee', '\x09ef', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x098c', '\x09e1', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x09e0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x09dc', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x09dd', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x09e2', '\x09e3', '\x0', '\x0', '\x09c4', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Tamil Table 2', Code Page (4, '44', 57004) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0000', '\x0b82', '\x0b83', '\x0b85', '\x0b86', '\x0b87', '\x0b88', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0b89', '\x0b8a', '\x0000', '\x0b8f', '\x0b8f', '\x0b90', '\x0b90', '\x0b92', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0b93', '\x0b94', '\x0b94', '\x0b95', '\x0b95', '\x0b95', '\x0b95', '\x0b99', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0b9a', '\x0b9a', '\x0b9c', '\x0b9c', '\x0b9e', '\x0b9f', '\x0b9f', '\x0b9f', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0b9f', '\x0ba3', '\x0ba4', '\x0ba4', '\x0ba4', '\x0ba4', '\x0ba8', '\x0ba9', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0baa', '\x0baa', '\x0baa', '\x0baa', '\x0bae', '\x0baf', '\x0baf', '\x0bb0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0bb1', '\x0bb2', '\x0bb3', '\x0bb4', '\x0bb5', '\x0bb7', '\x0bb7', '\x0bb8', - // d8, d9, da, db, dc, dd, de, df, - '\x0bb9', '\x0000', '\x0bbe', '\x0bbf', '\x0bc0', '\x0bc1', '\x0bc2', '\x0000', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0bc6', '\x0bc7', '\x0bc8', '\x0bc8', '\x0bca', '\x0bcb', '\x0bcc', '\x0bcc', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0bcd', '\x0000', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0030', '\x0be7', '\x0be8', '\x0be9', '\x0bea', '\x0beb', '\x0bec', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0bed', '\x0bee', '\x0bef', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Telugu Table 3', Code Page (5, '45', 57005) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0c01', '\x0c02', '\x0c03', '\x0c05', '\x0c06', '\x0c07', '\x0c08', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0c09', '\x0c0a', '\x0c0b', '\x0c0e', '\x0c0f', '\x0c10', '\x0c10', '\x0c12', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0c13', '\x0c14', '\x0c14', '\x0c15', '\x0c16', '\x0c17', '\x0c18', '\x0c19', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0c1a', '\x0c1b', '\x0c1c', '\x0c1d', '\x0c1e', '\x0c1f', '\x0c20', '\x0c21', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0c22', '\x0c23', '\x0c24', '\x0c25', '\x0c26', '\x0c27', '\x0c28', '\x0c28', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0c2a', '\x0c2b', '\x0c2c', '\x0c2d', '\x0c2e', '\x0c2f', '\x0c2f', '\x0c30', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0c31', '\x0c32', '\x0c33', '\x0c33', '\x0c35', '\x0c36', '\x0c37', '\x0c38', - // d8, d9, da, db, dc, dd, de, df, - '\x0c39', '\x0000', '\x0c3e', '\x0c3f', '\x0c40', '\x0c41', '\x0c42', '\x0c43', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0c46', '\x0c47', '\x0c48', '\x0c48', '\x0c4a', '\x0c4b', '\x0c4c', '\x0c4c', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0c4d', '\x0000', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0c66', '\x0c67', '\x0c68', '\x0c69', '\x0c6a', '\x0c6b', '\x0c6c', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0c6d', '\x0c6e', '\x0c6f', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0c0c', '\x0c61', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0c60', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0c44', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Oriya Table 4', Code Page (7, '47', 57007) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0b01', '\x0b02', '\x0b03', '\x0b05', '\x0b06', '\x0b07', '\x0b08', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0b09', '\x0b0a', '\x0b0b', '\x0b0f', '\x0b0f', '\x0b10', '\x0b10', '\x0b10', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0b13', '\x0b14', '\x0b14', '\x0b15', '\x0b16', '\x0b17', '\x0b18', '\x0b19', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0b1a', '\x0b1b', '\x0b1c', '\x0b1d', '\x0b1e', '\x0b1f', '\x0b20', '\x0b21', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0b22', '\x0b23', '\x0b24', '\x0b25', '\x0b26', '\x0b27', '\x0b28', '\x0b28', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0b2a', '\x0b2b', '\x0b2c', '\x0b2d', '\x0b2e', '\x0b2f', '\x0b5f', '\x0b30', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0b30', '\x0b32', '\x0b33', '\x0b33', '\x0b2c', '\x0b36', '\x0b37', '\x0b38', - // d8, d9, da, db, dc, dd, de, df, - '\x0b39', '\x0000', '\x0b3e', '\x0b3f', '\x0b40', '\x0b41', '\x0b42', '\x0b43', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0b47', '\x0b47', '\x0b48', '\x0b48', '\x0b4b', '\x0b4b', '\x0b4c', '\x0b4c', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0b4d', '\x0b3c', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0b66', '\x0b67', '\x0b68', '\x0b69', '\x0b6a', '\x0b6b', '\x0b6c', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0b6d', '\x0b6e', '\x0b6f', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0c0c', '\x0c61', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0c60', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0b5c', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0b5d', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0c44', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0b3d', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Kannada Table 5', Code Page (8, '48', 57008) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0000', '\x0c82', '\x0c83', '\x0c85', '\x0c86', '\x0c87', '\x0c88', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0c89', '\x0c8a', '\x0c8b', '\x0c8e', '\x0c8f', '\x0c90', '\x0c90', '\x0c92', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0c93', '\x0c94', '\x0c94', '\x0c95', '\x0c96', '\x0c97', '\x0c98', '\x0c99', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0c9a', '\x0c9b', '\x0c9c', '\x0c9d', '\x0c9e', '\x0c9f', '\x0ca0', '\x0ca1', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0ca2', '\x0ca3', '\x0ca4', '\x0ca5', '\x0ca6', '\x0ca7', '\x0ca8', '\x0ca8', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0caa', '\x0cab', '\x0cac', '\x0cad', '\x0cae', '\x0caf', '\x0caf', '\x0cb0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0cb1', '\x0cb2', '\x0cb3', '\x0cb3', '\x0cb5', '\x0cb6', '\x0cb7', '\x0cb8', - // d8, d9, da, db, dc, dd, de, df, - '\x0cb9', '\x0000', '\x0cbe', '\x0cbf', '\x0cc0', '\x0cc1', '\x0cc2', '\x0cc3', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0cc6', '\x0cc7', '\x0cc8', '\x0cc8', '\x0cca', '\x0ccb', '\x0ccc', '\x0ccc', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0ccd', '\x0000', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0ce6', '\x0ce7', '\x0ce8', '\x0ce9', '\x0cea', '\x0ceb', '\x0cec', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0ced', '\x0cee', '\x0cef', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0c8c', '\x0ce1', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0ce0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0cde', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0cc4', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Malayalam Table 6', Code Page (9, '49', 57009) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0000', '\x0d02', '\x0d03', '\x0d05', '\x0d06', '\x0d07', '\x0d08', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0d09', '\x0d0a', '\x0d0b', '\x0d0e', '\x0d0f', '\x0d10', '\x0d10', '\x0d12', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0d13', '\x0d14', '\x0d14', '\x0d15', '\x0d16', '\x0d17', '\x0d18', '\x0d19', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0d1a', '\x0d1b', '\x0d1c', '\x0d1d', '\x0d1e', '\x0d1f', '\x0d20', '\x0d21', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0d22', '\x0d23', '\x0d24', '\x0d25', '\x0d26', '\x0d27', '\x0d28', '\x0d28', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0d2a', '\x0d2b', '\x0d2c', '\x0d2d', '\x0d2e', '\x0d2f', '\x0d2f', '\x0d30', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0d31', '\x0d32', '\x0d33', '\x0d34', '\x0d35', '\x0d36', '\x0d37', '\x0d38', - // d8, d9, da, db, dc, dd, de, df, - '\x0d39', '\x0000', '\x0d3e', '\x0d3f', '\x0d40', '\x0d41', '\x0d42', '\x0d43', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0d46', '\x0d47', '\x0d48', '\x0d48', '\x0d4a', '\x0d4b', '\x0d4c', '\x0d4c', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0d4d', '\x0000', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0d66', '\x0d67', '\x0d68', '\x0d69', '\x0d6a', '\x0d6b', '\x0d6c', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0d6d', '\x0d6e', '\x0d6f', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0d0c', '\x0d61', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0d60', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Gujarati Table 7', Code Page (10', '4a', 57010) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0a81', '\x0a82', '\x0a83', '\x0a85', '\x0a86', '\x0a87', '\x0a88', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0a89', '\x0a8a', '\x0a8b', '\x0a8f', '\x0a8f', '\x0a90', '\x0a8d', '\x0a8d', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0a93', '\x0a94', '\x0a91', '\x0a95', '\x0a96', '\x0a97', '\x0a98', '\x0a99', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0a9a', '\x0a9b', '\x0a9c', '\x0a9d', '\x0a9e', '\x0a9f', '\x0aa0', '\x0aa1', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0aa2', '\x0aa3', '\x0aa4', '\x0aa5', '\x0aa6', '\x0aa7', '\x0aa8', '\x0aa8', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0aaa', '\x0aab', '\x0aac', '\x0aad', '\x0aae', '\x0aaf', '\x0aaf', '\x0ab0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0ab0', '\x0ab2', '\x0ab3', '\x0ab3', '\x0ab5', '\x0ab6', '\x0ab7', '\x0ab8', - // d8, d9, da, db, dc, dd, de, df, - '\x0ab9', '\x0000', '\x0abe', '\x0abf', '\x0ac0', '\x0ac1', '\x0ac2', '\x0ac3', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0ac7', '\x0ac7', '\x0ac8', '\x0ac5', '\x0acb', '\x0acb', '\x0acc', '\x0ac9', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0acd', '\x0abc', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0ae6', '\x0ae7', '\x0ae8', '\x0ae9', '\x0aea', '\x0aeb', '\x0aec', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0aed', '\x0aee', '\x0aef', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0ad0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0ae0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0ac4', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0abd', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - }, - - { - //////////////////////////////////////////////////////////////////////////// - // - // Punjabi (Gurmukhi) Table 8', Code Page (11', '4b', 57011) - // - //////////////////////////////////////////////////////////////////////////// - - // Default Unicode Char - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0000', '\x0000', '\x0a02', '\x0000', '\x0a05', '\x0a06', '\x0a07', '\x0a08', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0a09', '\x0a0a', '\x0000', '\x0a0f', '\x0a0f', '\x0a10', '\x0a10', '\x0a10', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0a13', '\x0a14', '\x0a14', '\x0a15', '\x0a16', '\x0a17', '\x0a18', '\x0a19', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0a1a', '\x0a1b', '\x0a1c', '\x0a1d', '\x0a1e', '\x0a1f', '\x0a20', '\x0a21', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0a22', '\x0a23', '\x0a24', '\x0a25', '\x0a26', '\x0a27', '\x0a28', '\x0a28', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0a2a', '\x0a2b', '\x0a2c', '\x0a2d', '\x0a2e', '\x0a2f', '\x0a2f', '\x0a30', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0a30', '\x0a32', '\x0a33', '\x0a33', '\x0a35', '\x0a36', '\x0a36', '\x0a38', - // d8, d9, da, db, dc, dd, de, df, - '\x0a39', '\x0000', '\x0a3e', '\x0a3f', '\x0a40', '\x0a41', '\x0a42', '\x0000', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0a47', '\x0a47', '\x0a48', '\x0a48', '\x0a4b', '\x0a4b', '\x0a4c', '\x0a4c', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x0a4d', '\x0a3c', '\x002e', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0000', '\x0a66', '\x0a67', '\x0a68', '\x0a69', '\x0a6a', '\x0a6b', '\x0a6c', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0a6d', '\x0a6e', '\x0a6f', '\x0000', '\x0000', '\x0000', '\x0000', '\x0000' - }, - - // Alternate Unicode Char & Flags - { - // a0, a1, a2, a3, a4, a5, a6, a7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // a8, a9, aa, ab, ac, ad, ae, af, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // b0, b1, b2, b3, b4, b5, b6, b7, - '\x0', '\x0', '\x0', '\x0', '\x0a59', '\x0a5a', '\x0', '\x0', - // b8, b9, ba, bb, bc, bd, be, bf, - '\x0', '\x0', '\x0a5b', '\x0', '\x0', '\x0', '\x0', '\x0', - // c0, c1, c2, c3, c4, c5, c6, c7, - '\x0a5c', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // c8, c9, ca, cb, cc, cd, ce, cf, - '\x0', '\x0a5e', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d0, d1, d2, d3, d4, d5, d6, d7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // d8, d9, da, db, dc, dd, de, df, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e0, e1, e2, e3, e4, e5, e6, e7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // e8, e9, ea, eb, ec, ed, ee, ef, - '\x200C', '\x200D', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f0, f1, f2, f3, f4, f5, f6, f7, - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', - // f8, f9, fa, fb, fc, fd, fe, ff - '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0', '\x0' - } - } - }; - } - -} diff --git a/src/mscorlib/src/System/Text/ISO2022Encoding.cs b/src/mscorlib/src/System/Text/ISO2022Encoding.cs deleted file mode 100644 index fca579fe56..0000000000 --- a/src/mscorlib/src/System/Text/ISO2022Encoding.cs +++ /dev/null @@ -1,1983 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - - -// -// -// Notes: -// -// IsAlwaysNormalized ??? -// Regarding Normalization for ISO-2022-JP (50220, 50221, 50222), its the same rules as EUCJP -// Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings -// Form D is precluded because of 0x00a8, which changes to space + dierises. -// -// Note: I think that IsAlwaysNormalized should probably return true for form C for Japanese 20932 based CPs. -// -// For ISO-2022-KR -// Never normalized, C & D (& therefore KC & KD) are precluded because of Hangul syllables and combined characters. -// -// IsAlwaysNormalized ??? -// Regarding Normalization for ISO-2022-CN (50227, 50229) & HZ-GB2312 (52936) I think is similar to the Japanese case. -// Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings -// Form D is precluded because of 0x00a8, which changes to space + dierises. -// -// Note: I think that IsAlwaysNormalized should probably return true for form C for Chinese 20936 based CPs. -// -#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding -namespace System.Text -{ - using System.Globalization; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Text; - using System.Runtime.InteropServices; - using System; - using System.Security; - using System.Runtime.CompilerServices; - using System.Runtime.Serialization; - - - /*=================================ISO2022Encoding============================ - ** - ** This is used to support ISO 2022 encodings that use shift/escape sequences. - ** - ==============================================================================*/ - - [Serializable] - internal class ISO2022Encoding : DBCSCodePageEncoding - { - const byte SHIFT_OUT = (byte)0x0E; - const byte SHIFT_IN = (byte)0x0F; - const byte ESCAPE = 0x1B; - const byte LEADBYTE_HALFWIDTH = 0x10; - - // We have to load the 936 code page tables, so impersonate 936 as our base - // This pretends to be other code pages as far as memory sections are concerned. - internal ISO2022Encoding(int codePage) : base(codePage, tableBaseCodePages[codePage % 10]) - { - this.m_bUseMlangTypeForSerialization = true; - } - - // Constructor called by serialization. - // Note: We use the base GetObjectData however - internal ISO2022Encoding(SerializationInfo info, StreamingContext context) : base(info, context) - { - // Actually this can't ever get called, CodePageEncoding is our proxy - Debug.Assert(false, "Didn't expect to make it to DBCSCodePageEncoding serialization constructor"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - - static int[] tableBaseCodePages = - { - 932, // 50220 ISO-2022-JP, No halfwidth Katakana, convert to full width - 932, // 50221 ISO-2022-JP, Use escape sequence for half width Katakana - 932, // 50222 ISO-2022-JP, Use shift-in/shift-out for half width Katakana - 0, - 0, - 949, // 50225 ISO-2022-KR, Korean - 936, // 52936 HZ-GB2312, 936 might be better source - 0, //20936, // 50227 ISO-2022-CN, Note: This is just the same as CP 936 in Everett. - 0, - // 50229 is currently unsupported, CP 20000 is currently not built in .nlp file - 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_1 - 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_2 - 0 // ModeASCII - }; - - internal enum ISO2022Modes - { - ModeHalfwidthKatakana = 0, - ModeJIS0208 = 1, - ModeKR = 5, - ModeHZ = 6, - ModeGB2312 = 7, - ModeCNS11643_1 = 9, - ModeCNS11643_2 = 10, - ModeASCII = 11, - - ModeIncompleteEscape = -1, - ModeInvalidEscape = -2, - ModeNOOP = -3 - } - - protected unsafe override String GetMemorySectionName() - { - int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; - - String strFormat; - - switch (this.CodePage) - { - case 50220: - case 50221: - case 50222: - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022JP"; - break; - case 50225: - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022KR"; - break; - case 52936: - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_HZ"; - break; - default: - Debug.Assert(false, "[ISO2022Encoding.GetMemorySectionName] Don't expect to get here for code page " + this.CodePage); - strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}"; - break; - } - - String strName = String.Format(CultureInfo.InvariantCulture, strFormat, - iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, - this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); - - return strName; - } - - // Clean up characters for ISO2022 code pages, etc. - // ISO2022 (50220, 50221, 50222) - // GB-HZ (52936) - protected override bool CleanUpBytes(ref int bytes) - { - switch (this.CodePage) - { - // 932 based code pages - case 50220: - case 50221: - case 50222: - { - if (bytes >= 0x100) - { - // map extended char (0xfa40-0xfc4b) to a special range - // (ported from mlang) - if (bytes >= 0xfa40 && bytes <= 0xfc4b) - { - if ( bytes >= 0xfa40 && bytes <= 0xfa5b ) - { - if ( bytes <= 0xfa49 ) - bytes = bytes - 0x0b51 ; - else if ( bytes >= 0xfa4a && bytes <= 0xfa53 ) - bytes = bytes - 0x072f6 ; - else if ( bytes >= 0xfa54 && bytes <= 0xfa57 ) - bytes = bytes - 0x0b5b ; - else if ( bytes == 0xfa58 ) - bytes = 0x878a ; - else if ( bytes == 0xfa59 ) - bytes = 0x8782 ; - else if ( bytes == 0xfa5a ) - bytes = 0x8784 ; - else if ( bytes == 0xfa5b ) - bytes = 0x879a ; - } - else if ( bytes >= 0xfa5c && bytes <= 0xfc4b ) - { - byte tc = unchecked((byte)bytes); - if ( tc < 0x5c ) - bytes = bytes - 0x0d5f; - else if ( tc >= 0x80 && tc <= 0x9B ) - bytes = bytes - 0x0d1d; - else - bytes = bytes - 0x0d1c; - } - } - - // Convert 932 code page to 20932 like code page range - // (also ported from mlang) - byte bLead = unchecked((byte)(bytes >> 8)); - byte bTrail = unchecked((byte)bytes); - - bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71); - bLead = (byte)((bLead << 1) + 1); - if (bTrail > (byte)0x9e) - { - bTrail -= (byte)0x7e; - bLead++; - } - else - { - if (bTrail > (byte)0x7e) - bTrail--; - bTrail -= (byte)0x1f; - } - - bytes = ((int)bLead) << 8 | (int)bTrail; - - // Don't step out of our allocated lead byte area. - // All DBCS lead and trail bytes should be >= 0x21 and <= 0x7e - // This is commented out because Everett/Mlang had illegal PUA - // mappings to ISO2022 code pages that we're maintaining. -// if ((bytes & 0xFF00) < 0x2100 || (bytes & 0xFF00) > 0x7e00 || - // (bytes & 0xFF) < 0x21 || (bytes & 0xFF) > 0x7e) - // return false; - } - else - { - // Adjust 1/2 Katakana - if (bytes >= 0xa1 && bytes <= 0xdf) - bytes += (LEADBYTE_HALFWIDTH << 8) - 0x80; - - // 0x81-0x9f and 0xe0-0xfc CP 932 - // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though) - // b0-df is 1/2 Katakana - if (bytes >= 0x81 && - (bytes <= 0x9f || - (bytes >= 0xe0 && bytes <= 0xfc))) - { - // Don't do lead bytes, we use escape sequences instead. - return false; - } - } - break; - } - case 50225: - { - // For 50225 since we don't rely on lead byte marks, return false and don't add them, - // esp. since we're only a 7 bit code page. - if (bytes >= 0x80 && bytes <= 0xff) - return false; - - // Ignore characters out of range (a1-7f) - if (bytes >= 0x100 && - ((bytes & 0xff) < 0xa1 || (bytes & 0xff) == 0xff || - (bytes & 0xff00) < 0xa100 || (bytes & 0xff00) == 0xff00)) - return false; - - // May as well get them into our 7 bit range - bytes &= 0x7f7f; - - break; - } - case 52936: - { - // Since we don't rely on lead byte marks for 52936, get rid of them so we - // don't end up with extra wierd fffe mappings. - if (bytes >= 0x81 && bytes <= 0xfe) - return false; - - break; - } - } - - return true; - } - - // GetByteCount - internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(count >= 0, "[ISO2022Encoding.GetByteCount]count is negative"); - Debug.Assert(chars != null, "[ISO2022Encoding.GetByteCount]chars is null"); - - // Just call GetBytes with null byte* to get count - return GetBytes(chars, count, null, 0, baseEncoder); - } - - internal override unsafe int GetBytes(char* chars, int charCount, - byte* bytes, int byteCount, EncoderNLS baseEncoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(chars != null, "[ISO2022Encoding.GetBytes]chars is null"); - Debug.Assert(byteCount >= 0, "[ISO2022Encoding.GetBytes]byteCount is negative"); - Debug.Assert(charCount >= 0, "[ISO2022Encoding.GetBytes]charCount is negative"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[ISO2022Encoding.GetBytes]Attempting to use null encoder fallback"); - - // Fix our encoder - ISO2022Encoder encoder = (ISO2022Encoder)baseEncoder; - - // Our return value - int iCount = 0; - - switch(CodePage) - { - case 50220: - case 50221: - case 50222: - iCount = GetBytesCP5022xJP( chars, charCount, bytes, byteCount, encoder ); - break; - case 50225: - iCount = GetBytesCP50225KR( chars, charCount, bytes, byteCount, encoder ); - break; -// Everett had 50227 the same as 936 -/* case 50227: - iCount = GetBytesCP50227CN( chars, charCount, bytes, byteCount, encoder ); - break; -*/ - case 52936: - iCount = GetBytesCP52936( chars, charCount, bytes, byteCount, encoder ); - break; - } - - return iCount; - } - - // This is internal and called by something else, - internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) - { - // Just assert, we're called internally so these should be safe, checked already - Debug.Assert(bytes != null, "[ISO2022Encoding.GetCharCount]bytes is null"); - Debug.Assert(count >= 0, "[ISO2022Encoding.GetCharCount]byteCount is negative"); - - // Just call getChars with null char* to get count - return GetChars(bytes, count, null, 0, baseDecoder); - } - - internal override unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS baseDecoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[ISO2022Encoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[ISO2022Encoding.GetChars]byteCount is negative"); - Debug.Assert(charCount >= 0, "[ISO2022Encoding.GetChars]charCount is negative"); - - // Fix our decoder - ISO2022Decoder decoder = (ISO2022Decoder)baseDecoder; - int iCount = 0; - - switch (CodePage) - { - case 50220: - case 50221: - case 50222: - iCount = GetCharsCP5022xJP( bytes, byteCount, chars, charCount, decoder); - break; - case 50225: - iCount = GetCharsCP50225KR( bytes, byteCount, chars, charCount, decoder); - break; - // Currently 50227 is the same as 936 -// case 50227: - // iCount = GetCharsCP50227CN( bytes, byteCount, chars, charCount, decoder); - // break; - case 52936: - iCount = GetCharsCP52936( bytes, byteCount, chars, charCount, decoder); - break; - default: - Debug.Assert(false, "[ISO2022Encoding.GetChars] had unexpected code page"); - break; - } - - return iCount; - } - - // ISO 2022 Code pages for JP. - // 50220 - No halfwidth Katakana, convert to full width - // 50221 - Use escape sequence for half width Katakana - // 50222 - Use shift-in/shift-out for half width Katakana - // - // These are the JIS code pages, superset of ISO-2022 / ISO-2022-JP-1 - // 0E Shift Out (following bytes are Katakana) - // 0F Shift In (back to "normal" behavior) - // 21-7E Byte ranges (1 or 2 bytes) - // <ESC> $ @ To Double Byte 0208 Mode (actually older code page, but subset of 0208) - // <ESC> $ B To Double Byte 0208 Mode (duplicate) - // <ESC> $ ( D To Double Byte 0212 Mode (previously we misinterpreted this) - // <ESC> $ I To half width Katakana - // <ESC> ( J To JIS-Roman - // <ESC> ( H To JIS-Roman (swedish character set) - // <ESC> ( B To ASCII - // <ESC> & @ Alternate lead in to <ESC> $ B so just ignore it. - // - // So in Katakana mode we add 0x8e as a lead byte and use CP 20932 to convert it - // In ASCII mode we just spit out the single byte. - // In Roman mode we should change 0x5c (\) -> Yen sign and 0x7e (~) to Overline, however - // we didn't in mLang, otherwise roman is like ASCII. - // In 0208 double byte mode we have to |= with 0x8080 and use CP 20932 to convert it. - // In 0212 double byte mode we have to |= with 0x8000 and use CP 20932 to convert it. - // - // Note that JIS Shift In/Shift Out is different than the other ISO2022 encodings. For JIS - // Shift out always shifts to half-width Katakana. Chinese encodings use designator sequences - // instead of escape sequences and shift out to the designated sequence or back in to ASCII. - // - // When decoding JIS 0208, MLang used a '*' (0x2a) character in JIS 0208 mode to map the trailing byte - // to halfwidth katakana. I found no description of that behavior, however that block of 0208 is - // undefined, so we maintain that behavior when decoding. We will never generate characters using - // that technique, but the decoder will process them. - // - private unsafe int GetBytesCP5022xJP(char* chars, int charCount, - byte* bytes, int byteCount, ISO2022Encoder encoder) - { - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Get our mode - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode - ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that shift in will go back to (only used by CP 50222) - - // Check our encoder - if (encoder != null) - { - char charLeftOver = encoder.charLeftOver; - - currentMode = encoder.currentMode; - shiftInMode = encoder.shiftInOutMode; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP5022xJP]leftover character should be high surrogate"); - - // It has to be a high surrogate, which we don't support, so it has to be a fallback - buffer.Fallback(charLeftOver); - } - } - - while (buffer.MoreData) - { - // Get our char - char ch = buffer.GetNextChar(); - - // Get our bytes - ushort iBytes = mapUnicodeToBytes[ch]; - - StartConvert: - // Check for halfwidth bytes - byte bLeadByte = (byte)(iBytes >> 8); - byte bTrailByte = (byte)(iBytes & 0xff); - - if (bLeadByte == LEADBYTE_HALFWIDTH) - { - // Its Halfwidth Katakana - if (CodePage == 50220) - { - // CodePage 50220 doesn't use halfwidth Katakana, convert to fullwidth - // See if its out of range, fallback if so, throws if recursive fallback - if (bTrailByte < 0x21 || bTrailByte >= 0x21 + HalfToFullWidthKanaTable.Length) - { - buffer.Fallback(ch); - continue; - } - - // Get the full width katakana char to use. - iBytes = unchecked((ushort)(HalfToFullWidthKanaTable[bTrailByte - 0x21] & 0x7F7F)); - - // May have to do all sorts of fun stuff for mode, go back to start convert - goto StartConvert; - } - - // Can use halfwidth Katakana, make sure we're in right mode - - // Make sure we're in right mode - if (currentMode != ISO2022Modes.ModeHalfwidthKatakana) - { - // 50222 or 50221, either shift in/out or escape to get to Katakana mode - if (CodePage == 50222) - { - // Shift Out - if (!buffer.AddByte(SHIFT_OUT)) - break; // convert out of space, stop - - // Don't change modes until after AddByte in case it fails for convert - // We get to shift out to Katakana, make sure we'll go back to the right mode - // (This ends up always being ASCII) - shiftInMode = currentMode; - currentMode = ISO2022Modes.ModeHalfwidthKatakana; - } - else - { - // 50221 does halfwidth katakana by escape sequence - Debug.Assert(CodePage == 50221, "[ISO2022Encoding.GetBytesCP5022xJP]Expected Code Page 50221"); - - // Add our escape sequence - if (!buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'I'))) - break; // convert out of space, stop - - currentMode = ISO2022Modes.ModeHalfwidthKatakana; - } - } - - // We know we're in Katakana mode now, so add it. - // Go ahead and add the Katakana byte. Our table tail bytes are 0x80 too big. - if (!buffer.AddByte(unchecked((byte)(bTrailByte & 0x7F)))) - break; // convert out of space, stop - - // Done with this one - continue; - } - else if (bLeadByte != 0) - { - // - // It's a double byte character. - // - - // If we're CP 50222 we may have to shift in from Katakana mode first - if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) - { - // Shift In - if (!buffer.AddByte(SHIFT_IN)) - break; // convert out of space, stop - - // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) - currentMode = shiftInMode; - } - - // Make sure we're in the right mode (JIS 0208 or JIS 0212) - // Note: Right now we don't use JIS 0212. Also this table'd be wrong - - // Its JIS extension 0208 - if (currentMode != ISO2022Modes.ModeJIS0208) - { - // Escape sequence, we can fail after this, mode will be correct for convert - if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)'B'))) - break; // Convert out of space, stop - - currentMode = ISO2022Modes.ModeJIS0208; - } - - // Add our double bytes - if (!buffer.AddByte(unchecked((byte)(bLeadByte)), unchecked((byte)(bTrailByte)))) - break; // Convert out of space, stop - continue; - } - else if (iBytes != 0 || ch == 0) - { - // Single byte Char - // If we're CP 50222 we may have to shift in from Katakana mode first - if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) - { - // Shift IN - if (!buffer.AddByte(SHIFT_IN)) - break; // convert ran out of room - - // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) - currentMode = shiftInMode; - } - - // Its a single byte character, switch to ASCII if we have to - if (currentMode != ISO2022Modes.ModeASCII) - { - if (!buffer.AddByte(ESCAPE,unchecked((byte)'('), unchecked((byte)'B'))) - break; // convert ran out of room - - currentMode = ISO2022Modes.ModeASCII; - } - - // Add the ASCII char - if (!buffer.AddByte(bTrailByte)) - break; // convert had no room left - continue; - } - - // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - } - - // Switch back to ASCII if MustFlush or no encoder - if (currentMode != ISO2022Modes.ModeASCII && - (encoder == null || encoder.MustFlush)) - { - // If we're CP 50222 we may have to shift in from Katakana mode first - if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) - { - // Shift IN, only shift mode if necessary. - if (buffer.AddByte(SHIFT_IN)) - // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) - currentMode = shiftInMode; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - - // switch back to ASCII to finish neatly - if (currentMode != ISO2022Modes.ModeASCII && - (CodePage != 50222 || currentMode != ISO2022Modes.ModeHalfwidthKatakana)) - { - // only shift if it was successful - if (buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'B'))) - currentMode = ISO2022Modes.ModeASCII; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - } - - // Remember our encoder state - if (bytes != null && encoder != null) - { - // This is ASCII if we had to flush - encoder.currentMode = currentMode; - encoder.shiftInOutMode = shiftInMode; - - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - // ISO 2022 Code pages for Korean - CP 50225 - // - // CP 50225 has Shift In/Shift Out codes, and a single designator sequence that is supposed - // to appear once in the file, at the beginning of a line, before any multibyte code points. - // So we stick the designator at the beginning of the output. - // - // These are the KR code page codes for ISO-2022-KR - // 0E Shift Out (following bytes are double byte) - // 0F Shift In (back to ASCII behavior) - // 21-7E Byte ranges (1 or 2 bytes) - // <ESC> $)C Double byte ISO-2022-KR designator - // - // Note that this encoding is a little different than other encodings. The <esc>$)C sequence - // should only appear once per file. (Actually I saw another spec/rfc that said at the beginning - // of each line, but it shouldn't really matter.) - // - // During decoding Mlang accepted ' ', '\t, and '\n' as their respective characters, even if - // it was in double byte mode. We maintain that behavior, although I couldn't find a reference or - // reason for that behavior. We never generate data using that shortcut. - // - // Also Mlang always assumed KR mode, even if the designator wasn't found yet, so we do that as - // well. So basically we just ignore <ESC>$)C when decoding. - // - private unsafe int GetBytesCP50225KR(char* chars, int charCount, - byte* bytes, int byteCount, ISO2022Encoder encoder) - { - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Get our mode - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode - ISO2022Modes shiftOutMode = ISO2022Modes.ModeASCII; // ModeKR if already stamped lead bytes - - // Check our encoder - if (encoder != null) - { - // May have leftover stuff - char charLeftOver = encoder.charLeftOver; - currentMode = encoder.currentMode; - shiftOutMode = encoder.shiftInOutMode; - - // We may have a l left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP50225KR]leftover character should be high surrogate"); - - // It has to be a high surrogate, which we don't support, so it has to be a fallback - buffer.Fallback(charLeftOver); - } - } - - while (buffer.MoreData) - { - // Get our data - char ch = buffer.GetNextChar(); - - // Get our bytes - ushort iBytes = mapUnicodeToBytes[ch]; - - // Check for double byte bytes - byte bLeadByte = (byte)(iBytes >> 8); - byte bTrailByte = (byte)(iBytes & 0xff); - - if (bLeadByte != 0) - { - // - // It's a double byte character. - // - - // If we haven't done our Korean designator, then do so, if we have any input - if (shiftOutMode != ISO2022Modes.ModeKR) - { - // Add our code page designator sequence - if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)')'), unchecked((byte)'C'))) - break; // No room during convert. - - shiftOutMode = ISO2022Modes.ModeKR; - } - - // May have to switch to ModeKR first - if (currentMode != ISO2022Modes.ModeKR) - { - if (!buffer.AddByte(SHIFT_OUT)) - break; // No convert room - - currentMode = ISO2022Modes.ModeKR; - } - - // Add the bytes - if (!buffer.AddByte(bLeadByte, bTrailByte)) - break; // no convert room - continue; - } - else if (iBytes != 0 || ch == 0) - { - // Its a single byte character, switch to ASCII if we have to - if (currentMode != ISO2022Modes.ModeASCII) - { - if (!buffer.AddByte(SHIFT_IN)) - break; - - currentMode = ISO2022Modes.ModeASCII; - } - - // Add the ASCII char - if (!buffer.AddByte(bTrailByte)) - break; - continue; - } - - // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - } - - // Switch back to ASCII if MustFlush or no encoder - if (currentMode != ISO2022Modes.ModeASCII && - (encoder == null || encoder.MustFlush)) - { - // Get back to ASCII to be safe. Only do it if it success. - if (buffer.AddByte(SHIFT_IN)) - currentMode = ISO2022Modes.ModeASCII; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - - // Remember our encoder state - if (bytes != null && encoder != null) - { - // If we didn't use the encoder, then there's no chars left over - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - // This is ASCII if we had to flush - encoder.currentMode = currentMode; - - // We don't use shift out mode, but if we've flushed we need to reset it so it doesn't - // get output again. - if (!encoder.MustFlush || encoder.charLeftOver != (char)0) - { - // We should be not flushing or converting - Debug.Assert(!encoder.MustFlush || !encoder.m_throwOnOverflow, - "[ISO2022Encoding.GetBytesCP50225KR]Expected no left over data or not flushing or not converting"); - encoder.shiftInOutMode = shiftOutMode; - } - else - encoder.shiftInOutMode = ISO2022Modes.ModeASCII; - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - // CP52936 is HZ Encoding - // HZ Encoding has 4 shift sequences: - // ~~ '~' (\u7e) - // ~} shift into 1 byte mode, - // ~{ shift into 2 byte GB 2312-80 - // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) - // (This is for mailers that restrict to 70 or 80 or whatever character lines) - // - // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 - // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e - // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe - // - // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. - // (all bytes <= 0x7f) - private unsafe int GetBytesCP52936(char* chars, int charCount, - byte* bytes, int byteCount, ISO2022Encoder encoder) - { - // prepare our helpers - Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( - this, encoder, bytes, byteCount, chars, charCount); - - // Mode - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; - - // Check our encoder - if (encoder != null) - { - char charLeftOver = encoder.charLeftOver; - currentMode = encoder.currentMode; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - Debug.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP52936]leftover character should be high surrogate"); - - // It has to be a high surrogate, which we don't support, so it has to be a fallback - buffer.Fallback(charLeftOver); - } - } - - while (buffer.MoreData) - { - // Get our char - char ch = buffer.GetNextChar(); - - // Get our bytes - ushort sChar = mapUnicodeToBytes[ch]; - if (sChar == 0 && ch != 0) - { - // Wasn't a legal byte sequence, its a surrogate or fallback - // Throws if recursive (knows because we called InternalGetNextChar) - buffer.Fallback(ch); - - // Done with our char, now process fallback - continue; - } - - // Check for halfwidth bytes - byte bLeadByte = (byte)(sChar >> 8); - byte bTrailByte = (byte)(sChar & 0xff); - - // If its a double byte, it has to fit in the lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe range - // (including the 0x8080 that our codepage or's to the value) - if ((bLeadByte != 0 && - (bLeadByte < 0xa1 || bLeadByte > 0xf7 || bTrailByte < 0xa1 || bTrailByte > 0xfe)) || - (bLeadByte == 0 && bTrailByte > 0x80 && bTrailByte != 0xff)) - { - // Illegal character, in 936 code page, but not in HZ subset, get fallback for it - buffer.Fallback(ch); - continue; - } - - // sChar is now either ASCII or has an 0x8080 mask - if (bLeadByte != 0) - { - // Its a double byte mode - if (currentMode != ISO2022Modes.ModeHZ) - { - // Need to add the double byte mode marker - if (!buffer.AddByte((byte)'~', (byte)'{', 2)) - break; // Stop if no buffer space in convert - - currentMode = ISO2022Modes.ModeHZ; - } - - // Go ahead and add the 2 bytes - if (!buffer.AddByte(unchecked((byte)(bLeadByte & 0x7f)), unchecked((byte)(bTrailByte & 0x7f)))) - break; // Stop if no buffer space in convert - } - else - { - // Its supposed to be ASCII - if (currentMode != ISO2022Modes.ModeASCII) - { - // Need to add the ASCII mode marker - // Will have 1 more byte (or 2 if ~) - if (!buffer.AddByte((byte)'~', (byte)'}', bTrailByte == '~' ? 2:1)) - break; - - currentMode = ISO2022Modes.ModeASCII; - } - - // If its a '~' we'll need an extra one - if (bTrailByte == '~') - { - // Need to add the extra ~ - if (!buffer.AddByte((byte)'~', 1)) - break; - } - - // Need to add the character - if (!buffer.AddByte(bTrailByte)) - break; - } - } - - // Add ASCII shift out if we're at end of decoder - if (currentMode != ISO2022Modes.ModeASCII && - (encoder == null || encoder.MustFlush)) - { - // Need to add the ASCII mode marker - // Only turn off other mode if this works - if (buffer.AddByte((byte)'~',(byte)'}')) - currentMode = ISO2022Modes.ModeASCII; - else - // If not successful, convert will maintain state for next time, also - // AddByte will have decremented our char count, however we need it to remain the same - buffer.GetNextChar(); - } - - // Need to remember our mode - if (encoder != null && bytes != null) - { - // This is ASCII if we had to flush - encoder.currentMode = currentMode; - - if (!buffer.fallbackBuffer.bUsedEncoder) - { - encoder.charLeftOver = (char)0; - } - - encoder.m_charsUsed = buffer.CharsUsed; - } - - // Return our length - return buffer.Count; - } - - private unsafe int GetCharsCP5022xJP(byte* bytes, int byteCount, - char* chars, int charCount, ISO2022Decoder decoder) - { - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // No mode information yet - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode - ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that we'll shift in to - byte[] escapeBytes = new byte[4]; - int escapeCount = 0; - - if (decoder != null) - { - currentMode = decoder.currentMode; - shiftInMode = decoder.shiftInOutMode; - - // See if we have leftover decoder buffer to use - // Load our bytesLeftOver - escapeCount = decoder.bytesLeftOverCount; - - // Don't want to mess up decoder if we're counting or throw an exception - for (int i = 0; i < escapeCount; i++) - escapeBytes[i] = decoder.bytesLeftOver[i]; - } - - // Do this until the end - while (buffer.MoreData || escapeCount > 0) - { - byte ch; - - if (escapeCount > 0) - { - // Get more escape sequences if necessary - if (escapeBytes[0] == ESCAPE) - { - // Stop if no more input - if (!buffer.MoreData) - { - if (decoder != null && !decoder.MustFlush) - break; - } - else - { - // Add it to the sequence we can check - escapeBytes[escapeCount++] = buffer.GetNextByte(); - - // We have an escape sequence - ISO2022Modes modeReturn = - CheckEscapeSequenceJP(escapeBytes, escapeCount); - - if (modeReturn != ISO2022Modes.ModeInvalidEscape) - { - if (modeReturn != ISO2022Modes.ModeIncompleteEscape) - { - // Processed escape correctly - escapeCount = 0; - - // We're now this mode - currentMode = shiftInMode = modeReturn; - } - - // Either way, continue to get next escape or real byte - continue; - } - } - - // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. - } - - // Read next escape byte and move them down one. - ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - } - else - { - // Get our next byte - ch = buffer.GetNextByte(); - - if (ch == ESCAPE) - { - // We'll have an escape sequence, use it if we don't have one buffered already - if (escapeCount == 0) - { - // Start this new escape sequence - escapeBytes[0] = ch; - escapeCount = 1; - continue; - } - - // Flush the previous escape sequence, then reuse this escape byte - buffer.AdjustBytes(-1); - } - } - - if (ch == SHIFT_OUT) - { - shiftInMode = currentMode; - currentMode = ISO2022Modes.ModeHalfwidthKatakana; - continue; - } - else if (ch == SHIFT_IN) - { - currentMode = shiftInMode; - continue; - } - - // Get our full character - ushort iBytes = ch; - bool b2Bytes = false; - - if (currentMode == ISO2022Modes.ModeJIS0208) - { - // - // To handle errors, we need to check: - // 1. if trailbyte is there - // 2. if code is valid - // - if (escapeCount > 0) - { - // Let another escape fall through - if (escapeBytes[0] != ESCAPE) - { - // Move them down one & get the next data - iBytes <<= 8; - iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - b2Bytes = true; - } - } - else if (buffer.MoreData) - { - iBytes <<= 8; - iBytes |= buffer.GetNextByte(); - b2Bytes = true; - } - else - { - // Not enough input, use decoder if possible - if (decoder == null || decoder.MustFlush) - { - // No decoder, do fallback for this byte - buffer.Fallback(ch); - break; - } - - // Stick it in the decoder if we're not counting - if (chars != null) - { - escapeBytes[0] = ch; - escapeCount = 1; - } - break; - } - - // MLang treated JIS 0208 '*' lead byte like a single halfwidth katakana - // escape, so use 0x8e00 as katakana lead byte and keep same trail byte. - // 0x2a lead byte range is normally unused in JIS 0208, so shouldn't have - // any wierd compatibility issues. - if ((b2Bytes == true) && ((iBytes & 0xff00) == 0x2a00)) - { - iBytes = (ushort)(iBytes & 0xff); - iBytes |= (LEADBYTE_HALFWIDTH << 8); // Put us in the halfwidth katakana range - } - } - else if (iBytes >= 0xA1 && iBytes <= 0xDF) - { - // Everett accidentally mapped Katakana like shift-jis (932), - // even though this is a 7 bit code page. We keep that mapping - iBytes |= (LEADBYTE_HALFWIDTH << 8); // Map to halfwidth katakana range - iBytes &= 0xff7f; // remove extra 0x80 - } - else if (currentMode == ISO2022Modes.ModeHalfwidthKatakana ) - { - // Add 0x10 lead byte that our encoding expects for Katakana: - iBytes |= (LEADBYTE_HALFWIDTH << 8); - } - - // We have an iBytes to try to convert. - char c = mapBytesToUnicode[iBytes]; - - // See if it was unknown - if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - // Have to do fallback - if (b2Bytes) - { - if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) - break; - } - else - { - if (!buffer.Fallback(ch)) - break; - } - } - else - { - // If we were JIS 0208, then we consumed an extra byte - if (!buffer.AddChar(c, b2Bytes ? 2:1)) - break; - } - } - - // Make sure our decoder state matches our mode, if not counting - if (chars != null && decoder != null) - { - // Remember it if we don't flush - if (!decoder.MustFlush || escapeCount != 0) - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISO2022Encoding.GetCharsCP5022xJP]Expected no state or not converting or not flushing"); - - decoder.currentMode = currentMode; - decoder.shiftInOutMode = shiftInMode; - - // Remember escape buffer - decoder.bytesLeftOverCount = escapeCount; - decoder.bytesLeftOver = escapeBytes; - } - else - { - // We flush, clear buffer - decoder.currentMode = ISO2022Modes.ModeASCII; - decoder.shiftInOutMode = ISO2022Modes.ModeASCII; - decoder.bytesLeftOverCount = 0; - // Slightly different if counting/not counting - } - - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return # of characters we found - return buffer.Count; - } - - // We know we have an escape sequence, so check it starting with the byte after the escape - private ISO2022Modes CheckEscapeSequenceJP( byte[] bytes, int escapeCount ) - { - // Have an escape sequence - if (bytes[0] != ESCAPE) - return ISO2022Modes.ModeInvalidEscape; - - if (escapeCount < 3) - return ISO2022Modes.ModeIncompleteEscape; - - if (bytes[1] == '(') - { - if (bytes[2] == 'B') // <esc>(B - { - return ISO2022Modes.ModeASCII; - } - else if (bytes[2] == 'H') // <esc>(H - { - // Actually this is supposed to be Swedish - // We treat it like ASCII though. - return ISO2022Modes.ModeASCII; - } - else if (bytes[2] == 'J') // <esc>(J - { - // Actually this is supposed to be Roman - // 2 characters are different, but historically we treat it as ascii - return ISO2022Modes.ModeASCII; - } - else if (bytes[2] == 'I') // <esc>(I - { - return ISO2022Modes.ModeHalfwidthKatakana; - } - } - else if (bytes[1] == '$') - { - if (bytes[2] == '@' || // <esc>$@ - bytes[2] == 'B') // <esc>$B - { - return ISO2022Modes.ModeJIS0208; - } - else - { - // Looking for <esc>$(D - if (escapeCount < 4) - return ISO2022Modes.ModeIncompleteEscape; - - if (bytes[2] == '(' && bytes[3] == 'D') // <esc>$(D - { - // Mlang treated 0208 like 0212 even though that's wrong - return ISO2022Modes.ModeJIS0208; - } - } - } - else if (bytes[1] == '&') - { - if (bytes[2] == '@') // <esc>&@ - { - // Ignore ESC & @ (prefix to <esc>$B) - return ISO2022Modes.ModeNOOP; - } - } - - // If we get here we fell through and have an invalid/unknown escape sequence - return ISO2022Modes.ModeInvalidEscape; - } - - private byte DecrementEscapeBytes(ref byte[] bytes, ref int count) - { - Debug.Assert(count > 0, "[ISO2022Encoding.DecrementEscapeBytes]count > 0"); - - // Decrement our count - count--; - - // Remember the first one - byte returnValue = bytes[0]; - - // Move them down one. - for (int i = 0; i < count; i++) - { - bytes[i] = bytes[i+1]; - } - - // Clear out the last byte - bytes[count] = 0; - - // Return the old 1st byte - return returnValue; - } - - // Note that in DBCS mode mlang passed through ' ', '\t' and '\n' as SBCS characters - // probably to allow mailer formatting without too much extra work. - private unsafe int GetCharsCP50225KR(byte* bytes, int byteCount, - char* chars, int charCount, ISO2022Decoder decoder) - { - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // No mode information yet - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode - - byte[] escapeBytes = new byte[4]; - int escapeCount = 0; - - if (decoder != null) - { - currentMode = decoder.currentMode; - - // See if we have leftover decoder buffer to use - // Load our bytesLeftOver - escapeCount = decoder.bytesLeftOverCount; - - // Don't want to mess up decoder if we're counting or throw an exception - for (int i = 0; i < escapeCount; i++) - escapeBytes[i] = decoder.bytesLeftOver[i]; - } - - // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. - while (buffer.MoreData || escapeCount > 0) - { - byte ch; - - if (escapeCount > 0) - { - // Get more escape sequences if necessary - if (escapeBytes[0] == ESCAPE) - { - // Stop if no more input - if (!buffer.MoreData) - { - if (decoder != null && !decoder.MustFlush) - break; - } - else - { - // Add it to the sequence we can check - escapeBytes[escapeCount++] = buffer.GetNextByte(); - - // We have an escape sequence - ISO2022Modes modeReturn = - CheckEscapeSequenceKR(escapeBytes, escapeCount); - - if (modeReturn != ISO2022Modes.ModeInvalidEscape) - { - if (modeReturn != ISO2022Modes.ModeIncompleteEscape) - { - // Processed escape correctly, no effect (we know about KR mode) - escapeCount = 0; - } - - // Either way, continue to get next escape or real byte - continue; - } - } - - // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. - } - - // Still have something left over in escape buffer - // Get it and move them down one - ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - } - else - { - // Get our next byte - ch = buffer.GetNextByte(); - - if (ch == ESCAPE) - { - // We'll have an escape sequence, use it if we don't have one buffered already - if (escapeCount == 0) - { - // Start this new escape sequence - escapeBytes[0] = ch; - escapeCount = 1; - continue; - } - - // Flush previous escape sequence, then reuse this escape byte - buffer.AdjustBytes(-1); - } - } - - if (ch == SHIFT_OUT) - { - currentMode = ISO2022Modes.ModeKR; - continue; - } - else if (ch == SHIFT_IN) - { - currentMode = ISO2022Modes.ModeASCII; - continue; - } - - // Get our full character - ushort iBytes = ch; - bool b2Bytes = false; - - // MLANG was passing through ' ', '\t' and '\n', so we do so as well, but I don't see that in the RFC. - if (currentMode == ISO2022Modes.ModeKR && ch != ' ' && ch != '\t' && ch != '\n') - { - // - // To handle errors, we need to check: - // 1. if trailbyte is there - // 2. if code is valid - // - if (escapeCount > 0) - { - // Let another escape fall through - if (escapeBytes[0] != ESCAPE) - { - // Move them down one & get the next data - iBytes <<= 8; - iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); - b2Bytes = true; - } - } - else if (buffer.MoreData) - { - iBytes <<= 8; - iBytes |= buffer.GetNextByte(); - b2Bytes = true; - } - else - { - // Not enough input, use decoder if possible - if (decoder == null || decoder.MustFlush) - { - // No decoder, do fallback for lonely 1st byte - buffer.Fallback(ch); - break; - } - - // Stick it in the decoder if we're not counting - if (chars != null) - { - escapeBytes[0] = ch; - escapeCount = 1; - } - break; - } - } - - // We have a iBytes to try to convert. - char c = mapBytesToUnicode[iBytes]; - - // See if it was unknown - if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - // Have to do fallback - if (b2Bytes) - { - if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) - break; - } - else - { - if (!buffer.Fallback(ch)) - break; - } - } - else - { - if (!buffer.AddChar(c, b2Bytes ? 2:1)) - break; - } - } - - // Make sure our decoder state matches our mode, if not counting - if (chars != null && decoder != null) - { - // Remember it if we don't flush - if (!decoder.MustFlush || escapeCount != 0) - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISO2022Encoding.GetCharsCP50225KR]Expected no state or not converting or not flushing"); - - decoder.currentMode = currentMode; - - // Remember escape buffer - decoder.bytesLeftOverCount = escapeCount; - decoder.bytesLeftOver = escapeBytes; - } - else - { - // We flush, clear buffer - decoder.currentMode = ISO2022Modes.ModeASCII; - decoder.shiftInOutMode = ISO2022Modes.ModeASCII; - decoder.bytesLeftOverCount = 0; - } - - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return # of characters we found - return buffer.Count; - } - - // We know we have an escape sequence, so check it starting with the byte after the escape - private ISO2022Modes CheckEscapeSequenceKR( byte[] bytes, int escapeCount ) - { - // Have an escape sequence - if (bytes[0] != ESCAPE) - return ISO2022Modes.ModeInvalidEscape; - - if (escapeCount < 4) - return ISO2022Modes.ModeIncompleteEscape; - - if (bytes[1] == '$' && bytes[2] == ')' && bytes[3] == 'C') // <esc>$)C - return ISO2022Modes.ModeKR; - - // If we get here we fell through and have an invalid/unknown escape sequence - return ISO2022Modes.ModeInvalidEscape; - } - - // CP52936 is HZ Encoding - // HZ Encoding has 4 shift sequences: - // ~~ '~' (\u7e) - // ~} shift into 1 byte mode, - // ~{ shift into 2 byte GB 2312-80 - // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) - // (This is for mailers that restrict to 70 or 80 or whatever character lines) - // - // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 - // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e - // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe - // - // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. - // (all bytes <= 0x7f) - private unsafe int GetCharsCP52936(byte* bytes, int byteCount, - char* chars, int charCount, ISO2022Decoder decoder) - { - Debug.Assert(byteCount >=0, "[ISO2022Encoding.GetCharsCP52936]count >=0"); - Debug.Assert(bytes!=null, "[ISO2022Encoding.GetCharsCP52936]bytes!=null"); - - // Get our info. - Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( - this, decoder, chars, charCount, bytes, byteCount); - - // No mode information yet - ISO2022Modes currentMode = ISO2022Modes.ModeASCII; - int byteLeftOver = -1; - bool bUsedDecoder = false; - - if (decoder != null) - { - currentMode = decoder.currentMode; - // See if we have leftover decoder buffer to use - // Don't want to mess up decoder if we're counting or throw an exception - if (decoder.bytesLeftOverCount != 0 ) - { - // Load our bytesLeftOver - byteLeftOver = decoder.bytesLeftOver[0]; - } - } - - // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. - while (buffer.MoreData || byteLeftOver >= 0) - { - byte ch; - - // May have a left over byte - if (byteLeftOver >= 0) - { - ch = (byte)byteLeftOver; - byteLeftOver = -1; - } - else - { - ch = buffer.GetNextByte(); - } - - // We're in escape mode - if (ch == '~') - { - // Next char is type of switch - if (!buffer.MoreData) - { - // We don't have anything left, it'll be in decoder or a ? - // don't fail if we are allowing overflows - if (decoder == null || decoder.MustFlush) - { - // We'll be a '?' - buffer.Fallback(ch); - // break if we fail & break if we don't (because !MoreData) - // Add succeeded, continue - break; - } - - // Stick it in decoder - if (decoder != null) - decoder.ClearMustFlush(); - - if (chars != null) - { - decoder.bytesLeftOverCount = 1; - decoder.bytesLeftOver[0] = (byte)'~'; - bUsedDecoder = true; - } - break; - } - - // What type is it?, get 2nd byte - ch = buffer.GetNextByte(); - - if (ch == '~' && currentMode == ISO2022Modes.ModeASCII) - { - // Its just a ~~ replacement for ~, add it - if (!buffer.AddChar((char)ch, 2)) - // Add failed, break for converting - break; - - // Add succeeded, continue - continue; - } - else if (ch == '{') - { - // Switching to Double Byte mode - currentMode = ISO2022Modes.ModeHZ; - continue; - } - else if (ch == '}') - { - // Switching to ASCII mode - currentMode = ISO2022Modes.ModeASCII; - continue; - } - else if (ch == '\n') - { - // Ignore ~\n sequence - continue; - } - else - { - // Unknown escape, back up and try the '~' as a "normal" byte or lead byte - buffer.AdjustBytes(-1); - ch = (byte)'~'; - } - } - - // go ahead and add our data - if (currentMode != ISO2022Modes.ModeASCII) - { - // Should be ModeHZ - Debug.Assert(currentMode == ISO2022Modes.ModeHZ, "[ISO2022Encoding.GetCharsCP52936]Expected ModeHZ"); - char cm; - - // Everett allowed characters < 0x20 to be passed as if they were ASCII - if (ch < 0x20) - { - // Emit it as ASCII - goto STOREASCII; - } - - // Its multibyte, should have another byte - if (!buffer.MoreData) - { - // No bytes left - // don't fail if we are allowing overflows - if (decoder == null || decoder.MustFlush) - { - // Not enough bytes, fallback lead byte - buffer.Fallback(ch); - - // Break if we fail & break because !MoreData - break; - } - - if (decoder != null) - decoder.ClearMustFlush(); - - // Stick it in decoder - if (chars != null) - { - decoder.bytesLeftOverCount = 1; - decoder.bytesLeftOver[0] = ch; - bUsedDecoder = true; - } - break; - } - - // Everett uses space as an escape character for single SBCS bytes - byte ch2 = buffer.GetNextByte(); - ushort iBytes = (ushort)(ch << 8 | ch2); - - if (ch == ' ' && ch2 != 0) - { - // Get next char and treat it like ASCII (Everett treated space like an escape - // allowing the next char to be just ascii) - cm = (char)ch2; - goto STOREMULTIBYTE; - } - - // Bytes should be in range: lead byte 0x21-0x77, trail byte: 0x21 - 0x7e - if ((ch < 0x21 || ch > 0x77 || ch2 < 0x21 || ch2 > 0x7e) && - // Everett allowed high bit mappings for same characters (but only if both bits set) - (ch < 0xa1 || ch > 0xf7 || ch2 < 0xa1 || ch2 > 0xfe)) - { - // For some reason Everett allowed XX20 to become unicode 3000... (ideo sp) - if (ch2 == 0x20 && 0x21 <= ch && ch <= 0x7d) - { - iBytes = 0x2121; - goto MULTIBYTE; - } - - // Illegal char, use fallback. If lead byte is 0 have to do it special and do it first - if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) - break; - continue; - } - - MULTIBYTE: - iBytes |= 0x8080; - // Look up the multibyte char to stick it in our data - - // We have a iBytes to try to convert. - cm = mapBytesToUnicode[iBytes]; - - STOREMULTIBYTE: - - // See if it was unknown - if (cm == UNKNOWN_CHAR_FLAG && iBytes != 0) - { - // Fall back the unknown stuff - if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) - break; - continue; - } - - if (!buffer.AddChar(cm, 2)) - break; // convert ran out of buffer, stop - continue; - } - - // Just ASCII - // We allow some chars > 7f because everett did, so we have to look them up. - STOREASCII: - char c = mapBytesToUnicode[ch]; - - // Check if it was unknown - if ((c == UNKNOWN_CHAR_FLAG || c == 0) && (ch != 0)) - { - // fallback the unkown bytes - if (!buffer.Fallback((byte)ch)) - break; - continue; - } - - // Go ahead and add our ASCII character - if (!buffer.AddChar(c)) - break; // convert ran out of buffer, stop - } - - // Need to remember our state, IF we're not counting - if (chars != null && decoder != null) - { - if (!bUsedDecoder) - { - // If we didn't use it, clear the byte left over - decoder.bytesLeftOverCount = 0; - } - - if (decoder.MustFlush && decoder.bytesLeftOverCount == 0) - { - decoder.currentMode = ISO2022Modes.ModeASCII; - } - else - { - // Either not flushing or had state (from convert) - Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, - "[ISO2022Encoding.GetCharsCP52936]Expected no state or not converting or not flushing"); - - decoder.currentMode = currentMode; - } - decoder.m_bytesUsed = buffer.BytesUsed; - } - - // Return # of characters we found - return buffer.Count; - } - - // Note: These all end up with 1/2 bytes of average byte count, so unless we're 1 we're always - // charCount/2 bytes too big. - public override int GetMaxByteCount(int charCount) - { - if (charCount < 0) - throw new ArgumentOutOfRangeException(nameof(charCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Characters would be # of characters + 1 in case high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; - - if (EncoderFallback.MaxCharCount > 1) - byteCount *= EncoderFallback.MaxCharCount; - - // Start with just generic DBCS values (sort of). - int perChar = 2; - int extraStart = 0; - int extraEnd = 0; - - switch (CodePage) - { - case 50220: - case 50221: - // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP - perChar = 5; // 5 max (4.5 average) - extraEnd = 3; // 3 bytes to shift back to ASCII - break; - case 50222: - // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP - perChar = 5; // 5 max (4.5 average) - extraEnd = 4; // 1 byte to shift from Katakana -> DBCS, 3 bytes to shift back to ASCII from DBCS - break; - case 50225: - // 2 bytes per char + 1 byte SO, or 1 byte per char + 1 byte SI. - perChar = 3; // 3 max, (2.5 average) - extraStart = 4; // EUC-KR marker appears at beginning of file. - extraEnd = 1; // 1 byte to shift back to ascii if necessary. - break; - case 52936: - // 2 bytes per char + 2 byte shift, or 1 byte + 1 byte shift - // Worst case: left over surrogate with no low surrogate is extra ?, could have to switch to ASCII, then could have HZ and flush to ASCII mode - perChar = 4; // 4 max, (3.5 average if every other char is HZ/ASCII) - extraEnd = 2; // 2 if we have to shift back to ASCII - break; - } - - // Return our surrogate and End plus perChar for each char. - byteCount *= perChar; - byteCount += extraStart + extraEnd; - - if (byteCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); - - return (int)byteCount; - } - - public override int GetMaxCharCount(int byteCount) - { - if (byteCount < 0) - throw new ArgumentOutOfRangeException(nameof(byteCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - int perChar = 1; - int extraDecoder = 1; - - switch (CodePage) - { - case 50220: - case 50221: - case 50222: - case 50225: - perChar = 1; // Worst case all ASCII - extraDecoder = 3; // Could have left over 3 chars of 4 char escape sequence, that all become ? - break; - case 52936: - perChar = 1; // Worst case all ASCII - extraDecoder = 1; // sequences are 2 chars, so if next one is illegal, then previous 1 could be ? - break; - } - - // Figure out our length, perchar * char + whatever extra our decoder could do to us. - long charCount = ((long)byteCount * perChar) + extraDecoder; - - // Just in case we have to fall back unknown ones. - if (DecoderFallback.MaxCharCount > 1) - charCount *= DecoderFallback.MaxCharCount; - - if (charCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(byteCount), Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); - - return (int)charCount; - } - - public override Encoder GetEncoder() - { - return new ISO2022Encoder(this); - } - - public override Decoder GetDecoder() - { - return new ISO2022Decoder(this); - } - - [Serializable] - internal class ISO2022Encoder : System.Text.EncoderNLS - { - internal ISO2022Modes currentMode; - internal ISO2022Modes shiftInOutMode; - - internal ISO2022Encoder(EncodingNLS encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - { - // Reset - currentMode = ISO2022Modes.ModeASCII; - shiftInOutMode = ISO2022Modes.ModeASCII; - charLeftOver = (char)0; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our encoder? - internal override bool HasState - { - get - { - // Don't check shift-out mode, it may be ascii (JP) or not (KR) - return (this.charLeftOver != (char)0 || - currentMode != ISO2022Modes.ModeASCII); - } - } - } - - [Serializable] - internal class ISO2022Decoder : System.Text.DecoderNLS - { - internal byte[] bytesLeftOver; - internal int bytesLeftOverCount; - internal ISO2022Modes currentMode; - internal ISO2022Modes shiftInOutMode; - - internal ISO2022Decoder(EncodingNLS encoding) : base(encoding) - { - // base calls reset - } - - public override void Reset() - { - // Reset - bytesLeftOverCount = 0; - bytesLeftOver = new byte[4]; - currentMode = ISO2022Modes.ModeASCII; - shiftInOutMode = ISO2022Modes.ModeASCII; - if (m_fallbackBuffer != null) - m_fallbackBuffer.Reset(); - } - - // Anything left in our decoder? - internal override bool HasState - { - get - { - // If have bytes left over or not shifted back to ASCII then have problem - return (this.bytesLeftOverCount != 0 || - currentMode != ISO2022Modes.ModeASCII); - } - } - } - - static ushort[] HalfToFullWidthKanaTable = - { - 0xa1a3, // 0x8ea1 : Halfwidth Ideographic Period - 0xa1d6, // 0x8ea2 : Halfwidth Opening Corner Bracket - 0xa1d7, // 0x8ea3 : Halfwidth Closing Corner Bracket - 0xa1a2, // 0x8ea4 : Halfwidth Ideographic Comma - 0xa1a6, // 0x8ea5 : Halfwidth Katakana Middle Dot - 0xa5f2, // 0x8ea6 : Halfwidth Katakana Wo - 0xa5a1, // 0x8ea7 : Halfwidth Katakana Small A - 0xa5a3, // 0x8ea8 : Halfwidth Katakana Small I - 0xa5a5, // 0x8ea9 : Halfwidth Katakana Small U - 0xa5a7, // 0x8eaa : Halfwidth Katakana Small E - 0xa5a9, // 0x8eab : Halfwidth Katakana Small O - 0xa5e3, // 0x8eac : Halfwidth Katakana Small Ya - 0xa5e5, // 0x8ead : Halfwidth Katakana Small Yu - 0xa5e7, // 0x8eae : Halfwidth Katakana Small Yo - 0xa5c3, // 0x8eaf : Halfwidth Katakana Small Tu - 0xa1bc, // 0x8eb0 : Halfwidth Katakana-Hiragana Prolonged Sound Mark - 0xa5a2, // 0x8eb1 : Halfwidth Katakana A - 0xa5a4, // 0x8eb2 : Halfwidth Katakana I - 0xa5a6, // 0x8eb3 : Halfwidth Katakana U - 0xa5a8, // 0x8eb4 : Halfwidth Katakana E - 0xa5aa, // 0x8eb5 : Halfwidth Katakana O - 0xa5ab, // 0x8eb6 : Halfwidth Katakana Ka - 0xa5ad, // 0x8eb7 : Halfwidth Katakana Ki - 0xa5af, // 0x8eb8 : Halfwidth Katakana Ku - 0xa5b1, // 0x8eb9 : Halfwidth Katakana Ke - 0xa5b3, // 0x8eba : Halfwidth Katakana Ko - 0xa5b5, // 0x8ebb : Halfwidth Katakana Sa - 0xa5b7, // 0x8ebc : Halfwidth Katakana Si - 0xa5b9, // 0x8ebd : Halfwidth Katakana Su - 0xa5bb, // 0x8ebe : Halfwidth Katakana Se - 0xa5bd, // 0x8ebf : Halfwidth Katakana So - 0xa5bf, // 0x8ec0 : Halfwidth Katakana Ta - 0xa5c1, // 0x8ec1 : Halfwidth Katakana Ti - 0xa5c4, // 0x8ec2 : Halfwidth Katakana Tu - 0xa5c6, // 0x8ec3 : Halfwidth Katakana Te - 0xa5c8, // 0x8ec4 : Halfwidth Katakana To - 0xa5ca, // 0x8ec5 : Halfwidth Katakana Na - 0xa5cb, // 0x8ec6 : Halfwidth Katakana Ni - 0xa5cc, // 0x8ec7 : Halfwidth Katakana Nu - 0xa5cd, // 0x8ec8 : Halfwidth Katakana Ne - 0xa5ce, // 0x8ec9 : Halfwidth Katakana No - 0xa5cf, // 0x8eca : Halfwidth Katakana Ha - 0xa5d2, // 0x8ecb : Halfwidth Katakana Hi - 0xa5d5, // 0x8ecc : Halfwidth Katakana Hu - 0xa5d8, // 0x8ecd : Halfwidth Katakana He - 0xa5db, // 0x8ece : Halfwidth Katakana Ho - 0xa5de, // 0x8ecf : Halfwidth Katakana Ma - 0xa5df, // 0x8ed0 : Halfwidth Katakana Mi - 0xa5e0, // 0x8ed1 : Halfwidth Katakana Mu - 0xa5e1, // 0x8ed2 : Halfwidth Katakana Me - 0xa5e2, // 0x8ed3 : Halfwidth Katakana Mo - 0xa5e4, // 0x8ed4 : Halfwidth Katakana Ya - 0xa5e6, // 0x8ed5 : Halfwidth Katakana Yu - 0xa5e8, // 0x8ed6 : Halfwidth Katakana Yo - 0xa5e9, // 0x8ed7 : Halfwidth Katakana Ra - 0xa5ea, // 0x8ed8 : Halfwidth Katakana Ri - 0xa5eb, // 0x8ed9 : Halfwidth Katakana Ru - 0xa5ec, // 0x8eda : Halfwidth Katakana Re - 0xa5ed, // 0x8edb : Halfwidth Katakana Ro - 0xa5ef, // 0x8edc : Halfwidth Katakana Wa - 0xa5f3, // 0x8edd : Halfwidth Katakana N - 0xa1ab, // 0x8ede : Halfwidth Katakana Voiced Sound Mark - 0xa1ac // 0x8edf : Halfwidth Katakana Semi-Voiced Sound Mark - }; - } -} -#endif // FEATURE_CODEPAGES_FILE - diff --git a/src/mscorlib/src/System/Text/Latin1Encoding.cs b/src/mscorlib/src/System/Text/Latin1Encoding.cs index 56a6c1f949..26009bf6c0 100644 --- a/src/mscorlib/src/System/Text/Latin1Encoding.cs +++ b/src/mscorlib/src/System/Text/Latin1Encoding.cs @@ -13,7 +13,6 @@ namespace System.Text using System.Collections; using System.Runtime.CompilerServices; using System.Runtime.Serialization; - using System.Security.Permissions; // @@ -488,7 +487,6 @@ namespace System.Text } } -#if !FEATURE_NORM_IDNA_ONLY public override bool IsAlwaysNormalized(NormalizationForm form) { // Latin-1 contains precomposed characters, so normal for Form C. @@ -498,7 +496,6 @@ namespace System.Text // Only true for form C. return (form == NormalizationForm.FormC); } -#endif // !FEATURE_NORM_IDNA_ONLY // Since our best fit table is small we'll hard code it internal override char[] GetBestFitUnicodeToBytesData() { diff --git a/src/mscorlib/src/System/Text/MLangCodePageEncoding.cs b/src/mscorlib/src/System/Text/MLangCodePageEncoding.cs deleted file mode 100644 index a82db91b98..0000000000 --- a/src/mscorlib/src/System/Text/MLangCodePageEncoding.cs +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - - -// WARNING: -// -// This is just an IObjectReference proxy for the former MLang Encodings (V1.1) -// We keep the old name now even for the Whidbey V2.0 IObjectReference because it also -// works with the Everett V1.1 version. -namespace System.Text -{ - using System; - using System.Runtime.Serialization; - using System.Security.Permissions; - using System.Diagnostics; - using System.Diagnostics.Contracts; - - /*=================================MLangCodePageEncoding================================== - ** This class is here only to deserialize the MLang classes from Everett (V1.1) into - ** Appropriate Whidbey (V2.0) objects. We also serialize the Whidbey classes - ** using this proxy since we pretty much need one anyway and that solves Whidbey - ** to Everett compatibility as well. - ==============================================================================*/ - - [Serializable] - internal sealed class MLangCodePageEncoding : IObjectReference, ISerializable - { - // Temp stuff - [NonSerialized] - private int m_codePage; - [NonSerialized] - private bool m_isReadOnly; - [NonSerialized] - private bool m_deserializedFromEverett = false; - - [NonSerialized] - private EncoderFallback encoderFallback = null; - [NonSerialized] - private DecoderFallback decoderFallback = null; - - // Might need this when GetRealObjecting - [NonSerialized] - private Encoding realEncoding = null; - - // Constructor called by serialization. - internal MLangCodePageEncoding(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - // All versions have a code page - this.m_codePage = (int)info.GetValue("m_codePage", typeof(int)); - - // See if we have a code page - try - { - // - // Try Whidbey V2.0 Fields - // - this.m_isReadOnly = (bool)info.GetValue("m_isReadOnly", typeof(bool)); - - this.encoderFallback = (EncoderFallback)info.GetValue("encoderFallback", typeof(EncoderFallback)); - this.decoderFallback = (DecoderFallback)info.GetValue("decoderFallback", typeof(DecoderFallback)); - } - catch (SerializationException) - { - // - // Didn't have Whidbey things, must be Everett - // - this.m_deserializedFromEverett = true; - - // May as well be read only - this.m_isReadOnly = true; - } - } - - // Just get it from GetEncoding - public Object GetRealObject(StreamingContext context) - { - // Get our encoding (Note: This has default fallbacks for readonly and everett cases) - this.realEncoding = Encoding.GetEncoding(this.m_codePage); - - // If its read only then it uses default fallbacks, otherwise pick up the new ones - // Otherwise we want to leave the new one read only - if (!this.m_deserializedFromEverett && !this.m_isReadOnly) - { - this.realEncoding = (Encoding)this.realEncoding.Clone(); - this.realEncoding.EncoderFallback = this.encoderFallback; - this.realEncoding.DecoderFallback = this.decoderFallback; - } - - return this.realEncoding; - } - - // ISerializable implementation - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // We cannot ever call this. - Debug.Assert(false, "Didn't expect to make it to MLangCodePageEncoding ISerializable.GetObjectData"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - -// Same problem with the Encoder, this only happens with Everett Encoders - [Serializable] - internal sealed class MLangEncoder : IObjectReference, ISerializable - { - // Might need this when GetRealObjecting - [NonSerialized] - private Encoding realEncoding = null; - - // Constructor called by serialization, have to handle deserializing from Everett - internal MLangEncoder(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - this.realEncoding = (Encoding)info.GetValue("m_encoding", typeof(Encoding)); - } - - // Just get it from GetEncoder - public Object GetRealObject(StreamingContext context) - { - return this.realEncoding.GetEncoder(); - } - - // ISerializable implementation, get data for this object - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // We cannot ever call this. - Debug.Assert(false, "Didn't expect to make it to MLangCodePageEncoding.MLangEncoder.GetObjectData"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - } - - - // Same problem with the Decoder, this only happens with Everett Decoders - [Serializable] - internal sealed class MLangDecoder : IObjectReference, ISerializable - { - // Might need this when GetRealObjecting - [NonSerialized] - private Encoding realEncoding = null; - - // Constructor called by serialization, have to handle deserializing from Everett - internal MLangDecoder(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - this.realEncoding = (Encoding)info.GetValue("m_encoding", typeof(Encoding)); - } - - // Just get it from GetDecoder - public Object GetRealObject(StreamingContext context) - { - return this.realEncoding.GetDecoder(); - } - - // ISerializable implementation, get data for this object - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // We cannot ever call this. - Debug.Assert(false, "Didn't expect to make it to MLangCodePageEncoding.MLangDecoder.GetObjectData"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - } - } -} diff --git a/src/mscorlib/src/System/Text/Normalization.Windows.cs b/src/mscorlib/src/System/Text/Normalization.Windows.cs index b2faf0db68..3bcba08934 100644 --- a/src/mscorlib/src/System/Text/Normalization.Windows.cs +++ b/src/mscorlib/src/System/Text/Normalization.Windows.cs @@ -21,19 +21,15 @@ namespace System.Text // // Flags that track whether given normalization form was initialized // -#if !FEATURE_NORM_IDNA_ONLY private static volatile bool NFC; private static volatile bool NFD; private static volatile bool NFKC; private static volatile bool NFKD; -#endif // !FEATURE_NORM_IDNA_ONLY private static volatile bool IDNA; -#if !FEATURE_NORM_IDNA_ONLY private static volatile bool NFCDisallowUnassigned; private static volatile bool NFDDisallowUnassigned; private static volatile bool NFKCDisallowUnassigned; private static volatile bool NFKDDisallowUnassigned; -#endif // !FEATURE_NORM_IDNA_ONLY private static volatile bool IDNADisallowUnassigned; private static volatile bool Other; @@ -77,7 +73,6 @@ namespace System.Text { switch ((ExtendedNormalizationForms)form) { -#if !FEATURE_NORM_IDNA_ONLY case ExtendedNormalizationForms.FormC: if (NFC) return; InitializeForm(form, "normnfc.nlp"); @@ -101,7 +96,6 @@ namespace System.Text InitializeForm(form, "normnfkd.nlp"); NFKD = true; break; -#endif // !FEATURE_NORM_IDNA_ONLY case ExtendedNormalizationForms.FormIdna: if (IDNA) return; @@ -109,7 +103,6 @@ namespace System.Text IDNA = true; break; -#if !FEATURE_NORM_IDNA_ONLY case ExtendedNormalizationForms.FormCDisallowUnassigned: if (NFCDisallowUnassigned) return; InitializeForm(form, "normnfc.nlp"); @@ -133,7 +126,6 @@ namespace System.Text InitializeForm(form, "normnfkd.nlp"); NFKDDisallowUnassigned = true; break; -#endif // !FEATURE_NORM_IDNA_ONLY case ExtendedNormalizationForms.FormIdnaDisallowUnassigned: if (IDNADisallowUnassigned) return; diff --git a/src/mscorlib/src/System/Text/Normalization.cs b/src/mscorlib/src/System/Text/Normalization.cs index e7e733a587..c81149d59a 100644 --- a/src/mscorlib/src/System/Text/Normalization.cs +++ b/src/mscorlib/src/System/Text/Normalization.cs @@ -5,32 +5,25 @@ namespace System.Text { // This is the enumeration for Normalization Forms -[System.Runtime.InteropServices.ComVisible(true)] public enum NormalizationForm { -#if !FEATURE_NORM_IDNA_ONLY FormC = 1, FormD = 2, FormKC = 5, FormKD = 6 -#endif // !FEATURE_NORM_IDNA_ONLY } internal enum ExtendedNormalizationForms { -#if !FEATURE_NORM_IDNA_ONLY FormC = 1, FormD = 2, FormKC = 5, FormKD = 6, -#endif // !FEATURE_NORM_IDNA_ONLY FormIdna = 0xd, -#if !FEATURE_NORM_IDNA_ONLY FormCDisallowUnassigned = 0x101, FormDDisallowUnassigned = 0x102, FormKCDisallowUnassigned = 0x105, FormKDDisallowUnassigned = 0x106, -#endif // !FEATURE_NORM_IDNA_ONLY FormIdnaDisallowUnassigned = 0x10d } } diff --git a/src/mscorlib/src/System/Text/SBCSCodePageEncoding.cs b/src/mscorlib/src/System/Text/SBCSCodePageEncoding.cs deleted file mode 100644 index 8b07149fb7..0000000000 --- a/src/mscorlib/src/System/Text/SBCSCodePageEncoding.cs +++ /dev/null @@ -1,1009 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -#if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding -namespace System.Text -{ - using System; - using System.Diagnostics; - using System.Diagnostics.Contracts; - using System.Text; - using System.Threading; - using System.Globalization; - using System.Runtime.Serialization; - using System.Security; - using System.Security.Permissions; - - // SBCSCodePageEncoding - [Serializable] - internal class SBCSCodePageEncoding : BaseCodePageEncoding, ISerializable - { - // Pointers to our memory section parts - [NonSerialized] - unsafe char* mapBytesToUnicode = null; // char 256 - [NonSerialized] - unsafe byte* mapUnicodeToBytes = null; // byte 65536 - [NonSerialized] - unsafe int* mapCodePageCached = null; // to remember which CP is cached - - const char UNKNOWN_CHAR=(char)0xFFFD; - - // byteUnknown is used for default fallback only - [NonSerialized] - byte byteUnknown; - [NonSerialized] - char charUnknown; - - public SBCSCodePageEncoding(int codePage) : this(codePage, codePage) - { - } - - internal SBCSCodePageEncoding(int codePage, int dataCodePage) : base(codePage, dataCodePage) - { - } - - // Constructor called by serialization. - // Note: We use the base GetObjectData however - internal SBCSCodePageEncoding(SerializationInfo info, StreamingContext context) : base(0) - { - // Actually this can't ever get called, CodePageEncoding is our proxy - Debug.Assert(false, "Didn't expect to make it to SBCSCodePageEncoding serialization constructor"); - throw new ArgumentNullException("this"); - } - - // We have a managed code page entry, so load our tables - // SBCS data section looks like: - // - // char[256] - what each byte maps to in unicode. No support for surrogates. 0 is undefined code point - // (except 0 for byte 0 is expected to be a real 0) - // - // byte/char* - Data for best fit (unicode->bytes), again no best fit for Unicode - // 1st WORD is Unicode // of 1st character position - // Next bytes are best fit byte for that position. Position is incremented after each byte - // byte < 0x20 means skip the next n positions. (Where n is the byte #) - // byte == 1 means that next word is another unicode code point # - // byte == 0 is unknown. (doesn't override initial WCHAR[256] table! - protected override unsafe void LoadManagedCodePage() - { - // Should be loading OUR code page - Debug.Assert(pCodePage->CodePage == this.dataTableCodePage, - "[SBCSCodePageEncoding.LoadManagedCodePage]Expected to load data table code page"); - - // Make sure we're really a 1 byte code page - if (pCodePage->ByteCount != 1) - throw new NotSupportedException( - Environment.GetResourceString("NotSupported_NoCodepageData", CodePage)); - - // Remember our unknown bytes & chars - byteUnknown = (byte)pCodePage->ByteReplace; - charUnknown = pCodePage->UnicodeReplace; - - // Get our mapped section 65536 bytes for unicode->bytes, 256 * 2 bytes for bytes->unicode - // Plus 4 byte to remember CP # when done loading it. (Don't want to get IA64 or anything out of alignment) - byte *pMemorySection = GetSharedMemory(65536*1 + 256*2 + 4 + iExtraBytes); - - mapBytesToUnicode = (char*)pMemorySection; - mapUnicodeToBytes = (byte*)(pMemorySection + 256 * 2); - mapCodePageCached = (int*)(pMemorySection + 256 * 2 + 65536 * 1 + iExtraBytes); - - // If its cached (& filled in) we don't have to do anything else - if (*mapCodePageCached != 0) - { - Debug.Assert(*mapCodePageCached == this.dataTableCodePage, - "[DBCSCodePageEncoding.LoadManagedCodePage]Expected mapped section cached page to be same as data table code page. Cached : " + - *mapCodePageCached + " Expected:" + this.dataTableCodePage); - - if (*mapCodePageCached != this.dataTableCodePage) - throw new OutOfMemoryException( - Environment.GetResourceString("Arg_OutOfMemoryException")); - - // If its cached (& filled in) we don't have to do anything else - return; - } - - // Need to read our data file and fill in our section. - // WARNING: Multiple code pieces could do this at once (so we don't have to lock machine-wide) - // so be careful here. Only stick legal values in here, don't stick temporary values. - - // Read our data file and set mapBytesToUnicode and mapUnicodeToBytes appropriately - // First table is just all 256 mappings - char* pTemp = (char*)&(pCodePage->FirstDataWord); - for (int b = 0; b < 256; b++) - { - // Don't want to force 0's to map Unicode wrong. 0 byte == 0 unicode already taken care of - if (pTemp[b] != 0 || b == 0) - { - mapBytesToUnicode[b] = pTemp[b]; - - if (pTemp[b] != UNKNOWN_CHAR) - mapUnicodeToBytes[pTemp[b]] = (byte)b; - } - else - { - mapBytesToUnicode[b] = UNKNOWN_CHAR; - } - } - - // We're done with our mapped section, set our flag so others don't have to rebuild table. - *mapCodePageCached = this.dataTableCodePage; - } - - // Private object for locking instead of locking on a public type for SQL reliability work. - private static Object s_InternalSyncObject; - private static Object InternalSyncObject - { - get - { - if (s_InternalSyncObject == null) - { - Object o = new Object(); - Interlocked.CompareExchange<Object>(ref s_InternalSyncObject, o, null); - } - return s_InternalSyncObject; - } - } - - // Read in our best fit table - protected unsafe override void ReadBestFitTable() - { - // Lock so we don't confuse ourselves. - lock(InternalSyncObject) - { - // If we got a best fit array already, then don't do this - if (arrayUnicodeBestFit == null) - { - // - // Read in Best Fit table. - // - - // First check the SBCS->Unicode best fit table, which starts right after the - // 256 word data table. This table looks like word, word where 1st word is byte and 2nd - // word is replacement for that word. It ends when byte == 0. - byte* pData = (byte*)&(pCodePage->FirstDataWord); - pData += 512; - - // Need new best fit array - char[] arrayTemp = new char[256]; - for (int i = 0; i < 256; i++) - arrayTemp[i] = mapBytesToUnicode[i]; - - // See if our words are zero - ushort byteTemp; - while ((byteTemp = *((ushort*)pData)) != 0) - { - - Debug.Assert(arrayTemp[byteTemp] == UNKNOWN_CHAR, String.Format(CultureInfo.InvariantCulture, - "[SBCSCodePageEncoding::ReadBestFitTable] Expected unallocated byte (not 0x{2:X2}) for best fit byte at 0x{0:X2} for code page {1}", - byteTemp, CodePage, (int)arrayTemp[byteTemp])); - pData += 2; - - arrayTemp[byteTemp] = *((char*)pData); - pData += 2; - } - - // Remember our new array - arrayBytesBestFit = arrayTemp; - - // It was on 0, it needs to be on next byte - pData+=2; - byte* pUnicodeToSBCS = pData; - - // Now count our characters from our Unicode->SBCS best fit table, - // which is right after our 256 byte data table - int iBestFitCount = 0; - - // Now do the UnicodeToBytes Best Fit mapping (this is the one we normally think of when we say "best fit") - // pData should be pointing at the first data point for Bytes->Unicode table - int unicodePosition = *((ushort*)pData); - pData += 2; - - while (unicodePosition < 0x10000) - { - // Get the next byte - byte input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next 2 bytes as our byte position - unicodePosition = *((ushort*)pData); - pData+=2; - } - else if (input < 0x20 && input > 0 && input != 0x1e) - { - // Advance input characters - unicodePosition += input; - } - else - { - // Use this character if it isn't zero - if (input > 0) - iBestFitCount++; - - // skip this unicode position in any case - unicodePosition++; - } - } - - // Make an array for our best fit data - arrayTemp = new char[iBestFitCount*2]; - - // Now actually read in the data - // reset pData should be pointing at the first data point for Bytes->Unicode table - pData = pUnicodeToSBCS; - unicodePosition = *((ushort*)pData); - pData += 2; - iBestFitCount = 0; - - while (unicodePosition < 0x10000) - { - // Get the next byte - byte input = *pData; - pData++; - - // build our table: - if (input == 1) - { - // Use next 2 bytes as our byte position - unicodePosition = *((ushort*)pData); - pData+=2; - } - else if (input < 0x20 && input > 0 && input != 0x1e) - { - // Advance input characters - unicodePosition += input; - } - else - { - // Check for escape for glyph range - if (input == 0x1e) - { - // Its an escape, so just read next byte directly - input = *pData; - pData++; - } - - // 0 means just skip me - if (input > 0) - { - // Use this character - arrayTemp[iBestFitCount++] = (char)unicodePosition; - // Have to map it to Unicode because best fit will need unicode value of best fit char. - arrayTemp[iBestFitCount++] = mapBytesToUnicode[input]; - - // This won't work if it won't round trip. - Debug.Assert(arrayTemp[iBestFitCount-1] != (char)0, - String.Format(CultureInfo.InvariantCulture, - "[SBCSCodePageEncoding.ReadBestFitTable] No valid Unicode value {0:X4} for round trip bytes {1:X4}, encoding {2}", - (int)mapBytesToUnicode[input], (int)input, CodePage)); - } - unicodePosition++; - } - } - - // Remember it - arrayUnicodeBestFit = arrayTemp; - } - } - } - - // GetByteCount - // Note: We start by assuming that the output will be the same as count. Having - // an encoder or fallback may change that assumption - internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(count >= 0, "[SBCSCodePageEncoding.GetByteCount]count is negative"); - Debug.Assert(chars != null, "[SBCSCodePageEncoding.GetByteCount]chars is null"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[SBCSCodePageEncoding.GetByteCount]Attempting to use null fallback"); - - CheckMemorySection(); - - // Need to test fallback - EncoderReplacementFallback fallback = null; - - // Get any left over characters - char charLeftOver = (char)0; - if (encoder != null) - { - charLeftOver = encoder.charLeftOver; - Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), - "[SBCSCodePageEncoding.GetByteCount]leftover character should be high surrogate"); - fallback = encoder.Fallback as EncoderReplacementFallback; - - // Verify that we have no fallbackbuffer, actually for SBCS this is always empty, so just assert - Debug.Assert(!encoder.m_throwOnOverflow || !encoder.InternalHasFallbackBuffer || - encoder.FallbackBuffer.Remaining == 0, - "[SBCSCodePageEncoding.GetByteCount]Expected empty fallback buffer at start"); - } - else - { - // If we aren't using default fallback then we may have a complicated count. - fallback = this.EncoderFallback as EncoderReplacementFallback; - } - - if ((fallback != null && fallback.MaxCharCount == 1)/* || bIsBestFit*/) - { - // Replacement fallback encodes surrogate pairs as two ?? (or two whatever), so return size is always - // same as input size. - // Note that no existing SBCS code pages map code points to supplimentary characters, so this is easy. - - // We could however have 1 extra byte if the last call had an encoder and a funky fallback and - // if we don't use the funky fallback this time. - - // Do we have an extra char left over from last time? - if (charLeftOver > 0) - count++; - - return (count); - } - - // It had a funky fallback, so its more complicated - // Need buffer maybe later - EncoderFallbackBuffer fallbackBuffer = null; - - // prepare our end - int byteCount = 0; - char* charEnd = chars + count; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - // Since left over char was a surrogate, it'll have to be fallen back. - // Get Fallback - Debug.Assert(encoder != null, "[SBCSCodePageEncoding.GetByteCount]Expect to have encoder if we have a charLeftOver"); - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, false); - - // This will fallback a pair if *chars is a low surrogate - fallbackBuffer.InternalFallback(charLeftOver, ref chars); - } - - // Now we may have fallback char[] already from the encoder - - // Go ahead and do it, including the fallback. - char ch; - while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || - chars < charEnd) - { - // First unwind any fallback - if (ch == 0) - { - // No fallback, just get next char - ch = *chars; - chars++; - } - - // get byte for this char - byte bTemp = mapUnicodeToBytes[ch]; - - // Check for fallback, this'll catch surrogate pairs too. - if (bTemp == 0 && ch != (char)0) - { - if (fallbackBuffer == null) - { - // Create & init fallback buffer - if (encoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = encoder.FallbackBuffer; - - // chars has moved so we need to remember figure it out so Exception fallback - // index will be correct - fallbackBuffer.InternalInitialize(charEnd - count, charEnd, encoder, false); - } - - // Get Fallback - fallbackBuffer.InternalFallback(ch, ref chars); - continue; - } - - // We'll use this one - byteCount++; - } - - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[SBCSEncoding.GetByteCount]Expected Empty fallback buffer at end"); - - return (int)byteCount; - } - - internal override unsafe int GetBytes(char* chars, int charCount, - byte* bytes, int byteCount, EncoderNLS encoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[SBCSCodePageEncoding.GetBytes]bytes is null"); - Debug.Assert(byteCount >= 0, "[SBCSCodePageEncoding.GetBytes]byteCount is negative"); - Debug.Assert(chars != null, "[SBCSCodePageEncoding.GetBytes]chars is null"); - Debug.Assert(charCount >= 0, "[SBCSCodePageEncoding.GetBytes]charCount is negative"); - - // Assert because we shouldn't be able to have a null encoder. - Debug.Assert(encoderFallback != null, "[SBCSCodePageEncoding.GetBytes]Attempting to use null encoder fallback"); - - CheckMemorySection(); - - // Need to test fallback - EncoderReplacementFallback fallback = null; - - // Get any left over characters - char charLeftOver = (char)0; - if (encoder != null) - { - charLeftOver = encoder.charLeftOver; - Debug.Assert(charLeftOver == 0 || Char.IsHighSurrogate(charLeftOver), - "[SBCSCodePageEncoding.GetBytes]leftover character should be high surrogate"); - fallback = encoder.Fallback as EncoderReplacementFallback; - - // Verify that we have no fallbackbuffer, for SBCS its always empty, so just assert - Debug.Assert(!encoder.m_throwOnOverflow || !encoder.InternalHasFallbackBuffer || - encoder.FallbackBuffer.Remaining == 0, - "[SBCSCodePageEncoding.GetBytes]Expected empty fallback buffer at start"); -// if (encoder.m_throwOnOverflow && encoder.InternalHasFallbackBuffer && -// encoder.FallbackBuffer.Remaining > 0) -// throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty", -// this.EncodingName, encoder.Fallback.GetType())); - } - else - { - // If we aren't using default fallback then we may have a complicated count. - fallback = this.EncoderFallback as EncoderReplacementFallback; - } - - // prepare our end - char* charEnd = chars + charCount; - byte* byteStart = bytes; - char* charStart = chars; - - // See if we do the fast default or slightly slower fallback - if (fallback != null && fallback.MaxCharCount == 1) - { - // Make sure our fallback character is valid first - byte bReplacement = mapUnicodeToBytes[fallback.DefaultString[0]]; - - // Check for replacements in range, otherwise fall back to slow version. - if (bReplacement != 0) - { - // We should have exactly as many output bytes as input bytes, unless there's a left - // over character, in which case we may need one more. - - // If we had a left over character will have to add a ? (This happens if they had a funky - // fallback last time, but not this time.) (We can't spit any out though - // because with fallback encoder each surrogate is treated as a seperate code point) - if (charLeftOver > 0) - { - // Have to have room - // Throw even if doing no throw version because this is just 1 char, - // so buffer will never be big enough - if (byteCount == 0) - ThrowBytesOverflow(encoder, true); - - // This'll make sure we still have more room and also make sure our return value is correct. - *(bytes++) = bReplacement; - byteCount--; // We used one of the ones we were counting. - } - - // This keeps us from overrunning our output buffer - if (byteCount < charCount) - { - // Throw or make buffer smaller? - ThrowBytesOverflow(encoder, byteCount < 1); - - // Just use what we can - charEnd = chars + byteCount; - } - - // Simple way - while (chars < charEnd) - { - char ch2 = *chars; - chars++; - - byte bTemp = mapUnicodeToBytes[ch2]; - - // Check for fallback - if (bTemp == 0 && ch2 != (char)0) - *bytes = bReplacement; - else - *bytes = bTemp; - - bytes++; - } - - // Clear encoder - if (encoder != null) - { - encoder.charLeftOver = (char)0; - encoder.m_charsUsed = (int)(chars-charStart); - } - return (int)(bytes - byteStart); - } - } - - // Slower version, have to do real fallback. - - // For fallback we may need a fallback buffer, we know we aren't default fallback - EncoderFallbackBuffer fallbackBuffer = null; - - // prepare our end - byte* byteEnd = bytes + byteCount; - - // We may have a left over character from last time, try and process it. - if (charLeftOver > 0) - { - // Since left over char was a surrogate, it'll have to be fallen back. - // Get Fallback - Debug.Assert(encoder != null, "[SBCSCodePageEncoding.GetBytes]Expect to have encoder if we have a charLeftOver"); - fallbackBuffer = encoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(chars, charEnd, encoder, true); - - // This will fallback a pair if *chars is a low surrogate - fallbackBuffer.InternalFallback(charLeftOver, ref chars); - if (fallbackBuffer.Remaining > byteEnd - bytes) - { - // Throw it, if we don't have enough for this we never will - ThrowBytesOverflow(encoder, true); - } - } - - // Now we may have fallback char[] already from the encoder fallback above - - // Go ahead and do it, including the fallback. - char ch; - while ((ch = (fallbackBuffer == null) ? '\0' : fallbackBuffer.InternalGetNextChar()) != 0 || - chars < charEnd) - { - // First unwind any fallback - if (ch == 0) - { - // No fallback, just get next char - ch = *chars; - chars++; - } - - // get byte for this char - byte bTemp = mapUnicodeToBytes[ch]; - - // Check for fallback, this'll catch surrogate pairs too. - if (bTemp == 0 && ch != (char)0) - { - // Get Fallback - if ( fallbackBuffer == null ) - { - // Create & init fallback buffer - if (encoder == null) - fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = encoder.FallbackBuffer; - // chars has moved so we need to remember figure it out so Exception fallback - // index will be correct - fallbackBuffer.InternalInitialize(charEnd - charCount, charEnd, encoder, true); - } - - // Make sure we have enough room. Each fallback char will be 1 output char - // (or recursion exception will be thrown) - fallbackBuffer.InternalFallback(ch, ref chars); - if (fallbackBuffer.Remaining > byteEnd - bytes) - { - // Didn't use this char, reset it - Debug.Assert(chars > charStart, - "[SBCSCodePageEncoding.GetBytes]Expected chars to have advanced (fallback)"); - chars--; - fallbackBuffer.InternalReset(); - - // Throw it & drop this data - ThrowBytesOverflow(encoder, chars == charStart); - break; - } - continue; - } - - // We'll use this one - // Bounds check - if (bytes >= byteEnd) - { - // didn't use this char, we'll throw or use buffer - Debug.Assert(fallbackBuffer == null || fallbackBuffer.bFallingBack == false, - "[SBCSCodePageEncoding.GetBytes]Expected to NOT be falling back"); - if (fallbackBuffer == null || fallbackBuffer.bFallingBack == false) - { - Debug.Assert(chars > charStart, - "[SBCSCodePageEncoding.GetBytes]Expected chars to have advanced (normal)"); - chars--; // don't use last char - } - ThrowBytesOverflow(encoder, chars == charStart); // throw ? - break; // don't throw, stop - } - - // Go ahead and add it - *bytes = bTemp; - bytes++; - } - - // encoder stuff if we have one - if (encoder != null) - { - // Fallback stuck it in encoder if necessary, but we have to clear MustFlush cases - if (fallbackBuffer != null && !fallbackBuffer.bUsedEncoder) - // Clear it in case of MustFlush - encoder.charLeftOver = (char)0; - - // Set our chars used count - encoder.m_charsUsed = (int)(chars - charStart); - } - - // Expect Empty fallback buffer for SBCS - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[SBCSEncoding.GetBytes]Expected Empty fallback buffer at end"); - - return (int)(bytes - byteStart); - } - - // This is internal and called by something else, - internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS decoder) - { - // Just assert, we're called internally so these should be safe, checked already - Debug.Assert(bytes != null, "[SBCSCodePageEncoding.GetCharCount]bytes is null"); - Debug.Assert(count >= 0, "[SBCSCodePageEncoding.GetCharCount]byteCount is negative"); - - CheckMemorySection(); - - // See if we have best fit - bool bUseBestFit = false; - - // Only need decoder fallback buffer if not using default replacement fallback or best fit fallback. - DecoderReplacementFallback fallback = null; - - if (decoder == null) - { - fallback = this.DecoderFallback as DecoderReplacementFallback; - bUseBestFit = this.DecoderFallback.IsMicrosoftBestFitFallback; - } - else - { - fallback = decoder.Fallback as DecoderReplacementFallback; - bUseBestFit = decoder.Fallback.IsMicrosoftBestFitFallback; - Debug.Assert(!decoder.m_throwOnOverflow || !decoder.InternalHasFallbackBuffer || - decoder.FallbackBuffer.Remaining == 0, - "[SBCSCodePageEncoding.GetChars]Expected empty fallback buffer at start"); - } - - if (bUseBestFit || (fallback != null && fallback.MaxCharCount == 1)) - { - // Just return length, SBCS stay the same length because they don't map to surrogate - // pairs and we don't have a decoder fallback. - return count; - } - - // Might need one of these later - DecoderFallbackBuffer fallbackBuffer = null; - - // Have to do it the hard way. - // Assume charCount will be == count - int charCount = count; - byte[] byteBuffer = new byte[1]; - - // Do it our fast way - byte* byteEnd = bytes + count; - - // Quick loop - while (bytes < byteEnd) - { - // Faster if don't use *bytes++; - char c; - c = mapBytesToUnicode[*bytes]; - bytes++; - - // If unknown we have to do fallback count - if (c == UNKNOWN_CHAR) - { - // Must have a fallback buffer - if (fallbackBuffer == null) - { - // Need to adjust count so we get real start - if (decoder == null) - fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - count, null); - } - - // Use fallback buffer - byteBuffer[0] = *(bytes - 1); - charCount--; // We'd already reserved one for *(bytes-1) - charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); - } - } - - // Fallback buffer must be empty - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[SBCSEncoding.GetCharCount]Expected Empty fallback buffer at end"); - - // Converted sequence is same length as input - return charCount; - } - - internal override unsafe int GetChars(byte* bytes, int byteCount, - char* chars, int charCount, DecoderNLS decoder) - { - // Just need to ASSERT, this is called by something else internal that checked parameters already - Debug.Assert(bytes != null, "[SBCSCodePageEncoding.GetChars]bytes is null"); - Debug.Assert(byteCount >= 0, "[SBCSCodePageEncoding.GetChars]byteCount is negative"); - Debug.Assert(chars != null, "[SBCSCodePageEncoding.GetChars]chars is null"); - Debug.Assert(charCount >= 0, "[SBCSCodePageEncoding.GetChars]charCount is negative"); - - CheckMemorySection(); - - // See if we have best fit - bool bUseBestFit = false; - - // Do it fast way if using ? replacement or best fit fallbacks - byte* byteEnd = bytes + byteCount; - byte* byteStart = bytes; - char* charStart = chars; - - // Only need decoder fallback buffer if not using default replacement fallback or best fit fallback. - DecoderReplacementFallback fallback = null; - - if (decoder == null) - { - fallback = this.DecoderFallback as DecoderReplacementFallback; - bUseBestFit = this.DecoderFallback.IsMicrosoftBestFitFallback; - } - else - { - fallback = decoder.Fallback as DecoderReplacementFallback; - bUseBestFit = decoder.Fallback.IsMicrosoftBestFitFallback; - Debug.Assert(!decoder.m_throwOnOverflow || !decoder.InternalHasFallbackBuffer || - decoder.FallbackBuffer.Remaining == 0, - "[SBCSCodePageEncoding.GetChars]Expected empty fallback buffer at start"); - } - - if (bUseBestFit || (fallback != null && fallback.MaxCharCount == 1)) - { - // Try it the fast way - char replacementChar; - if (fallback == null) - replacementChar = '?'; // Best fit alwasy has ? for fallback for SBCS - else - replacementChar = fallback.DefaultString[0]; - - // Need byteCount chars, otherwise too small buffer - if (charCount < byteCount) - { - // Need at least 1 output byte, throw if must throw - ThrowCharsOverflow(decoder, charCount < 1); - - // Not throwing, use what we can - byteEnd = bytes + charCount; - } - - // Quick loop, just do '?' replacement because we don't have fallbacks for decodings. - while (bytes < byteEnd) - { - char c; - if (bUseBestFit) - { - if (arrayBytesBestFit == null) - { - ReadBestFitTable(); - } - c = arrayBytesBestFit[*bytes]; - } - else - c = mapBytesToUnicode[*bytes]; - bytes++; - - if (c == UNKNOWN_CHAR) - // This is an invalid byte in the ASCII encoding. - *chars = replacementChar; - else - *chars = c; - chars++; - } - - // bytes & chars used are the same - if (decoder != null) - decoder.m_bytesUsed = (int)(bytes - byteStart); - return (int)(chars - charStart); - } - - // Slower way's going to need a fallback buffer - DecoderFallbackBuffer fallbackBuffer = null; - byte[] byteBuffer = new byte[1]; - char* charEnd = chars + charCount; - - // Not quite so fast loop - while (bytes < byteEnd) - { - // Faster if don't use *bytes++; - char c = mapBytesToUnicode[*bytes]; - bytes++; - - // See if it was unknown - if (c == UNKNOWN_CHAR) - { - // Make sure we have a fallback buffer - if (fallbackBuffer == null) - { - if (decoder == null) - fallbackBuffer = this.DecoderFallback.CreateFallbackBuffer(); - else - fallbackBuffer = decoder.FallbackBuffer; - fallbackBuffer.InternalInitialize(byteEnd - byteCount, charEnd); - } - - // Use fallback buffer - Debug.Assert(bytes > byteStart, - "[SBCSCodePageEncoding.GetChars]Expected bytes to have advanced already (unknown byte)"); - byteBuffer[0] = *(bytes - 1); - // Fallback adds fallback to chars, but doesn't increment chars unless the whole thing fits. - if (!fallbackBuffer.InternalFallback(byteBuffer, bytes, ref chars)) - { - // May or may not throw, but we didn't get this byte - bytes--; // unused byte - fallbackBuffer.InternalReset(); // Didn't fall this back - ThrowCharsOverflow(decoder, bytes == byteStart); // throw? - break; // don't throw, but stop loop - } - } - else - { - // Make sure we have buffer space - if (chars >= charEnd) - { - Debug.Assert(bytes > byteStart, - "[SBCSCodePageEncoding.GetChars]Expected bytes to have advanced already (known byte)"); - bytes--; // unused byte - ThrowCharsOverflow(decoder, bytes == byteStart); // throw? - break; // don't throw, but stop loop - } - - *(chars) = c; - chars++; - } - } - - // Might have had decoder fallback stuff. - if (decoder != null) - decoder.m_bytesUsed = (int)(bytes - byteStart); - - // Expect Empty fallback buffer for GetChars - Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, - "[SBCSEncoding.GetChars]Expected Empty fallback buffer at end"); - - return (int)(chars - charStart); - } - - public override int GetMaxByteCount(int charCount) - { - if (charCount < 0) - throw new ArgumentOutOfRangeException(nameof(charCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Characters would be # of characters + 1 in case high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; - - if (EncoderFallback.MaxCharCount > 1) - byteCount *= EncoderFallback.MaxCharCount; - - // 1 to 1 for most characters. Only surrogates with fallbacks have less. - - if (byteCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); - return (int)byteCount; - } - - public override int GetMaxCharCount(int byteCount) - { - if (byteCount < 0) - throw new ArgumentOutOfRangeException(nameof(byteCount), - Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); - Contract.EndContractBlock(); - - // Just return length, SBCS stay the same length because they don't map to surrogate - long charCount = (long)byteCount; - - // 1 to 1 for most characters. Only surrogates with fallbacks have less, unknown fallbacks could be longer. - if (DecoderFallback.MaxCharCount > 1) - charCount *= DecoderFallback.MaxCharCount; - - if (charCount > 0x7fffffff) - throw new ArgumentOutOfRangeException(nameof(byteCount), Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); - - return (int)charCount; - } - - // True if and only if the encoding only uses single byte code points. (Ie, ASCII, 1252, etc) - public override bool IsSingleByte - { - get - { - return true; - } - } - - [System.Runtime.InteropServices.ComVisible(false)] - public override bool IsAlwaysNormalized(NormalizationForm form) - { - // Most of these code pages could be decomposed or have compatibility mappings for KC, KD, & D - // additionally the allow unassigned forms and IDNA wouldn't work either, so C is our choice. - if (form == NormalizationForm.FormC) - { - // Form C is only true for some code pages. They have to have all 256 code points assigned - // and not map to unassigned or combinable code points. - switch (CodePage) - { - // Return true for some code pages. - case 1252: // (Latin I - ANSI) - case 1250: // (Eastern Europe - ANSI) - case 1251: // (Cyrillic - ANSI) - case 1254: // (Turkish - ANSI) - case 1256: // (Arabic - ANSI) - case 28591: // (ISO 8859-1 Latin I) - case 437: // (United States - OEM) - case 737: // (Greek (aka 437G) - OEM) - case 775: // (Baltic - OEM) - case 850: // (Multilingual (Latin I) - OEM) - case 852: // (Slovak (Latin II) - OEM) - case 855: // (Cyrillic - OEM) - case 858: // (Multilingual (Latin I) - OEM + Euro) - case 860: // (Portuguese - OEM) - case 861: // (Icelandic - OEM) - case 862: // (Hebrew - OEM) - case 863: // (Canadian French - OEM) - case 865: // (Nordic - OEM) - case 866: // (Russian - OEM) - case 869: // (Modern Greek - OEM) - case 10007: // (Cyrillic - MAC) - case 10017: // (Ukraine - MAC) - case 10029: // (Latin II - MAC) - case 28592: // (ISO 8859-2 Eastern Europe) - case 28594: // (ISO 8859-4 Baltic) - case 28595: // (ISO 8859-5 Cyrillic) - case 28599: // (ISO 8859-9 Latin Alphabet No.5) - case 28603: // (ISO/IEC 8859-13:1998 (Lithuanian)) - case 28605: // (ISO 8859-15 Latin 9 (IBM923=IBM819+Euro)) - case 037: // (IBM EBCDIC U.S./Canada) - case 500: // (IBM EBCDIC International) - case 870: // (IBM EBCDIC Latin-2 Multilingual/ROECE) - case 1026: // (IBM EBCDIC Latin-5 Turkey) - case 1047: // (IBM Latin-1/Open System) - case 1140: // (IBM EBCDIC U.S./Canada (037+Euro)) - case 1141: // (IBM EBCDIC Germany (20273(IBM273)+Euro)) - case 1142: // (IBM EBCDIC Denmark/Norway (20277(IBM277+Euro)) - case 1143: // (IBM EBCDIC Finland/Sweden (20278(IBM278)+Euro)) - case 1144: // (IBM EBCDIC Italy (20280(IBM280)+Euro)) - case 1145: // (IBM EBCDIC Latin America/Spain (20284(IBM284)+Euro)) - case 1146: // (IBM EBCDIC United Kingdom (20285(IBM285)+Euro)) - case 1147: // (IBM EBCDIC France (20297(IBM297+Euro)) - case 1148: // (IBM EBCDIC International (500+Euro)) - case 1149: // (IBM EBCDIC Icelandic (20871(IBM871+Euro)) - case 20273: // (IBM EBCDIC Germany) - case 20277: // (IBM EBCDIC Denmark/Norway) - case 20278: // (IBM EBCDIC Finland/Sweden) - case 20280: // (IBM EBCDIC Italy) - case 20284: // (IBM EBCDIC Latin America/Spain) - case 20285: // (IBM EBCDIC United Kingdom) - case 20297: // (IBM EBCDIC France) - case 20871: // (IBM EBCDIC Icelandic) - case 20880: // (IBM EBCDIC Cyrillic) - case 20924: // (IBM Latin-1/Open System (IBM924=IBM1047+Euro)) - case 21025: // (IBM EBCDIC Cyrillic (Serbian, Bulgarian)) - case 720: // (Arabic - Transparent ASMO) - case 20866: // (Russian - KOI8) - case 21866: // (Ukrainian - KOI8-U) - return true; - } - } - - // False for IDNA and unknown - return false; - } - } -} -#endif // FEATURE_CODEPAGES_FILE diff --git a/src/mscorlib/src/System/Text/StringBuilder.cs b/src/mscorlib/src/System/Text/StringBuilder.cs index f20146fe00..72247c333e 100644 --- a/src/mscorlib/src/System/Text/StringBuilder.cs +++ b/src/mscorlib/src/System/Text/StringBuilder.cs @@ -41,7 +41,6 @@ namespace System.Text { // Console.WriteLine(sb1); // Console.WriteLine(sb2); // - [System.Runtime.InteropServices.ComVisible(true)] [Serializable] public sealed class StringBuilder : ISerializable { @@ -593,7 +592,7 @@ namespace System.Text { throw new ArgumentOutOfRangeException(nameof(startIndex), Environment.GetResourceString("ArgumentOutOfRange_GenericPositive")); } if (charCount<0) { - throw new ArgumentOutOfRangeException("count", Environment.GetResourceString("ArgumentOutOfRange_GenericPositive")); + throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GenericPositive")); } Contract.Ensures(Contract.Result<StringBuilder>() != null); Contract.EndContractBlock(); @@ -605,7 +604,7 @@ namespace System.Text { throw new ArgumentNullException(nameof(value)); } if (charCount > value.Length - startIndex) { - throw new ArgumentOutOfRangeException("count", Environment.GetResourceString("ArgumentOutOfRange_Index")); + throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_Index")); } if (charCount==0) { @@ -715,20 +714,17 @@ namespace System.Text { } } - [System.Runtime.InteropServices.ComVisible(false)] public StringBuilder AppendLine() { Contract.Ensures(Contract.Result<StringBuilder>() != null); return Append(Environment.NewLine); } - [System.Runtime.InteropServices.ComVisible(false)] public StringBuilder AppendLine(string value) { Contract.Ensures(Contract.Result<StringBuilder>() != null); Append(value); return Append(Environment.NewLine); } - [System.Runtime.InteropServices.ComVisible(false)] public void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count) { if (destination == null) { throw new ArgumentNullException(nameof(destination)); @@ -848,7 +844,7 @@ namespace System.Text { } if (length > Length - startIndex) { - throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index")); + throw new ArgumentOutOfRangeException(nameof(length), Environment.GetResourceString("ArgumentOutOfRange_Index")); } Contract.Ensures(Contract.Result<StringBuilder>() != null); Contract.EndContractBlock(); @@ -1205,7 +1201,7 @@ namespace System.Text { } if (charCount < 0) { - throw new ArgumentOutOfRangeException("count", Environment.GetResourceString("ArgumentOutOfRange_GenericPositive")); + throw new ArgumentOutOfRangeException(nameof(charCount), Environment.GetResourceString("ArgumentOutOfRange_GenericPositive")); } if (startIndex > value.Length - charCount) { diff --git a/src/mscorlib/src/System/Text/SurrogateEncoder.cs b/src/mscorlib/src/System/Text/SurrogateEncoder.cs deleted file mode 100644 index bbfa180f29..0000000000 --- a/src/mscorlib/src/System/Text/SurrogateEncoder.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - - -// WARNING: -// -// This is just an IObjectReference proxy for the former V1.1 Surrogate Encoder -// All this does is make an encoder of the correct type, it DOES NOT maintain state. -namespace System.Text -{ - using System; - using System.Runtime.Serialization; - using System.Security.Permissions; - using System.Diagnostics; - using System.Diagnostics.Contracts; - - /*=================================SurrogateEncoder================================== - ** This class is here only to deserialize the SurrogateEncoder class from Everett (V1.1) into - ** Appropriate Whidbey (V2.0) objects. - ==============================================================================*/ - - [Serializable] - internal sealed class SurrogateEncoder : IObjectReference, ISerializable - { - // Might need this when GetRealObjecting - [NonSerialized] - private Encoding realEncoding = null; - - // Constructor called by serialization. - internal SurrogateEncoder(SerializationInfo info, StreamingContext context) - { - // Any info? - if (info==null) throw new ArgumentNullException(nameof(info)); - Contract.EndContractBlock(); - - // All versions have a code page - this.realEncoding = (Encoding)info.GetValue("m_encoding", typeof(Encoding)); - } - - // Just get it from GetEncoding - public Object GetRealObject(StreamingContext context) - { - // Need to get our Encoding's Encoder - return this.realEncoding.GetEncoder(); - } - - // ISerializable implementation - void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context) - { - // We cannot ever call this. - Debug.Assert(false, "Didn't expect to make it to SurrogateEncoder.GetObjectData"); - throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); - } - } -} - diff --git a/src/mscorlib/src/System/Text/UTF7Encoding.cs b/src/mscorlib/src/System/Text/UTF7Encoding.cs index 624ca735f6..9418d2e768 100644 --- a/src/mscorlib/src/System/Text/UTF7Encoding.cs +++ b/src/mscorlib/src/System/Text/UTF7Encoding.cs @@ -10,13 +10,11 @@ namespace System.Text { using System; using System.Runtime.Serialization; - using System.Security.Permissions; using System.Diagnostics; using System.Diagnostics.Contracts; [Serializable] - [System.Runtime.InteropServices.ComVisible(true)] public class UTF7Encoding : Encoding { private const String base64Chars = @@ -127,7 +125,6 @@ namespace System.Text - [System.Runtime.InteropServices.ComVisible(false)] public override bool Equals(Object value) { UTF7Encoding that = value as UTF7Encoding; @@ -142,7 +139,6 @@ namespace System.Text // Compared to all the other encodings, variations of UTF7 are unlikely - [System.Runtime.InteropServices.ComVisible(false)] public override int GetHashCode() { return this.CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode(); @@ -170,20 +166,17 @@ namespace System.Text return EncodingForwarder.GetByteCount(this, chars, index, count); } - [System.Runtime.InteropServices.ComVisible(false)] public override int GetByteCount(String s) { return EncodingForwarder.GetByteCount(this, s); } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetByteCount(char* chars, int count) { return EncodingForwarder.GetByteCount(this, chars, count); } - [System.Runtime.InteropServices.ComVisible(false)] public override int GetBytes(String s, int charIndex, int charCount, byte[] bytes, int byteIndex) { @@ -206,7 +199,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { return EncodingForwarder.GetBytes(this, chars, charCount, bytes, byteCount); @@ -221,7 +213,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetCharCount(byte* bytes, int count) { return EncodingForwarder.GetCharCount(this, bytes, count); @@ -234,7 +225,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { return EncodingForwarder.GetChars(this, bytes, byteCount, chars, charCount); @@ -243,7 +233,6 @@ namespace System.Text // Returns a string containing the decoded representation of a range of // bytes in a byte array. - [System.Runtime.InteropServices.ComVisible(false)] public override String GetString(byte[] bytes, int index, int count) { return EncodingForwarder.GetString(this, bytes, index, count); diff --git a/src/mscorlib/src/System/Text/UTF8Encoding.cs b/src/mscorlib/src/System/Text/UTF8Encoding.cs index ba19649b56..191bbfef56 100644 --- a/src/mscorlib/src/System/Text/UTF8Encoding.cs +++ b/src/mscorlib/src/System/Text/UTF8Encoding.cs @@ -20,7 +20,6 @@ namespace System.Text using System; using System.Globalization; using System.Runtime.Serialization; - using System.Security.Permissions; using System.Diagnostics; using System.Diagnostics.Contracts; @@ -37,7 +36,6 @@ namespace System.Text // switch the byte orderings. [Serializable] -[System.Runtime.InteropServices.ComVisible(true)] public class UTF8Encoding : Encoding { /* @@ -131,7 +129,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetByteCount(char* chars, int count) { return EncodingForwarder.GetByteCount(this, chars, count); @@ -159,7 +156,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { return EncodingForwarder.GetBytes(this, chars, charCount, bytes, byteCount); @@ -174,7 +170,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetCharCount(byte* bytes, int count) { return EncodingForwarder.GetCharCount(this, bytes, count); @@ -187,7 +182,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { return EncodingForwarder.GetChars(this, bytes, byteCount, chars, charCount); @@ -196,7 +190,6 @@ namespace System.Text // Returns a string containing the decoded representation of a range of // bytes in a byte array. - [System.Runtime.InteropServices.ComVisible(false)] public override String GetString(byte[] bytes, int index, int count) { return EncodingForwarder.GetString(this, bytes, index, count); diff --git a/src/mscorlib/src/System/Text/UnicodeEncoding.cs b/src/mscorlib/src/System/Text/UnicodeEncoding.cs index 25255c3230..d8ef18ab05 100644 --- a/src/mscorlib/src/System/Text/UnicodeEncoding.cs +++ b/src/mscorlib/src/System/Text/UnicodeEncoding.cs @@ -11,13 +11,11 @@ namespace System.Text using System; using System.Globalization; using System.Runtime.Serialization; - using System.Security.Permissions; using System.Diagnostics; using System.Diagnostics.Contracts; [Serializable] - [System.Runtime.InteropServices.ComVisible(true)] public class UnicodeEncoding : Encoding { // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization @@ -111,7 +109,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetByteCount(char* chars, int count) { return EncodingForwarder.GetByteCount(this, chars, count); @@ -139,7 +136,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { return EncodingForwarder.GetBytes(this, chars, charCount, bytes, byteCount); @@ -154,7 +150,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public override unsafe int GetCharCount(byte* bytes, int count) { return EncodingForwarder.GetCharCount(this, bytes, count); @@ -167,7 +162,6 @@ namespace System.Text } [CLSCompliant(false)] - [System.Runtime.InteropServices.ComVisible(false)] public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) { return EncodingForwarder.GetChars(this, bytes, byteCount, chars, charCount); @@ -176,7 +170,6 @@ namespace System.Text // Returns a string containing the decoded representation of a range of // bytes in a byte array. - [System.Runtime.InteropServices.ComVisible(false)] public override String GetString(byte[] bytes, int index, int count) { return EncodingForwarder.GetString(this, bytes, index, count); @@ -1659,7 +1652,6 @@ namespace System.Text } - [System.Runtime.InteropServices.ComVisible(false)] public override System.Text.Encoder GetEncoder() { return new EncoderNLS(this); |