src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8.cs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;

namespace System.Text.Unicode
{
    public static class Utf8
    {
        /*
         * OperationStatus-based APIs for transcoding of chunked data.
         * This method is similar to Encoding.UTF8.GetBytes / GetChars but has a
         * different calling convention, different error handling mechanisms, and
         * different performance characteristics.
         *
         * If 'replaceInvalidSequences' is true, the method will replace any ill-formed
         * subsequence in the source with U+FFFD when transcoding to the destination,
         * then it will continue processing the remainder of the buffers. Otherwise
         * the method will return OperationStatus.InvalidData.
         *
         * If the method does return an error code, the out parameters will represent
         * how much of the data was successfully transcoded, and the location of the
         * ill-formed subsequence can be deduced from these values.
         *
         * If 'replaceInvalidSequences' is true, the method is guaranteed never to return
         * OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is
         * guaranteed never to return OperationStatus.NeedMoreData.
         */

        /// <summary>
        /// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8.
        /// </summary>
        /// <remarks>
        /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences
        /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
        /// this method will not return <see cref="OperationStatus.InvalidData"/>.
        /// </remarks>
        public static OperationStatus FromUtf16(ReadOnlySpan<char> source, Span<byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
        {
            int originalSourceLength = source.Length;
            int originalDestinationLength = destination.Length;
            OperationStatus status = OperationStatus.Done;

            // In a loop, this is going to read and transcode one scalar value at a time
            // from the source to the destination.

            while (!source.IsEmpty)
            {
                status = Rune.DecodeFromUtf16(source, out Rune firstScalarValue, out int charsConsumed);

                switch (status)
                {
                    case OperationStatus.NeedMoreData:

                        // Input buffer ended with a high surrogate. Only treat this as an error
                        // if the caller told us that we shouldn't expect additional data in a
                        // future call.

                        if (!isFinalBlock)
                        {
                            goto Finish;
                        }

                        status = OperationStatus.InvalidData;
                        goto case OperationStatus.InvalidData;

                    case OperationStatus.InvalidData:

                        // Input buffer contained invalid data. If the caller told us not to
                        // perform U+FFFD replacement, terminate the loop immediately and return
                        // an error to the caller.

                        if (!replaceInvalidSequences)
                        {
                            goto Finish;
                        }

                        firstScalarValue = Rune.ReplacementChar;
                        goto default;

                    default:

                        // We know which scalar value we need to transcode to UTF-8.
                        // Do so now, and only terminate the loop if we ran out of space
                        // in the destination buffer.

                        if (firstScalarValue.TryEncodeToUtf8(destination, out int bytesWritten))
                        {
                            source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution
                            destination = destination.Slice(bytesWritten);
                            status = OperationStatus.Done; // forcibly set success
                            continue;
                        }
                        else
                        {
                            status = OperationStatus.DestinationTooSmall;
                            goto Finish;
                        }
                }
            }

        Finish:

            numCharsRead = originalSourceLength - source.Length;
            numBytesWritten = originalDestinationLength - destination.Length;

            Debug.Assert((status == OperationStatus.Done) == (numCharsRead == originalSourceLength),
                "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");

            return status;
        }

        /// <summary>
        /// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16.
        /// </summary>
        /// <remarks>
        /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences
        /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
        /// this method will not return <see cref="OperationStatus.InvalidData"/>.
        /// </remarks>
        public static OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
        {
            int originalSourceLength = source.Length;
            int originalDestinationLength = destination.Length;
            OperationStatus status = OperationStatus.Done;

            // In a loop, this is going to read and transcode one scalar value at a time
            // from the source to the destination.

            while (!source.IsEmpty)
            {
                status = Rune.DecodeFromUtf8(source, out Rune firstScalarValue, out int bytesConsumed);

                switch (status)
                {
                    case OperationStatus.NeedMoreData:

                        // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error
                        // if the caller told us that we shouldn't expect additional data in a
                        // future call.

                        if (!isFinalBlock)
                        {
                            goto Finish;
                        }

                        status = OperationStatus.InvalidData;
                        goto case OperationStatus.InvalidData;

                    case OperationStatus.InvalidData:

                        // Input buffer contained invalid data. If the caller told us not to
                        // perform U+FFFD replacement, terminate the loop immediately and return
                        // an error to the caller.

                        if (!replaceInvalidSequences)
                        {
                            goto Finish;
                        }

                        firstScalarValue = Rune.ReplacementChar;
                        goto default;

                    default:

                        // We know which scalar value we need to transcode to UTF-16.
                        // Do so now, and only terminate the loop if we ran out of space
                        // in the destination buffer.

                        if (firstScalarValue.TryEncodeToUtf16(destination, out int charsWritten))
                        {
                            source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution
                            destination = destination.Slice(charsWritten);
                            status = OperationStatus.Done; // forcibly set success
                            continue;
                        }
                        else
                        {
                            status = OperationStatus.DestinationTooSmall;
                            goto Finish;
                        }
                }
            }

        Finish:

            numBytesRead = originalSourceLength - source.Length;
            numCharsWritten = originalDestinationLength - destination.Length;

            Debug.Assert((status == OperationStatus.Done) == (numBytesRead == originalSourceLength),
                    "Should report OperationStatus.Done if and only if we've consumed the entire input buffer.");

            return status;
        }
    }
}