src/bson/encoding.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

/*
 * Copyright 2009-2012 10gen, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Portions Copyright 2001 Unicode, Inc.
 *
 * Disclaimer
 *
 * This source code is provided as is by Unicode, Inc. No claims are
 * made as to fitness for any particular purpose. No warranties of any
 * kind are expressed or implied. The recipient agrees to determine
 * applicability of information provided. If this file has been
 * purchased on magnetic or optical media from Unicode, Inc., the
 * sole remedy for any claim will be exchange of defective media
 * within 90 days of receipt.
 *
 * Limitations on Rights to Redistribute This Code
 *
 * Unicode, Inc. hereby grants the right to freely use the information
 * supplied in this file in the creation of products supporting the
 * Unicode Standard, and to make copies of this file in any form
 * for internal or external distribution as long as this notice
 * remains attached.
 */


#include "bson.h"
#include "encoding.h"

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
 */
static const char trailingBytesForUTF8[256] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};

/* --------------------------------------------------------------------- */

/*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
 * The length can be set by:
 *  length = trailingBytesForUTF8[*source]+1;
 * and the sequence is illegal right away if there aren't that many bytes
 * available.
 * If presented with a length > 4, this returns 0.  The Unicode
 * definition of UTF-8 goes up to 4-byte sequences.
 */
static int isLegalUTF8(const unsigned char *source, int length) {
    unsigned char a;
    const unsigned char *srcptr = source + length;
    switch (length) {
        default:
            return 0;
            /* Everything else falls through when "true"... */
        case 4:
            if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
        case 3:
            if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
        case 2:
            if ((a = (*--srcptr)) > 0xBF) return 0;
            switch (*source) {
                    /* no fall-through in this inner switch */
                case 0xE0:
                    if (a < 0xA0) return 0;
                    break;
                case 0xF0:
                    if (a < 0x90) return 0;
                    break;
                case 0xF4:
                    if (a > 0x8F) return 0;
                    break;
                default:
                    if (a < 0x80) return 0;
            }
        case 1:
            if (*source >= 0x80 && *source < 0xC2) return 0;
            if (*source > 0xF4) return 0;
    }
    return 1;
}

/* If the name is part of a db ref ($ref, $db, or $id), then return true. */
static int bson_string_is_db_ref(const unsigned char *string, const int length) {
    int result = 0;
    if (length >= 4) {
        if (string[1] == 'r' && string[2] == 'e' && string[3] == 'f')
            result = 1;
    } else if (length >= 3) {
        if (string[1] == 'i' && string[2] == 'd')
            result = 1;
        else if (string[1] == 'd' && string[2] == 'b')
            result = 1;
    }
    return result;
}

static int bson_validate_string(bson *b, const unsigned char *string,
        const int length, const char check_utf8, const char check_dot,
        const char check_dollar) {

    int position = 0;
    int sequence_length = 1;

    if (check_dollar && string[0] == '$') {
        if (!bson_string_is_db_ref(string, length))
            b->err |= BSON_FIELD_INIT_DOLLAR;
    }

    while (position < length) {
        if (check_dot && *(string + position) == '.') {
            b->err |= BSON_FIELD_HAS_DOT;
        }
        if (check_utf8) {
            sequence_length = trailingBytesForUTF8[*(string + position)] + 1;
            if ((position + sequence_length) > length) {
                b->err |= BSON_NOT_UTF8;
                return BSON_ERROR;
            }
            if (!isLegalUTF8(string + position, sequence_length)) {
                b->err |= BSON_NOT_UTF8;
                return BSON_ERROR;
            }
        }
        position += sequence_length;
    }

    return BSON_OK;
}

int bson_check_string(bson *b, const char *string,
                      const int length) {
    return bson_validate_string(b, (const unsigned char *) string, length, 1, 0, 0);
}

int bson_check_field_name(bson *b, const char *string,
                          const int length, int check_dot, int check_dollar) {
    return bson_validate_string(b, (const unsigned char *) string, length, 1, check_dot, check_dollar);
}