diff options
Diffstat (limited to 'src/bson/encoding.c')
-rw-r--r-- | src/bson/encoding.c | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/src/bson/encoding.c b/src/bson/encoding.c new file mode 100644 index 0000000..4bf405f --- /dev/null +++ b/src/bson/encoding.c @@ -0,0 +1,165 @@ +/* + * Copyright 2009-2012 10gen, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +#include "bson.h" +#include "encoding.h" + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + */ +static const char trailingBytesForUTF8[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 +}; + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * The length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns 0. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ +static int isLegalUTF8(const unsigned char *source, int length) { + unsigned char a; + const unsigned char *srcptr = source + length; + switch (length) { + default: + return 0; + /* Everything else falls through when "true"... */ + case 4: + if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + case 3: + if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + case 2: + if ((a = (*--srcptr)) > 0xBF) return 0; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: + if (a < 0xA0) return 0; + break; + case 0xF0: + if (a < 0x90) return 0; + break; + case 0xF4: + if (a > 0x8F) return 0; + break; + default: + if (a < 0x80) return 0; + } + case 1: + if (*source >= 0x80 && *source < 0xC2) return 0; + if (*source > 0xF4) return 0; + } + return 1; +} + +/* If the name is part of a db ref ($ref, $db, or $id), then return true. */ +static int bson_string_is_db_ref(const unsigned char *string, const int length) { + int result = 0; + + if (length >= 4) { + if (string[1] == 'r' && string[2] == 'e' && string[3] == 'f') + result = 1; + } else if (length >= 3) { + if (string[1] == 'i' && string[2] == 'd') + result = 1; + else if (string[1] == 'd' && string[2] == 'b') + result = 1; + } + + return result; +} + +static int bson_validate_string(bson *b, const unsigned char *string, + const int length, const char check_utf8, const char check_dot, + const char check_dollar) { + + int position = 0; + int sequence_length = 1; + + if (check_dollar && string[0] == '$') { + if (!bson_string_is_db_ref(string, length)) + b->err |= BSON_FIELD_INIT_DOLLAR; + } + + while (position < length) { + if (check_dot && *(string + position) == '.') { + b->err |= BSON_FIELD_HAS_DOT; + } + + if (check_utf8) { + sequence_length = trailingBytesForUTF8[*(string + position)] + 1; + if ((position + sequence_length) > length) { + b->err |= BSON_NOT_UTF8; + return BSON_ERROR; + } + if (!isLegalUTF8(string + position, sequence_length)) { + b->err |= BSON_NOT_UTF8; + return BSON_ERROR; + } + } + position += sequence_length; + } + + return BSON_OK; +} + +int bson_check_string(bson *b, const char *string, + const int length) { + + return bson_validate_string(b, (const unsigned char *) string, length, 1, 0, 0); +} + +int bson_check_field_name(bson *b, const char *string, + const int length, int check_dot, int check_dollar) { + + return bson_validate_string(b, (const unsigned char *) string, length, 1, check_dot, check_dollar); +} |