diff options
author | H. Peter Anvin <hpa@zytor.com> | 2008-06-14 16:53:48 -0700 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2008-06-14 16:53:48 -0700 |
commit | 518df30308c555a8f8d7e359cb31688af0c686db (patch) | |
tree | 16657636dfd4f99a7ca4055a7dda9cf400811278 /strfunc.c | |
parent | dfaa278cd5e9d54bd3ecbd6c8cd7544de0134e42 (diff) | |
download | nasm-518df30308c555a8f8d7e359cb31688af0c686db.tar.gz nasm-518df30308c555a8f8d7e359cb31688af0c686db.tar.bz2 nasm-518df30308c555a8f8d7e359cb31688af0c686db.zip |
Implement __utf16__() and __utf32__() for the DB family
Implement __utf16__() and __utf32__() for the DB family of
pseudo-instructions. Not yet implemented for evaluation context.
Diffstat (limited to 'strfunc.c')
-rw-r--r-- | strfunc.c | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/strfunc.c b/strfunc.c new file mode 100644 index 0000000..9fb7270 --- /dev/null +++ b/strfunc.c @@ -0,0 +1,167 @@ +/* + * strfunc.c + * + * String transformation functions + */ + +#include "nasmlib.h" +#include "nasm.h" + +/* + * Convert a string in UTF-8 format to UTF-16LE + */ +static size_t utf8_to_16le(uint8_t *str, size_t len, char *op) +{ +#define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0) + + size_t outlen = 0; + int expect = 0; + uint8_t c; + uint32_t v = 0, vmin = 0; + + while (len--) { + c = *str++; + + if (expect) { + if ((c & 0xc0) != 0x80) { + expect = 0; + return -1; + } else { + v = (v << 6) | (c & 0x3f); + if (!--expect) { + if (v < vmin || v > 0x10ffff || + (v >= 0xd800 && v <= 0xdfff)) { + return -1; + } else if (v > 0xffff) { + v -= 0x10000; + EMIT(0xd800 | (v >> 10)); + EMIT(0xdc00 | (v & 0x3ff)); + } else { + EMIT(v); + } + } + continue; + } + } + + if (c < 0x80) { + EMIT(c); + } else if (c < 0xa0 || c >= 0xfe) { + /* Invalid UTF-8 */ + return -1; + } else if (c < 0xe0) { + v = c & 0x1f; + expect = 1; + vmin = 0x80; + } else if (c < 0xf0) { + v = c & 0x0f; + expect = 2; + vmin = 0x800; + } else if (c < 0xf8) { + v = c & 0x07; + expect = 3; + vmin = 0x10000; + } else if (c < 0xfc) { + v = c & 0x03; + expect = 4; + vmin = 0x200000; + } else { + v = c & 0x01; + expect = 5; + vmin = 0x4000000; + } + } + + return expect ? (size_t)-1 : outlen << 1; + +#undef EMIT +} + +/* + * Convert a string in UTF-8 format to UTF-32LE + */ +static size_t utf8_to_32le(uint8_t *str, size_t len, char *op) +{ +#define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0) + + size_t outlen = 0; + int expect = 0; + uint8_t c; + uint32_t v = 0, vmin = 0; + + while (len--) { + c = *str++; + + if (expect) { + if ((c & 0xc0) != 0x80) { + return -1; + } else { + v = (v << 6) | (c & 0x3f); + if (!--expect) { + if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) { + return -1; + } else { + EMIT(v); + } + } + continue; + } + } + + if (c < 0x80) { + EMIT(c); + } else if (c < 0xa0 || c >= 0xfe) { + /* Invalid UTF-8 */ + return -1; + } else if (c < 0xe0) { + v = c & 0x1f; + expect = 1; + vmin = 0x80; + } else if (c < 0xf0) { + v = c & 0x0f; + expect = 2; + vmin = 0x800; + } else if (c < 0xf8) { + v = c & 0x07; + expect = 3; + vmin = 0x10000; + } else if (c < 0xfc) { + v = c & 0x03; + expect = 4; + vmin = 0x200000; + } else { + v = c & 0x01; + expect = 5; + vmin = 0x4000000; + } + } + + return expect ? (size_t)-1 : outlen << 2; + +#undef EMIT +} + +typedef size_t (*transform_func)(uint8_t *, size_t, char *); + +/* + * Apply a specific string transform and return it in a nasm_malloc'd + * buffer, returning the length. On error, returns (size_t)-1 and no + * buffer is allocated. + */ +size_t string_transform(char *str, size_t len, char **out, enum strfunc func) +{ + /* This should match enum strfunc in nasm.h */ + static const transform_func str_transforms[] = { + utf8_to_16le, + utf8_to_32le, + }; + transform_func transform = str_transforms[func]; + size_t outlen; + uint8_t *s = (uint8_t *)str; + + outlen = transform(s, len, NULL); + if (outlen == (size_t)-1) + return -1; + + return transform(s, len, *out = nasm_malloc(outlen)); +} |