diff options
-rw-r--r-- | doc/nasmdoc.src | 15 | ||||
-rw-r--r-- | nasm.h | 6 | ||||
-rw-r--r-- | strfunc.c | 156 | ||||
-rw-r--r-- | test/utf.asm | 52 | ||||
-rw-r--r-- | tokens.dat | 4 |
5 files changed, 226 insertions, 7 deletions
diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src index 217c12a..3c912ee 100644 --- a/doc/nasmdoc.src +++ b/doc/nasmdoc.src @@ -1596,9 +1596,12 @@ operands to \c{DW}, and so forth. \S{unicode} \I{UTF-16}\I{UTF-32}\i{Unicode} Strings -The special operators \i\c{__utf16__} and \i\c{__utf32__} allows -definition of Unicode strings. They take a string in UTF-8 format and -converts it to (littleendian) UTF-16 or UTF-32, respectively. +The special operators \i\c{__utf16__}, \i\c{__utf16le__}, +\i\c{__utf16be__}, \i\c{__utf32__}, \i\c{__utf32le__} and +\i\c{__utf32be__} allows definition of Unicode strings. They take a +string in UTF-8 format and converts it to UTF-16 or UTF-32, +respectively. Unless the \c{be} forms are specified, the output is +littleendian. For example: @@ -1608,9 +1611,9 @@ For example: \c dw u('C:\WINDOWS'), 0 ; Pathname in UTF-16 \c dd w(`A + B = \u206a`), 0 ; String in UTF-32 -\c{__utf16__} and \c{__utf32__} can be applied either to strings -passed to the \c{DB} family instructions, or to character constants in -an expression context. +The UTF operators can be applied either to strings passed to the +\c{DB} family instructions, or to character constants in an expression +context. \S{fltconst} \I{floating-point, constants}Floating-Point Constants @@ -224,7 +224,7 @@ enum token_type { /* token types, other than chars */ TOKEN_SEG, /* SEG */ TOKEN_WRT, /* WRT */ TOKEN_FLOATIZE, /* __floatX__ */ - TOKEN_STRFUNC, /* __utf16__, __utf32__ */ + TOKEN_STRFUNC, /* __utf16*__, __utf32*__ */ }; enum floatize { @@ -241,7 +241,11 @@ enum floatize { /* Must match the list in string_transform(), in strfunc.c */ enum strfunc { STRFUNC_UTF16, + STRFUNC_UTF16LE, + STRFUNC_UTF16BE, STRFUNC_UTF32, + STRFUNC_UTF32LE, + STRFUNC_UTF32BE, }; size_t string_transform(char *, size_t, char **, enum strfunc); @@ -111,6 +111,84 @@ static size_t utf8_to_16le(uint8_t *str, size_t len, char *op) } /* + * Convert a string in UTF-8 format to UTF-16BE + */ +static size_t utf8_to_16be(uint8_t *str, size_t len, char *op) +{ +#define EMIT(x) \ + do { \ + uint16_t _y = (x); \ + if (op) { \ + WRITECHAR(op, _y >> 8); \ + WRITECHAR(op, _y); \ + } \ + outlen++; \ + } while (0) \ + + size_t outlen = 0; + int expect = 0; + uint8_t c; + uint32_t v = 0, vmin = 0; + + while (len--) { + c = *str++; + + if (expect) { + if ((c & 0xc0) != 0x80) { + expect = 0; + return -1; + } else { + v = (v << 6) | (c & 0x3f); + if (!--expect) { + if (v < vmin || v > 0x10ffff || + (v >= 0xd800 && v <= 0xdfff)) { + return -1; + } else if (v > 0xffff) { + v -= 0x10000; + EMIT(0xdc00 | (v & 0x3ff)); + EMIT(0xd800 | (v >> 10)); + } else { + EMIT(v); + } + } + continue; + } + } + + if (c < 0x80) { + EMIT(c); + } else if (c < 0xc0 || c >= 0xfe) { + /* Invalid UTF-8 */ + return -1; + } else if (c < 0xe0) { + v = c & 0x1f; + expect = 1; + vmin = 0x80; + } else if (c < 0xf0) { + v = c & 0x0f; + expect = 2; + vmin = 0x800; + } else if (c < 0xf8) { + v = c & 0x07; + expect = 3; + vmin = 0x10000; + } else if (c < 0xfc) { + v = c & 0x03; + expect = 4; + vmin = 0x200000; + } else { + v = c & 0x01; + expect = 5; + vmin = 0x4000000; + } + } + + return expect ? (size_t)-1 : outlen << 1; + +#undef EMIT +} + +/* * Convert a string in UTF-8 format to UTF-32LE */ static size_t utf8_to_32le(uint8_t *str, size_t len, char *op) @@ -174,6 +252,80 @@ static size_t utf8_to_32le(uint8_t *str, size_t len, char *op) #undef EMIT } +/* + * Convert a string in UTF-8 format to UTF-32BE + */ +static size_t utf8_to_32be(uint8_t *str, size_t len, char *op) +{ +#define EMIT(x) \ + do { \ + uint32_t _y = (x); \ + if (op) { \ + WRITECHAR(op,_y >> 24); \ + WRITECHAR(op,_y >> 16); \ + WRITECHAR(op,_y >> 8); \ + WRITECHAR(op,_y); \ + } \ + outlen++; \ + } while (0) + + size_t outlen = 0; + int expect = 0; + uint8_t c; + uint32_t v = 0, vmin = 0; + + while (len--) { + c = *str++; + + if (expect) { + if ((c & 0xc0) != 0x80) { + return -1; + } else { + v = (v << 6) | (c & 0x3f); + if (!--expect) { + if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) { + return -1; + } else { + EMIT(v); + } + } + continue; + } + } + + if (c < 0x80) { + EMIT(c); + } else if (c < 0xc0 || c >= 0xfe) { + /* Invalid UTF-8 */ + return -1; + } else if (c < 0xe0) { + v = c & 0x1f; + expect = 1; + vmin = 0x80; + } else if (c < 0xf0) { + v = c & 0x0f; + expect = 2; + vmin = 0x800; + } else if (c < 0xf8) { + v = c & 0x07; + expect = 3; + vmin = 0x10000; + } else if (c < 0xfc) { + v = c & 0x03; + expect = 4; + vmin = 0x200000; + } else { + v = c & 0x01; + expect = 5; + vmin = 0x4000000; + } + } + + return expect ? (size_t)-1 : outlen << 2; + +#undef EMIT +} + typedef size_t (*transform_func)(uint8_t *, size_t, char *); /* @@ -186,7 +338,11 @@ size_t string_transform(char *str, size_t len, char **out, enum strfunc func) /* This should match enum strfunc in nasm.h */ static const transform_func str_transforms[] = { utf8_to_16le, + utf8_to_16le, + utf8_to_16be, + utf8_to_32le, utf8_to_32le, + utf8_to_32be, }; transform_func transform = str_transforms[func]; size_t outlen; diff --git a/test/utf.asm b/test/utf.asm index 4b894f8..00207dc 100644 --- a/test/utf.asm +++ b/test/utf.asm @@ -2,6 +2,10 @@ ;Testname=error; Arguments=-fbin -outf.bin -DERROR; Files=stdout stderr utf.bin %define u(x) __utf16__(x) %define w(x) __utf32__(x) +%define ul(x) __utf16le__(x) +%define wl(x) __utf32le__(x) +%define ub(x) __utf16be__(x) +%define wb(x) __utf32be__(x) db `Test \u306a\U0001abcd\n` dw u(`Test \u306a\U0001abcd\n`) @@ -21,10 +25,58 @@ mov ebx,u(`\U0001abcd`) mov ecx,w(`\U0001abcd`) + db `Test \u306a\U0001abcd\n` + dw ul(`Test \u306a\U0001abcd\n`) + dd wl(`Test \u306a\U0001abcd\n`) + + db `\u306a` + db `\xe3\x81\xaa` + + dw __utf16le__ "Hello, World!" + + nop + + mov ax,ul(`a`) + mov bx,ul(`\u306a`) + mov cx,ul(`\xe3\x81\xaa`) + mov eax,ul(`ab`) + mov ebx,ul(`\U0001abcd`) + mov ecx,wl(`\U0001abcd`) + + db `Test \u306a\U0001abcd\n` + dw ub(`Test \u306a\U0001abcd\n`) + dd wb(`Test \u306a\U0001abcd\n`) + + db `\u306a` + db `\xe3\x81\xaa` + + dw __utf16be__ "Hello, World!" + + nop + + mov ax,ub(`a`) + mov bx,ub(`\u306a`) + mov cx,ub(`\xe3\x81\xaa`) + mov eax,ub(`ab`) + mov ebx,ub(`\U0001abcd`) + mov ecx,wb(`\U0001abcd`) + %ifdef ERROR dw __utf16__ 33 dw __utf16__, 46 dw __utf16__("Hello, World!",16) dw __utf16__("Hello, World!",16 dw u(`\xff`) + + dw __utf16le__ 33 + dw __utf16le__, 46 + dw __utf16le__("Hello, World!",16) + dw __utf16le__("Hello, World!",16 + dw ul(`\xff`) + + dw __utf16be__ 33 + dw __utf16be__, 46 + dw __utf16be__("Hello, World!",16) + dw __utf16be__("Hello, World!",16 + dw ub(`\xff`) %endif @@ -91,7 +91,11 @@ __float128h__ % TOKEN_STRFUNC, 0, STRFUNC_{__*__} __utf16__ +__utf16le__ +__utf16be__ __utf32__ +__utf32le__ +__utf32be__ % TOKEN_*, 0, 0 seg |