diff options
author | Pierre Le Marre <dev@wismill.eu> | 2023-05-13 17:26:24 +0200 |
---|---|---|
committer | Ran Benita <ran@unusedvar.com> | 2023-05-13 22:02:46 +0300 |
commit | 183761ac24544b355aaf362e62d05fa1c184baf8 (patch) | |
tree | 0fb328d8876d92997fca57acfbb4a76dc6ae7d58 | |
parent | 5fbffaf035f0c0edbcf7b2e747ccab9a234101ff (diff) | |
download | libxkbcommon-183761ac24544b355aaf362e62d05fa1c184baf8.tar.gz libxkbcommon-183761ac24544b355aaf362e62d05fa1c184baf8.tar.bz2 libxkbcommon-183761ac24544b355aaf362e62d05fa1c184baf8.zip |
Do not interpret nor emit invalid Unicode encoding forms
Surrogates are invalid in both UTF-32 and UTF-8.
See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
and https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G31703
-rw-r--r-- | src/keysym-utf.c | 16 | ||||
-rw-r--r-- | src/utf8.c | 16 | ||||
-rw-r--r-- | test/keysym.c | 4 | ||||
-rw-r--r-- | test/utf8.c | 2 |
4 files changed, 33 insertions, 5 deletions
diff --git a/src/keysym-utf.c b/src/keysym-utf.c index a9d46d1..0bb9a4f 100644 --- a/src/keysym-utf.c +++ b/src/keysym-utf.c @@ -41,6 +41,8 @@ #include "utils.h" #include "utf8.h" +#define NO_KEYSYM_UNICODE_CONVERSION 0 + /* We don't use the uint32_t types here, to save some space. */ struct codepair { uint16_t keysym; @@ -847,7 +849,7 @@ bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym) } /* no matching Unicode value found in table */ - return 0; + return NO_KEYSYM_UNICODE_CONVERSION; } XKB_EXPORT uint32_t @@ -871,6 +873,13 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym) return keysym & 0x7f; /* also check for directly encoded Unicode codepoints */ + + /* Exclude surrogates: they are invalid in UTF-32. + * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875 + * for further details. + */ + if (0x0100d800 <= keysym && keysym <= 0x0100dfff) + return NO_KEYSYM_UNICODE_CONVERSION; /* * In theory, this is supposed to start from 0x100100, such that the ASCII * range, which is already covered by 0x00-0xff, can't be encoded in two @@ -900,7 +909,8 @@ xkb_utf32_to_keysym(uint32_t ucs) return XKB_KEY_Delete; /* Unicode non-symbols and code points outside Unicode planes */ - if ((ucs >= 0xfdd0 && ucs <= 0xfdef) || + if ((ucs >= 0xd800 && ucs <= 0xdfff) || + (ucs >= 0xfdd0 && ucs <= 0xfdef) || ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe) return XKB_KEY_NoSymbol; @@ -948,7 +958,7 @@ xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size) codepoint = xkb_keysym_to_utf32(keysym); - if (codepoint == 0) + if (codepoint == NO_KEYSYM_UNICODE_CONVERSION) return 0; return utf32_to_utf8(codepoint, buffer); @@ -32,6 +32,11 @@ #include "utf8.h" +/* Conformant encoding form conversion from UTF-32 to UTF-8. + * + * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875 + * for further details. +*/ int utf32_to_utf8(uint32_t unichar, char *buffer) { @@ -47,6 +52,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer) length = 2; head = 0xc0; } + /* Handle surrogates */ + else if (0xd800 <= unichar && unichar <= 0xdfff) { + goto ill_formed_code_unit_subsequence; + } else if (unichar <= 0xffff) { length = 3; head = 0xe0; @@ -56,8 +65,7 @@ utf32_to_utf8(uint32_t unichar, char *buffer) head = 0xf0; } else { - buffer[0] = '\0'; - return 0; + goto ill_formed_code_unit_subsequence; } for (count = length - 1, shift = 0; count > 0; count--, shift += 6) @@ -67,6 +75,10 @@ utf32_to_utf8(uint32_t unichar, char *buffer) buffer[length] = '\0'; return length + 1; + +ill_formed_code_unit_subsequence: + buffer[0] = '\0'; + return 0; } bool diff --git a/test/keysym.c b/test/keysym.c index 38f967d..a4dba0c 100644 --- a/test/keysym.c +++ b/test/keysym.c @@ -222,6 +222,8 @@ main(void) assert(test_utf8(0x10005d0, "א")); assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf")); + assert(test_utf8(0x0100d800, NULL) == 0); // Unicode surrogates + assert(test_utf8(0x0100dfff, NULL) == 0); // Unicode surrogates assert(test_utf8(0x1110000, NULL) == 0); assert(test_utf32_to_keysym('y', XKB_KEY_y)); @@ -255,6 +257,8 @@ main(void) assert(test_utf32_to_keysym(0x20ac, XKB_KEY_EuroSign)); // Unicode non-characters + assert(test_utf32_to_keysym(0xd800, XKB_KEY_NoSymbol)); // Unicode surrogates + assert(test_utf32_to_keysym(0xdfff, XKB_KEY_NoSymbol)); // Unicode surrogates assert(test_utf32_to_keysym(0xfdd0, XKB_KEY_NoSymbol)); assert(test_utf32_to_keysym(0xfdef, XKB_KEY_NoSymbol)); assert(test_utf32_to_keysym(0xfffe, XKB_KEY_NoSymbol)); diff --git a/test/utf8.c b/test/utf8.c index 214e356..aa3c0d5 100644 --- a/test/utf8.c +++ b/test/utf8.c @@ -170,6 +170,8 @@ test_utf32_to_utf8(void) check_utf32_to_utf8(0x40, 2, "\x40"); check_utf32_to_utf8(0xA1, 3, "\xc2\xa1"); check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81"); + check_utf32_to_utf8(0xd800, 0, ""); // Unicode surrogates + check_utf32_to_utf8(0xdfff, 0, ""); // Unicode surrogates check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84"); check_utf32_to_utf8(0x110000, 0, ""); check_utf32_to_utf8(0xffffffff, 0, ""); |