diff options
Diffstat (limited to 'src/unicode.c')
-rw-r--r-- | src/unicode.c | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/src/unicode.c b/src/unicode.c new file mode 100644 index 0000000..89c2cbe --- /dev/null +++ b/src/unicode.c @@ -0,0 +1,174 @@ +/** + * \file unicode.c + * + * This file contains general Unicode string manipulation functions. + * It mainly consist of functions for converting between UCS-2 (used on + * the devices) and UTF-8 (used by several applications). + * + * For a deeper understanding of Unicode encoding formats see the + * Wikipedia entries for + * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a> + * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. + * + * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include "config.h" + +#include <stdlib.h> +#include <string.h> +#ifdef HAVE_ICONV +#include "iconv.h" +#else +#error "libmtp unicode.c needs fixing to work without iconv()!" +#endif +#include "libmtp.h" +#include "unicode.h" +#include "util.h" +#include "ptp.h" + +/** + * The size of the buffer (in characters) used for creating string copies. + */ +#define STRING_BUFFER_LENGTH 1024 + +/** + * Gets the length (in characters, not bytes) of a unicode + * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00 + * will return a value of 1. + * + * @param unicstr a UCS-2 Unicode string + * @return the length of the string, in number of characters. If you + * want to know the length in bytes, multiply this by two and + * add two (for zero terminator). + */ +int ucs2_strlen(uint16_t const * const unicstr) +{ + int length; + + /* Unicode strings are terminated with 2 * 0x00 */ + for(length = 0; unicstr[length] != 0x0000U; length ++); + return length; +} + +/** + * Converts a big-endian UTF-16 2-byte string + * to a UTF-8 string. Actually just a UCS-2 internal conversion + * routine that strips off the BOM if there is one. + * + * @param device a pointer to the current device. + * @param unicstr the UTF-16 unicode string to convert + * @return a UTF-8 string. + */ +char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr) +{ + PTPParams *params = (PTPParams *) device->params; + char *stringp = (char *) unicstr; + char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char. + char *locp = loclstr; + size_t nconv; + size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator + size_t convmax = STRING_BUFFER_LENGTH*3; + + loclstr[0]='\0'; + /* Do the conversion. */ + nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax); + if (nconv == (size_t) -1) { + // Return partial string anyway. + *locp = '\0'; + } + loclstr[STRING_BUFFER_LENGTH*3] = '\0'; + // Strip off any BOM, it's totally useless... + if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) { + return strdup(loclstr+3); + } + return strdup(loclstr); +} + +/** + * Converts a UTF-8 string to a big-endian UTF-16 2-byte string + * Actually just a UCS-2 internal conversion. + * + * @param device a pointer to the current device. + * @param localstr the UTF-8 unicode string to convert + * @return a UTF-16 string. + */ +uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr) +{ + PTPParams *params = (PTPParams *) device->params; + char *stringp = (char *) localstr; // cast away "const" + char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char. + char *unip = unicstr; + size_t nconv = 0; + size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator + size_t convmax = STRING_BUFFER_LENGTH*2; + + unicstr[0]='\0'; + unicstr[1]='\0'; + + /* Do the conversion. */ + nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax); + + if (nconv == (size_t) -1) { + // Return partial string anyway. + unip[0] = '\0'; + unip[1] = '\0'; + } + // make sure the string is null terminated + unicstr[STRING_BUFFER_LENGTH*2] = '\0'; + unicstr[STRING_BUFFER_LENGTH*2+1] = '\0'; + + // allocate the string to be returned + // Note: can't use strdup since every other byte is a null byte + int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2; + uint16_t* ret = malloc(ret_len); + memcpy(ret,unicstr,(size_t)ret_len); + return ret; +} + +/** + * This helper function simply removes any consecutive chars + * > 0x7F and replace then with an underscore. In UTF-8 + * consequtive chars > 0x7F represent one single character so + * it has to be done like this (and it's elegant). It will only + * shrink the string in size so no copying is needed. + */ +void strip_7bit_from_utf8(char *str) +{ + int i,j,k; + i = 0; + j = 0; + k = strlen(str); + while (i < k) { + if ((uint8_t) str[i] > 0x7FU) { + str[j] = '_'; + i++; + // Skip over any consequtive > 0x7F chars. + while((uint8_t) str[i] > 0x7FU) { + i++; + } + } else { + str[j] = str[i]; + i++; + } + j++; + } + // Terminate stripped string... + str[j] = '\0'; +} |