summaryrefslogtreecommitdiff
path: root/jnlib/utf8conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'jnlib/utf8conv.c')
-rw-r--r--jnlib/utf8conv.c738
1 files changed, 738 insertions, 0 deletions
diff --git a/jnlib/utf8conv.c b/jnlib/utf8conv.c
new file mode 100644
index 0000000..fee4dc6
--- /dev/null
+++ b/jnlib/utf8conv.c
@@ -0,0 +1,738 @@
+/* utf8conf.c - UTF8 character set conversion
+ * Copyright (C) 1994, 1998, 1999, 2000, 2001,
+ * 2003, 2006, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of JNLIB.
+ *
+ * JNLIB is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * JNLIB is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
+#include <errno.h>
+#ifndef HAVE_W32_SYSTEM
+# include <iconv.h>
+#endif
+
+#include "libjnlib-config.h"
+#include "stringhelp.h"
+#include "dynload.h"
+#include "utf8conv.h"
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 16
+#endif
+
+static const char *active_charset_name = "iso-8859-1";
+static int no_translation; /* Set to true if we let simply pass through. */
+static int use_iconv; /* iconv comversion fucntions required. */
+
+
+/* Under W32 we dlopen the iconv dll and don't require any iconv
+ related headers at all. However we need to define some stuff. */
+#ifdef HAVE_W32_SYSTEM
+typedef void *iconv_t;
+#ifndef ICONV_CONST
+#define ICONV_CONST const
+#endif
+static iconv_t (* __stdcall iconv_open) (const char *tocode,
+ const char *fromcode);
+static size_t (* __stdcall iconv) (iconv_t cd,
+ const char **inbuf, size_t *inbytesleft,
+ char **outbuf, size_t *outbytesleft);
+static int (* __stdcall iconv_close) (iconv_t cd);
+
+static int
+load_libiconv (void)
+{
+ static int done;
+
+ if (!done)
+ {
+ void *handle;
+
+ done = 1; /* Do it right now because we might get called recursivly
+ through gettext. */
+
+ handle = dlopen ("iconv.dll", RTLD_LAZY);
+ if (handle)
+ {
+ iconv_open = dlsym (handle, "libiconv_open");
+ if (iconv_open)
+ iconv = dlsym (handle, "libiconv");
+ if (iconv)
+ iconv_close = dlsym (handle, "libiconv_close");
+ }
+ if (!handle || !iconv_close)
+ {
+ log_info (_("error loading `%s': %s\n"),
+ "iconv.dll", dlerror ());
+ log_info (_("please see %s for more information\n"),
+ "http://www.gnupg.org/download/iconv.html");
+ iconv_open = NULL;
+ iconv = NULL;
+ iconv_close = NULL;
+ if (handle)
+ dlclose (handle);
+ }
+ }
+ return iconv_open? 0: -1;
+}
+#endif /*HAVE_W32_SYSTEM*/
+
+
+/* Error handler for iconv failures. This is needed to not clutter the
+ output with repeated diagnostics about a missing conversion. */
+static void
+handle_iconv_error (const char *to, const char *from, int use_fallback)
+{
+ if (errno == EINVAL)
+ {
+ static int shown1, shown2;
+ int x;
+
+ if (to && !strcmp (to, "utf-8"))
+ {
+ x = shown1;
+ shown1 = 1;
+ }
+ else
+ {
+ x = shown2;
+ shown2 = 1;
+ }
+
+ if (!x)
+ log_info (_("conversion from `%s' to `%s' not available\n"),
+ from, to);
+ }
+ else
+ {
+ static int shown;
+
+ if (!shown)
+ log_info (_("iconv_open failed: %s\n"), strerror (errno));
+ shown = 1;
+ }
+
+ if (use_fallback)
+ {
+ /* To avoid further error messages we fallback to Latin-1 for the
+ native encoding. This is justified as one can expect that on a
+ utf-8 enabled system nl_langinfo() will work and thus we won't
+ never get to here. Thus Latin-1 seems to be a reasonable
+ default. */
+ active_charset_name = "iso-8859-1";
+ no_translation = 0;
+ use_iconv = 0;
+ }
+}
+
+
+
+int
+set_native_charset (const char *newset)
+{
+ const char *full_newset;
+
+ if (!newset)
+ {
+#ifdef HAVE_W32_SYSTEM
+ static char codepage[30];
+ unsigned int cpno;
+ const char *aliases;
+
+ /* We are a console program thus we need to use the
+ GetConsoleOutputCP function and not the the GetACP which
+ would give the codepage for a GUI program. Note this is not
+ a bulletproof detection because GetConsoleCP might return a
+ different one for console input. Not sure how to cope with
+ that. If the console Code page is not known we fall back to
+ the system code page. */
+ cpno = GetConsoleOutputCP ();
+ if (!cpno)
+ cpno = GetACP ();
+ sprintf (codepage, "CP%u", cpno );
+ /* Resolve alias. We use a long string string and not the usual
+ array to optimize if the code is taken to a DSO. Taken from
+ libiconv 1.9.2. */
+ newset = codepage;
+ for (aliases = ("CP936" "\0" "GBK" "\0"
+ "CP1361" "\0" "JOHAB" "\0"
+ "CP20127" "\0" "ASCII" "\0"
+ "CP20866" "\0" "KOI8-R" "\0"
+ "CP21866" "\0" "KOI8-RU" "\0"
+ "CP28591" "\0" "ISO-8859-1" "\0"
+ "CP28592" "\0" "ISO-8859-2" "\0"
+ "CP28593" "\0" "ISO-8859-3" "\0"
+ "CP28594" "\0" "ISO-8859-4" "\0"
+ "CP28595" "\0" "ISO-8859-5" "\0"
+ "CP28596" "\0" "ISO-8859-6" "\0"
+ "CP28597" "\0" "ISO-8859-7" "\0"
+ "CP28598" "\0" "ISO-8859-8" "\0"
+ "CP28599" "\0" "ISO-8859-9" "\0"
+ "CP28605" "\0" "ISO-8859-15" "\0"
+ "CP65001" "\0" "UTF-8" "\0");
+ *aliases;
+ aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
+ {
+ if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
+ {
+ newset = aliases + strlen (aliases) + 1;
+ break;
+ }
+ }
+
+#else /*!HAVE_W32_SYSTEM*/
+
+#ifdef HAVE_LANGINFO_CODESET
+ newset = nl_langinfo (CODESET);
+#else /*!HAVE_LANGINFO_CODESET*/
+ /* Try to get the used charset from environment variables. */
+ static char codepage[30];
+ const char *lc, *dot, *mod;
+
+ strcpy (codepage, "iso-8859-1");
+ lc = getenv ("LC_ALL");
+ if (!lc || !*lc)
+ {
+ lc = getenv ("LC_CTYPE");
+ if (!lc || !*lc)
+ lc = getenv ("LANG");
+ }
+ if (lc && *lc)
+ {
+ dot = strchr (lc, '.');
+ if (dot)
+ {
+ mod = strchr (++dot, '@');
+ if (!mod)
+ mod = dot + strlen (dot);
+ if (mod - dot < sizeof codepage && dot != mod)
+ {
+ memcpy (codepage, dot, mod - dot);
+ codepage [mod - dot] = 0;
+ }
+ }
+ }
+ newset = codepage;
+#endif /*!HAVE_LANGINFO_CODESET*/
+#endif /*!HAVE_W32_SYSTEM*/
+ }
+
+ full_newset = newset;
+ if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
+ {
+ newset += 3;
+ if (*newset == '-' || *newset == '_')
+ newset++;
+ }
+
+ /* Note that we silently assume that plain ASCII is actually meant
+ as Latin-1. This makes sense because many Unix system don't have
+ their locale set up properly and thus would get annoying error
+ messages and we have to handle all the "bug" reports. Latin-1 has
+ always been the character set used for 8 bit characters on Unix
+ systems. */
+ if ( !*newset
+ || !ascii_strcasecmp (newset, "8859-1" )
+ || !ascii_strcasecmp (newset, "646" )
+ || !ascii_strcasecmp (newset, "ASCII" )
+ || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
+ )
+ {
+ active_charset_name = "iso-8859-1";
+ no_translation = 0;
+ use_iconv = 0;
+ }
+ else if ( !ascii_strcasecmp (newset, "utf8" )
+ || !ascii_strcasecmp(newset, "utf-8") )
+ {
+ active_charset_name = "utf-8";
+ no_translation = 1;
+ use_iconv = 0;
+ }
+ else
+ {
+ iconv_t cd;
+
+#ifdef HAVE_W32_SYSTEM
+ if (load_libiconv ())
+ return -1;
+#endif /*HAVE_W32_SYSTEM*/
+
+ cd = iconv_open (full_newset, "utf-8");
+ if (cd == (iconv_t)-1)
+ {
+ handle_iconv_error (full_newset, "utf-8", 0);
+ return -1;
+ }
+ iconv_close (cd);
+ cd = iconv_open ("utf-8", full_newset);
+ if (cd == (iconv_t)-1)
+ {
+ handle_iconv_error ("utf-8", full_newset, 0);
+ return -1;
+ }
+ iconv_close (cd);
+ active_charset_name = full_newset;
+ no_translation = 0;
+ use_iconv = 1;
+ }
+ return 0;
+}
+
+const char *
+get_native_charset ()
+{
+ return active_charset_name;
+}
+
+/* Return true if the native charset is utf-8. */
+int
+is_native_utf8 (void)
+{
+ return no_translation;
+}
+
+
+/* Convert string, which is in native encoding to UTF8 and return a
+ new allocated UTF-8 string. This function terminates the process
+ on memory shortage. */
+char *
+native_to_utf8 (const char *orig_string)
+{
+ const unsigned char *string = (const unsigned char *)orig_string;
+ const unsigned char *s;
+ char *buffer;
+ unsigned char *p;
+ size_t length = 0;
+
+ if (no_translation)
+ {
+ /* Already utf-8 encoded. */
+ buffer = jnlib_xstrdup (orig_string);
+ }
+ else if (!use_iconv)
+ {
+ /* For Latin-1 we can avoid the iconv overhead. */
+ for (s = string; *s; s++)
+ {
+ length++;
+ if (*s & 0x80)
+ length++;
+ }
+ buffer = jnlib_xmalloc (length + 1);
+ for (p = (unsigned char *)buffer, s = string; *s; s++)
+ {
+ if ( (*s & 0x80 ))
+ {
+ *p++ = 0xc0 | ((*s >> 6) & 3);
+ *p++ = 0x80 | (*s & 0x3f);
+ }
+ else
+ *p++ = *s;
+ }
+ *p = 0;
+ }
+ else
+ {
+ /* Need to use iconv. */
+ iconv_t cd;
+ const char *inptr;
+ char *outptr;
+ size_t inbytes, outbytes;
+
+ cd = iconv_open ("utf-8", active_charset_name);
+ if (cd == (iconv_t)-1)
+ {
+ handle_iconv_error ("utf-8", active_charset_name, 1);
+ return native_to_utf8 (string);
+ }
+
+ for (s=string; *s; s++ )
+ {
+ length++;
+ if ((*s & 0x80))
+ length += 5; /* We may need up to 6 bytes for the utf8 output. */
+ }
+ buffer = jnlib_xmalloc (length + 1);
+
+ inptr = string;
+ inbytes = strlen (string);
+ outptr = buffer;
+ outbytes = length;
+ if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
+ &outptr, &outbytes) == (size_t)-1)
+ {
+ static int shown;
+
+ if (!shown)
+ log_info (_("conversion from `%s' to `%s' failed: %s\n"),
+ active_charset_name, "utf-8", strerror (errno));
+ shown = 1;
+ /* We don't do any conversion at all but use the strings as is. */
+ strcpy (buffer, string);
+ }
+ else /* Success. */
+ {
+ *outptr = 0;
+ /* We could realloc the buffer now but I doubt that it makes
+ much sense given that it will get freed anyway soon
+ after. */
+ }
+ iconv_close (cd);
+ }
+ return buffer;
+}
+
+
+
+static char *
+do_utf8_to_native (const char *string, size_t length, int delim,
+ int with_iconv)
+{
+ int nleft;
+ int i;
+ unsigned char encbuf[8];
+ int encidx;
+ const unsigned char *s;
+ size_t n;
+ char *buffer = NULL;
+ char *p = NULL;
+ unsigned long val = 0;
+ size_t slen;
+ int resync = 0;
+
+ /* First pass (p==NULL): count the extended utf-8 characters. */
+ /* Second pass (p!=NULL): create string. */
+ for (;;)
+ {
+ for (slen = length, nleft = encidx = 0, n = 0,
+ s = (const unsigned char *)string;
+ slen;
+ s++, slen--)
+ {
+ if (resync)
+ {
+ if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
+ {
+ /* Still invalid. */
+ if (p)
+ {
+ sprintf (p, "\\x%02x", *s);
+ p += 4;
+ }
+ n += 4;
+ continue;
+ }
+ resync = 0;
+ }
+ if (!nleft)
+ {
+ if (!(*s & 0x80))
+ {
+ /* Plain ascii. */
+ if ( delim != -1
+ && (*s < 0x20 || *s == 0x7f || *s == delim
+ || (delim && *s == '\\')))
+ {
+ n++;
+ if (p)
+ *p++ = '\\';
+ switch (*s)
+ {
+ case '\n': n++; if ( p ) *p++ = 'n'; break;
+ case '\r': n++; if ( p ) *p++ = 'r'; break;
+ case '\f': n++; if ( p ) *p++ = 'f'; break;
+ case '\v': n++; if ( p ) *p++ = 'v'; break;
+ case '\b': n++; if ( p ) *p++ = 'b'; break;
+ case 0: n++; if ( p ) *p++ = '0'; break;
+ default:
+ n += 3;
+ if (p)
+ {
+ sprintf (p, "x%02x", *s);
+ p += 3;
+ }
+ break;
+ }
+ }
+ else
+ {
+ if (p)
+ *p++ = *s;
+ n++;
+ }
+ }
+ else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
+ {
+ val = *s & 0x1f;
+ nleft = 1;
+ encidx = 0;
+ encbuf[encidx++] = *s;
+ }
+ else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
+ {
+ val = *s & 0x0f;
+ nleft = 2;
+ encidx = 0;
+ encbuf[encidx++] = *s;
+ }
+ else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
+ {
+ val = *s & 0x07;
+ nleft = 3;
+ encidx = 0;
+ encbuf[encidx++] = *s;
+ }
+ else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
+ {
+ val = *s & 0x03;
+ nleft = 4;
+ encidx = 0;
+ encbuf[encidx++] = *s;
+ }
+ else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
+ {
+ val = *s & 0x01;
+ nleft = 5;
+ encidx = 0;
+ encbuf[encidx++] = *s;
+ }
+ else /* Invalid encoding: print as \xNN. */
+ {
+ if (p)
+ {
+ sprintf (p, "\\x%02x", *s);
+ p += 4;
+ }
+ n += 4;
+ resync = 1;
+ }
+ }
+ else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
+ {
+ if (p)
+ {
+ for (i = 0; i < encidx; i++)
+ {
+ sprintf (p, "\\x%02x", encbuf[i]);
+ p += 4;
+ }
+ sprintf (p, "\\x%02x", *s);
+ p += 4;
+ }
+ n += 4 + 4 * encidx;
+ nleft = 0;
+ encidx = 0;
+ resync = 1;
+ }
+ else
+ {
+ encbuf[encidx++] = *s;
+ val <<= 6;
+ val |= *s & 0x3f;
+ if (!--nleft) /* Ready. */
+ {
+ if (no_translation)
+ {
+ if (p)
+ {
+ for (i = 0; i < encidx; i++)
+ *p++ = encbuf[i];
+ }
+ n += encidx;
+ encidx = 0;
+ }
+ else if (with_iconv)
+ {
+ /* Our strategy for using iconv is a bit strange
+ but it better keeps compatibility with
+ previous versions in regard to how invalid
+ encodings are displayed. What we do is to
+ keep the utf-8 as is and have the real
+ translation step then at the end. Yes, I
+ know that this is ugly. However we are short
+ of the 1.4 release and for this branch we
+ should not mess too much around with iconv
+ things. One reason for this is that we don't
+ know enough about non-GNU iconv
+ implementation and want to minimize the risk
+ of breaking the code on too many platforms. */
+ if ( p )
+ {
+ for (i=0; i < encidx; i++ )
+ *p++ = encbuf[i];
+ }
+ n += encidx;
+ encidx = 0;
+ }
+ else /* Latin-1 case. */
+ {
+ if (val >= 0x80 && val < 256)
+ {
+ /* We can simply print this character */
+ n++;
+ if (p)
+ *p++ = val;
+ }
+ else
+ {
+ /* We do not have a translation: print utf8. */
+ if (p)
+ {
+ for (i = 0; i < encidx; i++)
+ {
+ sprintf (p, "\\x%02x", encbuf[i]);
+ p += 4;
+ }
+ }
+ n += encidx * 4;
+ encidx = 0;
+ }
+ }
+ }
+
+ }
+ }
+ if (!buffer)
+ {
+ /* Allocate the buffer after the first pass. */
+ buffer = p = jnlib_xmalloc (n + 1);
+ }
+ else if (with_iconv)
+ {
+ /* Note: See above for comments. */
+ iconv_t cd;
+ const char *inptr;
+ char *outbuf, *outptr;
+ size_t inbytes, outbytes;
+
+ *p = 0; /* Terminate the buffer. */
+
+ cd = iconv_open (active_charset_name, "utf-8");
+ if (cd == (iconv_t)-1)
+ {
+ handle_iconv_error (active_charset_name, "utf-8", 1);
+ jnlib_free (buffer);
+ return utf8_to_native (string, length, delim);
+ }
+
+ /* Allocate a new buffer large enough to hold all possible
+ encodings. */
+ n = p - buffer + 1;
+ inbytes = n - 1;;
+ inptr = buffer;
+ outbytes = n * MB_LEN_MAX;
+ if (outbytes / MB_LEN_MAX != n)
+ BUG (); /* Actually an overflow. */
+ outbuf = outptr = jnlib_xmalloc (outbytes);
+ if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
+ &outptr, &outbytes) == (size_t)-1)
+ {
+ static int shown;
+
+ if (!shown)
+ log_info (_("conversion from `%s' to `%s' failed: %s\n"),
+ "utf-8", active_charset_name, strerror (errno));
+ shown = 1;
+ /* Didn't worked out. Try again but without iconv. */
+ jnlib_free (buffer);
+ buffer = NULL;
+ jnlib_free (outbuf);
+ outbuf = do_utf8_to_native (string, length, delim, 0);
+ }
+ else /* Success. */
+ {
+ *outptr = 0; /* Make sure it is a string. */
+ /* We could realloc the buffer now but I doubt that it
+ makes much sense given that it will get freed
+ anyway soon after. */
+ jnlib_free (buffer);
+ }
+ iconv_close (cd);
+ return outbuf;
+ }
+ else /* Not using iconv. */
+ {
+ *p = 0; /* Make sure it is a string. */
+ return buffer;
+ }
+ }
+}
+
+/* Convert string, which is in UTF-8 to native encoding. Replace
+ illegal encodings by some "\xnn" and quote all control
+ characters. A character with value DELIM will always be quoted, it
+ must be a vanilla ASCII character. A DELIM value of -1 is special:
+ it disables all quoting of control characters. This function
+ terminates the process on memory shortage. */
+char *
+utf8_to_native (const char *string, size_t length, int delim)
+{
+ return do_utf8_to_native (string, length, delim, use_iconv);
+}
+
+
+
+
+/* Wrapper function for iconv_open, required for W32 as we dlopen that
+ library on that system. */
+jnlib_iconv_t
+jnlib_iconv_open (const char *tocode, const char *fromcode)
+{
+#ifdef HAVE_W32_SYSTEM
+ if (load_libiconv ())
+ return (jnlib_iconv_t)(-1);
+#endif /*HAVE_W32_SYSTEM*/
+
+ return (jnlib_iconv_t)iconv_open (tocode, fromcode);
+}
+
+
+/* Wrapper function for iconv, required for W32 as we dlopen that
+ library on that system. */
+size_t
+jnlib_iconv (jnlib_iconv_t cd,
+ const char **inbuf, size_t *inbytesleft,
+ char **outbuf, size_t *outbytesleft)
+{
+
+#ifdef HAVE_W32_SYSTEM
+ if (load_libiconv ())
+ return 0;
+#endif /*HAVE_W32_SYSTEM*/
+
+ return iconv ((iconv_t)cd, (char**)inbuf, inbytesleft, outbuf, outbytesleft);
+}
+
+/* Wrapper function for iconv_close, required for W32 as we dlopen that
+ library on that system. */
+int
+jnlib_iconv_close (jnlib_iconv_t cd)
+{
+#ifdef HAVE_W32_SYSTEM
+ if (load_libiconv ())
+ return 0;
+#endif /*HAVE_W32_SYSTEM*/
+
+ return iconv_close ((iconv_t)cd);
+}