summaryrefslogtreecommitdiff
path: root/src/iri.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/iri.c')
-rw-r--r--src/iri.c368
1 files changed, 368 insertions, 0 deletions
diff --git a/src/iri.c b/src/iri.c
new file mode 100644
index 0000000..9b16639
--- /dev/null
+++ b/src/iri.c
@@ -0,0 +1,368 @@
+/* IRI related functions.
+ Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iconv.h>
+#include <stringprep.h>
+#include <idna.h>
+#include <errno.h>
+
+#include "utils.h"
+
+/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
+#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
+
+/* Note: locale encoding is kept in options struct (opt.locale) */
+
+static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
+
+
+/* Given a string containing "charset=XXX", return the encoding if found,
+ or NULL otherwise */
+char *
+parse_charset (char *str)
+{
+ char *charset;
+
+ if (!str || !*str)
+ return NULL;
+
+ str = strcasestr (str, "charset=");
+ if (!str)
+ return NULL;
+
+ str += 8;
+ charset = str;
+
+ /* sXXXav: which chars should be banned ??? */
+ while (*charset && !c_isspace (*charset))
+ charset++;
+
+ /* sXXXav: could strdupdelim return NULL ? */
+ charset = strdupdelim (str, charset);
+
+ /* Do a minimum check on the charset value */
+ if (!check_encoding_name (charset))
+ {
+ xfree (charset);
+ return NULL;
+ }
+
+ /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
+
+ return charset;
+}
+
+/* Find the locale used, or fall back on a default value */
+char *
+find_locale (void)
+{
+ return (char *) stringprep_locale_charset ();
+}
+
+/* Basic check of an encoding name. */
+bool
+check_encoding_name (char *encoding)
+{
+ char *s = encoding;
+
+ while (*s)
+ {
+ if (!c_isascii (*s) || c_isspace (*s))
+ {
+ logprintf (LOG_VERBOSE, _("Encoding %s isn't valid\n"), quote (encoding));
+ return false;
+ }
+
+ s++;
+ }
+
+ return true;
+}
+
+/* Try converting string str from locale to UTF-8. Return a new string
+ on success, or str on error or if conversion isn't needed. */
+const char *
+locale_to_utf8 (const char *str)
+{
+ iconv_t l2u;
+ char *new;
+
+ /* That shouldn't happen, just in case */
+ if (!opt.locale)
+ {
+ logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n"));
+ opt.locale = find_locale ();
+ }
+
+ if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
+ return str;
+
+ l2u = iconv_open ("UTF-8", opt.locale);
+ if (l2u != (iconv_t)(-1))
+ {
+ logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
+ quote (opt.locale), quote ("UTF-8"));
+ return str;
+ }
+
+ if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
+ return (const char *) new;
+
+ return str;
+}
+
+/* Do the conversion according to the passed conversion descriptor cd. *out
+ will contain the transcoded string on success. *out content is
+ unspecified otherwise. */
+static bool
+do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+{
+ /* sXXXav : hummm hard to guess... */
+ size_t len, done, outlen = inlen * 2;
+ int invalid = 0, tooshort = 0;
+ char *s;
+
+ s = xmalloc (outlen + 1);
+ *out = s;
+ len = outlen;
+ done = 0;
+
+ for (;;)
+ {
+ if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
+ {
+ *out = s;
+ *(s + len - outlen - done) = '\0';
+ return true;
+ }
+
+ /* Incomplete or invalid multibyte sequence */
+ if (errno == EINVAL || errno == EILSEQ)
+ {
+ if (!invalid)
+ logprintf (LOG_VERBOSE,
+ _("Incomplete or invalid multibyte sequence encountered\n"));
+
+ invalid++;
+ **out = *in;
+ in++;
+ inlen--;
+ (*out)++;
+ outlen--;
+ }
+ else if (errno == E2BIG) /* Output buffer full */
+ {
+ char *new;
+
+ tooshort++;
+ done = len;
+ outlen = done + inlen * 2;
+ new = xmalloc (outlen + 1);
+ memcpy (new, s, done);
+ xfree (s);
+ s = new;
+ len = outlen;
+ *out = s + done;
+ }
+ else /* Weird, we got an unspecified error */
+ {
+ logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
+ break;
+ }
+ }
+
+ return false;
+}
+
+/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
+ on error. */
+char *
+idn_encode (struct iri *i, char *host)
+{
+ char *new;
+ int ret;
+
+ /* Encode to UTF-8 if not done */
+ if (!i->utf8_encode)
+ {
+ if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
+ return NULL; /* Nothing to encode or an error occured */
+ host = new;
+ }
+
+ /* toASCII UTF-8 NULL terminated string */
+ ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
+ if (ret != IDNA_SUCCESS)
+ {
+ /* sXXXav : free new when needed ! */
+ logprintf (LOG_VERBOSE, _("idn_encode failed (%d): %s\n"), ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
+ on success or NULL on error. */
+char *
+idn_decode (char *host)
+{
+ char *new;
+ int ret;
+
+ ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
+ if (ret != IDNA_SUCCESS)
+ {
+ logprintf (LOG_VERBOSE, _("idn_decode failed (%d): %s\n"), ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to transcode string str from remote encoding to UTF-8. On success, *new
+ contains the transcoded string. *new content is unspecified otherwise. */
+bool
+remote_to_utf8 (struct iri *i, const char *str, const char **new)
+{
+ iconv_t cd;
+ bool ret = false;
+
+ if (!i->uri_encoding)
+ return false;
+
+ /* When `i->uri_encoding' == "UTF-8" there is nothing to convert. But we must
+ test for non-ASCII symbols for correct hostname processing in `idn_encode'
+ function. */
+ if (!strcmp (i->uri_encoding, "UTF-8"))
+ {
+ int i, len = strlen (str);
+ for (i = 0; i < len; i++)
+ if ((unsigned char) str[i] >= (unsigned char) '\200')
+ {
+ *new = strdup (str);
+ return true;
+ }
+ return false;
+ }
+
+ cd = iconv_open ("UTF-8", i->uri_encoding);
+ if (cd == (iconv_t)(-1))
+ return false;
+
+ if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+ ret = true;
+
+ iconv_close (cd);
+
+ /* Test if something was converted */
+ if (!strcmp (str, *new))
+ {
+ xfree ((char *) *new);
+ return false;
+ }
+
+ return ret;
+}
+
+/* Allocate a new iri structure and return a pointer to it. */
+struct iri *
+iri_new (void)
+{
+ struct iri *i = xmalloc (sizeof *i);
+ i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
+ i->content_encoding = NULL;
+ i->orig_url = NULL;
+ i->utf8_encode = opt.enable_iri;
+ return i;
+}
+
+struct iri *iri_dup (const struct iri *src)
+{
+ struct iri *i = xmalloc (sizeof *i);
+ i->uri_encoding = src->uri_encoding ? xstrdup (src->uri_encoding) : NULL;
+ i->content_encoding = (src->content_encoding ?
+ xstrdup (src->content_encoding) : NULL);
+ i->orig_url = src->orig_url ? xstrdup (src->orig_url) : NULL;
+ i->utf8_encode = src->utf8_encode;
+ return i;
+}
+
+/* Completely free an iri structure. */
+void
+iri_free (struct iri *i)
+{
+ xfree_null (i->uri_encoding);
+ xfree_null (i->content_encoding);
+ xfree_null (i->orig_url);
+ xfree (i);
+}
+
+/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
+ it unless force is true. */
+void
+set_uri_encoding (struct iri *i, char *charset, bool force)
+{
+ DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
+ if (!force && opt.encoding_remote)
+ return;
+ if (i->uri_encoding)
+ {
+ if (charset && !strcasecmp (i->uri_encoding, charset))
+ return;
+ xfree (i->uri_encoding);
+ }
+
+ i->uri_encoding = charset ? xstrdup (charset) : NULL;
+}
+
+/* Set content_encoding of struct iri i. */
+void
+set_content_encoding (struct iri *i, char *charset)
+{
+ DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
+ if (opt.encoding_remote)
+ return;
+ if (i->content_encoding)
+ {
+ if (charset && !strcasecmp (i->content_encoding, charset))
+ return;
+ xfree (i->content_encoding);
+ }
+
+ i->content_encoding = charset ? xstrdup (charset) : NULL;
+}
+