/* tld.c --- Handle TLD restriction checking. * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Simon Josefsson. * Copyright (C) 2003, 2004 Free Software Foundation, Inc. * * Author: Thomas Jacob, Internet24.de * * This file is part of GNU Libidn. * * GNU Libidn is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * GNU Libidn is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with GNU Libidn; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA * */ #include /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */ #include /* Get strcmp(). */ #include /* Get specifications. */ #include /* Array of built-in domain restriction structures. See tlds.c. */ extern const Tld_table *_tld_tables[]; /** * tld_get_table - get table for a TLD name in table * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string. * @tables: Zero terminated array of #Tld_table info-structures for * TLDs. * * Get the TLD table for a named TLD by searching through the given * TLD table array. * * Return value: Return structure corresponding to TLD @tld by going * thru @tables, or return %NULL if no such structure is found. */ const Tld_table * tld_get_table (const char *tld, const Tld_table ** tables) { const Tld_table **tldtable = NULL; if (!tld || !tables) return NULL; for (tldtable = tables; *tldtable; tldtable++) if (!strcmp ((*tldtable)->name, tld)) return *tldtable; return NULL; } /** * tld_default_table - get table for a TLD name * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string. * @overrides: Additional zero terminated array of #Tld_table * info-structures for TLDs, or %NULL to only use library deault * tables. * * Get the TLD table for a named TLD, using the internal defaults, * possibly overrided by the (optional) supplied tables. * * Return value: Return structure corresponding to TLD @tld_str, first * looking through @overrides then thru built-in list, or %NULL if * no such structure found. */ const Tld_table * tld_default_table (const char *tld, const Tld_table ** overrides) { const Tld_table *tldtable = NULL; if (!tld) return NULL; if (overrides) tldtable = tld_get_table (tld, overrides); if (!tldtable) tldtable = tld_get_table (tld, _tld_tables); return tldtable; } #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \ (c) == 0xFF0E || (c) == 0xFF61) /** * tld_get_4 - extract top level domain part in input Unicode string * @in: Array of unicode code points to process. Does not need to be * zero terminated. * @inlen: Number of unicode code points. * @out: Zero terminated ascii result string pointer. * * Isolate the top-level domain of @in and return it as an ASCII * string in @out. * * Return value: Return %TLD_SUCCESS on success, or the corresponding * #Tld_rc error code otherwise. */ int tld_get_4 (const uint32_t * in, size_t inlen, char **out) { const uint32_t *ipos; size_t olen; *out = NULL; if (!in || inlen == 0) return TLD_NODATA; ipos = &in[inlen - 1]; olen = 0; /* Scan backwards for non(latin)letters. */ while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) || (*ipos >= 0x61 && *ipos <= 0x7A))) ipos--, olen++; if (olen > 0 && DOTP (*ipos)) /* Found something that appears a TLD. */ { char *out_s = malloc (sizeof (char) * (olen + 1)); char *opos = out_s; if (!opos) return TLD_MALLOC_ERROR; ipos++; /* Transcribe to lowercase ascii string. */ for (; ipos < &in[inlen]; ipos++, opos++) *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20; *opos = 0; *out = out_s; return TLD_SUCCESS; } return TLD_NO_TLD; } /** * tld_get_4z - extract top level domain part in input Unicode string * @in: Zero terminated array of unicode code points to process. * @out: Zero terminated ascii result string pointer. * * Isolate the top-level domain of @in and return it as an ASCII * string in @out. * * Return value: Return %TLD_SUCCESS on success, or the corresponding * #Tld_rc error code otherwise. */ int tld_get_4z (const uint32_t * in, char **out) { const uint32_t *ipos = in; if (!in) return TLD_NODATA; while (*ipos) ipos++; return tld_get_4 (in, ipos - in, out); } /** * tld_get_z - extract top level domain part in input string * @in: Zero terminated character array to process. * @out: Zero terminated ascii result string pointer. * * Isolate the top-level domain of @in and return it as an ASCII * string in @out. The input string @in may be UTF-8, ISO-8859-1 or * any ASCII compatible character encoding. * * Return value: Return %TLD_SUCCESS on success, or the corresponding * #Tld_rc error code otherwise. */ int tld_get_z (const char *in, char **out) { uint32_t *iucs; size_t i, ilen; int rc; ilen = strlen (in); iucs = calloc (ilen, sizeof (*iucs)); if (!iucs) return TLD_MALLOC_ERROR; for (i = 0; i < ilen; i++) iucs[i] = in[i]; rc = tld_get_4 (iucs, ilen, out); free (iucs); return rc; } /* * tld_checkchar - verify that character is permitted * @ch: 32 bit unicode character to check. * @tld: A #Tld_table data structure to check @ch against. * * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid * character in @tld. * * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a * valid character for the TLD @tld or if @tld is %NULL, * %TLD_INVALID if @ch is invalid as defined by @tld. */ static int _tld_checkchar (uint32_t ch, const Tld_table * tld) { const Tld_table_element *s, *e, *m; if (!tld) return TLD_SUCCESS; /* Check for [-a-z0-9.]. */ if ((ch >= 0x61 && ch <= 0x7A) || (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch)) return TLD_SUCCESS; s = tld->valid; e = s + tld->nvalid; while (s < e) { m = s + ((e - s) >> 1); if (ch < m->start) e = m; else if (ch > m->end) s = m + 1; else return TLD_SUCCESS; } return TLD_INVALID; } /** * tld_check_4t - verify that characters are permitted * @in: Array of unicode code points to process. Does not need to be * zero terminated. * @inlen: Number of unicode code points. * @errpos: Position of offending character is returned here. * @tld: A #Tld_table data structure representing the restrictions for * which the input should be tested. * * Test each of the code points in @in for whether or not * they are allowed by the data structure in @tld, return * the position of the first character for which this is not * the case in @errpos. * * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code * points are valid or when @tld is null, %TLD_INVALID if a * character is not allowed, or additional error codes on general * failure conditions. */ int tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos, const Tld_table * tld) { const uint32_t *ipos; int rc; if (!tld) /* No data for TLD so everything is valid. */ return TLD_SUCCESS; ipos = in; while (ipos < &in[inlen]) { rc = _tld_checkchar (*ipos, tld); if (rc != TLD_SUCCESS) { if (errpos) *errpos = ipos - in; return rc; } ipos++; } return TLD_SUCCESS; } /** * tld_check_4tz - verify that characters are permitted * @in: Zero terminated array of unicode code points to process. * @errpos: Position of offending character is returned here. * @tld: A #Tld_table data structure representing the restrictions for * which the input should be tested. * * Test each of the code points in @in for whether or not * they are allowed by the data structure in @tld, return * the position of the first character for which this is not * the case in @errpos. * * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code * points are valid or when @tld is null, %TLD_INVALID if a * character is not allowed, or additional error codes on general * failure conditions. */ int tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld) { const uint32_t *ipos = in; if (!ipos) return TLD_NODATA; while (*ipos) ipos++; return tld_check_4t (in, ipos - in, errpos, tld); } /** * tld_check_4 - verify that characters are permitted * @in: Array of unicode code points to process. Does not need to be * zero terminated. * @inlen: Number of unicode code points. * @errpos: Position of offending character is returned here. * @overrides: A #Tld_table array of additional domain restriction * structures that complement and supersede the built-in information. * * Test each of the code points in @in for whether or not they are * allowed by the information in @overrides or by the built-in TLD * restriction data. When data for the same TLD is available both * internally and in @overrides, the information in @overrides takes * precedence. If several entries for a specific TLD are found, the * first one is used. If @overrides is %NULL, only the built-in * information is used. The position of the first offending character * is returned in @errpos. * * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code * points are valid or when @tld is null, %TLD_INVALID if a * character is not allowed, or additional error codes on general * failure conditions. */ int tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos, const Tld_table ** overrides) { const Tld_table *tld; char *domain; int rc; if (errpos) *errpos = 0; /* Get TLD name. */ rc = tld_get_4 (in, inlen, &domain); if (rc != TLD_SUCCESS) { if (rc == TLD_NO_TLD) /* No TLD, say OK */ return TLD_SUCCESS; else return rc; } /* Retrieve appropriate data structure. */ tld = tld_default_table (domain, overrides); free (domain); return tld_check_4t (in, inlen, errpos, tld); } /** * tld_check_4z - verify that characters are permitted * @in: Zero-terminated array of unicode code points to process. * @errpos: Position of offending character is returned here. * @overrides: A #Tld_table array of additional domain restriction * structures that complement and supersede the built-in information. * * Test each of the code points in @in for whether or not they are * allowed by the information in @overrides or by the built-in TLD * restriction data. When data for the same TLD is available both * internally and in @overrides, the information in @overrides takes * precedence. If several entries for a specific TLD are found, the * first one is used. If @overrides is %NULL, only the built-in * information is used. The position of the first offending character * is returned in @errpos. * * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code * points are valid or when @tld is null, %TLD_INVALID if a * character is not allowed, or additional error codes on general * failure conditions. */ int tld_check_4z (const uint32_t * in, size_t * errpos, const Tld_table ** overrides) { const uint32_t *ipos = in; if (!ipos) return TLD_NODATA; while (*ipos) ipos++; return tld_check_4 (in, ipos - in, errpos, overrides); } /** * tld_check_8z - verify that characters are permitted * @in: Zero-terminated UTF8 string to process. * @errpos: Position of offending character is returned here. * @overrides: A #Tld_table array of additional domain restriction * structures that complement and supersede the built-in information. * * Test each of the characters in @in for whether or not they are * allowed by the information in @overrides or by the built-in TLD * restriction data. When data for the same TLD is available both * internally and in @overrides, the information in @overrides takes * precedence. If several entries for a specific TLD are found, the * first one is used. If @overrides is %NULL, only the built-in * information is used. The position of the first offending character * is returned in @errpos. Note that the error position refers to the * decoded character offset rather than the byte position in the * string. * * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all * characters are valid or when @tld is null, %TLD_INVALID if a * character is not allowed, or additional error codes on general * failure conditions. */ int tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides) { uint32_t *iucs; size_t ilen; int rc; if (!in) return TLD_NODATA; iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen); if (!iucs) return TLD_MALLOC_ERROR; rc = tld_check_4 (iucs, ilen, errpos, overrides); free (iucs); return rc; } /** * tld_check_lz - verify that characters are permitted * @in: Zero-terminated string in the current locales encoding to process. * @errpos: Position of offending character is returned here. * @overrides: A #Tld_table array of additional domain restriction * structures that complement and supersede the built-in information. * * Test each of the characters in @in for whether or not they are * allowed by the information in @overrides or by the built-in TLD * restriction data. When data for the same TLD is available both * internally and in @overrides, the information in @overrides takes * precedence. If several entries for a specific TLD are found, the * first one is used. If @overrides is %NULL, only the built-in * information is used. The position of the first offending character * is returned in @errpos. Note that the error position refers to the * decoded character offset rather than the byte position in the * string. * * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all * characters are valid or when @tld is null, %TLD_INVALID if a * character is not allowed, or additional error codes on general * failure conditions. */ int tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides) { char *utf8; int rc; if (!in) return TLD_NODATA; utf8 = stringprep_locale_to_utf8 (in); if (!utf8) return TLD_ICONV_ERROR; rc = tld_check_8z (utf8, errpos, overrides); free (utf8); return rc; } /** * Tld_rc: * @TLD_SUCCESS: Successful operation. This value is guaranteed to * always be zero, the remaining ones are only guaranteed to hold * non-zero values, for logical comparison purposes. * @TLD_INVALID: Invalid character found. * @TLD_NODATA: No input data was provided. * @TLD_MALLOC_ERROR: Error during memory allocation. * @TLD_ICONV_ERROR: Error during iconv string conversion. * @TLD_NO_TLD: No top-level domain found in domain string. * * Enumerated return codes of the TLD checking functions. * The value 0 is guaranteed to always correspond to success. */