/* $Id: chars.c 4534 2011-02-24 02:47:25Z astyanax $ */ /************************************************************************** * chars.c * * * * Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 * * Free Software Foundation, Inc. * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 3, or (at your option) * * any later version. * * * * This program is distributed in the hope that it will be useful, but * * WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * * General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * * 02110-1301, USA. * * * **************************************************************************/ #include "proto.h" #include #include #ifdef ENABLE_UTF8 #ifdef HAVE_WCHAR_H #include #endif #ifdef HAVE_WCTYPE_H #include #endif static bool use_utf8 = FALSE; /* Whether we've enabled UTF-8 support. */ static const wchar_t bad_wchar = 0xFFFD; /* If we get an invalid multibyte sequence, we treat it as * Unicode FFFD (Replacement Character), unless we're searching * for a match to it. */ static const char *const bad_mbchar = "\xEF\xBF\xBD"; static const int bad_mbchar_len = 3; /* Enable UTF-8 support. */ void utf8_init(void) { use_utf8 = TRUE; } /* Is UTF-8 support enabled? */ bool using_utf8(void) { return use_utf8; } #endif #ifndef HAVE_ISBLANK /* This function is equivalent to isblank(). */ bool nisblank(int c) { return isspace(c) && (c == '\t' || !is_cntrl_char(c)); } #endif #if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8) /* This function is equivalent to iswblank(). */ bool niswblank(wchar_t wc) { return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc)); } #endif /* Return TRUE if the value of c is in byte range, and FALSE * otherwise. */ bool is_byte(int c) { return ((unsigned int)c == (unsigned char)c); } static void mbtowc_reset(void) { IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0)); } static void wctomb_reset(void) { IGNORE_CALL_RESULT(wctomb(NULL, 0)); } /* This function is equivalent to isalnum() for multibyte characters. */ bool is_alnum_mbchar(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); wc = bad_wchar; } return iswalnum(wc); } else #endif return isalnum((unsigned char)*c); } /* This function is equivalent to isblank() for multibyte characters. */ bool is_blank_mbchar(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); wc = bad_wchar; } return iswblank(wc); } else #endif return isblank((unsigned char)*c); } /* This function is equivalent to iscntrl(), except in that it only * handles non-high-bit control characters. */ bool is_ascii_cntrl_char(int c) { return (0 <= c && c < 32); } /* This function is equivalent to iscntrl(), except in that it also * handles high-bit control characters. */ bool is_cntrl_char(int c) { return (-128 <= c && c < -96) || (0 <= c && c < 32) || (127 <= c && c < 160); } #ifdef ENABLE_UTF8 /* This function is equivalent to iscntrl() for wide characters, except * in that it also handles wide control characters with their high bits * set. */ bool is_cntrl_wchar(wchar_t wc) { return (0 <= wc && wc < 32) || (127 <= wc && wc < 160); } #endif /* This function is equivalent to iscntrl() for multibyte characters, * except in that it also handles multibyte control characters with * their high bits set. */ bool is_cntrl_mbchar(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); wc = bad_wchar; } return is_cntrl_wchar(wc); } else #endif return is_cntrl_char((unsigned char)*c); } /* This function is equivalent to ispunct() for multibyte characters. */ bool is_punct_mbchar(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); if (c_mb_len < 0) { mbtowc_reset(); wc = bad_wchar; } return iswpunct(wc); } else #endif return ispunct((unsigned char)*c); } /* Return TRUE for a multibyte character found in a word (currently only * an alphanumeric or punctuation character, and only the latter if * allow_punct is TRUE) and FALSE otherwise. */ bool is_word_mbchar(const char *c, bool allow_punct) { assert(c != NULL); return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) : FALSE); } /* c is a control character. It displays as ^@, ^?, or ^[ch], where ch * is (c + 64). We return that character. */ char control_rep(char c) { assert(is_cntrl_char(c)); /* Treat newlines embedded in a line as encoded nulls. */ if (c == '\n') return '@'; else if (c == NANO_CONTROL_8) return '?'; else return c + 64; } #ifdef ENABLE_UTF8 /* c is a wide control character. It displays as ^@, ^?, or ^[ch], * where ch is (c + 64). We return that wide character. */ wchar_t control_wrep(wchar_t wc) { assert(is_cntrl_wchar(wc)); /* Treat newlines embedded in a line as encoded nulls. */ if (wc == '\n') return '@'; else if (wc == NANO_CONTROL_8) return '?'; else return wc + 64; } #endif /* c is a multibyte control character. It displays as ^@, ^?, or ^[ch], * where ch is (c + 64). We return that multibyte character. If crep * is an invalid multibyte sequence, it will be replaced with Unicode * 0xFFFD (Replacement Character). */ char *control_mbrep(const char *c, char *crep, int *crep_len) { assert(c != NULL && crep != NULL && crep_len != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); *crep_len = bad_mbchar_len; strncpy(crep, bad_mbchar, *crep_len); } else { *crep_len = wctomb(crep, control_wrep(wc)); if (*crep_len < 0) { wctomb_reset(); *crep_len = 0; } } } else { #endif *crep_len = 1; *crep = control_rep(*c); #ifdef ENABLE_UTF8 } #endif return crep; } /* c is a multibyte non-control character. We return that multibyte * character. If crep is an invalid multibyte sequence, it will be * replaced with Unicode 0xFFFD (Replacement Character). */ char *mbrep(const char *c, char *crep, int *crep_len) { assert(c != NULL && crep != NULL && crep_len != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; /* Reject invalid Unicode characters. */ if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) { mbtowc_reset(); *crep_len = bad_mbchar_len; strncpy(crep, bad_mbchar, *crep_len); } else { *crep_len = wctomb(crep, wc); if (*crep_len < 0) { wctomb_reset(); *crep_len = 0; } } } else { #endif *crep_len = 1; *crep = *c; #ifdef ENABLE_UTF8 } #endif return crep; } /* This function is equivalent to wcwidth() for multibyte characters. */ int mbwidth(const char *c) { assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; int width; if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc_reset(); wc = bad_wchar; } width = wcwidth(wc); if (width == -1) { wc = bad_wchar; width = wcwidth(wc); } return width; } else #endif return 1; } /* Return the maximum width in bytes of a multibyte character. */ int mb_cur_max(void) { return #ifdef ENABLE_UTF8 use_utf8 ? MB_CUR_MAX : #endif 1; } /* Convert the Unicode value in chr to a multibyte character with the * same wide character value as chr, if possible. If the conversion * succeeds, return the (dynamically allocated) multibyte character and * its length. Otherwise, return an undefined (dynamically allocated) * multibyte character and a length of zero. */ char *make_mbchar(long chr, int *chr_mb_len) { char *chr_mb; assert(chr_mb_len != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { chr_mb = charalloc(MB_CUR_MAX); *chr_mb_len = wctomb(chr_mb, (wchar_t)chr); /* Reject invalid Unicode characters. */ if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) { wctomb_reset(); *chr_mb_len = 0; } } else { #endif *chr_mb_len = 1; chr_mb = mallocstrncpy(NULL, (char *)&chr, 1); #ifdef ENABLE_UTF8 } #endif return chr_mb; } /* Parse a multibyte character from buf. Return the number of bytes * used. If chr isn't NULL, store the multibyte character in it. If * col isn't NULL, store the new display width in it. If *buf is '\t', * we expect col to have the current display width. */ int parse_mbchar(const char *buf, char *chr, size_t *col) { int buf_mb_len; assert(buf != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { /* Get the number of bytes in the multibyte character. */ buf_mb_len = mblen(buf, MB_CUR_MAX); /* If buf contains an invalid multibyte character, only * interpret buf's first byte. */ if (buf_mb_len < 0) { IGNORE_CALL_RESULT(mblen(NULL, 0)); buf_mb_len = 1; } else if (buf_mb_len == 0) buf_mb_len++; /* Save the multibyte character in chr. */ if (chr != NULL) { int i; for (i = 0; i < buf_mb_len; i++) chr[i] = buf[i]; } /* Save the column width of the wide character in col. */ if (col != NULL) { /* If we have a tab, get its width in columns using the * current value of col. */ if (*buf == '\t') *col += tabsize - *col % tabsize; /* If we have a control character, get its width using one * column for the "^" that will be displayed in front of it, * and the width in columns of its visible equivalent as * returned by control_mbrep(). */ else if (is_cntrl_mbchar(buf)) { char *ctrl_buf_mb = charalloc(MB_CUR_MAX); int ctrl_buf_mb_len; (*col)++; ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb, &ctrl_buf_mb_len); *col += mbwidth(ctrl_buf_mb); free(ctrl_buf_mb); /* If we have a normal character, get its width in columns * normally. */ } else *col += mbwidth(buf); } } else { #endif /* Get the number of bytes in the byte character. */ buf_mb_len = 1; /* Save the byte character in chr. */ if (chr != NULL) *chr = *buf; if (col != NULL) { /* If we have a tab, get its width in columns using the * current value of col. */ if (*buf == '\t') *col += tabsize - *col % tabsize; /* If we have a control character, it's two columns wide: * one column for the "^" that will be displayed in front of * it, and one column for its visible equivalent as returned * by control_mbrep(). */ else if (is_cntrl_char((unsigned char)*buf)) *col += 2; /* If we have a normal character, it's one column wide. */ else (*col)++; } #ifdef ENABLE_UTF8 } #endif return buf_mb_len; } /* Return the index in buf of the beginning of the multibyte character * before the one at pos. */ size_t move_mbleft(const char *buf, size_t pos) { size_t pos_prev = pos; assert(buf != NULL && pos <= strlen(buf)); /* There is no library function to move backward one multibyte * character. Here is the naive, O(pos) way to do it. */ while (TRUE) { int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL); if (pos_prev <= buf_mb_len) break; pos_prev -= buf_mb_len; } return pos - pos_prev; } /* Return the index in buf of the beginning of the multibyte character * after the one at pos. */ size_t move_mbright(const char *buf, size_t pos) { return pos + parse_mbchar(buf + pos, NULL, NULL); } #ifndef HAVE_STRCASECMP /* This function is equivalent to strcasecmp(). */ int nstrcasecmp(const char *s1, const char *s2) { return strncasecmp(s1, s2, (size_t)-1); } #endif /* This function is equivalent to strcasecmp() for multibyte strings. */ int mbstrcasecmp(const char *s1, const char *s2) { return mbstrncasecmp(s1, s2, (size_t)-1); } #ifndef HAVE_STRNCASECMP /* This function is equivalent to strncasecmp(). */ int nstrncasecmp(const char *s1, const char *s2, size_t n) { if (s1 == s2) return 0; assert(s1 != NULL && s2 != NULL); for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) { if (tolower(*s1) != tolower(*s2)) break; } return (n > 0) ? tolower(*s1) - tolower(*s2) : 0; } #endif /* This function is equivalent to strncasecmp() for multibyte * strings. */ int mbstrncasecmp(const char *s1, const char *s2, size_t n) { #ifdef ENABLE_UTF8 if (use_utf8) { char *s1_mb, *s2_mb; wchar_t ws1, ws2; if (s1 == s2) return 0; assert(s1 != NULL && s2 != NULL); s1_mb = charalloc(MB_CUR_MAX); s2_mb = charalloc(MB_CUR_MAX); for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1 += move_mbright(s1, 0), s2 += move_mbright(s2, 0), n--) { bool bad_s1_mb = FALSE, bad_s2_mb = FALSE; int s1_mb_len, s2_mb_len; s1_mb_len = parse_mbchar(s1, s1_mb, NULL); if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) { mbtowc_reset(); ws1 = (unsigned char)*s1_mb; bad_s1_mb = TRUE; } s2_mb_len = parse_mbchar(s2, s2_mb, NULL); if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) { mbtowc_reset(); ws2 = (unsigned char)*s2_mb; bad_s2_mb = TRUE; } if (bad_s1_mb != bad_s2_mb || towlower(ws1) != towlower(ws2)) break; } free(s1_mb); free(s2_mb); return (n > 0) ? towlower(ws1) - towlower(ws2) : 0; } else #endif return strncasecmp(s1, s2, n); } #ifndef HAVE_STRCASESTR /* This function is equivalent to strcasestr(). */ char *nstrcasestr(const char *haystack, const char *needle) { size_t haystack_len, needle_len; assert(haystack != NULL && needle != NULL); if (*needle == '\0') return (char *)haystack; haystack_len = strlen(haystack); needle_len = strlen(needle); for (; *haystack != '\0' && haystack_len >= needle_len; haystack++, haystack_len--) { if (strncasecmp(haystack, needle, needle_len) == 0) return (char *)haystack; } return NULL; } #endif /* This function is equivalent to strcasestr() for multibyte strings. */ char *mbstrcasestr(const char *haystack, const char *needle) { #ifdef ENABLE_UTF8 if (use_utf8) { size_t haystack_len, needle_len; assert(haystack != NULL && needle != NULL); if (*needle == '\0') return (char *)haystack; haystack_len = mbstrlen(haystack); needle_len = mbstrlen(needle); for (; *haystack != '\0' && haystack_len >= needle_len; haystack += move_mbright(haystack, 0), haystack_len--) { if (mbstrncasecmp(haystack, needle, needle_len) == 0) return (char *)haystack; } return NULL; } else #endif return (char *) strcasestr(haystack, needle); } #if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP) /* This function is equivalent to strstr(), except in that it scans the * string in reverse, starting at rev_start. */ char *revstrstr(const char *haystack, const char *needle, const char *rev_start) { size_t rev_start_len, needle_len; assert(haystack != NULL && needle != NULL && rev_start != NULL); if (*needle == '\0') return (char *)rev_start; needle_len = strlen(needle); if (strlen(haystack) < needle_len) return NULL; rev_start_len = strlen(rev_start); for (; rev_start >= haystack; rev_start--, rev_start_len++) { if (rev_start_len >= needle_len && strncmp(rev_start, needle, needle_len) == 0) return (char *)rev_start; } return NULL; } #endif /* !NANO_TINY || !DISABLE_TABCOMP */ #ifndef NANO_TINY /* This function is equivalent to strcasestr(), except in that it scans * the string in reverse, starting at rev_start. */ char *revstrcasestr(const char *haystack, const char *needle, const char *rev_start) { size_t rev_start_len, needle_len; assert(haystack != NULL && needle != NULL && rev_start != NULL); if (*needle == '\0') return (char *)rev_start; needle_len = strlen(needle); if (strlen(haystack) < needle_len) return NULL; rev_start_len = strlen(rev_start); for (; rev_start >= haystack; rev_start--, rev_start_len++) { if (rev_start_len >= needle_len && strncasecmp(rev_start, needle, needle_len) == 0) return (char *)rev_start; } return NULL; } /* This function is equivalent to strcasestr() for multibyte strings, * except in that it scans the string in reverse, starting at * rev_start. */ char *mbrevstrcasestr(const char *haystack, const char *needle, const char *rev_start) { #ifdef ENABLE_UTF8 if (use_utf8) { bool begin_line = FALSE; size_t rev_start_len, needle_len; assert(haystack != NULL && needle != NULL && rev_start != NULL); if (*needle == '\0') return (char *)rev_start; needle_len = mbstrlen(needle); if (mbstrlen(haystack) < needle_len) return NULL; rev_start_len = mbstrlen(rev_start); while (!begin_line) { if (rev_start_len >= needle_len && mbstrncasecmp(rev_start, needle, needle_len) == 0) return (char *)rev_start; if (rev_start == haystack) begin_line = TRUE; else { rev_start = haystack + move_mbleft(haystack, rev_start - haystack); rev_start_len++; } } return NULL; } else #endif return revstrcasestr(haystack, needle, rev_start); } #endif /* !NANO_TINY */ /* This function is equivalent to strlen() for multibyte strings. */ size_t mbstrlen(const char *s) { return mbstrnlen(s, (size_t)-1); } #ifndef HAVE_STRNLEN /* This function is equivalent to strnlen(). */ size_t nstrnlen(const char *s, size_t maxlen) { size_t n = 0; assert(s != NULL); for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++) ; return n; } #endif /* This function is equivalent to strnlen() for multibyte strings. */ size_t mbstrnlen(const char *s, size_t maxlen) { assert(s != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { size_t n = 0; for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0), maxlen--, n++) ; return n; } else #endif return strnlen(s, maxlen); } #if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY) /* This function is equivalent to strchr() for multibyte strings. */ char *mbstrchr(const char *s, const char *c) { assert(s != NULL && c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { bool bad_s_mb = FALSE, bad_c_mb = FALSE; char *s_mb = charalloc(MB_CUR_MAX); const char *q = s; wchar_t ws, wc; int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); if (c_mb_len < 0) { mbtowc_reset(); wc = (unsigned char)*c; bad_c_mb = TRUE; } while (*s != '\0') { int s_mb_len = parse_mbchar(s, s_mb, NULL); if (mbtowc(&ws, s_mb, s_mb_len) < 0) { mbtowc_reset(); ws = (unsigned char)*s; bad_s_mb = TRUE; } if (bad_s_mb == bad_c_mb && ws == wc) break; s += s_mb_len; q += s_mb_len; } free(s_mb); if (*s == '\0') q = NULL; return (char *)q; } else #endif return (char *) strchr(s, *c); } #endif /* !NANO_TINY || !DISABLE_JUSTIFY */ #ifndef NANO_TINY /* This function is equivalent to strpbrk() for multibyte strings. */ char *mbstrpbrk(const char *s, const char *accept) { assert(s != NULL && accept != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { for (; *s != '\0'; s += move_mbright(s, 0)) { if (mbstrchr(accept, s) != NULL) return (char *)s; } return NULL; } else #endif return (char *) strpbrk(s, accept); } /* This function is equivalent to strpbrk(), except in that it scans the * string in reverse, starting at rev_start. */ char *revstrpbrk(const char *s, const char *accept, const char *rev_start) { assert(s != NULL && accept != NULL && rev_start != NULL); for (; rev_start >= s; rev_start--) { const char *q = (*rev_start == '\0') ? NULL : strchr(accept, *rev_start); if (q != NULL) return (char *)rev_start; } return NULL; } /* This function is equivalent to strpbrk() for multibyte strings, * except in that it scans the string in reverse, starting at * rev_start. */ char *mbrevstrpbrk(const char *s, const char *accept, const char *rev_start) { assert(s != NULL && accept != NULL && rev_start != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { bool begin_line = FALSE; while (!begin_line) { const char *q = (*rev_start == '\0') ? NULL : mbstrchr(accept, rev_start); if (q != NULL) return (char *)rev_start; if (rev_start == s) begin_line = TRUE; else rev_start = s + move_mbleft(s, rev_start - s); } return NULL; } else #endif return revstrpbrk(s, accept, rev_start); } #endif /* !NANO_TINY */ #if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)) /* Return TRUE if the string s contains one or more blank characters, * and FALSE otherwise. */ bool has_blank_chars(const char *s) { assert(s != NULL); for (; *s != '\0'; s++) { if (isblank(*s)) return TRUE; } return FALSE; } /* Return TRUE if the multibyte string s contains one or more blank * multibyte characters, and FALSE otherwise. */ bool has_blank_mbchars(const char *s) { assert(s != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { bool retval = FALSE; char *chr_mb = charalloc(MB_CUR_MAX); for (; *s != '\0'; s += move_mbright(s, 0)) { parse_mbchar(s, chr_mb, NULL); if (is_blank_mbchar(chr_mb)) { retval = TRUE; break; } } free(chr_mb); return retval; } else #endif return has_blank_chars(s); } #endif /* ENABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */ #ifdef ENABLE_UTF8 /* Return TRUE if wc is valid Unicode, and FALSE otherwise. */ bool is_valid_unicode(wchar_t wc) { return ((0 <= wc && wc <= 0x10FFFF) && (wc <= 0xD7FF || 0xE000 <= wc) && (wc <= 0xFDCF || 0xFDF0 <= wc) && ((wc & 0xFFFF) <= 0xFFFD)); } #endif #ifdef ENABLE_NANORC /* Check if the string s is a valid multibyte string. Return TRUE if it * is, and FALSE otherwise. */ bool is_valid_mbstring(const char *s) { assert(s != NULL); return #ifdef ENABLE_UTF8 use_utf8 ? (mbstowcs(NULL, s, 0) != (size_t)-1) : #endif TRUE; } #endif /* ENABLE_NANORC */