summaryrefslogtreecommitdiff
path: root/charsets.c
diff options
context:
space:
mode:
Diffstat (limited to 'charsets.c')
-rw-r--r--charsets.c630
1 files changed, 630 insertions, 0 deletions
diff --git a/charsets.c b/charsets.c
new file mode 100644
index 0000000..07d8197
--- /dev/null
+++ b/charsets.c
@@ -0,0 +1,630 @@
+/* charsets.c
+ * (c) 2002 Mikulas Patocka, Karel 'Clock' Kulhavy
+ * This file is a part of the Links program, released under GPL.
+ */
+
+#include "links.h"
+
+int utf8_table;
+
+struct table_entry {
+ unsigned char c;
+ int u;
+};
+
+struct codepage_desc {
+ unsigned char *name;
+ unsigned char **aliases;
+ struct table_entry *table;
+};
+
+#include "codepage.inc"
+#include "uni_7b.inc"
+#include "entity.inc"
+#include "upcase.inc"
+
+static unsigned char strings[256][2] = {
+ "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
+ "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
+ "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
+ "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
+ "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
+ "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
+ "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
+ "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
+ "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
+ "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
+ "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
+ "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
+ "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
+ "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
+ "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
+ "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
+ "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
+ "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
+ "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
+ "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
+ "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
+ "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
+ "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
+ "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
+ "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
+ "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
+ "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
+ "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
+ "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
+ "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
+ "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
+ "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
+};
+
+static void free_translation_table(struct conv_table *p)
+{
+ int i;
+ for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
+ mem_free(p);
+}
+
+static unsigned char *no_str = "*";
+
+static void new_translation_table(struct conv_table *p)
+{
+ int i;
+ for (i = 0; i < 256; i++) if (p[i].t) free_translation_table(p[i].u.tbl);
+ for (i = 0; i < 128; i++) p[i].t = 0, p[i].u.str = strings[i];
+ for (; i < 256; i++) p[i].t = 0, p[i].u.str = no_str;
+}
+
+static int strange_chars[32] = {
+ 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
+ 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
+ 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
+};
+
+#define U_EQUAL(a, b) unicode_7b[a].x == (b)
+#define U_ABOVE(a, b) unicode_7b[a].x > (b)
+
+unsigned char *u2cp(int u, int to, int fallback)
+{
+ int j, s;
+ again:
+ if (u < 128) return strings[u];
+ if (u == 0xa0) return "\001";
+ if (u == 0xad) return "";
+ if (to == utf8_table) return encode_utf_8(u);
+ if (u < 0xa0) {
+ u = strange_chars[u - 0x80];
+ if (!u) return NULL;
+ goto again;
+ }
+ for (j = 0; codepages[to].table[j].c; j++)
+ if (codepages[to].table[j].u == u)
+ return strings[codepages[to].table[j].c];
+ if (!fallback) return NULL;
+ BIN_SEARCH(N_UNICODE_7B, U_EQUAL, U_ABOVE, u, s);
+ if (s != -1) return unicode_7b[s].s;
+ return NULL;
+}
+
+int cp2u(unsigned ch, int from)
+{
+ struct table_entry *e;
+ if (from == utf8_table) return ch;
+ if (from < 0 || ch < 0x80) return ch;
+ for (e = codepages[from].table; e->c; e++) if (e->c == ch) return e->u;
+ return -1;
+}
+
+static unsigned char utf_buffer[7];
+
+unsigned char *encode_utf_8(int u)
+{
+ memset(utf_buffer, 0, 7);
+ if (u < 0x80) utf_buffer[0] = u;
+ else if (u < 0x800)
+ utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
+ utf_buffer[1] = 0x80 | (u & 0x3f);
+ else if (u < 0x10000)
+ utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
+ utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
+ utf_buffer[2] = 0x80 | (u & 0x3f);
+ else if (u < 0x200000)
+ utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
+ utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
+ utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
+ utf_buffer[3] = 0x80 | (u & 0x3f);
+ else if (u < 0x4000000)
+ utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
+ utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
+ utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
+ utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
+ utf_buffer[4] = 0x80 | (u & 0x3f);
+ else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
+ utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
+ utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
+ utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
+ utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
+ utf_buffer[5] = 0x80 | (u & 0x3f);
+ return utf_buffer;
+}
+
+static void add_utf_8(struct conv_table *ct, int u, unsigned char *str)
+{
+ unsigned char *p = encode_utf_8(u);
+ while (p[1]) {
+ if (ct[*p].t) ct = ct[*p].u.tbl;
+ else {
+ struct conv_table *nct;
+ if (ct[*p].u.str != no_str) {
+ internal("bad utf encoding #1");
+ return;
+ }
+ nct = mem_alloc(sizeof(struct conv_table) * 256);
+ memset(nct, 0, sizeof(struct conv_table) * 256);
+ new_translation_table(nct);
+ ct[*p].t = 1;
+ ct[*p].u.tbl = nct;
+ ct = nct;
+ }
+ p++;
+ }
+ if (ct[*p].t) {
+ internal("bad utf encoding #2");
+ return;
+ }
+ if (ct[*p].u.str == no_str) ct[*p].u.str = str;
+}
+
+static struct conv_table utf_table[256];
+static int utf_table_init = 1;
+
+static void free_utf_table(void)
+{
+ int i;
+ for (i = 128; i < 256; i++) mem_free(utf_table[i].u.str);
+}
+
+static struct conv_table *get_translation_table_to_utf_8(int from)
+{
+ int i;
+ static int lfr = -1;
+ if (from == -1) return NULL;
+ if (from == lfr) return utf_table;
+ lfr = from;
+ if (utf_table_init) memset(utf_table, 0, sizeof(struct conv_table) * 256), utf_table_init = 0;
+ else free_utf_table();
+ for (i = 0; i < 128; i++) utf_table[i].u.str = strings[i];
+ if (from == utf8_table) {
+ for (i = 128; i < 256; i++) utf_table[i].u.str = stracpy(strings[i]);
+ return utf_table;
+ }
+ for (i = 128; i < 256; i++) utf_table[i].u.str = NULL;
+ for (i = 0; codepages[from].table[i].c; i++) {
+ int u = codepages[from].table[i].u;
+ if (!utf_table[codepages[from].table[i].c].u.str)
+ utf_table[codepages[from].table[i].c].u.str = stracpy(encode_utf_8(u));
+ }
+ for (i = 128; i < 256; i++)
+ if (!utf_table[i].u.str) utf_table[i].u.str = stracpy(no_str);
+ return utf_table;
+}
+
+unsigned short int utf8_2_uni_table[0x200] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 192, 0,
+ 0, 0, 256, 0, 0, 0, 320, 0, 0, 0, 384, 0, 0, 0, 448, 0,
+ 0, 0, 512, 0, 0, 0, 576, 0, 0, 0, 640, 0, 0, 0, 704, 0,
+ 0, 0, 768, 0, 0, 0, 832, 0, 0, 0, 896, 0, 0, 0, 960, 0,
+ 0, 0, 1024, 0, 0, 0, 1088, 0, 0, 0, 1152, 0, 0, 0, 1216, 0,
+ 0, 0, 1280, 0, 0, 0, 1344, 0, 0, 0, 1408, 0, 0, 0, 1472, 0,
+ 0, 0, 1536, 0, 0, 0, 1600, 0, 0, 0, 1664, 0, 0, 0, 1728, 0,
+ 0, 0, 1792, 0, 0, 0, 1856, 0, 0, 0, 1920, 0, 0, 0, 1984, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+unsigned char utf_8_1[256] = {
+ 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 6, 6,
+};
+
+static unsigned min_utf_8[9] = {
+ 0, 0x4000000, 0x200000, 0x10000, 0x800, 0x80, 0x100, 0x1,
+};
+
+unsigned get_utf_8(unsigned char **s)
+{
+ unsigned v, min;
+ int l;
+ unsigned char *p = *s;
+ l = utf_8_1[p[0]];
+ min = min_utf_8[l];
+ v = p[0] & ((1 << l) - 1);
+ (*s)++;
+ while (l++ <= 5) {
+ unsigned c = **s - 0x80;
+ if (c >= 0x40) {
+ return 0;
+ }
+ (*s)++;
+ v = (v << 6) + c;
+ }
+ if (v < min) {
+ return 0;
+ }
+ return v;
+}
+
+static struct conv_table table[256];
+static int table_init = 1;
+
+void free_conv_table(void)
+{
+ if (!utf_table_init) free_utf_table();
+ if (!table_init) new_translation_table(table);
+}
+
+struct conv_table *get_translation_table(int from, int to)
+{
+ int i;
+ static int lfr = -1;
+ static int lto = -1;
+ if (/*from == to ||*/ from == -1 || to == -1) return NULL;
+ if (to == utf8_table) return get_translation_table_to_utf_8(from);
+ if (table_init) memset(table, 0, sizeof(struct conv_table) * 256), table_init = 0;
+ if (from == lfr && to == lto) return table;
+ lfr = from; lto = to;
+ new_translation_table(table);
+ if (from == utf8_table) {
+ int j;
+ for (j = 0; codepages[to].table[j].c; j++) add_utf_8(table, codepages[to].table[j].u, codepages[to].table[j].u == 0xa0 ? (unsigned char *)"\001" : codepages[to].table[j].u == 0xad ? (unsigned char *)"" : strings[codepages[to].table[j].c]);
+ for (i = 0; unicode_7b[i].x != -1; i++) if (unicode_7b[i].x >= 0x80) add_utf_8(table, unicode_7b[i].x, unicode_7b[i].s);
+ } else for (i = 128; i < 256; i++) {
+ int j;
+ unsigned char *u;
+ for (j = 0; codepages[from].table[j].c; j++) {
+ if (codepages[from].table[j].c == i) goto f;
+ }
+ continue;
+ f:
+ u = u2cp(codepages[from].table[j].u, to, 1);
+ if (u) table[i].u.str = u;
+ }
+ return table;
+}
+
+static inline int xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
+{
+ while (l2) {
+ if (*s1 > *s2) return 1;
+ if (!*s1 || *s1 < *s2) return -1;
+ s1++, s2++, l2--;
+ }
+ return !!*s1;
+}
+
+int get_entity_number(unsigned char *st, int l)
+{
+ int n = 0;
+ if (upcase(st[0]) == 'X') {
+ st++, l--;
+ if (!l) return -1;
+ do {
+ unsigned char c = upcase(*(st++));
+ if (c >= '0' && c <= '9') n = n * 16 + c - '0';
+ else if (c >= 'A' && c <= 'F') n = n * 16 + c - 'A' + 10;
+ else return -1;
+ if (n >= 0x10000) return -1;
+ } while (--l);
+ } else {
+ if (!l) return -1;
+ do {
+ unsigned char c = *(st++);
+ if (c >= '0' && c <= '9') n = n * 10 + c - '0';
+ else return -1;
+ if (n >= 0x10000) return -1;
+ } while (--l);
+ }
+ return n;
+}
+
+unsigned char *get_entity_string(unsigned char *st, int l, int encoding)
+{
+ int n;
+ if (l <= 0) return NULL;
+ if (st[0] == '#') {
+ if (l == 1) return NULL;
+ if ((n = get_entity_number(st + 1, l - 1)) == -1) return NULL;
+ if (n < 32 && get_attr_val_nl != 2) n = 32;
+ } else {
+ int s = 0, e = N_ENTITIES - 1;
+ while (s <= e) {
+ int c;
+ int m = (s + e) / 2;
+ c = xxstrcmp(entities[m].s, st, l);
+ if (!c) {
+ n = entities[m].c;
+ goto f;
+ }
+ if (c > 0) e = m - 1;
+ else s = m + 1;
+ }
+ return NULL;
+ f:;
+ }
+
+ return u2cp(n, encoding, 1);
+}
+
+unsigned char *convert_string(struct conv_table *ct, unsigned char *c, int l, struct document_options *dopt)
+{
+ unsigned char *buffer;
+ int bp = 0;
+ int pp = 0;
+ if (!ct) {
+ int i;
+ for (i = 0; i < l; i++) if (c[i] == '&') goto xx;
+ return memacpy(c, l);
+ xx:;
+ }
+ buffer = mem_alloc(ALLOC_GR);
+ while (pp < l) {
+ unsigned char *e = NULL; /* against warning */
+ if (c[pp] < 128 && c[pp] != '&') {
+ put_c:
+ buffer[bp++] = c[pp++];
+ if (!(bp & (ALLOC_GR - 1))) {
+ if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
+ buffer = mem_realloc(buffer, bp + ALLOC_GR);
+ }
+ continue;
+ }
+ if (c[pp] != '&') {
+ struct conv_table *t;
+ int i;
+ if (!ct) goto put_c;
+ t = ct;
+ i = pp;
+ decode:
+ if (!t[c[i]].t) {
+ e = t[c[i]].u.str;
+ } else {
+ t = t[c[i++]].u.tbl;
+ if (i >= l) goto put_c;
+ goto decode;
+ }
+ pp = i + 1;
+ } else {
+ int i = pp + 1;
+ if (!dopt || dopt->plain) goto put_c;
+ while (i < l && c[i] != ';' && c[i] != '&' && c[i] > ' ') i++;
+ if (!(e = get_entity_string(&c[pp + 1], i - pp - 1, dopt->cp))) goto put_c;
+ pp = i + (i < l && c[i] == ';');
+ }
+ if (!e[0]) continue;
+ if (!e[1]) {
+ buffer[bp++] = e[0];
+ if (!(bp & (ALLOC_GR - 1))) {
+ if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
+ buffer = mem_realloc(buffer, bp + ALLOC_GR);
+ }
+ continue;
+ }
+ while (*e) {
+ buffer[bp++] = *(e++);
+ if (!(bp & (ALLOC_GR - 1))) {
+ if ((unsigned)bp > MAXINT - ALLOC_GR) overalloc();
+ buffer = mem_realloc(buffer, bp + ALLOC_GR);
+ }
+ }
+ }
+ buffer[bp] = 0;
+ return buffer;
+}
+
+/* 1 match 0 mismatch */
+static inline int compare_names(unsigned char *one, unsigned char *two)
+{
+
+ while(1){
+ if (!*one) return !*two;
+ if (!*two||(upcase(*one)!=upcase(*two))) return 0;
+ one++;
+ two++;
+ }
+
+}
+
+int get_cp_index(unsigned char *n)
+{
+ int i, a, p, q;
+ int ii = -1, ll = 0;
+ for (i = 0; codepages[i].name; i++) {
+ for (a = 0; codepages[i].aliases[a]; a++) {
+ for (p = 0; n[p]; p++) {
+ if (upcase(n[p]) == upcase(codepages[i].aliases[a][0])) {
+ for (q = 1; codepages[i].aliases[a][q]; q++) {
+ if (upcase(n[p+q]) != upcase(codepages[i].aliases[a][q])) goto fail;
+ }
+ if (strlen(codepages[i].aliases[a]) > (size_t)ll) {
+ ll = strlen(codepages[i].aliases[a]);
+ ii = i;
+ }
+ }
+ fail:;
+ }
+ }
+ }
+ return ii;
+}
+
+unsigned char *get_cp_name(int index)
+{
+ if (index < 0) return "none";
+ return codepages[index].name;
+}
+
+unsigned char *get_cp_mime_name(int index)
+{
+ if (index < 0) return "none";
+ if (!codepages[index].aliases) return NULL;
+ return codepages[index].aliases[0];
+}
+
+#define UP_EQUAL(a, b) unicode_upcase[a].lo == (b)
+#define UP_ABOVE(a, b) unicode_upcase[a].lo > (b)
+
+unsigned charset_upcase(unsigned ch, int cp)
+{
+ unsigned u;
+ int res;
+ unsigned char *str;
+ if (ch < 0x80) return upcase(ch);
+ u = cp2u(ch, cp);
+ BIN_SEARCH(sizeof(unicode_upcase) / sizeof(*unicode_upcase), UP_EQUAL, UP_ABOVE, u, res);
+ if (res == -1) return ch;
+ if (cp == utf8_table) return unicode_upcase[res].up;
+ str = u2cp(unicode_upcase[res].up, cp, 0);
+ if (!str || !str[0] || str[1]) return ch;
+ return str[0];
+}
+
+unsigned uni_upcase(unsigned ch)
+{
+ return charset_upcase(ch, utf8_table);
+}
+
+void charset_upcase_string(unsigned char **chp, int cp)
+{
+ unsigned char *ch = *chp;
+ int i;
+ if (cp == utf8_table) {
+ ch = unicode_upcase_string(ch);
+ mem_free(*chp);
+ *chp = ch;
+ } else {
+ for (i = 0; ch[i]; i++) ch[i] = charset_upcase(ch[i], cp);
+ }
+}
+
+unsigned char *unicode_upcase_string(unsigned char *ch)
+{
+ unsigned char *r = init_str();
+ int rl = 0;
+ while (1) {
+ unsigned c;
+ int res;
+ GET_UTF_8(ch, c);
+ if (!c) break;
+ BIN_SEARCH(sizeof(unicode_upcase) / sizeof(*unicode_upcase), UP_EQUAL, UP_ABOVE, c, res);
+ if (res != -1) c = unicode_upcase[res].up;
+ add_to_str(&r, &rl, encode_utf_8(c));
+ }
+ return r;
+}
+
+unsigned char *to_utf8_upcase(unsigned char *str, int cp)
+{
+ unsigned char *str1, *str2;
+ struct conv_table *ct = get_translation_table(cp, utf8_table);
+ str1 = convert_string(ct, str, strlen(str), NULL);
+ str2 = unicode_upcase_string(str1);
+ mem_free(str1);
+ return str2;
+}
+
+int compare_case_utf8(unsigned char *u1, unsigned char *u2)
+{
+ unsigned char *uu1 = u1;
+ unsigned c1, c2;
+ int cc1;
+ while (1) {
+ GET_UTF_8(u2, c2);
+ if (!c2) return u1 - uu1;
+ skip_discr:
+ GET_UTF_8(u1, c1);
+ BIN_SEARCH(sizeof(unicode_upcase) / sizeof(*unicode_upcase), UP_EQUAL, UP_ABOVE, c1, cc1);
+ if (cc1 != -1) c1 = unicode_upcase[cc1].up;
+ if (c1 == 0xad) goto skip_discr;
+ if (c1 != c2) return 0;
+ if (c1 == ' ') {
+ unsigned char *x1;
+ do {
+ x1 = u1;
+ GET_UTF_8(u1, c1);
+ BIN_SEARCH(sizeof(unicode_upcase) / sizeof(*unicode_upcase), UP_EQUAL, UP_ABOVE, c1, cc1);
+ if (cc1 != -1) c1 = unicode_upcase[cc1].up;
+ } while (c1 == ' ');
+ u1 = x1;
+ }
+ }
+}
+
+int strlen_utf8(unsigned char *s)
+{
+ int len = 0;
+ while (1) {
+ unsigned c;
+ GET_UTF_8(s, c);
+ if (!c) return len;
+ len++;
+ }
+}
+
+int cp_len(int cp, unsigned char *s)
+{
+ if (cp == utf8_table) return strlen_utf8(s);
+ return strlen(s);
+}
+
+unsigned char *cp_strchr(int charset, unsigned char *str, unsigned chr)
+{
+ if (charset != utf8_table) {
+ if (chr >= 0x100)
+ return NULL;
+ return (unsigned char *)strchr(str, chr);
+ }
+ while (1) {
+ unsigned char *o_str = str;
+ unsigned c;
+ GET_UTF_8(str, c);
+ if (!c) return NULL;
+ if (c == chr) return o_str;
+ }
+}