summaryrefslogtreecommitdiff
path: root/src/tools/hzip.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tools/hzip.c')
-rw-r--r--src/tools/hzip.c325
1 files changed, 325 insertions, 0 deletions
diff --git a/src/tools/hzip.c b/src/tools/hzip.c
new file mode 100644
index 0000000..cf760e8
--- /dev/null
+++ b/src/tools/hzip.c
@@ -0,0 +1,325 @@
+/* hzip: file compression for sorted dictionaries with optional encryption,
+ * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define CODELEN 65536
+#define BUFSIZE 65536
+#define EXTENSION ".hz"
+
+#define ESCAPE 31
+#define MAGIC "hz0"
+#define MAGIC_ENCRYPTED "hz1"
+
+#define DESC "hzip - dictionary compression utility\n" \
+"Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \
+" -P password encrypted compression\n" \
+" -h display this help and exit\n"
+
+enum { code_LEAF, code_TERM, code_NODE};
+
+struct item {
+ unsigned short word;
+ int count;
+ char type;
+ struct item * left;
+ struct item * right;
+};
+
+int fail(const char * err, const char * par) {
+ fprintf(stderr, err, par);
+ return 1;
+}
+
+void code2table(struct item * tree, char **table, char * code, int deep) {
+ int first = 0;
+ if (!code) {
+ first = 1;
+ code = malloc(CODELEN);
+ }
+ code[deep] = '1';
+ if (tree->left) code2table(tree->left, table, code, deep + 1);
+ if (tree->type != code_NODE) {
+ int i = tree->word;
+ code[deep] = '\0';
+ if (tree->type == code_TERM) i = CODELEN; /* terminal code */
+ table[i] = malloc(deep + 1);
+ strcpy(table[i], code);
+ }
+ code[deep] = '0';
+ if (tree->right) code2table(tree->right, table, code, deep + 1);
+ if (first) free(code);
+}
+
+struct item * newitem(int c, struct item * l, struct item * r, int t) {
+ struct item * ni = (struct item *) malloc(sizeof(struct item));
+ ni->type = t;
+ ni->word = 0;
+ ni->count = c;
+ ni->left = l;
+ ni->right = r;
+ return ni;
+}
+
+/* return length of the freq array */
+int get_freqdata(struct item *** dest, FILE * f, unsigned short * termword) {
+ int freq[CODELEN];
+ int i, j, k, n;
+ union {
+ char c[2];
+ unsigned short word;
+ } u;
+ for (i = 0; i < CODELEN; i++) freq[i] = 0;
+ while((j = getc(f)) != -1 && (k = getc(f)) != -1) {
+ u.c[0] = j;
+ u.c[1] = k;
+ freq[u.word]++;
+ }
+ if (j != -1) {
+ u.c[0] = 1;
+ u.c[1] = j;
+ } else {
+ u.c[0] = 0;
+ u.c[1] = 0;
+ }
+
+ *dest = (struct item **) malloc((CODELEN + 1) * sizeof(struct item *));
+ if (!*dest) return -1;
+ for (i = 0, n = 0; i < CODELEN; i++) if (freq[i]) {
+ (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF);
+ (*dest)[n]->word = i;
+ n++;
+ }
+ /* terminal sequence (also contains the last odd byte of the file) */
+ (*dest)[n] = newitem(1, NULL, NULL, code_TERM);
+ *termword = u.word;
+ return n + 1;
+}
+
+void get_codetable(struct item **l, int n, char ** table) {
+ int i;
+ while (n > 1) {
+ int min = 0;
+ int mi2 = 1;
+ for (i = 1; i < n; i++) {
+ if (l[i]->count < l[min]->count) {
+ mi2 = min;
+ min = i;
+ } else if (l[i]->count < l[mi2]->count) mi2 = i;
+ }
+ l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE);
+ for (i = mi2 + 1; i < n; i++) l[i - 1] = l[i];
+ n--;
+ }
+ code2table(l[0], table, NULL, 0);
+}
+
+int write_bits(FILE *f, char * bitbuf, int *bits, char * code) {
+ while (*code) {
+ int b = (*bits) % 8;
+ if (!b) bitbuf[(*bits) / 8] = ((*code) - '0') << 7;
+ else bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b));
+ (*bits)++;
+ code++;
+ if (*bits == BUFSIZE * 8) {
+ if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f))
+ return 1;
+ *bits = 0;
+ }
+ }
+ return 0;
+}
+
+int encode_file(char ** table, int n, FILE *f, FILE *f2, unsigned short tw, char * key) {
+ char bitbuf[BUFSIZE];
+ int i, bits = 0;
+ unsigned char cl, ch;
+ int cx[2];
+ union {
+ char c[2];
+ unsigned short word;
+ } u;
+ char * enc = key;
+
+ /* header and codes */
+ fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */
+ cl = (unsigned char) (n & 0x00ff);
+ ch = (unsigned char) (n >> 8);
+ if (key) {
+ unsigned char cs;
+ for (cs = 0; *enc; enc++) cs ^= *enc;
+ fprintf(f2, "%c", cs); /* 1-byte check sum */
+ enc = key;
+ ch ^= *enc;
+ if ((*(++enc)) == '\0') enc = key;
+ cl ^= *enc;
+ }
+ fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */
+ for (i = 0; i < BUFSIZE; i++) bitbuf[i] = '\0';
+ for (i = 0; i < CODELEN + 1; i++) if (table[i]) {
+ int nmemb;
+ u.word = (unsigned short) i;
+ if (i == CODELEN) u.word = tw;
+ if (key) {
+ if (*(++enc) == '\0') enc = key;
+ u.c[0] ^= *enc;
+ if (*(++enc) == '\0') enc = key;
+ u.c[1] ^= *enc;
+ }
+ fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */
+ bits = 0;
+ if (write_bits(f2, bitbuf, &bits, table[i]) != 0)
+ return 1;
+ if (key) {
+ if (*(++enc) == '\0') enc = key;
+ fprintf(f2, "%c", ((unsigned char) bits) ^ *enc);
+ for (cl = 0; cl <= bits/8; cl++) {
+ if (*(++enc) == '\0') enc = key;
+ bitbuf[cl] ^= *enc;
+ }
+ } else
+ fprintf(f2, "%c", (unsigned char) bits); /* 1-byte code length */
+ nmemb = bits/8 + 1;
+ if (fwrite(bitbuf, 1, bits/8 + 1, f2) != nmemb) /* x-byte code */
+ return 1;
+ }
+
+ /* file encoding */
+ bits = 0;
+ while((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) {
+ u.c[0] = cx[0];
+ u.c[1] = cx[1];
+ if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0)
+ return 1;
+ }
+ /* terminal suffixes */
+ if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0)
+ return 1;
+ if (bits > 0)
+ {
+ int nmemb = bits/8 + 1;
+ if (fwrite(bitbuf, 1, nmemb, f2) != nmemb)
+ return 1;
+ }
+ return 0;
+}
+
+int prefixcompress(FILE *f, FILE *tempfile) {
+ char buf[BUFSIZE];
+ char buf2[BUFSIZE * 2];
+ char prev[BUFSIZE];
+ int prevlen = 0;
+ while(fgets(buf,BUFSIZE,f)) {
+ int i, j, k, m, c=0;
+ int pfx = prevlen;
+ char * p = buf2;
+ m = j = 0;
+ for (i = 0; buf[i]; i++) {
+ if ((pfx > 0) && (buf[i] == prev[i])) {
+ j++;
+ } else pfx = 0;
+ }
+ if (i > 0 && buf[i - 1] == '\n') {
+ if (j == i) j--; /* line duplicate */
+ if (j > 29) j = 29;
+ c = j;
+ if (c == '\t') c = 30;
+ /* common suffix */
+ for (; buf[i - m - 2] == prev[prevlen - m - 2] &&
+ m < i - j - 1 && m < 15; m++);
+ if (m == 1) m = 0;
+ } else {
+ j = 0;
+ m = -1;
+ }
+ for (k = j; k < i - m - 1; k++, p++) {
+ if (((unsigned char) buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') {
+ *p = ESCAPE;
+ p++;
+ }
+ *p = buf[k];
+ }
+ if (m > 0) {
+ *p = m + 31; /* 33-46 */
+ p++;
+ }
+ if (i > 0 && buf[i - 1] == '\n') {
+ size_t nmemb = p - buf2 + 1;
+ *p = c;
+ if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
+ return 1;
+ } else {
+ size_t nmemb = p - buf2;
+ if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
+ return 1;
+ }
+ memcpy(prev, buf, i);
+ prevlen = i;
+ }
+ return 0;
+}
+
+int hzip(const char * filename, char * key) {
+ struct item ** list;
+ char * table[CODELEN + 1];
+ int n;
+ char out[BUFSIZE];
+ FILE *f, *f2, *tempfile;
+ unsigned short termword;
+ strcpy(out, filename);
+ strcat(out, EXTENSION);
+ f = fopen(filename, "r");
+ if (!f) return fail("hzip: %s: Permission denied\n", filename);
+ tempfile = tmpfile();
+ if (!tempfile) {
+ fclose(f);
+ return fail("hzip: cannot create temporary file\n", NULL);
+ }
+ f2 = fopen(out, "wb");
+ if (!f2) {
+ fclose(tempfile);
+ fclose(f);
+ return fail("hzip: %s: Permission denied\n", out);
+ }
+ for (n = 0; n < CODELEN; n++) table[n] = NULL;
+ if (prefixcompress(f, tempfile) != 0) {
+ fclose(f2);
+ fclose(tempfile);
+ fclose(f);
+ return fail("hzip: cannot write file\n", NULL);
+ }
+ rewind(tempfile);
+ n = get_freqdata(&list, tempfile, &termword);
+ get_codetable(list, n, table);
+ rewind(tempfile);
+ n = encode_file(table, n, tempfile, f2, termword, key);
+ fclose(f2);
+ fclose(tempfile);
+ fclose(f);
+ if (n != 0) return fail("hzip: cannot write file\n", NULL);
+ return n;
+}
+
+int main(int argc, char** argv) {
+
+ int i, j = 0;
+ char * key = NULL;
+ for (i = 1; i < argc; i++) {
+ if (*(argv[i]) == '-') {
+ if (*(argv[i] + 1) == 'h')
+ return fail(DESC, NULL);
+ if (*(argv[i] + 1) == 'P') {
+ if (i + 1 == argc)
+ return fail("hzip: missing password\n", NULL);
+ key = argv[i + 1];
+ i++;
+ continue;
+ }
+ return fail("hzip: no such option: %s\n", argv[i]);
+ } else if (hzip(argv[i], key) != 0) return 1; else j = 1;
+ }
+ if (j == 0) return fail("hzip: need a filename parameter\n", NULL);
+ return 0;
+}