summaryrefslogtreecommitdiff
path: root/src/tools/munch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tools/munch.c')
-rw-r--r--src/tools/munch.c832
1 files changed, 832 insertions, 0 deletions
diff --git a/src/tools/munch.c b/src/tools/munch.c
new file mode 100644
index 0000000..2087efa
--- /dev/null
+++ b/src/tools/munch.c
@@ -0,0 +1,832 @@
+/* Munch a word list and generate a smaller root word list with affixes*/
+
+#include <ctype.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#ifdef __linux__
+#include <error.h>
+#include <errno.h>
+#include <sys/mman.h>
+#endif
+
+#include "munch.h"
+
+int main(int argc, char** argv)
+{
+
+ int i, j, k, n;
+ int rl, p , nwl;
+ int al;
+
+ FILE * wrdlst;
+ FILE * afflst;
+
+ char *nword, *wf, *af;
+ char as[(MAX_PREFIXES + MAX_SUFFIXES)];
+ char * ap;
+
+ struct hentry * ep;
+ struct hentry * ep1;
+ struct affent * pfxp;
+ struct affent * sfxp;
+
+ /* first parse the command line options */
+ /* arg1 - wordlist, arg2 - affix file */
+
+ if (argv[1]) {
+ wf = mystrdup(argv[1]);
+ } else {
+ fprintf(stderr,"correct syntax is:\n");
+ fprintf(stderr,"munch word_list_file affix_file\n");
+ exit(1);
+ }
+ if (argv[2]) {
+ af = mystrdup(argv[2]);
+ } else {
+ fprintf(stderr,"correct syntax is:\n");
+ fprintf(stderr,"munch word_list_file affix_file\n");
+ exit(1);
+ }
+
+ /* open the affix file */
+ afflst = fopen(af,"r");
+ if (!afflst) {
+ fprintf(stderr,"Error - could not open affix description file\n");
+ exit(1);
+ }
+
+ /* step one is to parse the affix file building up the internal
+ affix data structures */
+
+ numpfx = 0;
+ numsfx = 0;
+
+ if (parse_aff_file(afflst)) {
+ fprintf(stderr,"Error - in affix file loading\n");
+ exit(1);
+ }
+ fclose(afflst);
+
+ fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx);
+
+ /* affix file is now parsed so create hash table of wordlist on the fly */
+
+ /* open the wordlist */
+ wrdlst = fopen(wf,"r");
+ if (!wrdlst) {
+ fprintf(stderr,"Error - could not open word list file\n");
+ exit(1);
+ }
+
+ if (load_tables(wrdlst)) {
+ fprintf(stderr,"Error building hash tables\n");
+ exit(1);
+ }
+ fclose(wrdlst);
+
+ for (i=0; i< tablesize; i++) {
+ ep = &tableptr[i];
+ if (ep->word == NULL) continue;
+ for ( ; ep != NULL; ep = ep->next) {
+ numroots = 0;
+ aff_chk(ep->word,strlen(ep->word));
+ if (numroots) {
+ /* now there might be a number of combinations */
+ /* of prefixes and suffixes that might match this */
+ /* word. So how to choose? As a first shot look */
+ /* for the shortest remaining root word to */
+ /* to maximize the combinatorial power */
+
+ /* but be careful, do not REQUIRE a specific combination */
+ /* of a prefix and a suffix to generate the word since */
+ /* that violates the rule that the root word with just */
+ /* the prefix or just the suffix must also exist in the */
+ /* wordlist as well */
+
+ /* in fact because of the cross product issue, this not a */
+ /* simple choice since some combinations of previous */
+ /* prefixes and new suffixes may not be valid. */
+ /* The only way to know is to simply try them all */
+
+ rl = 1000;
+ p = -1;
+
+ for (j = 0; j < numroots; j++){
+
+ /* first collect the root word info and build up */
+ /* the potential new affix string */
+ nword = (roots[j].hashent)->word;
+ nwl = strlen(nword);
+ *as = '\0';
+ al = 0;
+ ap = as;
+ if (roots[j].prefix) *ap++ = (roots[j].prefix)->achar;
+ if (roots[j].suffix) *ap++ = (roots[j].suffix)->achar;
+ if ((roots[j].hashent)->affstr) {
+ strcpy(ap,(roots[j].hashent)->affstr);
+ } else {
+ *ap = '\0';
+ }
+ al =strlen(as);
+
+ /* now expand the potential affix string to generate */
+ /* all legal words and make sure they all exist in the */
+ /* word list */
+ numwords = 0;
+ wlist[numwords].word = mystrdup(nword);
+ wlist[numwords].pallow = 0;
+ numwords++;
+ n = 0;
+ if (al)
+ expand_rootword(nword,nwl,as,al);
+ for (k=0; k<numwords; k++) {
+ if (lookup(wlist[k].word)) n++;
+ free(wlist[k].word);
+ wlist[k].word = NULL;
+ wlist[k].pallow = 0;
+ }
+
+ /* if all exist in word list then okay */
+ if (n == numwords) {
+ if (nwl < rl) {
+ rl = nwl;
+ p = j;
+ }
+ }
+ }
+ if (p != -1) {
+ ep1 = roots[p].hashent;
+ pfxp = roots[p].prefix;
+ sfxp = roots[p].suffix;
+ ep1->keep = 1;
+ if (pfxp != NULL) add_affix_char(ep1,pfxp->achar);
+ if (sfxp != NULL) add_affix_char(ep1,sfxp->achar);
+ } else {
+ ep->keep = 1;
+ }
+ } else {
+ ep->keep = 1;
+ }
+ }
+ }
+
+ /* now output only the words to keep along with affixes info */
+ /* first count how many words that is */
+ k = 0;
+ for (i=0; i< tablesize; i++) {
+ ep = &tableptr[i];
+ if (ep->word == NULL) continue;
+ for ( ; ep != NULL; ep = ep->next) {
+ if (ep->keep > 0) k++;
+ }
+ }
+ fprintf(stdout,"%d\n",k);
+
+ for (i=0; i< tablesize; i++) {
+ ep = &tableptr[i];
+ if (ep->word == NULL) continue;
+ for ( ; ep != NULL; ep = ep->next) {
+ if (ep->keep > 0) {
+ if (ep->affstr != NULL) {
+ fprintf(stdout,"%s/%s\n",ep->word,ep->affstr);
+ } else {
+ fprintf(stdout,"%s\n",ep->word);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+
+int parse_aff_file(FILE * afflst)
+{
+ int i, j;
+ int numents = 0;
+ char achar = '\0';
+ short ff=0;
+ char ft;
+ struct affent * ptr= NULL;
+ struct affent * nptr= NULL;
+ char * line = malloc(MAX_LN_LEN);
+
+ while (fgets(line,MAX_LN_LEN,afflst)) {
+ mychomp(line);
+ ft = ' ';
+ fprintf(stderr,"parsing line: %s\n",line);
+ if (strncmp(line,"PFX",3) == 0) ft = 'P';
+ if (strncmp(line,"SFX",3) == 0) ft = 'S';
+ if (ft != ' ') {
+ char * tp = line;
+ char * piece;
+ i = 0;
+ ff = 0;
+ while ((piece=mystrsep(&tp,' '))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: break;
+ case 1: { achar = *piece; break; }
+ case 2: { if (*piece == 'Y') ff = XPRODUCT; break; }
+ case 3: { numents = atoi(piece);
+ ptr = malloc(numents * sizeof(struct affent));
+ ptr->achar = achar;
+ ptr->xpflg = ff;
+ fprintf(stderr,"parsing %c entries %d\n",achar,numents);
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ /* now parse all of the sub entries*/
+ nptr = ptr;
+ for (j=0; j < numents; j++) {
+ if (!fgets(line,MAX_LN_LEN,afflst)) return 1;
+ mychomp(line);
+ tp = line;
+ i = 0;
+ while ((piece=mystrsep(&tp,' '))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { if (nptr != ptr) {
+ nptr->achar = ptr->achar;
+ nptr->xpflg = ptr->xpflg;
+ }
+ break;
+ }
+ case 1: break;
+ case 2: { nptr->strip = mystrdup(piece);
+ nptr->stripl = strlen(nptr->strip);
+ if (strcmp(nptr->strip,"0") == 0) {
+ free(nptr->strip);
+ nptr->strip=mystrdup("");
+ nptr->stripl = 0;
+ }
+ break;
+ }
+ case 3: { nptr->appnd = mystrdup(piece);
+ nptr->appndl = strlen(nptr->appnd);
+ if (strcmp(nptr->appnd,"0") == 0) {
+ free(nptr->appnd);
+ nptr->appnd=mystrdup("");
+ nptr->appndl = 0;
+ }
+ break;
+ }
+ case 4: { encodeit(nptr,piece);}
+ fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd,
+ nptr->appndl,nptr->strip,nptr->stripl);
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ nptr++;
+ }
+ if (ft == 'P') {
+ ptable[numpfx].aep = ptr;
+ ptable[numpfx].num = numents;
+ fprintf(stderr,"ptable %d num is %d\n",numpfx,ptable[numpfx].num);
+ numpfx++;
+ } else {
+ stable[numsfx].aep = ptr;
+ stable[numsfx].num = numents;
+ fprintf(stderr,"stable %d num is %d\n",numsfx,stable[numsfx].num);
+ numsfx++;
+ }
+ ptr = NULL;
+ nptr = NULL;
+ numents = 0;
+ achar='\0';
+ }
+ }
+ free(line);
+ return 0;
+}
+
+
+void encodeit(struct affent * ptr, char * cs)
+{
+ int nc;
+ int neg;
+ int grp;
+ unsigned char c;
+ int n;
+ int ec;
+ int nm;
+ int i, j, k;
+ unsigned char mbr[MAX_WD_LEN];
+
+ /* now clear the conditions array */
+ for (i=0;i<SET_SIZE;i++) ptr->conds[i] = (unsigned char) 0;
+
+ /* now parse the string to create the conds array */
+ nc = strlen(cs);
+ neg = 0; /* complement indicator */
+ grp = 0; /* group indicator */
+ n = 0; /* number of conditions */
+ ec = 0; /* end condition indicator */
+ nm = 0; /* number of member in group */
+ i = 0;
+ if (strcmp(cs,".")==0) {
+ ptr->numconds = 0;
+ return;
+ }
+ while (i < nc) {
+ c = *((unsigned char *)(cs + i));
+ if (c == '[') {
+ grp = 1;
+ c = 0;
+ }
+ if ((grp == 1) && (c == '^')) {
+ neg = 1;
+ c = 0;
+ }
+ if (c == ']') {
+ ec = 1;
+ c = 0;
+ }
+ if ((grp == 1) && (c != 0)) {
+ *(mbr + nm) = c;
+ nm++;
+ c = 0;
+ }
+ if (c != 0) {
+ ec = 1;
+ }
+ if (ec) {
+ if (grp == 1) {
+ if (neg == 0) {
+ for (j=0;j<nm;j++) {
+ k = (unsigned int) mbr[j];
+ ptr->conds[k] = ptr->conds[k] | (1 << n);
+ }
+ } else {
+ for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
+ for (j=0;j<nm;j++) {
+ k = (unsigned int) mbr[j];
+ ptr->conds[k] = ptr->conds[k] & ~(1 << n);
+ }
+ }
+ neg = 0;
+ grp = 0;
+ nm = 0;
+ } else {
+ /* not a group so just set the proper bit for this char */
+ /* but first handle special case of . inside condition */
+ if (c == '.') {
+ /* wild card character so set them all */
+ for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
+ } else {
+ ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
+ }
+ }
+ n++;
+ ec = 0;
+ }
+ i++;
+ }
+ ptr->numconds = n;
+ return;
+}
+
+
+
+/* search for a prefix */
+void pfx_chk (const char * word, int len, struct affent* ep, int num)
+{
+ struct affent * aent;
+ int cond;
+ int tlen;
+ struct hentry * hent;
+ unsigned char * cp;
+ int i;
+ char tword[MAX_WD_LEN];
+
+ for (aent = ep, i = num; i > 0; aent++, i--) {
+
+ tlen = len - aent->appndl;
+
+ if (tlen > 0 && (aent->appndl == 0 ||
+ strncmp(aent->appnd, word, aent->appndl) == 0)
+ && tlen + aent->stripl >= aent->numconds) {
+
+ if (aent->stripl) strcpy (tword, aent->strip);
+ strcpy((tword + aent->stripl), (word + aent->appndl));
+
+ /* now go through the conds and make sure they all match */
+ cp = (unsigned char *) tword;
+ for (cond = 0; cond < aent->numconds; cond++) {
+ if ((aent->conds[*cp++] & (1 << cond)) == 0)
+ break;
+ }
+
+ if (cond >= aent->numconds) {
+ tlen += aent->stripl;
+ if ((hent = lookup(tword)) != NULL) {
+ if (numroots < MAX_ROOTS) {
+ roots[numroots].hashent = hent;
+ roots[numroots].prefix = aent;
+ roots[numroots].suffix = NULL;
+ numroots++;
+ }
+ }
+ }
+ }
+ }
+}
+
+
+
+void suf_chk (const char * word, int len, struct affent * ep,
+ int num, struct affent * pfxent, int cpflag)
+{
+ struct affent * aent;
+ int tlen;
+ int cond;
+ struct hentry * hent;
+ unsigned char * cp;
+ int i;
+ char tword[MAX_WD_LEN];
+
+ for (aent = ep, i = num; i > 0; aent++, i--) {
+
+ if ((cpflag & XPRODUCT) != 0 && (aent->xpflg & XPRODUCT) == 0)
+ continue;
+
+ tlen = len - aent->appndl;
+ if (tlen > 0 && (aent->appndl == 0 ||
+ strcmp(aent->appnd, (word + tlen)) == 0)
+ && tlen + aent->stripl >= aent->numconds) {
+
+ strcpy (tword, word);
+ cp = (unsigned char *) (tword + tlen);
+ if (aent->stripl) {
+ strcpy ((char *)cp, aent->strip);
+ tlen += aent->stripl;
+ cp = (unsigned char *)(tword + tlen);
+ } else *cp = '\0';
+
+ for (cond = aent->numconds; --cond >= 0; ) {
+ if ((aent->conds[*--cp] & (1 << cond)) == 0) break;
+ }
+ if (cond < 0) {
+ if ((hent = lookup(tword)) != NULL) {
+ if (numroots < MAX_ROOTS) {
+ roots[numroots].hashent = hent;
+ roots[numroots].prefix = pfxent;
+ roots[numroots].suffix = aent;
+ numroots++;
+ }
+ }
+ }
+ }
+ }
+}
+
+
+
+void aff_chk (const char * word, int len)
+{
+ int i;
+ int j;
+ int nh=0;
+ char * nword;
+ int nwl;
+
+ if (len < 4) return;
+
+ for (i=0; i < numpfx; i++) {
+ pfx_chk(word, len, ptable[i].aep, ptable[i].num);
+ }
+
+ nh = numroots;
+
+ if (nh > 0) {
+ for (j=0;j<nh;j++){
+ if (roots[j].prefix->xpflg & XPRODUCT) {
+ nword = mystrdup((roots[j].hashent)->word);
+ nwl = strlen(nword);
+ for (i=0; i < numsfx; i++) {
+ suf_chk(nword,nwl,stable[i].aep, stable[i].num, roots[j].prefix, XPRODUCT);
+ }
+ free(nword);
+ }
+ }
+ }
+ for (i=0; i < numsfx; i++) {
+ suf_chk(word, len, stable[i].aep, stable[i].num, NULL, 0);
+ }
+}
+
+
+
+/* lookup a root word in the hashtable */
+
+struct hentry * lookup(const char *word)
+{
+ struct hentry * dp;
+ dp = &tableptr[hash(word)];
+ if (dp->word == NULL) return NULL;
+ for ( ; dp != NULL; dp = dp->next) {
+ if (strcmp(word,dp->word) == 0) return dp;
+ }
+ return NULL;
+}
+
+
+
+/* add a word to the hash table */
+
+int add_word(char * word)
+{
+ int i;
+ struct hentry * dp;
+ struct hentry * hp = (struct hentry *) malloc (sizeof(struct hentry));
+
+ hp->word = word;
+ hp->affstr = NULL;
+ hp->keep = 0;
+ hp->next = NULL;
+
+ i = hash(word);
+ dp = &tableptr[i];
+
+ if (dp->word == NULL) {
+ *dp = *hp;
+ free(hp);
+ } else {
+ while (dp->next != NULL) dp=dp->next;
+ dp->next = hp;
+ }
+ return 0;
+}
+
+
+
+/* load a word list and build a hash table on the fly */
+
+int load_tables(FILE * wdlst)
+{
+ char * ap;
+ char ts[MAX_LN_LEN];
+
+ /* first read the first line of file to get hash table size */
+ if (! fgets(ts, MAX_LN_LEN-1,wdlst)) return 2;
+ mychomp(ts);
+ tablesize = atoi(ts);
+ tablesize = tablesize + 5;
+ if ((tablesize %2) == 0) tablesize++;
+
+ /* allocate the hash table */
+ tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
+ if (! tableptr) return 3;
+
+ /* loop thorugh all words on much list and add to hash
+ * table and store away word and affix strings in tmpfile
+ */
+
+ while (fgets(ts,MAX_LN_LEN-1,wdlst)) {
+ mychomp(ts);
+ ap = mystrdup(ts);
+ add_word(ap);
+
+ }
+ return 0;
+}
+
+
+/* the hash function is a simple load and rotate
+ * algorithm borrowed
+ */
+
+int hash(const char * word)
+{
+ int i;
+ long hv = 0;
+ for (i=0; i < 4 && *word != 0; i++)
+ hv = (hv << 8) | (*word++);
+ while (*word != 0) {
+ ROTATE(hv,ROTATE_LEN);
+ hv ^= (*word++);
+ }
+ return (unsigned long) hv % tablesize;
+}
+
+
+void add_affix_char(struct hentry * ep, char ac)
+{
+ int al;
+ int i;
+ char * tmp;
+ if (ep->affstr == NULL) {
+ ep->affstr = (char *) malloc(2);
+ *(ep->affstr) = ac;
+ *((ep->affstr)+1) = '\0';
+ return;
+ }
+ al = strlen(ep->affstr);
+ for (i=0; i< al; i++)
+ if (ac == (ep->affstr)[i]) return;
+ tmp = calloc(al+2,1);
+ memcpy(tmp,ep->affstr,(al+1));
+ *(tmp+al) = ac;
+ *(tmp+al+1)='\0';
+ free(ep->affstr);
+ ep->affstr = tmp;
+ return;
+}
+
+
+/* add a prefix to word */
+void pfx_add (const char * word, int len, struct affent* ep, int num)
+{
+ struct affent * aent;
+ int cond;
+ int tlen;
+ unsigned char * cp;
+ int i;
+ char * pp;
+ char tword[MAX_WD_LEN];
+
+
+ for (aent = ep, i = num; i > 0; aent++, i--) {
+
+ /* now make sure all conditions match */
+ if ((len > aent->stripl) && (len >= aent->numconds)) {
+
+ cp = (unsigned char *) word;
+ for (cond = 0; cond < aent->numconds; cond++) {
+ if ((aent->conds[*cp++] & (1 << cond)) == 0)
+ break;
+ }
+ if (cond >= aent->numconds) {
+
+ /* we have a match so add prefix */
+ tlen = 0;
+ if (aent->appndl) {
+ strcpy(tword,aent->appnd);
+ tlen += aent->appndl;
+ }
+ pp = tword + tlen;
+ strcpy(pp, (word + aent->stripl));
+ tlen = tlen + len - aent->stripl;
+
+ if (numwords < MAX_WORDS) {
+ wlist[numwords].word = mystrdup(tword);
+ wlist[numwords].pallow = 0;
+ numwords++;
+ }
+ }
+ }
+ }
+}
+
+
+/* add a suffix to a word */
+void suf_add (const char * word, int len, struct affent * ep, int num)
+{
+ struct affent * aent;
+ int tlen;
+ int cond;
+ unsigned char * cp;
+ int i;
+ char tword[MAX_WD_LEN];
+ char * pp;
+
+ for (aent = ep, i = num; i > 0; aent++, i--) {
+
+ /* if conditions hold on root word
+ * then strip off strip string and add suffix
+ */
+
+ if ((len > aent->stripl) && (len >= aent->numconds)) {
+ cp = (unsigned char *) (word + len);
+ for (cond = aent->numconds; --cond >= 0; ) {
+ if ((aent->conds[*--cp] & (1 << cond)) == 0) break;
+ }
+ if (cond < 0) {
+ /* we have a matching condition */
+ strcpy(tword,word);
+ tlen = len;
+ if (aent->stripl) {
+ tlen -= aent->stripl;
+ }
+ pp = (tword + tlen);
+ if (aent->appndl) {
+ strcpy (pp, aent->appnd);
+ tlen += aent->stripl;
+ } else *pp = '\0';
+
+ if (numwords < MAX_WORDS) {
+ wlist[numwords].word = mystrdup(tword);
+ wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
+ numwords++;
+ }
+ }
+ }
+ }
+}
+
+
+
+int expand_rootword(const char * ts, int wl, const char * ap, int al)
+{
+ int i;
+ int j;
+ int nh=0;
+ int nwl;
+
+ for (i=0; i < numsfx; i++) {
+ if (strchr(ap,(stable[i].aep)->achar)) {
+ suf_add(ts, wl, stable[i].aep, stable[i].num);
+ }
+ }
+
+ nh = numwords;
+
+ if (nh > 1) {
+ for (j=1;j<nh;j++){
+ if (wlist[j].pallow) {
+ for (i=0; i < numpfx; i++) {
+ if (strchr(ap,(ptable[i].aep)->achar)) {
+ if ((ptable[i].aep)->xpflg & XPRODUCT) {
+ nwl = strlen(wlist[j].word);
+ pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ for (i=0; i < numpfx; i++) {
+ if (strchr(ap,(ptable[i].aep)->achar)) {
+ pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
+ }
+ }
+ return 0;
+}
+
+
+
+/* strip strings into token based on single char delimiter
+ * acts like strsep() but only uses a delim char and not
+ * a delim string
+ */
+char * mystrsep(char ** stringp, const char delim)
+{
+ char * rv = NULL;
+ char * mp = *stringp;
+ int n = strlen(mp);
+ if (n > 0) {
+ char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n);
+ if (dp) {
+ int nc;
+ *stringp = dp+1;
+ nc = (int)((unsigned long)dp - (unsigned long)mp);
+ rv = (char *) malloc(nc+1);
+ if (rv) {
+ memcpy(rv,mp,nc);
+ *(rv+nc) = '\0';
+ }
+ } else {
+ rv = (char *) malloc(n+1);
+ if (rv) {
+ memcpy(rv, mp, n);
+ *(rv+n) = '\0';
+ *stringp = mp + n;
+ }
+ }
+ }
+ return rv;
+}
+
+
+char * mystrdup(const char * s)
+{
+ char * d = NULL;
+ if (s) {
+ int sl = strlen(s)+1;
+ d = (char *) malloc(sl);
+ if (d) memcpy(d,s,sl);
+ }
+ return d;
+}
+
+
+void mychomp(char * s)
+{
+ int k = strlen(s);
+ if (k > 0) *(s+k-1) = '\0';
+ if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
+}
+