/* Un-munch a root word list with affix tags * to recreate the original word list */ #include #include #include #include #include #include #include #include #ifdef __linux__ #include #include #include #endif #include "unmunch.h" int main(int argc, char** argv) { int i; int al, wl; FILE * wrdlst; FILE * afflst; char *wf, *af; char * ap; char ts[MAX_LN_LEN]; /* first parse the command line options */ /* arg1 - munched wordlist, arg2 - affix file */ if (argv[1]) { wf = mystrdup(argv[1]); } else { fprintf(stderr,"correct syntax is:\n"); fprintf(stderr,"unmunch dic_file affix_file\n"); exit(1); } if (argv[2]) { af = mystrdup(argv[2]); } else { fprintf(stderr,"correct syntax is:\n"); fprintf(stderr,"unmunch dic_file affix_file\n"); exit(1); } /* open the affix file */ afflst = fopen(af,"r"); if (!afflst) { fprintf(stderr,"Error - could not open affix description file\n"); exit(1); } /* step one is to parse the affix file building up the internal affix data structures */ numpfx = 0; numsfx = 0; fullstrip = 0; if (parse_aff_file(afflst)) { fprintf(stderr,"Error - in affix file loading\n"); exit(1); } fclose(afflst); fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx); /* affix file is now parsed so create hash table of wordlist on the fly */ /* open the wordlist */ wrdlst = fopen(wf,"r"); if (!wrdlst) { fprintf(stderr,"Error - could not open word list file\n"); exit(1); } /* skip over the hash table size */ if (! fgets(ts, MAX_LN_LEN-1,wrdlst)) { fclose(wrdlst); return 2; } mychomp(ts); while (fgets(ts,MAX_LN_LEN-1,wrdlst)) { mychomp(ts); /* split each line into word and affix char strings */ ap = strchr(ts,'/'); if (ap) { *ap = '\0'; ap++; al = strlen(ap); } else { al = 0; ap = NULL; } wl = strlen(ts); numwords = 0; wlist[numwords].word = mystrdup(ts); wlist[numwords].pallow = 0; numwords++; if (al) expand_rootword(ts,wl,ap,al); for (i=0; i < numwords; i++) { fprintf(stdout,"%s\n",wlist[i].word); free(wlist[i].word); wlist[i].word = NULL; wlist[i].pallow = 0; } } fclose(wrdlst); return 0; } int parse_aff_file(FILE * afflst) { int i, j; int numents=0; char achar='\0'; short ff=0; char ft; struct affent * ptr= NULL; struct affent * nptr= NULL; char * line = malloc(MAX_LN_LEN); while (fgets(line,MAX_LN_LEN,afflst)) { mychomp(line); ft = ' '; fprintf(stderr,"parsing line: %s\n",line); if (strncmp(line,"FULLSTRIP",9) == 0) fullstrip = 1; if (strncmp(line,"PFX",3) == 0) ft = 'P'; if (strncmp(line,"SFX",3) == 0) ft = 'S'; if (ft != ' ') { char * tp = line; char * piece; ff = 0; i = 0; while ((piece=mystrsep(&tp,' '))) { if (*piece != '\0') { switch(i) { case 0: break; case 1: { achar = *piece; break; } case 2: { if (*piece == 'Y') ff = XPRODUCT; break; } case 3: { numents = atoi(piece); ptr = malloc(numents * sizeof(struct affent)); ptr->achar = achar; ptr->xpflg = ff; fprintf(stderr,"parsing %c entries %d\n",achar,numents); break; } default: break; } i++; } free(piece); } /* now parse all of the sub entries*/ nptr = ptr; for (j=0; j < numents; j++) { if (!fgets(line,MAX_LN_LEN,afflst)) return 1; mychomp(line); tp = line; i = 0; while ((piece=mystrsep(&tp,' '))) { if (*piece != '\0') { switch(i) { case 0: { if (nptr != ptr) { nptr->achar = ptr->achar; nptr->xpflg = ptr->xpflg; } break; } case 1: break; case 2: { nptr->strip = mystrdup(piece); nptr->stripl = strlen(nptr->strip); if (strcmp(nptr->strip,"0") == 0) { free(nptr->strip); nptr->strip=mystrdup(""); nptr->stripl = 0; } break; } case 3: { nptr->appnd = mystrdup(piece); nptr->appndl = strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); nptr->appnd=mystrdup(""); nptr->appndl = 0; } break; } case 4: { encodeit(nptr,piece);} fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd, nptr->appndl,nptr->strip,nptr->stripl); default: break; } i++; } free(piece); } nptr++; } if (ft == 'P') { ptable[numpfx].aep = ptr; ptable[numpfx].num = numents; fprintf(stderr,"ptable %d num is %d flag %c\n",numpfx,ptable[numpfx].num,ptr->achar); numpfx++; } else { stable[numsfx].aep = ptr; stable[numsfx].num = numents; fprintf(stderr,"stable %d num is %d flag %c\n",numsfx,stable[numsfx].num,ptr->achar); numsfx++; } ptr = NULL; nptr = NULL; numents = 0; achar='\0'; } } free(line); return 0; } void encodeit(struct affent * ptr, char * cs) { int nc; int neg; int grp; unsigned char c; int n; int ec; int nm; int i, j, k; unsigned char mbr[MAX_WD_LEN]; /* now clear the conditions array */ for (i=0;iconds[i] = (unsigned char) 0; /* now parse the string to create the conds array */ nc = strlen(cs); neg = 0; /* complement indicator */ grp = 0; /* group indicator */ n = 0; /* number of conditions */ ec = 0; /* end condition indicator */ nm = 0; /* number of member in group */ i = 0; if (strcmp(cs,".")==0) { ptr->numconds = 0; return; } while (i < nc) { c = *((unsigned char *)(cs + i)); if (c == '[') { grp = 1; c = 0; } if ((grp == 1) && (c == '^')) { neg = 1; c = 0; } if (c == ']') { ec = 1; c = 0; } if ((grp == 1) && (c != 0)) { *(mbr + nm) = c; nm++; c = 0; } if (c != 0) { ec = 1; } if (ec) { if (grp == 1) { if (neg == 0) { for (j=0;jconds[k] = ptr->conds[k] | (1 << n); } } else { for (j=0;jconds[j] = ptr->conds[j] | (1 << n); for (j=0;jconds[k] = ptr->conds[k] & ~(1 << n); } } neg = 0; grp = 0; nm = 0; } else { /* not a group so just set the proper bit for this char */ /* but first handle special case of . inside condition */ if (c == '.') { /* wild card character so set them all */ for (j=0;jconds[j] = ptr->conds[j] | (1 << n); } else { ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n); } } n++; ec = 0; } i++; } ptr->numconds = n; return; } /* add a prefix to word */ void pfx_add (const char * word, int len, struct affent* ep, int num) { struct affent * aent; int cond; int tlen; unsigned char * cp; int i; char * pp; char tword[MAX_WD_LEN]; for (aent = ep, i = num; i > 0; aent++, i--) { /* now make sure all conditions match */ if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) && ((aent->stripl == 0) || (strncmp(aent->strip, word, aent->stripl) == 0))) { cp = (unsigned char *) word; for (cond = 0; cond < aent->numconds; cond++) { if ((aent->conds[*cp++] & (1 << cond)) == 0) break; } if (cond >= aent->numconds) { /* we have a match so add prefix */ tlen = 0; if (aent->appndl) { strcpy(tword,aent->appnd); tlen += aent->appndl; } pp = tword + tlen; strcpy(pp, (word + aent->stripl)); tlen = tlen + len - aent->stripl; if (numwords < MAX_WORDS) { wlist[numwords].word = mystrdup(tword); wlist[numwords].pallow = 0; numwords++; } } } } } /* add a suffix to a word */ void suf_add (const char * word, int len, struct affent * ep, int num) { struct affent * aent; int tlen; int cond; unsigned char * cp; int i; char tword[MAX_WD_LEN]; char * pp; for (aent = ep, i = num; i > 0; aent++, i--) { /* if conditions hold on root word * then strip off strip string and add suffix */ if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) && ((aent->stripl == 0) || (strcmp(aent->strip, word + len - aent->stripl) == 0))) { cp = (unsigned char *) (word + len); for (cond = aent->numconds; --cond >= 0; ) { if ((aent->conds[*--cp] & (1 << cond)) == 0) break; } if (cond < 0) { /* we have a matching condition */ strcpy(tword,word); tlen = len; if (aent->stripl) { tlen -= aent->stripl; } pp = (tword + tlen); if (aent->appndl) { strcpy (pp, aent->appnd); tlen += aent->stripl; } else *pp = '\0'; if (numwords < MAX_WORDS) { wlist[numwords].word = mystrdup(tword); wlist[numwords].pallow = (aent->xpflg & XPRODUCT); numwords++; } } } } } int expand_rootword(const char * ts, int wl, const char * ap, int al) { int i; int j; int nh=0; int nwl; for (i=0; i < numsfx; i++) { if (strchr(ap,(stable[i].aep)->achar)) { suf_add(ts, wl, stable[i].aep, stable[i].num); } } nh = numwords; if (nh > 1) { for (j=1;jachar)) { if ((ptable[i].aep)->xpflg & XPRODUCT) { nwl = strlen(wlist[j].word); pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num); } } } } } } for (i=0; i < numpfx; i++) { if (strchr(ap,(ptable[i].aep)->achar)) { pfx_add(ts, wl, ptable[i].aep, ptable[i].num); } } return 0; } /* strip strings into token based on single char delimiter * acts like strsep() but only uses a delim char and not * a delim string */ char * mystrsep(char ** stringp, const char delim) { char * rv = NULL; char * mp = *stringp; int n = strlen(mp); if (n > 0) { char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n); if (dp) { int nc; *stringp = dp+1; nc = (int)((unsigned long)dp - (unsigned long)mp); rv = (char *) malloc(nc+1); if (rv) { memcpy(rv,mp,nc); *(rv+nc) = '\0'; } } else { rv = (char *) malloc(n+1); if (rv) { memcpy(rv, mp, n); *(rv+n) = '\0'; *stringp = mp + n; } } } return rv; } char * mystrdup(const char * s) { char * d = NULL; if (s) { int sl = strlen(s)+1; d = (char *) malloc(sl); if (d) memcpy(d,s,sl); } return d; } void mychomp(char * s) { int k = strlen(s); if ((k > 0) && (*(s+k-1) == '\n')) *(s+k-1) = '\0'; if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; }