diff options
Diffstat (limited to 'src/tools/affixcompress')
-rwxr-xr-x | src/tools/affixcompress | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/src/tools/affixcompress b/src/tools/affixcompress new file mode 100755 index 0000000..9fc2989 --- /dev/null +++ b/src/tools/affixcompress @@ -0,0 +1,192 @@ +#!/bin/sh +# affix compressor utility for Hunspell +# 2008 (c) László Németh, version 0.3 +# usage: affixcompress sorted_word_list_file [max_affix_rules] +case $# in +0) echo \ +"affixcompress - compress a huge sorted word list to Hunspell format +Usage: + +LC_ALL=C sort word_list >sorted_word_list +affixcompress sorted_word_list [max_affix_rules] + +Default value of max_affix_rules = 5000 + +Note: output may need manually added affix parameters (SET character_encoding, +TRY suggestion_characters etc., see man(4) hunspell)" + exit 0;; +esac + +MAXAFFIX=${2:-5000} + +# profiling +#AWK="pgawk --profile" +AWK="gawk" + +rm -f $1.aff $1.dic +cat $1 | $AWK ' +{ + # calculate frequent suffixes + A[$1] = 1 + len = length($1) + if (len > 2) { +# print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr" + B[substr($1, 1, len - 1)] = substr($1, len, 1); + } + for(i = 2; i < len; i++) { + r = substr($1, 1, i) + if (i == 2) { + if (prev != r) { + delete A + delete B + print "Deleted roots: ", prev > "/dev/stderr" + A[$1] = 1 + } + prev = r + } + if (A[r]) { +# print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr" + sfx[substr($1, i + 1, len - i + 1)]++ + } else if (B[r] && B[r] != substr($1, i + 1, 1)) { + r2 = substr($1, i + 1, len - i + 1) + sfy[r2,B[r]]++ + } + } +} +END { + for (i in sfx) print i, 0, sfx[i] + for (i in sfy) print i, sfy[i] +} +' | tr '\034' ' ' >affixcompress0.tmp +sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' | +head -$MAXAFFIX >affixcompress1.tmp +cat affixcompress1.tmp | +$AWK ' +function potential_roots() { + # potential roots with most frequent suffixes + for(word in W) if (W[word]==1) { + print word >"word" + len = length(word); + for(i = 2; i < len; i++) { + root = substr(word, 1, i) + suff = substr(word, i + 1, len - i + 1) + if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++ + if (sfz[suff]) { + l = split(sfz[suff], a) + for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) { + C[root a[k]]++ + } + } + } + } + + # calculate roots + for(word in W) if (W[word]==1) { + print word >"word2" + len = length(word); + z = 0 + # choose most frequent root (maybe the original word) + max = C[word] + maxword = word + maxsuff = 0 + for(i = 2; i < len; i++) { + root = substr(word, 1, i) + suff = substr(word, i + 1, len - i + 1) + if ((sfx[suff] != "") && (C[root] > max)) { + max = C[root] + maxword = root + maxsuff = sfx[suff] + } + if (sfz[suff] != "") { + l = split(sfz[suff], a) + for (k=1; k <= l; k++) if (C[root a[k]] > max) { + max = C[root a[k]] + maxword = root a[k] + maxsuff = sfy[suff,a[k]] + } + } + } + if (max > 0) { + if (maxsuff > 0) print maxword, maxsuff; else print maxword + A[maxword]++ + z=1 + } else { + for(i = 2; i < len; i++) { + root = substr(word, 1, i) + suff = substr(word, i + 1, len - i + 1) + if ((A[root] > 0) && sfx[suff]!="") { + print root, sfx[suff] + z = 1 + break + } + if (sfz[suff]) { + l = split(sfz[suff], a) + for (k=1; k <= l; k++) if (A[root a[k]]!="") { + print root a[k], sfy[suff,a[k]] + z = 1 + break + } + } + } + } + if (z == 0) { + print word + A[word]++ + } + } + delete A + delete C +} +FILENAME == "-" { + if ($2 == 0) { + sfx[$1] = NR + sfxfr[$1] = $3 + } else { + sfy[$1,$2] = NR + sfyfr[$1,$2] = $3 + sfz[$1] = sfz[$1] " " $2 + } + maxsuf = NR + next +} +{ + cap = substr($1, 1, 3) + if (cap != prev) { + potential_roots() + delete W + print "Deleted class:", prev > "/dev/stderr" + } + prev = cap + W[$1] = 1 +} +END { + potential_roots() + # write out frequent suffixes + out=FILENAME ".aff" + print "FLAG num" >out + for (i in sfx) if (sfx[i] > 0) { + print "SFX", sfx[i], "Y 1" >out + print "SFX", sfx[i], "0", i, "." >out + } + for (i in sfy) if (sfy[i] > 0) { + print "SFX", sfy[i], "Y 1" >out + split(i, c, "\034"); + print "SFX", sfy[i], c[2], c[1], c[2] >out + } +} +' - $1 >affixcompress2.tmp +sort -nk 2 affixcompress2.tmp >affixcompress3.tmp +cat affixcompress3.tmp | $AWK -v out="$1.dic" ' +{ + if (A[$1]=="") A[$1]=$2; + else if ($2!="") A[$1] = A[$1] "," $2 +} +END { + for (i in A) n++ + print n >out + for (i in A) { + if (A[i]=="") print i + else print i "/" A[i] + } +} +' | sort >>$1.dic |