summaryrefslogtreecommitdiff
path: root/src/tools/affixcompress
diff options
context:
space:
mode:
Diffstat (limited to 'src/tools/affixcompress')
-rwxr-xr-xsrc/tools/affixcompress192
1 files changed, 192 insertions, 0 deletions
diff --git a/src/tools/affixcompress b/src/tools/affixcompress
new file mode 100755
index 0000000..9fc2989
--- /dev/null
+++ b/src/tools/affixcompress
@@ -0,0 +1,192 @@
+#!/bin/sh
+# affix compressor utility for Hunspell
+# 2008 (c) László Németh, version 0.3
+# usage: affixcompress sorted_word_list_file [max_affix_rules]
+case $# in
+0) echo \
+"affixcompress - compress a huge sorted word list to Hunspell format
+Usage:
+
+LC_ALL=C sort word_list >sorted_word_list
+affixcompress sorted_word_list [max_affix_rules]
+
+Default value of max_affix_rules = 5000
+
+Note: output may need manually added affix parameters (SET character_encoding,
+TRY suggestion_characters etc., see man(4) hunspell)"
+ exit 0;;
+esac
+
+MAXAFFIX=${2:-5000}
+
+# profiling
+#AWK="pgawk --profile"
+AWK="gawk"
+
+rm -f $1.aff $1.dic
+cat $1 | $AWK '
+{
+ # calculate frequent suffixes
+ A[$1] = 1
+ len = length($1)
+ if (len > 2) {
+# print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr"
+ B[substr($1, 1, len - 1)] = substr($1, len, 1);
+ }
+ for(i = 2; i < len; i++) {
+ r = substr($1, 1, i)
+ if (i == 2) {
+ if (prev != r) {
+ delete A
+ delete B
+ print "Deleted roots: ", prev > "/dev/stderr"
+ A[$1] = 1
+ }
+ prev = r
+ }
+ if (A[r]) {
+# print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr"
+ sfx[substr($1, i + 1, len - i + 1)]++
+ } else if (B[r] && B[r] != substr($1, i + 1, 1)) {
+ r2 = substr($1, i + 1, len - i + 1)
+ sfy[r2,B[r]]++
+ }
+ }
+}
+END {
+ for (i in sfx) print i, 0, sfx[i]
+ for (i in sfy) print i, sfy[i]
+}
+' | tr '\034' ' ' >affixcompress0.tmp
+sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' |
+head -$MAXAFFIX >affixcompress1.tmp
+cat affixcompress1.tmp |
+$AWK '
+function potential_roots() {
+ # potential roots with most frequent suffixes
+ for(word in W) if (W[word]==1) {
+ print word >"word"
+ len = length(word);
+ for(i = 2; i < len; i++) {
+ root = substr(word, 1, i)
+ suff = substr(word, i + 1, len - i + 1)
+ if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++
+ if (sfz[suff]) {
+ l = split(sfz[suff], a)
+ for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) {
+ C[root a[k]]++
+ }
+ }
+ }
+ }
+
+ # calculate roots
+ for(word in W) if (W[word]==1) {
+ print word >"word2"
+ len = length(word);
+ z = 0
+ # choose most frequent root (maybe the original word)
+ max = C[word]
+ maxword = word
+ maxsuff = 0
+ for(i = 2; i < len; i++) {
+ root = substr(word, 1, i)
+ suff = substr(word, i + 1, len - i + 1)
+ if ((sfx[suff] != "") && (C[root] > max)) {
+ max = C[root]
+ maxword = root
+ maxsuff = sfx[suff]
+ }
+ if (sfz[suff] != "") {
+ l = split(sfz[suff], a)
+ for (k=1; k <= l; k++) if (C[root a[k]] > max) {
+ max = C[root a[k]]
+ maxword = root a[k]
+ maxsuff = sfy[suff,a[k]]
+ }
+ }
+ }
+ if (max > 0) {
+ if (maxsuff > 0) print maxword, maxsuff; else print maxword
+ A[maxword]++
+ z=1
+ } else {
+ for(i = 2; i < len; i++) {
+ root = substr(word, 1, i)
+ suff = substr(word, i + 1, len - i + 1)
+ if ((A[root] > 0) && sfx[suff]!="") {
+ print root, sfx[suff]
+ z = 1
+ break
+ }
+ if (sfz[suff]) {
+ l = split(sfz[suff], a)
+ for (k=1; k <= l; k++) if (A[root a[k]]!="") {
+ print root a[k], sfy[suff,a[k]]
+ z = 1
+ break
+ }
+ }
+ }
+ }
+ if (z == 0) {
+ print word
+ A[word]++
+ }
+ }
+ delete A
+ delete C
+}
+FILENAME == "-" {
+ if ($2 == 0) {
+ sfx[$1] = NR
+ sfxfr[$1] = $3
+ } else {
+ sfy[$1,$2] = NR
+ sfyfr[$1,$2] = $3
+ sfz[$1] = sfz[$1] " " $2
+ }
+ maxsuf = NR
+ next
+}
+{
+ cap = substr($1, 1, 3)
+ if (cap != prev) {
+ potential_roots()
+ delete W
+ print "Deleted class:", prev > "/dev/stderr"
+ }
+ prev = cap
+ W[$1] = 1
+}
+END {
+ potential_roots()
+ # write out frequent suffixes
+ out=FILENAME ".aff"
+ print "FLAG num" >out
+ for (i in sfx) if (sfx[i] > 0) {
+ print "SFX", sfx[i], "Y 1" >out
+ print "SFX", sfx[i], "0", i, "." >out
+ }
+ for (i in sfy) if (sfy[i] > 0) {
+ print "SFX", sfy[i], "Y 1" >out
+ split(i, c, "\034");
+ print "SFX", sfy[i], c[2], c[1], c[2] >out
+ }
+}
+' - $1 >affixcompress2.tmp
+sort -nk 2 affixcompress2.tmp >affixcompress3.tmp
+cat affixcompress3.tmp | $AWK -v out="$1.dic" '
+{
+ if (A[$1]=="") A[$1]=$2;
+ else if ($2!="") A[$1] = A[$1] "," $2
+}
+END {
+ for (i in A) n++
+ print n >out
+ for (i in A) {
+ if (A[i]=="") print i
+ else print i "/" A[i]
+ }
+}
+' | sort >>$1.dic