diff options
Diffstat (limited to 'src/tools')
-rw-r--r-- | src/tools/Makefile.am | 27 | ||||
-rw-r--r-- | src/tools/Makefile.in | 755 | ||||
-rwxr-xr-x | src/tools/affixcompress | 192 | ||||
-rw-r--r-- | src/tools/analyze.cxx | 79 | ||||
-rw-r--r-- | src/tools/chmorph.cxx | 86 | ||||
-rw-r--r-- | src/tools/example.cxx | 65 | ||||
-rw-r--r-- | src/tools/hunspell.cxx | 1785 | ||||
-rw-r--r-- | src/tools/hunzip.cxx | 22 | ||||
-rw-r--r-- | src/tools/hzip.c | 325 | ||||
-rw-r--r-- | src/tools/ispellaff2myspell | 472 | ||||
-rwxr-xr-x | src/tools/makealias | 115 | ||||
-rw-r--r-- | src/tools/munch.c | 832 | ||||
-rw-r--r-- | src/tools/munch.h | 121 | ||||
-rw-r--r-- | src/tools/unmunch.c | 514 | ||||
-rw-r--r-- | src/tools/unmunch.h | 78 | ||||
-rwxr-xr-x | src/tools/wordforms | 35 | ||||
-rw-r--r-- | src/tools/wordlist2hunspell | 38 |
17 files changed, 5541 insertions, 0 deletions
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am new file mode 100644 index 0000000..8074eea --- /dev/null +++ b/src/tools/Makefile.am @@ -0,0 +1,27 @@ +bin_PROGRAMS=analyze chmorph hunspell munch unmunch hzip hunzip + +INCLUDES=-I${top_srcdir}/src/hunspell -I${top_srcdir}/src/parsers + +hzip_SOURCES=hzip.c +hunzip_SOURCES=hunzip.cxx +hunzip_LDADD = ../hunspell/libhunspell-1.3.la + +munch_SOURCES=munch.c munch.h +unmunch_SOURCES=unmunch.c unmunch.h + +example_SOURCES=example.cxx +example_LDADD = ../hunspell/libhunspell-1.3.la + +hunspell_SOURCES=hunspell.cxx +hunspell_LDADD = @LIBINTL@ @LIBICONV@ ../parsers/libparsers.a \ + ../hunspell/libhunspell-1.3.la @CURSESLIB@ @READLINELIB@ + +analyze_SOURCES=analyze.cxx +analyze_LDADD = ../hunspell/libhunspell-1.3.la + +chmorph_SOURCES=chmorph.cxx +chmorph_LDADD = ../parsers/libparsers.a ../hunspell/libhunspell-1.3.la + +noinst_PROGRAMS=example + +dist_bin_SCRIPTS=makealias affixcompress wordforms ispellaff2myspell wordlist2hunspell diff --git a/src/tools/Makefile.in b/src/tools/Makefile.in new file mode 100644 index 0000000..075f0b7 --- /dev/null +++ b/src/tools/Makefile.in @@ -0,0 +1,755 @@ +# Makefile.in generated by automake 1.11.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, +# Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + + +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +bin_PROGRAMS = analyze$(EXEEXT) chmorph$(EXEEXT) hunspell$(EXEEXT) \ + munch$(EXEEXT) unmunch$(EXEEXT) hzip$(EXEEXT) hunzip$(EXEEXT) +noinst_PROGRAMS = example$(EXEEXT) +subdir = src/tools +DIST_COMMON = $(dist_bin_SCRIPTS) $(srcdir)/Makefile.am \ + $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/codeset.m4 \ + $(top_srcdir)/m4/gettext.m4 $(top_srcdir)/m4/glibc2.m4 \ + $(top_srcdir)/m4/glibc21.m4 $(top_srcdir)/m4/iconv.m4 \ + $(top_srcdir)/m4/intdiv0.m4 $(top_srcdir)/m4/intl.m4 \ + $(top_srcdir)/m4/intlmacosx.m4 $(top_srcdir)/m4/intmax.m4 \ + $(top_srcdir)/m4/inttypes-pri.m4 \ + $(top_srcdir)/m4/inttypes_h.m4 $(top_srcdir)/m4/lcmessage.m4 \ + $(top_srcdir)/m4/lib-ld.m4 $(top_srcdir)/m4/lib-link.m4 \ + $(top_srcdir)/m4/lib-prefix.m4 $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/lock.m4 $(top_srcdir)/m4/longlong.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/nls.m4 $(top_srcdir)/m4/po.m4 \ + $(top_srcdir)/m4/printf-posix.m4 $(top_srcdir)/m4/progtest.m4 \ + $(top_srcdir)/m4/size_max.m4 $(top_srcdir)/m4/stdint_h.m4 \ + $(top_srcdir)/m4/uintmax_t.m4 $(top_srcdir)/m4/visibility.m4 \ + $(top_srcdir)/m4/wchar_t.m4 $(top_srcdir)/m4/wint_t.m4 \ + $(top_srcdir)/m4/xsize.m4 $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" +PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS) +am_analyze_OBJECTS = analyze.$(OBJEXT) +analyze_OBJECTS = $(am_analyze_OBJECTS) +analyze_DEPENDENCIES = ../hunspell/libhunspell-1.3.la +am_chmorph_OBJECTS = chmorph.$(OBJEXT) +chmorph_OBJECTS = $(am_chmorph_OBJECTS) +chmorph_DEPENDENCIES = ../parsers/libparsers.a \ + ../hunspell/libhunspell-1.3.la +am_example_OBJECTS = example.$(OBJEXT) +example_OBJECTS = $(am_example_OBJECTS) +example_DEPENDENCIES = ../hunspell/libhunspell-1.3.la +am_hunspell_OBJECTS = hunspell.$(OBJEXT) +hunspell_OBJECTS = $(am_hunspell_OBJECTS) +hunspell_DEPENDENCIES = ../parsers/libparsers.a \ + ../hunspell/libhunspell-1.3.la +am_hunzip_OBJECTS = hunzip.$(OBJEXT) +hunzip_OBJECTS = $(am_hunzip_OBJECTS) +hunzip_DEPENDENCIES = ../hunspell/libhunspell-1.3.la +am_hzip_OBJECTS = hzip.$(OBJEXT) +hzip_OBJECTS = $(am_hzip_OBJECTS) +hzip_LDADD = $(LDADD) +am_munch_OBJECTS = munch.$(OBJEXT) +munch_OBJECTS = $(am_munch_OBJECTS) +munch_LDADD = $(LDADD) +am_unmunch_OBJECTS = unmunch.$(OBJEXT) +unmunch_OBJECTS = $(am_unmunch_OBJECTS) +unmunch_LDADD = $(LDADD) +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +SCRIPTS = $(dist_bin_SCRIPTS) +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +SOURCES = $(analyze_SOURCES) $(chmorph_SOURCES) $(example_SOURCES) \ + $(hunspell_SOURCES) $(hunzip_SOURCES) $(hzip_SOURCES) \ + $(munch_SOURCES) $(unmunch_SOURCES) +DIST_SOURCES = $(analyze_SOURCES) $(chmorph_SOURCES) \ + $(example_SOURCES) $(hunspell_SOURCES) $(hunzip_SOURCES) \ + $(hzip_SOURCES) $(munch_SOURCES) $(unmunch_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMTAR = @AMTAR@ +AR = @AR@ +AS = @AS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BUILD_INCLUDED_LIBINTL = @BUILD_INCLUDED_LIBINTL@ +CATOBJEXT = @CATOBJEXT@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CFLAG_VISIBILITY = @CFLAG_VISIBILITY@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CURSESLIB = @CURSESLIB@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATADIRNAME = @DATADIRNAME@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GENCAT = @GENCAT@ +GETTEXT_MACRO_VERSION = @GETTEXT_MACRO_VERSION@ +GLIBC2 = @GLIBC2@ +GLIBC21 = @GLIBC21@ +GMSGFMT = @GMSGFMT@ +GMSGFMT_015 = @GMSGFMT_015@ +GREP = @GREP@ +HAVE_ASPRINTF = @HAVE_ASPRINTF@ +HAVE_POSIX_PRINTF = @HAVE_POSIX_PRINTF@ +HAVE_SNPRINTF = @HAVE_SNPRINTF@ +HAVE_VISIBILITY = @HAVE_VISIBILITY@ +HAVE_WPRINTF = @HAVE_WPRINTF@ +HUNSPELL_VERSION_MAJOR = @HUNSPELL_VERSION_MAJOR@ +HUNSPELL_VERSION_MINOR = @HUNSPELL_VERSION_MINOR@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +INSTOBJEXT = @INSTOBJEXT@ +INTLBISON = @INTLBISON@ +INTLLIBS = @INTLLIBS@ +INTLOBJS = @INTLOBJS@ +INTL_LIBTOOL_SUFFIX_PREFIX = @INTL_LIBTOOL_SUFFIX_PREFIX@ +INTL_MACOSX_LIBS = @INTL_MACOSX_LIBS@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBICONV = @LIBICONV@ +LIBINTL = @LIBINTL@ +LIBMULTITHREAD = @LIBMULTITHREAD@ +LIBOBJS = @LIBOBJS@ +LIBPTH = @LIBPTH@ +LIBPTH_PREFIX = @LIBPTH_PREFIX@ +LIBS = @LIBS@ +LIBTHREAD = @LIBTHREAD@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBC = @LTLIBC@ +LTLIBICONV = @LTLIBICONV@ +LTLIBINTL = @LTLIBINTL@ +LTLIBMULTITHREAD = @LTLIBMULTITHREAD@ +LTLIBOBJS = @LTLIBOBJS@ +LTLIBPTH = @LTLIBPTH@ +LTLIBTHREAD = @LTLIBTHREAD@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +MSGFMT = @MSGFMT@ +MSGFMT_015 = @MSGFMT_015@ +MSGMERGE = @MSGMERGE@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +POSUB = @POSUB@ +PRI_MACROS_BROKEN = @PRI_MACROS_BROKEN@ +RANLIB = @RANLIB@ +READLINELIB = @READLINELIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +USE_INCLUDED_LIBINTL = @USE_INCLUDED_LIBINTL@ +USE_NLS = @USE_NLS@ +VERSION = @VERSION@ +WINDRES = @WINDRES@ +WOE32 = @WOE32@ +WOE32DLL = @WOE32DLL@ +XFAILED = @XFAILED@ +XGETTEXT = @XGETTEXT@ +XGETTEXT_015 = @XGETTEXT_015@ +XGETTEXT_EXTRA_OPTIONS = @XGETTEXT_EXTRA_OPTIONS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +lt_ECHO = @lt_ECHO@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +INCLUDES = -I${top_srcdir}/src/hunspell -I${top_srcdir}/src/parsers +hzip_SOURCES = hzip.c +hunzip_SOURCES = hunzip.cxx +hunzip_LDADD = ../hunspell/libhunspell-1.3.la +munch_SOURCES = munch.c munch.h +unmunch_SOURCES = unmunch.c unmunch.h +example_SOURCES = example.cxx +example_LDADD = ../hunspell/libhunspell-1.3.la +hunspell_SOURCES = hunspell.cxx +hunspell_LDADD = @LIBINTL@ @LIBICONV@ ../parsers/libparsers.a \ + ../hunspell/libhunspell-1.3.la @CURSESLIB@ @READLINELIB@ + +analyze_SOURCES = analyze.cxx +analyze_LDADD = ../hunspell/libhunspell-1.3.la +chmorph_SOURCES = chmorph.cxx +chmorph_LDADD = ../parsers/libparsers.a ../hunspell/libhunspell-1.3.la +dist_bin_SCRIPTS = makealias affixcompress wordforms ispellaff2myspell wordlist2hunspell +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .cxx .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/tools/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu src/tools/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + test -z "$(bindir)" || $(MKDIR_P) "$(DESTDIR)$(bindir)" + @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ + for p in $$list; do echo "$$p $$p"; done | \ + sed 's/$(EXEEXT)$$//' | \ + while read p p1; do if test -f $$p || test -f $$p1; \ + then echo "$$p"; echo "$$p"; else :; fi; \ + done | \ + sed -e 'p;s,.*/,,;n;h' -e 's|.*|.|' \ + -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \ + sed 'N;N;N;s,\n, ,g' | \ + $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \ + { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ + if ($$2 == $$4) files[d] = files[d] " " $$1; \ + else { print "f", $$3 "/" $$4, $$1; } } \ + END { for (d in files) print "f", d, files[d] }' | \ + while read type dir files; do \ + if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ + test -z "$$files" || { \ + echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \ + $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ + } \ + ; done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ + files=`for p in $$list; do echo "$$p"; done | \ + sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \ + -e 's/$$/$(EXEEXT)/' `; \ + test -n "$$list" || exit 0; \ + echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ + cd "$(DESTDIR)$(bindir)" && rm -f $$files + +clean-binPROGRAMS: + @list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \ + echo " rm -f" $$list; \ + rm -f $$list || exit $$?; \ + test -n "$(EXEEXT)" || exit 0; \ + list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ + echo " rm -f" $$list; \ + rm -f $$list + +clean-noinstPROGRAMS: + @list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \ + echo " rm -f" $$list; \ + rm -f $$list || exit $$?; \ + test -n "$(EXEEXT)" || exit 0; \ + list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ + echo " rm -f" $$list; \ + rm -f $$list +analyze$(EXEEXT): $(analyze_OBJECTS) $(analyze_DEPENDENCIES) + @rm -f analyze$(EXEEXT) + $(CXXLINK) $(analyze_OBJECTS) $(analyze_LDADD) $(LIBS) +chmorph$(EXEEXT): $(chmorph_OBJECTS) $(chmorph_DEPENDENCIES) + @rm -f chmorph$(EXEEXT) + $(CXXLINK) $(chmorph_OBJECTS) $(chmorph_LDADD) $(LIBS) +example$(EXEEXT): $(example_OBJECTS) $(example_DEPENDENCIES) + @rm -f example$(EXEEXT) + $(CXXLINK) $(example_OBJECTS) $(example_LDADD) $(LIBS) +hunspell$(EXEEXT): $(hunspell_OBJECTS) $(hunspell_DEPENDENCIES) + @rm -f hunspell$(EXEEXT) + $(CXXLINK) $(hunspell_OBJECTS) $(hunspell_LDADD) $(LIBS) +hunzip$(EXEEXT): $(hunzip_OBJECTS) $(hunzip_DEPENDENCIES) + @rm -f hunzip$(EXEEXT) + $(CXXLINK) $(hunzip_OBJECTS) $(hunzip_LDADD) $(LIBS) +hzip$(EXEEXT): $(hzip_OBJECTS) $(hzip_DEPENDENCIES) + @rm -f hzip$(EXEEXT) + $(LINK) $(hzip_OBJECTS) $(hzip_LDADD) $(LIBS) +munch$(EXEEXT): $(munch_OBJECTS) $(munch_DEPENDENCIES) + @rm -f munch$(EXEEXT) + $(LINK) $(munch_OBJECTS) $(munch_LDADD) $(LIBS) +unmunch$(EXEEXT): $(unmunch_OBJECTS) $(unmunch_DEPENDENCIES) + @rm -f unmunch$(EXEEXT) + $(LINK) $(unmunch_OBJECTS) $(unmunch_LDADD) $(LIBS) +install-dist_binSCRIPTS: $(dist_bin_SCRIPTS) + @$(NORMAL_INSTALL) + test -z "$(bindir)" || $(MKDIR_P) "$(DESTDIR)$(bindir)" + @list='$(dist_bin_SCRIPTS)'; test -n "$(bindir)" || list=; \ + for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + if test -f "$$d$$p"; then echo "$$d$$p"; echo "$$p"; else :; fi; \ + done | \ + sed -e 'p;s,.*/,,;n' \ + -e 'h;s|.*|.|' \ + -e 'p;x;s,.*/,,;$(transform)' | sed 'N;N;N;s,\n, ,g' | \ + $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1; } \ + { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ + if ($$2 == $$4) { files[d] = files[d] " " $$1; \ + if (++n[d] == $(am__install_max)) { \ + print "f", d, files[d]; n[d] = 0; files[d] = "" } } \ + else { print "f", d "/" $$4, $$1 } } \ + END { for (d in files) print "f", d, files[d] }' | \ + while read type dir files; do \ + if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ + test -z "$$files" || { \ + echo " $(INSTALL_SCRIPT) $$files '$(DESTDIR)$(bindir)$$dir'"; \ + $(INSTALL_SCRIPT) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ + } \ + ; done + +uninstall-dist_binSCRIPTS: + @$(NORMAL_UNINSTALL) + @list='$(dist_bin_SCRIPTS)'; test -n "$(bindir)" || exit 0; \ + files=`for p in $$list; do echo "$$p"; done | \ + sed -e 's,.*/,,;$(transform)'`; \ + test -n "$$list" || exit 0; \ + echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ + cd "$(DESTDIR)$(bindir)" && rm -f $$files + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/analyze.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chmorph.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/example.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hunspell.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hunzip.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hzip.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/munch.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unmunch.Po@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +.cxx.o: +@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< + +.cxx.obj: +@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cxx.lo: +@am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(LTCXXCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + set x; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(PROGRAMS) $(SCRIPTS) +installdirs: + for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-binPROGRAMS clean-generic clean-libtool \ + clean-noinstPROGRAMS mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: install-binPROGRAMS install-dist_binSCRIPTS + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-binPROGRAMS uninstall-dist_binSCRIPTS + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ + clean-generic clean-libtool clean-noinstPROGRAMS ctags \ + distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-binPROGRAMS \ + install-data install-data-am install-dist_binSCRIPTS \ + install-dvi install-dvi-am install-exec install-exec-am \ + install-html install-html-am install-info install-info-am \ + install-man install-pdf install-pdf-am install-ps \ + install-ps-am install-strip installcheck installcheck-am \ + installdirs maintainer-clean maintainer-clean-generic \ + mostlyclean mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \ + uninstall-am uninstall-binPROGRAMS uninstall-dist_binSCRIPTS + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src/tools/affixcompress b/src/tools/affixcompress new file mode 100755 index 0000000..9fc2989 --- /dev/null +++ b/src/tools/affixcompress @@ -0,0 +1,192 @@ +#!/bin/sh +# affix compressor utility for Hunspell +# 2008 (c) László Németh, version 0.3 +# usage: affixcompress sorted_word_list_file [max_affix_rules] +case $# in +0) echo \ +"affixcompress - compress a huge sorted word list to Hunspell format +Usage: + +LC_ALL=C sort word_list >sorted_word_list +affixcompress sorted_word_list [max_affix_rules] + +Default value of max_affix_rules = 5000 + +Note: output may need manually added affix parameters (SET character_encoding, +TRY suggestion_characters etc., see man(4) hunspell)" + exit 0;; +esac + +MAXAFFIX=${2:-5000} + +# profiling +#AWK="pgawk --profile" +AWK="gawk" + +rm -f $1.aff $1.dic +cat $1 | $AWK ' +{ + # calculate frequent suffixes + A[$1] = 1 + len = length($1) + if (len > 2) { +# print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr" + B[substr($1, 1, len - 1)] = substr($1, len, 1); + } + for(i = 2; i < len; i++) { + r = substr($1, 1, i) + if (i == 2) { + if (prev != r) { + delete A + delete B + print "Deleted roots: ", prev > "/dev/stderr" + A[$1] = 1 + } + prev = r + } + if (A[r]) { +# print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr" + sfx[substr($1, i + 1, len - i + 1)]++ + } else if (B[r] && B[r] != substr($1, i + 1, 1)) { + r2 = substr($1, i + 1, len - i + 1) + sfy[r2,B[r]]++ + } + } +} +END { + for (i in sfx) print i, 0, sfx[i] + for (i in sfy) print i, sfy[i] +} +' | tr '\034' ' ' >affixcompress0.tmp +sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' | +head -$MAXAFFIX >affixcompress1.tmp +cat affixcompress1.tmp | +$AWK ' +function potential_roots() { + # potential roots with most frequent suffixes + for(word in W) if (W[word]==1) { + print word >"word" + len = length(word); + for(i = 2; i < len; i++) { + root = substr(word, 1, i) + suff = substr(word, i + 1, len - i + 1) + if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++ + if (sfz[suff]) { + l = split(sfz[suff], a) + for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) { + C[root a[k]]++ + } + } + } + } + + # calculate roots + for(word in W) if (W[word]==1) { + print word >"word2" + len = length(word); + z = 0 + # choose most frequent root (maybe the original word) + max = C[word] + maxword = word + maxsuff = 0 + for(i = 2; i < len; i++) { + root = substr(word, 1, i) + suff = substr(word, i + 1, len - i + 1) + if ((sfx[suff] != "") && (C[root] > max)) { + max = C[root] + maxword = root + maxsuff = sfx[suff] + } + if (sfz[suff] != "") { + l = split(sfz[suff], a) + for (k=1; k <= l; k++) if (C[root a[k]] > max) { + max = C[root a[k]] + maxword = root a[k] + maxsuff = sfy[suff,a[k]] + } + } + } + if (max > 0) { + if (maxsuff > 0) print maxword, maxsuff; else print maxword + A[maxword]++ + z=1 + } else { + for(i = 2; i < len; i++) { + root = substr(word, 1, i) + suff = substr(word, i + 1, len - i + 1) + if ((A[root] > 0) && sfx[suff]!="") { + print root, sfx[suff] + z = 1 + break + } + if (sfz[suff]) { + l = split(sfz[suff], a) + for (k=1; k <= l; k++) if (A[root a[k]]!="") { + print root a[k], sfy[suff,a[k]] + z = 1 + break + } + } + } + } + if (z == 0) { + print word + A[word]++ + } + } + delete A + delete C +} +FILENAME == "-" { + if ($2 == 0) { + sfx[$1] = NR + sfxfr[$1] = $3 + } else { + sfy[$1,$2] = NR + sfyfr[$1,$2] = $3 + sfz[$1] = sfz[$1] " " $2 + } + maxsuf = NR + next +} +{ + cap = substr($1, 1, 3) + if (cap != prev) { + potential_roots() + delete W + print "Deleted class:", prev > "/dev/stderr" + } + prev = cap + W[$1] = 1 +} +END { + potential_roots() + # write out frequent suffixes + out=FILENAME ".aff" + print "FLAG num" >out + for (i in sfx) if (sfx[i] > 0) { + print "SFX", sfx[i], "Y 1" >out + print "SFX", sfx[i], "0", i, "." >out + } + for (i in sfy) if (sfy[i] > 0) { + print "SFX", sfy[i], "Y 1" >out + split(i, c, "\034"); + print "SFX", sfy[i], c[2], c[1], c[2] >out + } +} +' - $1 >affixcompress2.tmp +sort -nk 2 affixcompress2.tmp >affixcompress3.tmp +cat affixcompress3.tmp | $AWK -v out="$1.dic" ' +{ + if (A[$1]=="") A[$1]=$2; + else if ($2!="") A[$1] = A[$1] "," $2 +} +END { + for (i in A) n++ + print n >out + for (i in A) { + if (A[i]=="") print i + else print i "/" A[i] + } +} +' | sort >>$1.dic diff --git a/src/tools/analyze.cxx b/src/tools/analyze.cxx new file mode 100644 index 0000000..03434fa --- /dev/null +++ b/src/tools/analyze.cxx @@ -0,0 +1,79 @@ + +#include <cstring> +#include <cstdlib> +#include <cstdio> + +#include "hunspell.hxx" + +#ifndef WIN32 +using namespace std; +#endif + + + +int main(int argc, char **argv) +{ + + FILE *wtclst; + int i; + int dp; + char buf[101]; + Hunspell *pMS; + + /* first parse the command line options */ + + for (i = 1; i < 3; i++) + if (!argv[i]) { + fprintf(stderr, "correct syntax is:\nanalyze affix_file"); + fprintf(stderr, " dictionary_file file_of_words_to_check\n"); + fprintf(stderr, "use two words per line for morphological generation\n"); + exit(1); + } + + /* open the words to check list */ + + wtclst = fopen(argv[3], "r"); + if (!wtclst) { + fprintf(stderr, "Error - could not open file to check\n"); + exit(1); + } + + pMS = new Hunspell(argv[1], argv[2]); + while (fgets(buf, 100, wtclst)) { + *(buf + strlen(buf) - 1) = '\0'; + if (*buf == '\0') continue; + // morphgen demo + char * s = strchr(buf, ' '); + if (s) { + *s = '\0'; + char ** result; + int n = pMS->generate(&result, buf, s+1); + for (int i = 0; i < n; i++) { + fprintf(stdout, "generate(%s, %s) = %s\n", buf, s+1, result[i]); + } + pMS->free_list(&result, n); + if (n == 0) fprintf(stdout, "generate(%s, %s) = NO DATA\n", buf, s+1); + } else { + dp = pMS->spell(buf); + fprintf(stdout, "> %s\n", buf); + if (dp) { + char ** result; + int n = pMS->analyze(&result, buf); + for (int i = 0; i < n; i++) { + fprintf(stdout, "analyze(%s) = %s\n", buf, result[i]); + } + pMS->free_list(&result, n); + n = pMS->stem(&result, buf); + for (int i = 0; i < n; i++) { + fprintf(stdout, "stem(%s) = %s\n", buf, result[i]); + } + pMS->free_list(&result, n); + } else { + fprintf(stdout, "Unknown word.\n"); + } + } + } + delete pMS; + fclose(wtclst); + return 0; +} diff --git a/src/tools/chmorph.cxx b/src/tools/chmorph.cxx new file mode 100644 index 0000000..0faa8f0 --- /dev/null +++ b/src/tools/chmorph.cxx @@ -0,0 +1,86 @@ +#include <cstring> +#include <cstdlib> +#include <cstdio> + +#include "hunspell.hxx" +#include "textparser.hxx" + +#ifndef W32 +using namespace std; +#endif + +int +main(int argc, char** argv) +{ + FILE * f; + + /* first parse the command line options */ + + for (int i = 1; i < 6; i++) + if (!argv[i]) { + fprintf(stderr, + "chmorph - change affixes by morphological analysis and generation\n" + "correct syntax is:\nchmorph affix_file " + "dictionary_file file_to_convert STRING1 STRING2\n" + "STRINGS may be arbitrary parts of the morphological descriptions\n" + "example: chmorph hu.aff hu.dic hu.txt SG_2 SG_3 " + " (convert informal Hungarian second person texts to formal third person texts)\n"); + exit(1); + } + + /* open the words to check list */ + + f = fopen(argv[3], "r"); + if (!f) { + fprintf(stderr, "Error - could not open file to check\n"); + exit(1); + } + + Hunspell *pMS = new Hunspell(argv[1], argv[2]); + TextParser * p = new TextParser("qwertzuiopasdfghjklyxcvbnméáúõûóüöíQWERTZUIOPASDFGHJKLYXCVBNMÍÉÁÕÚÖÜÓÛ"); + + char buf[MAXLNLEN]; + char * next; + + while(fgets(buf,MAXLNLEN,f)) { + p->put_line(buf); + while ((next=p->next_token())) { + char ** pl; + int pln = pMS->analyze(&pl, next); + if (pln) { + int gen = 0; + for (int i = 0; i < pln; i++) { + char *pos = strstr(pl[i], argv[4]); + if (pos) { + char * r = (char * ) malloc(strlen(pl[i]) - + strlen(argv[4]) + strlen(argv[5]) + 1); + strncpy(r, pl[i], pos - pl[i]); + strcpy(r + (pos - pl[i]), argv[5]); + strcat(r, pos + strlen(argv[4])); + free(pl[i]); + pl[i] = r; + gen = 1; + } + } + if (gen) { + char **pl2; + int pl2n = pMS->generate(&pl2, next, pl, pln); + if (pl2n) { + p->change_token(pl2[0]); + pMS->free_list(&pl2, pl2n); + // jump over the (possibly un)modified word + free(next); + next=p->next_token(); + } + } + pMS->free_list(&pl, pln); + } + free(next); + } + fprintf(stdout, "%s\n", p->get_line()); + } + + delete p; + fclose(f); + return 0; +} diff --git a/src/tools/example.cxx b/src/tools/example.cxx new file mode 100644 index 0000000..093a038 --- /dev/null +++ b/src/tools/example.cxx @@ -0,0 +1,65 @@ +#include <cstring> +#include <cstdlib> +#include <cstdio> + +#include "hunspell.hxx" + +extern char * mystrdup(const char * s); + +using namespace std; + +int +main(int argc, char** argv) +{ + + FILE* wtclst; + + /* first parse the command line options */ + + if (argc < 4) { + fprintf(stderr,"example (now it works with more dictionary files):\n"); + fprintf(stderr,"example affix_file dictionary_file(s) file_of_words_to_check\n"); + exit(1); + } + + /* open the words to check list */ + wtclst = fopen(argv[argc - 1],"r"); + if (!wtclst) { + fprintf(stderr,"Error - could not open file of words to check\n"); + exit(1); + } + + int k; + int dp; + char buf[101]; + + Hunspell * pMS= new Hunspell(argv[1], argv[2]); + + // load extra dictionaries + if (argc > 4) for (k = 3; k < argc - 1; k++) pMS->add_dic(argv[k]); + + while(fgets(buf, 100, wtclst)) { + k = strlen(buf); + *(buf + k - 1) = '\0'; + dp = pMS->spell(buf); + if (dp) { + fprintf(stdout,"\"%s\" is okay\n",buf); + fprintf(stdout,"\n"); + } else { + fprintf(stdout,"\"%s\" is incorrect!\n",buf); + fprintf(stdout," suggestions:\n"); + char ** wlst; + int ns = pMS->suggest(&wlst,buf); + for (int i=0; i < ns; i++) { + fprintf(stdout," ...\"%s\"\n",wlst[i]); + } + pMS->free_list(&wlst, ns); + fprintf(stdout,"\n"); + } + } + + delete pMS; + fclose(wtclst); + return 0; +} + diff --git a/src/tools/hunspell.cxx b/src/tools/hunspell.cxx new file mode 100644 index 0000000..07ad6bb --- /dev/null +++ b/src/tools/hunspell.cxx @@ -0,0 +1,1785 @@ +// glibc < 3.0 (for mkstemp) +#ifndef __USE_MISC +#define __USE_MISC +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include "config.h" +#include "hunspell.hxx" +#include "csutil.hxx" + +#ifndef HUNSPELL_EXTRA +#define suggest_auto suggest +#endif + +#define HUNSPELL_VERSION VERSION +#define INPUTLEN 50 + +#define HUNSPELL_PIPE_HEADING "@(#) International Ispell Version 3.2.06 (but really Hunspell "VERSION")\n" +#define HUNSPELL_HEADING "Hunspell " + +//for debugging only +//#define LOG + +#define DEFAULTDICNAME "default" + +#ifdef WIN32 + +#define LIBDIR "C:\\Hunspell\\" +#define USEROOODIR "Application Data\\OpenOffice.org 2\\user\\wordbook" +#define OOODIR \ + "C:\\Program files\\OpenOffice.org 2.4\\share\\dict\\ooo\\;" \ + "C:\\Program files\\OpenOffice.org 2.3\\share\\dict\\ooo\\;" \ + "C:\\Program files\\OpenOffice.org 2.2\\share\\dict\\ooo\\;" \ + "C:\\Program files\\OpenOffice.org 2.1\\share\\dict\\ooo\\;" \ + "C:\\Program files\\OpenOffice.org 2.0\\share\\dict\\ooo\\" +#define HOME "%USERPROFILE%\\" +#define DICBASENAME "hunspell_" +#define LOGFILE "C:\\Hunspell\\log" +#define DIRSEPCH '\\' +#define DIRSEP "\\" +#define PATHSEP ";" + +#include "textparser.hxx" +#include "htmlparser.hxx" +#include "latexparser.hxx" +#include "manparser.hxx" +#include "firstparser.hxx" + +#else + +// Not Windows +#include <sys/types.h> +#include <dirent.h> +#include <unistd.h> +#include "textparser.hxx" +#include "htmlparser.hxx" +#include "latexparser.hxx" +#include "manparser.hxx" +#include "firstparser.hxx" + +#define LIBDIR \ + "/usr/share/hunspell:" \ + "/usr/share/myspell:" \ + "/usr/share/myspell/dicts:" \ + "/Library/Spelling" +#define USEROOODIR \ + ".openoffice.org/3/user/wordbook:" \ + ".openoffice.org2/user/wordbook:" \ + ".openoffice.org2.0/user/wordbook:" \ + "Library/Spelling" +#define OOODIR \ + "/opt/openoffice.org/basis3.0/share/dict/ooo:" \ + "/usr/lib/openoffice.org/basis3.0/share/dict/ooo:" \ + "/opt/openoffice.org2.4/share/dict/ooo:" \ + "/usr/lib/openoffice.org2.4/share/dict/ooo:" \ + "/opt/openoffice.org2.3/share/dict/ooo:" \ + "/usr/lib/openoffice.org2.3/share/dict/ooo:" \ + "/opt/openoffice.org2.2/share/dict/ooo:" \ + "/usr/lib/openoffice.org2.2/share/dict/ooo:" \ + "/opt/openoffice.org2.1/share/dict/ooo:" \ + "/usr/lib/openoffice.org2.1/share/dict/ooo:" \ + "/opt/openoffice.org2.0/share/dict/ooo:" \ + "/usr/lib/openoffice.org2.0/share/dict/ooo" +#define HOME getenv("HOME") +#define DICBASENAME ".hunspell_" +#define LOGFILE "/tmp/hunspell.log" +#define DIRSEPCH '/' +#define DIRSEP "/" +#define PATHSEP ":" +#endif + +#ifdef HAVE_ICONV +#include <iconv.h> +char text_conv[MAXLNLEN]; +#endif + +#if ENABLE_NLS +# ifdef HAVE_LOCALE_H +# include <locale.h> +# ifdef HAVE_LANGINFO_CODESET +# include <langinfo.h> +# endif +# endif +# ifdef HAVE_LIBINTL_H +# include <libintl.h> +# else +# include <../../intl/libintl.h> +# endif +#else +# define gettext +# undef HAVE_LOCALE_H +# undef HAVE_LIBINTL_H +#endif + +#ifdef HAVE_CURSES_H +#ifdef HAVE_NCURSESW_H +#include <ncurses.h> +#else +#include <curses.h> +#endif +#endif + +#ifdef HAVE_READLINE +#include <readline/readline.h> +#else +#define readline scanline +#endif + +#define TEMPNAME "hunSPELL.bak" + +extern char * mystrdup(const char * s); + +// file formats: + +enum { FMT_TEXT, FMT_LATEX, FMT_HTML, FMT_MAN, FMT_FIRST }; + +struct wordlist { + char * word; + wordlist * next; +}; + +// global variables + +char * wordchars = NULL; +char * dicpath = NULL; +int wordchars_len; +unsigned short * wordchars_utf16 = NULL; +int wordchars_utf16_free = 0; +int wordchars_utf16_len; +char * dicname = NULL; +char * privdicname = NULL; +const char * currentfilename = NULL; + +int modified; // modified file sign +enum { NORMAL, + BADWORD, // print only bad words + WORDFILTER, // print only bad words from 1 word/line input + BADLINE, // print only lines with bad words + STEM, // stem input words + ANALYZE, // analyze input words + PIPE, // print only stars for LyX compatibility + AUTO0, // search typical error (based on SuggestMgr::suggest_auto()) + AUTO, // automatic spelling to standard output + AUTO2, // automatic spelling to standard output with sed log + AUTO3 }; // automatic spelling to standard output with gcc error format +int filter_mode = NORMAL; +int printgood = 0; // print only good words and lines +int showpath = 0; // show detected path of the dictionary +int checkurl = 0; // check URLs and mail addresses +int warn = 0; // warn potential mistakes (dictionary words with WARN flags) +const char * ui_enc = NULL; // locale character encoding (default for I/O) +const char * io_enc = NULL; // I/O character encoding + +#define DMAX 10 // maximal count of loaded dictionaries + +const char * dic_enc[DMAX]; // dictionary encoding +char * path = NULL; +int dmax = 0; // dictionary count + +// functions + +#ifdef HAVE_ICONV +static const char* fix_encoding_name(const char *enc) +{ + if (strcmp(enc, "TIS620-2533") == 0) + enc = "TIS620"; + return enc; +} +#endif + +/* change character encoding */ +char * chenc(char * st, const char * enc1, const char * enc2) { + char * out = st; +#ifdef HAVE_ICONV + if (enc1 && enc2 && strcmp(enc1, enc2) != 0) { + + size_t c1 = strlen(st) + 1; + size_t c2 = MAXLNLEN; + char * source = st; + char * dest = text_conv; + iconv_t conv = iconv_open(fix_encoding_name(enc2), fix_encoding_name(enc1)); + if (conv == (iconv_t) -1) { + fprintf(stderr, gettext("error - iconv_open: %s -> %s\n"), enc2, enc1); + } else { + size_t res = iconv(conv, (ICONV_CONST char **) &source, &c1, &dest, &c2); + iconv_close(conv); + if (res != (size_t) -1) out = text_conv; + } + } +#endif + return out; +} + +TextParser * get_parser(int format, char * extension, Hunspell * pMS) { + TextParser * p = NULL; + int io_utf8 = 0; + char * denc = pMS->get_dic_encoding(); +#ifdef HAVE_ICONV + initialize_utf_tbl(); // also need for 8-bit tokenization + if (io_enc) { + if ((strcmp(io_enc, "UTF-8") == 0) || + (strcmp(io_enc, "utf-8") == 0) || + (strcmp(io_enc, "UTF8") == 0) || + (strcmp(io_enc, "utf8") == 0)) { + io_utf8 = 1; + io_enc = "UTF-8"; + } + } else if (ui_enc) { + io_enc = ui_enc; + if (strcmp(ui_enc, "UTF-8") == 0) io_utf8 = 1; + } else { + io_enc = denc; + if (strcmp(denc, "UTF-8") == 0) io_utf8 = 1; + } + + if (io_utf8) { + wordchars_utf16 = pMS->get_wordchars_utf16(&wordchars_utf16_len); + if ((strcmp(denc, "UTF-8") != 0) && pMS->get_wordchars()) { + char * wchars = (char *) pMS->get_wordchars(); + int wlen = strlen(wchars); + size_t c1 = wlen; + size_t c2 = MAXLNLEN; + char * dest = text_conv; + iconv_t conv = iconv_open("UTF-8", fix_encoding_name(denc)); + if (conv == (iconv_t) -1) { + fprintf(stderr, gettext("error - iconv_open: UTF-8 -> %s\n"), denc); + wordchars_utf16 = NULL; + wordchars_utf16_len = 0; + } else { + iconv(conv, (ICONV_CONST char **) &wchars, &c1, &dest, &c2); + iconv_close(conv); + wordchars_utf16 = (unsigned short *) malloc(sizeof(unsigned short) * wlen); + int n = u8_u16((w_char *) wordchars_utf16, wlen, text_conv); + if (n > 0) flag_qsort(wordchars_utf16, 0, n); + wordchars_utf16_len = n; + wordchars_utf16_free = 1; + } + } + } else { + // 8-bit input encoding + // detect letters by unicodeisalpha() for tokenization + char letters[MAXLNLEN]; + char * pletters = letters; + char ch[2]; + char u8[10]; + *pletters = '\0'; + iconv_t conv = iconv_open("UTF-8", fix_encoding_name(io_enc)); + if (conv == (iconv_t) -1) { + fprintf(stderr, gettext("error - iconv_open: UTF-8 -> %s\n"), io_enc); + } else { + for (int i = 32; i < 256; i++) { + size_t c1 = 1; + size_t c2 = 10; + char * dest = u8; + u8[0] = '\0'; + char * ch8bit = ch; + ch[0] = (char) i; + ch[1] = '\0'; + size_t res = iconv(conv, (ICONV_CONST char **) &ch8bit, &c1, &dest, &c2); + if (res != (size_t) -1) { + unsigned short idx; + w_char w; + w.l = 0; + w.h = 0; + u8_u16(&w, 1, u8); + idx = (w.h << 8) + w.l; + if (unicodeisalpha(idx)) { + *pletters = (char) i; + pletters++; + } + } + } + iconv_close(conv); + } + *pletters = '\0'; + + // UTF-8 wordchars -> 8 bit wordchars + int len = 0; + char * wchars = (char *) pMS->get_wordchars(); + if (wchars) { + if ((strcmp(denc, "UTF-8")==0)) { + pMS->get_wordchars_utf16(&len); + } else { + len = strlen(wchars); + } + char * dest = letters + strlen(letters); // append wordchars + size_t c1 = len + 1; + size_t c2 = len + 1; + iconv_t conv = iconv_open(fix_encoding_name(io_enc), fix_encoding_name(denc)); + if (conv == (iconv_t) -1) { + fprintf(stderr, gettext("error - iconv_open: %s -> %s\n"), io_enc, denc); + } else { + iconv(conv, (ICONV_CONST char **) &wchars, &c1, &dest, &c2); + iconv_close(conv); + *dest = '\0'; + } + } + if (*letters) wordchars = mystrdup(letters); + } +#else + if (strcmp(denc, "UTF-8") == 0) { + wordchars_utf16 = pMS->get_wordchars_utf16(&wordchars_utf16_len); + io_utf8 = 1; + } else { + char * casechars = get_casechars(denc); + wordchars = (char *) pMS->get_wordchars(); + if (casechars && wordchars) { + casechars = (char *) realloc(casechars, strlen(casechars) + strlen(wordchars) + 1); + strcat(casechars, wordchars); + } + wordchars = casechars; + } + io_enc = denc; +#endif + + if (io_utf8) { + switch (format) { + case FMT_LATEX: p = new LaTeXParser(wordchars_utf16, wordchars_utf16_len); break; + case FMT_HTML: p = new HTMLParser(wordchars_utf16, wordchars_utf16_len); break; + case FMT_MAN: p = new ManParser(wordchars_utf16, wordchars_utf16_len); break; + case FMT_FIRST: p = new FirstParser(wordchars); + } + } else { + switch (format) { + case FMT_LATEX: p = new LaTeXParser(wordchars); break; + case FMT_HTML: p = new HTMLParser(wordchars); break; + case FMT_MAN: p = new ManParser(wordchars); break; + case FMT_FIRST: p = new FirstParser(wordchars); + } + } + + if ((!p) && (extension)) { + if ((strcmp(extension, "html") == 0) || + (strcmp(extension, "htm") == 0) || + (strcmp(extension, "xml") == 0)) { + if (io_utf8) { + p = new HTMLParser(wordchars_utf16, wordchars_utf16_len); + } else { + p = new HTMLParser(wordchars); + } + } else if (((extension[0] > '0') && (extension[0] <= '9'))) { + if (io_utf8) { + p = new ManParser(wordchars_utf16, wordchars_utf16_len); + } else { + p = new ManParser(wordchars); + } + } else if ((strcmp(extension, "tex") == 0)) { + if (io_utf8) { + p = new LaTeXParser(wordchars_utf16, wordchars_utf16_len); + } else { + p = new LaTeXParser(wordchars); + } + } + } + if (!p) { + if (io_utf8) { + p = new TextParser(wordchars_utf16, wordchars_utf16_len); + } else { + p = new TextParser(wordchars); + } + } + p->set_url_checking(checkurl); + return p; +} + + +#ifdef LOG +void log(char * message) +{ + FILE *f = fopen(LOGFILE,"a"); + if (f) { + fprintf(f,"%s\n",message); + fclose(f); + } else { + fprintf(stderr,"Logfile..."); + } +} +#endif + +int putdic(char * word, Hunspell * pMS) +{ + char * w; + + word = chenc(word, ui_enc, dic_enc[0]); + + if (((w = strstr(word + 1, "/")) == NULL)) { + if (*word == '*') return pMS->remove(word + 1); + else return pMS->add(word); + } else { + char c; + int ret; + c = *w; + *w = '\0'; + if (*(w+1) == '/') { + ret = pMS->add_with_affix(word, w + 2); // word//pattern (back comp.) + } else { + ret = pMS->add_with_affix(word, w + 1); // word/pattern + } + *w = c; + return ret; + } +} + +void load_privdic(char * filename, Hunspell * pMS) +{ + char buf[MAXLNLEN]; + FILE *dic = fopen(filename,"r"); + if (dic) { + while(fgets(buf,MAXLNLEN,dic)) { + if (*(buf + strlen(buf) - 1) == '\n') *(buf + strlen(buf) - 1) = '\0'; + putdic(buf,pMS); + } + fclose(dic); + } +} + +int exist(char * filename) +{ + FILE *f = fopen(filename,"r"); + if (f) { + fclose(f); + return 1; + } + return 0; +} + +int save_privdic(char * filename, char * filename2, wordlist * w) +{ + wordlist * r; + FILE *dic = fopen(filename,"r"); + if (dic) { + fclose(dic); + dic = fopen(filename,"a"); + } else { + dic = fopen(filename2,"a"); + } + if (! dic) return 0; + while (w != NULL) { + char *word = chenc(w->word, io_enc, ui_enc); + fprintf(dic,"%s\n",word); +#ifdef LOG + log(word);log("\n"); +#endif + r = w; + free(w->word); + w = w->next; + free(r); + } + fclose(dic); + return 1; +} + +char * basename(char * s, char c) { + char * p = s + strlen(s); + while ((*p != c) && (p != s)) p--; + if (*p == c) p++; + return p; +} + +#ifdef HAVE_CURSES_H +char * scanline(char * message) { + char input[INPUTLEN]; + printw(message); + echo(); + getnstr(input, INPUTLEN); + noecho(); + return mystrdup(input); +} +#endif + +// check words in the dictionaries (and set first checked dictionary) +int check(Hunspell ** pMS, int * d, char * token, int * info, char ** root) { + for (int i = 0; i < dmax; i++) { + if (pMS[*d]->spell(chenc(token, io_enc, dic_enc[*d]), info, root) && !(warn && (*info & SPELL_WARN))) { + return 1; + } + if (++(*d) == dmax) *d = 0; + } + return 0; +} + +void pipe_interface(Hunspell ** pMS, int format, FILE * fileid) { + char buf[MAXLNLEN]; + char * buf2; + wordlist * dicwords = NULL; + char * token; + int pos; + int bad; + int lineno = 0; + int terse_mode = 0; + int verbose_mode = 0; + int d = 0; + + TextParser * parser = get_parser(format, NULL, pMS[0]); + + if ((filter_mode == NORMAL)) { + fprintf(stdout,gettext(HUNSPELL_HEADING)); + fprintf(stdout,HUNSPELL_VERSION); + if (pMS[0]->get_version()) fprintf(stdout," - %s", pMS[0]->get_version()); + fprintf(stdout,"\n"); + fflush(stdout); + } + +nextline: while(fgets(buf, MAXLNLEN, fileid)) { + if (*(buf + strlen(buf) - 1) == '\n') *(buf + strlen(buf) - 1) = '\0'; + lineno++; +#ifdef LOG + log(buf); +#endif + bad = 0; + pos = 0; + + // execute commands + if (filter_mode == PIPE) { + pos = -1; + switch (buf[0]) { + case '%': { verbose_mode = terse_mode = 0; break; } + case '!': { terse_mode = 1; break; } + case '`': { verbose_mode = 1; break; } + case '+': { + delete parser; + parser = get_parser(FMT_LATEX, NULL, pMS[0]); + parser->set_url_checking(checkurl); + break; + } + case '-': { + delete parser; + parser = get_parser(format, NULL, pMS[0]); + break; + } + case '@': { putdic(buf+1, pMS[d]); break; } + case '*': { + struct wordlist* i = + (struct wordlist *) malloc (sizeof(struct wordlist)); + i->word = mystrdup(buf+1); + i->next = dicwords; + dicwords = i; + putdic(buf+1, pMS[d]); + break; + } + case '#': { + if (HOME) strcpy(buf,HOME); else { + fprintf(stderr, gettext("error - missing HOME variable\n")); + continue; + } +#ifndef WIN32 + strcat(buf,"/"); +#endif + buf2 = buf+strlen(buf); + if (!privdicname) { + strcat(buf,DICBASENAME); + strcat(buf,basename(dicname,DIRSEPCH)); + } else { + strcat(buf,privdicname); + } + if (save_privdic(buf2, buf, dicwords)) { + dicwords=NULL; + } + break; + } + case '^': { + pos = 1; + } + + default: { + pos = 0; + } + + } // end switch + } // end filter_mode == PIPE + +if (pos >= 0) { + parser->put_line(buf + pos); + while ((token = parser->next_token())) { + switch (filter_mode) { + + case BADWORD: { + if (!check(pMS, &d, token, NULL, NULL)) { + bad = 1; + if (! printgood) fprintf(stdout,"%s\n", token); + } else { + if (printgood) fprintf(stdout,"%s\n", token); + } + free(token); + continue; + } + + case WORDFILTER: { + if (!check(pMS, &d, token, NULL, NULL)) { + bad = 1; + if (! printgood) fprintf(stdout,"%s\n", buf); + } else { + if (printgood) fprintf(stdout,"%s\n", buf); + } + free(token); + goto nextline; + } + + case BADLINE: { + if (!check(pMS, &d, token, NULL, NULL)) { + bad = 1; + } + free(token); + continue; + } + + case AUTO0: + case AUTO: + case AUTO2: + case AUTO3: { + FILE * f = (filter_mode == AUTO) ? stderr : stdout; + if (!check(pMS, &d, token, NULL, NULL)) { + char ** wlst = NULL; + bad = 1; + int ns = pMS[d]->suggest_auto(&wlst, chenc(token, io_enc, dic_enc[d])); + if (ns > 0) { + parser->change_token(chenc(wlst[0], dic_enc[d], io_enc)); + if (filter_mode == AUTO3) { + fprintf(f,"%s:%d: Locate: %s | Try: %s\n", + currentfilename, lineno, + token, chenc(wlst[0], dic_enc[d], io_enc)); + } else if (filter_mode == AUTO2) { + fprintf(f,"%ds/%s/%s/g; # %s\n", lineno, + token, chenc(wlst[0], dic_enc[d], io_enc), buf); + } else { + fprintf(f,gettext("Line %d: %s -> "), lineno, + chenc(token, io_enc, ui_enc)); + fprintf(f, "%s\n", + chenc(wlst[0], dic_enc[d], ui_enc)); + } + } + pMS[d]->free_list(&wlst, ns); + } + free(token); + continue; + } + + case STEM: { + char ** result; + int n = pMS[d]->stem(&result, chenc(token, io_enc, dic_enc[d])); + for (int i = 0; i < n; i++) { + fprintf(stdout, "%s %s\n", token, chenc(result[i], dic_enc[d], ui_enc)); + } + pMS[d]->free_list(&result, n); + if (n == 0 && token[strlen(token) - 1] == '.') { + token[strlen(token) - 1] = '\0'; + n = pMS[d]->stem(&result, token); + for (int i = 0; i < n; i++) { + fprintf(stdout, "%s %s\n", token, chenc(result[i], dic_enc[d], ui_enc)); + } + pMS[d]->free_list(&result, n); + } + if (n == 0) fprintf(stdout, "%s\n", chenc(token, dic_enc[d], ui_enc)); + fprintf(stdout, "\n"); + free(token); + continue; + } + + case ANALYZE: { + char ** result; + int n = pMS[d]->analyze(&result, chenc(token, io_enc, dic_enc[d])); + for (int i = 0; i < n; i++) { + fprintf(stdout, "%s %s\n", token, chenc(result[i], dic_enc[d], ui_enc)); + } + pMS[d]->free_list(&result, n); + if (n == 0 && token[strlen(token) - 1] == '.') { + token[strlen(token) - 1] = '\0'; + n = pMS[d]->analyze(&result, token); + for (int i = 0; i < n; i++) { + fprintf(stdout, "%s %s\n", token, chenc(result[i], dic_enc[d], ui_enc)); + } + pMS[d]->free_list(&result, n); + } + if (n == 0) fprintf(stdout, "%s\n", chenc(token, dic_enc[d], ui_enc)); + fprintf(stdout, "\n"); + free(token); + continue; + } + + case PIPE: { + int info; + char * root = NULL; + if (check(pMS, &d, token, &info, &root)) { + if (!terse_mode) { + if (verbose_mode) fprintf(stdout,"* %s\n", token); + else fprintf(stdout,"*\n"); + } + fflush(stdout); + } else { + char ** wlst = NULL; + int ns = pMS[d]->suggest(&wlst, token); + if (ns == 0) { + fprintf(stdout,"# %s %d", token, + parser->get_tokenpos() + pos); + } else { + fprintf(stdout,"& %s %d %d: ", token, ns, + parser->get_tokenpos() + pos); + fprintf(stdout,"%s", chenc(wlst[0], dic_enc[d], io_enc)); + } + for (int j = 1; j < ns; j++) { + fprintf(stdout, ", %s", chenc(wlst[j], dic_enc[d], io_enc)); + } + pMS[d]->free_list(&wlst, ns); + fprintf(stdout, "\n"); + fflush(stdout); + } + if (root) free(root); + free(token); + continue; + } + case NORMAL: { + int info; + char * root = NULL; + if (check(pMS, &d, token, &info, &root)) { + if (info & SPELL_COMPOUND) { + fprintf(stdout,"-\n"); + } else if (root) { + fprintf(stdout,"+ %s\n", chenc(root, dic_enc[d], ui_enc)); + } else { + fprintf(stdout,"*\n"); + } + fflush(stdout); + if (root) free(root); + } else { + char ** wlst = NULL; + int ns = pMS[d]->suggest(&wlst, chenc(token, io_enc, dic_enc[d])); + if (ns == 0) { + fprintf(stdout,"# %s %d", chenc(token, io_enc, ui_enc), + parser->get_tokenpos() + pos); + } else { + fprintf(stdout,"& %s %d %d: ", chenc(token, io_enc, ui_enc), ns, + parser->get_tokenpos() + pos); + fprintf(stdout,"%s", chenc(wlst[0], dic_enc[d], ui_enc)); + } + for (int j = 1; j < ns; j++) { + fprintf(stdout, ", %s", chenc(wlst[j], dic_enc[d], ui_enc)); + } + pMS[d]->free_list(&wlst, ns); + fprintf(stdout, "\n"); + fflush(stdout); + } + free(token); + } + } + } + + switch (filter_mode) { + case AUTO: { + fprintf(stdout,"%s\n", parser->get_line()); + break; + } + + case BADLINE: { + if (((printgood) && (!bad)) || + (!printgood && (bad))) fprintf(stdout,"%s\n",buf); + break; + } + + case PIPE: + case NORMAL: { + fprintf(stdout,"\n"); + fflush(stdout); + break; + } + + } +} // if +} // while + +if (parser) delete(parser); + +} // pipe_interface + +#ifdef HAVE_READLINE + +#ifdef HAVE_CURSES_H +static const char * rltext; + +// set base text of input line +static int set_rltext () +{ + if (rltext) + { + rl_insert_text (rltext); + rltext = NULL; + rl_startup_hook = (rl_hook_func_t *)NULL; + } + return 0; +} + +#endif + +// Readline escape +static int rl_escape (int count, int key) +{ + rl_delete_text(0, rl_end); + rl_done = 1; + return 0; +} +#endif + +#ifdef HAVE_CURSES_H +int expand_tab(char * dest, char * src, int limit) { + int i = 0; + int u8 = ((ui_enc != NULL) && (strcmp(ui_enc, "UTF-8") == 0)) ? 1 : 0; + int chpos = 0; + for(int j = 0; (i < limit) && (src[j] != '\0') && (src[j] != '\r'); j++) { + dest[i] = src[j]; + if (src[j] == '\t') { + int end = 8 - (chpos % 8); + for(int k = 0; k < end; k++) { + dest[i] = ' '; + i++; + chpos++; + } + } else { + i++; + if (!u8 || (src[j] & 0xc0) != 0x80) chpos++; + } + } + dest[i] = '\0'; + return chpos; +} + +// UTF-8-aware version of strncpy (but output is always null terminated) +// What we should deal in is cursor position cells in a terminal emulator, +// i.e. the number of visual columns occupied like wcwidth/wcswidth does +// What we're really current doing is to deal in the number of characters, +// like mbstowcs which isn't quite correct, but close enough for western +// text in UTF-8 +void strncpyu8(char * dest, const char * src, int begin, int n) { + int u8 = ((ui_enc != NULL) && (strcmp(ui_enc, "UTF-8") == 0)) ? 1 : 0; + int i = 0; + while (i < begin + n) { + if (i >= begin) + { + if (!*src) + break; + *dest++ = *src; + } + if (!u8 || (*src & 0xc0) != 0x80) + i++; + ++src; + } + *dest = '\0'; +} + +//See strncpyu8 for gotchas +int strlenu8(const char * src) { + int u8 = ((ui_enc != NULL) && (strcmp(ui_enc, "UTF-8") == 0)) ? 1 : 0; + int i = 0; + while (*src) { + if (!u8 || (*src & 0xc0) != 0x80) + i++; + ++src; + } + return i; +} + +void dialogscreen(TextParser * parser, char * token, + char * filename, int forbidden, char ** wlst, int ns) { + int x, y; + char line[MAXLNLEN]; + char line2[MAXLNLEN]; + getmaxyx(stdscr,y,x); + clear(); + + if (forbidden & SPELL_FORBIDDEN) printw(gettext("FORBIDDEN!")); else + if (forbidden & SPELL_WARN) printw(gettext("Spelling mistake?")); + printw(gettext("\t%s\t\tFile: %s\n\n"), chenc(token, io_enc, ui_enc), filename); + + // handle long lines and tabulators + + char lines[MAXPREVLINE][MAXLNLEN]; + + for (int i = 0; i < MAXPREVLINE; i++) { + expand_tab(lines[i], chenc(parser->get_prevline(i), io_enc, ui_enc), MAXLNLEN); + } + + int prevline = 0; + + strncpy(line, parser->get_prevline(0), parser->get_tokenpos()); + line[parser->get_tokenpos()] = '\0'; + int tokenbeg = expand_tab(line2, chenc(line, io_enc, ui_enc), MAXLNLEN); + + strncpy(line, parser->get_prevline(0), parser->get_tokenpos() + strlen(token)); + line[parser->get_tokenpos() + strlen(token)] = '\0'; + int tokenend = expand_tab(line2, chenc(line, io_enc, ui_enc), MAXLNLEN); + + int rowindex = tokenend / x; + int beginrow = rowindex - tokenbeg / x; + if (beginrow >= MAXPREVLINE) beginrow = MAXPREVLINE - 1; + + for (int i = 0; i < MAXPREVLINE; i++) { + strncpyu8(line, lines[prevline], x * rowindex, x); + mvprintw(MAXPREVLINE + 1 - i, 0, "%s", line); + rowindex--; + if (rowindex == -1) { + prevline++; + rowindex = strlenu8(lines[prevline]) / x; + } + } + + int linestartpos = tokenbeg - (tokenbeg % x); + strncpyu8(line, lines[0], x * rowindex + linestartpos, tokenbeg % x); + mvprintw(MAXPREVLINE + 1 - beginrow, 0, "%s", line); + attron(A_REVERSE); + printw("%s", chenc(token, io_enc, ui_enc)); + attroff(A_REVERSE); + + mvprintw(MAXPREVLINE + 2, 0, "\n"); + for (int i = 0; i < ns; i++) { + if ((ns > 10) && (i < 10)) { + printw(" 0%d: %s\n", i, chenc(wlst[i], io_enc, ui_enc)); + } else { + printw(" %d: %s\n", i, chenc(wlst[i], io_enc, ui_enc)); + } + } + +/* TRANSLATORS: the capital letters are shortcuts, mark one letter similarly + in your translation and translate the standalone letter accordingly later */ + mvprintw(y-3, 0, "%s\n", + gettext("\n[SPACE] R)epl A)ccept I)nsert U)ncap S)tem Q)uit e(X)it or ? for help\n")); +} + +char * lower_first_char(char *token, const char *io_enc, int langnum) +{ + const char *utf8str = chenc(token, io_enc, "UTF-8"); + int max = strlen(utf8str); + w_char *u = new w_char[max]; + int len = u8_u16(u, max, utf8str); + unsigned short idx = (u[0].h << 8) + u[0].l; + idx = unicodetolower(idx, langnum); + u[0].h = (unsigned char) (idx >> 8); + u[0].l = (unsigned char) (idx & 0x00FF); + char *scratch = (char*)malloc(max + 1 + 4); + u16_u8(scratch, max+4, u, len); + delete[] u; + char *result = chenc(scratch, "UTF-8", io_enc); + if (result != scratch) + { + free (scratch); + result = mystrdup(result); + } + return result; +} + + // for terminal interface +int dialog(TextParser * parser, Hunspell * pMS, char * token, char * filename, + char ** wlst, int ns, int forbidden) { + char buf[MAXLNLEN]; + char * buf2; + wordlist * dicwords = NULL; + int c; + + dialogscreen(parser, token, filename, forbidden, wlst, ns); + + char firstletter='\0'; + + while ((c=getch())) { + switch (c) { + case '0': + case '1': if ((firstletter=='\0') && (ns>10)) { + firstletter=c; + break; + } + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + modified=1; + if ((firstletter!='\0') && (firstletter=='1')) { + c += 10; + } + c -= '0'; + if (c>=ns) break; + parser->change_token(wlst[c]); + goto ki; + } + case ' ': { + goto ki; + } + case '?': { + clear(); +printw(gettext("Whenever a word is found that is not in the dictionary\n" + "it is printed on the first line of the screen. If the dictionary\n" + "contains any similar words, they are listed with a number\n" + "next to each one. You have the option of replacing the word\n" + "completely, or choosing one of the suggested words.\n")); +printw(gettext("\nCommands are:\n\n")); +printw(gettext("R Replace the misspelled word completely.\n")); +printw(gettext("Space Accept the word this time only.\n")); +printw(gettext("A Accept the word for the rest of this session.\n")); +printw(gettext("I Accept the word, and put it in your private dictionary.\n")); +printw(gettext("U Accept and add lowercase version to private dictionary.\n")); +printw(gettext( +"S\tAsk a stem and a model word and store them in the private dictionary.\n" +"\tThe stem will be accepted also with the affixes of the model word.\n" +)); +printw(gettext("0-n Replace with one of the suggested words.\n")); +printw(gettext("X Write the rest of this file, ignoring misspellings, and start next file.\n")); +printw(gettext("Q Quit immediately. Asks for confirmation. Leaves file unchanged.\n")); +printw(gettext("^Z Suspend program. Restart with fg command.\n")); +printw(gettext("? Show this help screen.\n")); +printw(gettext("\n-- Type space to continue -- \n")); + while (getch()!=' '); + } + case 12: { + dialogscreen(parser, token, filename, forbidden, wlst, ns); + break; + } + default: { +/* TRANSLATORS: translate this letter according to the shortcut letter used + previously in the translation of "R)epl" before */ + if (c==(gettext("r"))[0]) { + char i[MAXLNLEN]; + char *temp; + + modified=1; + + +#ifdef HAVE_READLINE + endwin(); + rltext = ""; + if (rltext && *rltext) rl_startup_hook = set_rltext; +#endif + temp = readline(gettext("Replace with: ")); +#ifdef HAVE_READLINE + initscr(); + cbreak(); +#endif + + if ((!temp) || (temp[0] == '\0')) { + free(temp); + dialogscreen(parser, token, filename, forbidden, wlst, ns); + break; + } + + strncpy(i, temp, MAXLNLEN); + free(temp); + + parser->change_token(i); + + return 2; // replace + } +/* TRANSLATORS: translate these letters according to the shortcut letter used + previously in the translation of "U)ncap" and I)nsert before */ + int u_key = gettext("u")[0]; + int i_key = gettext("i")[0]; + + if (c==u_key || c==i_key) { + struct wordlist* i = (struct wordlist *) malloc (sizeof(struct wordlist)); + i->word = (c==i_key) ? mystrdup(token) : lower_first_char(token, io_enc, pMS->get_langnum()); + i->next = dicwords; + dicwords = i; + // save + if (HOME) strcpy(buf,HOME); else { + fprintf(stderr, gettext("error - missing HOME variable\n")); + break; + } +#ifndef WIN32 + strcat(buf,"/"); +#endif + buf2 = buf+strlen(buf); + if (!privdicname) { + strcat(buf,DICBASENAME); + strcat(buf,basename(dicname,DIRSEPCH)); + } else { + strcat(buf,privdicname); + } + if (save_privdic(buf2, buf, dicwords)) { + dicwords=NULL; + } else { + fprintf(stderr,gettext("Cannot update personal dictionary.")); + break; + } + } // no break +/* TRANSLATORS: translate this letter according to the shortcut letter used + previously in the translation of "U)ncap" and I)nsert before */ + if ((c==(gettext("u"))[0]) || (c==(gettext("i"))[0]) || (c==(gettext("a"))[0])) { + modified=1; + putdic(token, pMS); + goto ki; + } +/* TRANSLATORS: translate this letter according to the shortcut letter used + previously in the translation of "S)tem" before */ + if (c==(gettext("s"))[0]) { + modified=1; + + char w[MAXLNLEN], w2[MAXLNLEN], w3[MAXLNLEN]; + char *temp; + + strncpy(w, token, MAXLNLEN); + temp = basename(w, '-'); + if (w < temp) { + *(temp-1) = '\0'; + } else { + char ** poslst = NULL; +#ifdef HUNSPELL_EXPERIMENTAL + int ps = pMS->suggest_pos_stems(&poslst, token); +#else + int ps = 0; +#endif + if (ps > 0) { + strcpy(buf, poslst[0]); + for (int i = 0; i < ps; i++) { + if (strlen(poslst[i]) <= strlen(buf)) strcpy(buf, poslst[i]); + free(poslst[i]); + } + strcpy(w, buf); + } + if (poslst) free(poslst); + } + +#ifdef HAVE_READLINE + endwin(); + rltext = w; + if (rltext && *rltext) rl_startup_hook = set_rltext; +#endif + temp = readline(gettext("New word (stem): ")); + + if ((!temp) || (temp[0] == '\0')) { + free(temp); +#ifdef HAVE_READLINE + initscr(); + cbreak(); +#endif + dialogscreen(parser, token, filename, forbidden, wlst, ns); + break; + } + + strncpy(w, temp, MAXLNLEN); + free(temp); + +#ifdef HAVE_READLINE + initscr(); + cbreak(); +#endif + dialogscreen(parser, token, filename, forbidden, wlst, ns); + refresh(); + +#ifdef HAVE_READLINE + endwin(); + rltext = ""; + if (rltext && *rltext) rl_startup_hook = set_rltext; +#endif + temp = readline(gettext("Model word (a similar dictionary word): ")); + +#ifdef HAVE_READLINE + initscr(); + cbreak(); +#endif + + if ((!temp) || (temp[0] == '\0')) { + free(temp); + dialogscreen(parser, token, filename, forbidden, wlst, ns); + break; + } + + strncpy(w2, temp, MAXLNLEN); + free(temp); + + if (strlen(w) + strlen(w2) + 2 < MAXLNLEN) { + sprintf(w3, "%s/%s", w, w2); + } else break; + + if (!putdic(w3, pMS)) { + + struct wordlist* i = + (struct wordlist *) malloc (sizeof(struct wordlist)); + i->word = mystrdup(w3); + i->next = dicwords; + dicwords = i; + + if (strlen(w) + strlen(w2) + 4 < MAXLNLEN) { + sprintf(w3, "%s-/%s-", w, w2); + if (putdic(w3, pMS)) { + struct wordlist* i = + (struct wordlist *) malloc (sizeof(struct wordlist)); + i->word = mystrdup(w3); + i->next = dicwords; + dicwords = i; + } + } + // save + + if (HOME) strcpy(buf,HOME); else { + fprintf(stderr, gettext("error - missing HOME variable\n")); + continue; + } +#ifndef WIN32 + strcat(buf,"/"); +#endif + buf2 = buf + strlen(buf); + if (!privdicname) { + strcat(buf,DICBASENAME); + strcat(buf,basename(dicname,DIRSEPCH)); + } else { + strcat(buf,privdicname); + } + if (save_privdic(buf2, buf, dicwords)) { + dicwords = NULL; + } else { + fprintf(stderr, gettext("Cannot update personal dictionary.")); + break; + } + + } else { + dialogscreen(parser, token, filename, forbidden, wlst, ns); + printw(gettext("Model word must be in the dictionary. Press any key!")); + getch(); + dialogscreen(parser, token, filename, forbidden, wlst, ns); + break; + } + goto ki; + } +/* TRANSLATORS: translate this letter according to the shortcut letter used + previously in the translation of "e(X)it" before */ + if (c==(gettext("x"))[0]) { + return 1; + } +/* TRANSLATORS: translate this letter according to the shortcut letter used + previously in the translation of "Q)uit" before */ + if (c==(gettext("q"))[0]) { + if (modified) { + printw(gettext("Are you sure you want to throw away your changes? ")); +/* TRANSLATORS: translate this letter according to the shortcut letter y)es */ + if (getch()==(gettext("y"))[0]) return -1; + dialogscreen(parser, token, filename, forbidden, wlst, ns); + break; + } else { + return -1; + } + } + } + } + } + ki: return 0; +} + +int interactive_line(TextParser * parser, Hunspell ** pMS, char * filename, FILE * tempfile) +{ + char * token; + int dialogexit = 0; + int info; + int d = 0; + while ((token=parser->next_token())) { + if (!check(pMS, &d, token, &info, NULL)) { + dialogscreen(parser, token, filename, info, NULL, 0); // preview + refresh(); + char ** wlst = NULL; + int ns = pMS[d]->suggest(&wlst, chenc(token, io_enc, dic_enc[d])); + if (ns==0) { + dialogexit = dialog(parser, pMS[d], token, filename, wlst, ns, info); + } else { + for (int j = 0; j < ns; j++) { + char d2io[MAXLNLEN]; + strcpy(d2io, chenc(wlst[j], dic_enc[d], io_enc)); + wlst[j] = (char *) realloc(wlst[j], strlen(d2io) + 1); + strcpy(wlst[j], d2io); + } + dialogexit = dialog(parser, pMS[d], token, filename, wlst, ns, info); + } + for (int j = 0; j < ns; j++) { + free(wlst[j]); + } + free(wlst); + } + free(token); + if ((dialogexit==-1) || (dialogexit==1)) goto ki2; + } + + ki2: fprintf(tempfile,"%s\n",token=parser->get_line()); + free(token); + return dialogexit; +} + +void interactive_interface(Hunspell ** pMS, char * filename, int format) +{ + char buf[MAXLNLEN]; + + FILE *text; + + text = fopen(filename, "r"); + + int dialogexit; + int check=1; + + TextParser * parser; + char * extension = basename(filename, '.'); + parser = get_parser(format, extension, pMS[0]); + + char * tempname = (char *) malloc(strlen(filename) + strlen(TEMPNAME) + 1); + strcpy(tempname, filename); + strcpy(basename(tempname, DIRSEPCH), TEMPNAME); + + FILE *tempfile; + + if (!(tempfile = fopen(tempname, "w"))) { + fprintf(stderr, gettext("Can't create tempfile %s.\n"), tempname); + endwin(); + exit(1); + } + + while(fgets(buf,MAXLNLEN,text)) { + if (check) { + if (*(buf + strlen(buf) - 1) == '\n') *(buf + strlen(buf) - 1) = '\0'; + parser->put_line(buf); + dialogexit = interactive_line(parser,pMS,filename,tempfile); + switch (dialogexit) { + case -1: { + clear(); + refresh(); + unlink(tempname); + endwin(); + exit(0); + } + case 1: { + check = 0; + } + } + } else { + fprintf(tempfile,"%s",buf); + } + } + fclose(text); + fclose(tempfile); + delete parser; + + if (! modified) { + unlink(tempname); + } else { + rename(tempname, filename); + } + free(tempname); +} + +#endif + +char * add(char * dest, const char * st) { + if (!dest) { + dest = mystrdup(st); + } else { + dest = (char *) realloc(dest, strlen(dest) + strlen(st) + 1); + strcat(dest, st); + } + return dest; +} + +char * exist2(char * dir, int len, const char * name, const char * ext) { + char buf[MAXLNLEN]; + const char * sep = (len == 0) ? "": DIRSEP; + strncpy(buf, dir, len); + strcpy(buf + len, sep); + strcat(buf, name); + strcat(buf, ext); + if (exist(buf)) return mystrdup(buf); + strcat(buf, HZIP_EXTENSION); + if (exist(buf)) { + buf[strlen(buf) - strlen(HZIP_EXTENSION)] = '\0'; + return mystrdup(buf); + } + return NULL; +} + +#ifndef WIN32 +int listdicpath(char * dir, int len) { + char buf[MAXLNLEN]; + const char * sep = (len == 0) ? "": DIRSEP; + strncpy(buf, dir, len); + strcpy(buf + len, sep); + DIR *d = opendir(buf); + if (!d) return 0; + struct dirent * de; + while ((de = readdir(d))) { + int len = strlen(de->d_name); + if ((len > 4 && strcmp(de->d_name + len - 4, ".dic") == 0) || + (len > 7 && strcmp(de->d_name + len - 7, ".dic.hz") == 0)) { + char * s = mystrdup(de->d_name); + s[len - ((s[len - 1] == 'z') ? 7 : 4)] = '\0'; + fprintf(stderr, "%s%s\n", buf, s); + free(s); + } + } + closedir(d); + return 1; +} +#endif + +// search existing path for file "name + ext" +char * search(char * begin, char * name, const char * ext) { + char * end = begin; + while (1) { + while (!((*end == *PATHSEP) || (*end == '\0'))) end++; + char * res = NULL; + if (name) { + res = exist2(begin, end - begin, name, ext); + } else { +#ifndef WIN32 + listdicpath(begin, end - begin); +#endif + } + if ((*end == '\0') || res) return res; + end++; + begin = end; + } +} + +int main(int argc, char** argv) +{ + char buf[MAXLNLEN]; + Hunspell * pMS[DMAX]; + char * key = NULL; + int arg_files = -1; // first filename argumentum position in argv + int format = FMT_TEXT; + int argstate = 0; + +#ifdef ENABLE_NLS +# ifdef HAVE_LOCALE_H + setlocale(LC_ALL, ""); + textdomain("hunspell"); +# ifdef HAVE_LANGINFO_CODESET + ui_enc = nl_langinfo(CODESET); +# endif +# endif +#endif + +#ifdef HAVE_READLINE + rl_set_key("", rl_escape, rl_get_keymap()); + rl_bind_key('\t', rl_insert); +#endif + +#ifdef LOG + log("START"); +#endif + + for(int i=1; i<argc; i++) { +#ifdef LOG + log(argv[i]); +#endif + + if (argstate == 1) { + if (dicname) free(dicname); + dicname = mystrdup(argv[i]); + argstate = 0; + } else if (argstate == 2) { + if (privdicname) free(privdicname); + privdicname = mystrdup(argv[i]); + argstate = 0; + } else if (argstate == 3) { + io_enc = argv[i]; + argstate = 0; + } else if (argstate == 4) { + key = argv[i]; + argstate = 0; + } else if (strcmp(argv[i],"-d")==0) argstate=1; + else if (strcmp(argv[i],"-p")==0) argstate=2; + else if (strcmp(argv[i],"-i")==0) argstate=3; + else if (strcmp(argv[i],"-P")==0) argstate=4; + else if ((strcmp(argv[i],"-h") == 0) || (strcmp(argv[i],"--help") == 0)) { + fprintf(stderr,gettext("Usage: hunspell [OPTION]... [FILE]...\n")); + fprintf(stderr,gettext("Check spelling of each FILE. Without FILE, check standard input.\n\n")); + fprintf(stderr,gettext(" -1\t\tcheck only first field in lines (delimiter = tabulator)\n")); + fprintf(stderr,gettext(" -a\t\tIspell's pipe interface\n")); + fprintf(stderr,gettext(" --check-url\tCheck URLs, e-mail addresses and directory paths\n")); + fprintf(stderr,gettext(" -d d[,d2,...]\tuse d (d2 etc.) dictionaries\n")); + fprintf(stderr,gettext(" -D\t\tshow available dictionaries\n")); + fprintf(stderr,gettext(" -G\t\tprint only correct words or lines\n")); + fprintf(stderr,gettext(" -h, --help\tdisplay this help and exit\n")); + fprintf(stderr,gettext(" -H\t\tHTML input file format\n")); + fprintf(stderr,gettext(" -i enc\tinput encoding\n")); + fprintf(stderr,gettext(" -l\t\tprint misspelled words\n")); + fprintf(stderr,gettext(" -L\t\tprint lines with misspelled words\n")); + fprintf(stderr,gettext(" -m \t\tanalyze the words of the input text\n")); + fprintf(stderr,gettext(" -n\t\tnroff/troff input file format\n")); + fprintf(stderr,gettext(" -p dict\tset dict custom dictionary\n")); + fprintf(stderr,gettext(" -r\t\twarn of the potential mistakes (rare words)\n")); + fprintf(stderr,gettext(" -P password\tset password for encrypted dictionaries\n")); + fprintf(stderr,gettext(" -s \t\tstem the words of the input text\n")); + fprintf(stderr,gettext(" -t\t\tTeX/LaTeX input file format\n")); +// experimental functions: missing Unicode support +// fprintf(stderr,gettext(" -u\t\tshow typical misspellings\n")); +// fprintf(stderr,gettext(" -u2\t\tprint typical misspellings in sed format\n")); +// fprintf(stderr,gettext(" -u3\t\tprint typical misspellings in gcc error format\n")); +// fprintf(stderr,gettext(" -U\t\tautomatic correction of typical misspellings to stdout\n")); + fprintf(stderr,gettext(" -v, --version\tprint version number\n")); + fprintf(stderr,gettext(" -vv\t\tprint Ispell compatible version number\n")); + fprintf(stderr,gettext(" -w\t\tprint misspelled words (= lines) from one word/line input.\n\n")); + fprintf(stderr,gettext("Example: hunspell -d en_US file.txt # interactive spelling\n" + " hunspell -l file.txt # print misspelled words\n" + " hunspell -i utf-8 file.txt # check UTF-8 encoded file\n\n")); + fprintf(stderr,gettext("Bug reports: http://hunspell.sourceforge.net\n")); + exit(0); + } else if ((strcmp(argv[i],"-vv")==0) || (strcmp(argv[i],"-v")==0) || (strcmp(argv[i],"--version")==0)) { + fprintf(stdout,gettext(HUNSPELL_PIPE_HEADING)); + fprintf(stdout,"\n"); + if (strcmp(argv[i],"-vv")!=0) { + fprintf(stdout,gettext("\nCopyright (C) 2002-2008 L\303\241szl\303\263 N\303\251meth. License: MPL/GPL/LGPL.\n\n" + "Based on OpenOffice.org's Myspell library.\n" + "Myspell's copyright (C) Kevin Hendricks, 2001-2002, License: BSD.\n\n")); + fprintf(stdout,gettext("This is free software; see the source for copying conditions. There is NO\n" + "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE,\n" + "to the extent permitted by law.\n")); + } + exit(0); + } else if ((strcmp(argv[i],"-a")==0)) { + filter_mode = PIPE; + fprintf(stdout,gettext(HUNSPELL_PIPE_HEADING)); + fflush(stdout); + } else if ((strcmp(argv[i],"-m")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: Make possible root/affix combinations that aren't in the dictionary. + hunspell: Analyze the words of the input text + */ + if (filter_mode != PIPE) + filter_mode = ANALYZE; + } else if ((strcmp(argv[i],"-s")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: Stop itself with a SIGTSTP signal after each line of input. + hunspell: Stem the words of the input text + */ + if (filter_mode != PIPE) + filter_mode = STEM; + } else if ((strcmp(argv[i],"-t")==0)) { + format = FMT_LATEX; + } else if ((strcmp(argv[i],"-n")==0)) { + format = FMT_MAN; + } else if ((strcmp(argv[i],"-H")==0)) { + format = FMT_HTML; + } else if ((strcmp(argv[i],"-l")==0)) { + filter_mode = BADWORD; + } else if ((strcmp(argv[i],"-w")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: Specify additional characters that can be part of a word. + hunspell: Print misspelled words (= lines) from one word/line input + */ + if (filter_mode != PIPE) + filter_mode = WORDFILTER; + } else if ((strcmp(argv[i],"-L")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: Number of lines of context to be shown at the bottom of the screen + hunspell: Print lines with misspelled words + */ + if (filter_mode != PIPE) + filter_mode = BADLINE; + } else if ((strcmp(argv[i],"-u")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: None + hunspell: Show typical misspellings + */ + if (filter_mode != PIPE) + filter_mode = AUTO0; + } else if ((strcmp(argv[i],"-U")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: None + hunspell: Automatic correction of typical misspellings to stdout + */ + if (filter_mode != PIPE) + filter_mode = AUTO; + } else if ((strcmp(argv[i],"-u2")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: None + hunspell: Print typical misspellings in sed format + */ + if (filter_mode != PIPE) + filter_mode = AUTO2; + } else if ((strcmp(argv[i],"-u3")==0)) { + /* + if -a was used, don't override, i.e. keep ispell compatability + ispell: None + hunspell: Print typical misspellings in gcc error format + */ + if (filter_mode != PIPE) + filter_mode = AUTO3; + } else if ((strcmp(argv[i],"-G")==0)) { + printgood = 1; + } else if ((strcmp(argv[i],"-1")==0)) { + format = FMT_FIRST; + } else if ((strcmp(argv[i],"-D")==0)) { + showpath = 1; + } else if ((strcmp(argv[i],"-r")==0)) { + warn = 1; +fprintf(stderr, "BEKAPCS"); + } else if ((strcmp(argv[i],"--check-url")==0)) { + checkurl = 1; + } else if ((arg_files==-1) && ((argv[i][0] != '-') && (argv[i][0] != '\0'))) { + arg_files = i; + if (! exist(argv[i])) { // first check (before time-consuming dic. load) + fprintf(stderr,gettext("Can't open %s.\n"),argv[i]); +#ifdef HAVE_CURSES_H + endwin(); +#endif + exit(1); + } + } + } + + if (printgood && (filter_mode == NORMAL)) filter_mode = BADWORD; + + if (! dicname) { + if (! (dicname=getenv("DICTIONARY"))) { + /* + * Search in order of LC_ALL, LC_MESSAGES & + * LANG + */ + const char *tests[] = { "LC_ALL", "LC_MESSAGES", "LANG" }; + for (size_t i = 0; i < sizeof(tests) / sizeof(const char*); ++i) { + if ((dicname=getenv(tests[i])) && strcmp(dicname, "") != 0) { + dicname = mystrdup(dicname); + char * dot = strchr(dicname, '.'); + if (dot) *dot = '\0'; + char * at = strchr(dicname, '@'); + if (at) *at = '\0'; + break; + } + } + + if (dicname && ((strcmp(dicname, "C") == 0) || (strcmp(dicname, "POSIX") == 0))) { + free(dicname); + dicname=mystrdup("en_US"); + } + + if (! dicname) { + dicname=mystrdup(DEFAULTDICNAME); + } + } else { + dicname = mystrdup(dicname); + } + } + path = add(mystrdup("."), PATHSEP); // <- check path in local directory + path = add(path, PATHSEP); // <- check path in root directory + if (getenv("DICPATH")) path = add(add(path, getenv("DICPATH")), PATHSEP); + path = add(add(path, LIBDIR), PATHSEP); + if (HOME) path = add(add(add(add(path, HOME), DIRSEP), USEROOODIR), PATHSEP); + path = add(path, OOODIR); + + if (showpath) { + fprintf(stderr, gettext("SEARCH PATH:\n%s\n"), path); + fprintf(stderr, gettext("AVAILABLE DICTIONARIES (path is not mandatory for -d option):\n")); + search(path, NULL, NULL); + } + + if (!privdicname) privdicname = mystrdup(getenv("WORDLIST")); + + char * dicplus = strchr(dicname, ','); + if (dicplus) *dicplus = '\0'; + char * aff = search(path, dicname, ".aff"); + char * dic = search(path, dicname, ".dic"); + if (aff && dic) { + if (showpath) { + fprintf(stderr, gettext("LOADED DICTIONARY:\n%s\n%s\n"), aff, dic); + } + pMS[0] = new Hunspell(aff, dic, key); + dic_enc[0] = pMS[0]->get_dic_encoding(); + dmax = 1; + if (pMS[0] && dicplus) while (dicplus) { + char * dicname2 = dicplus + 1; + dicplus = strchr(dicname2, ','); + if (dicplus) *dicplus = '\0'; + free(aff); + free(dic); + aff = search(path, dicname2, ".aff"); + dic = search(path, dicname2, ".dic"); + if (aff && dic) { + if (dmax < DMAX) { + pMS[dmax] = new Hunspell(aff, dic, key); + dic_enc[dmax] = pMS[dmax]->get_dic_encoding(); + dmax++; + } else fprintf(stderr, gettext("error - %s exceeds dictionary limit.\n"), dicname2); + } else if (dic) pMS[dmax-1]->add_dic(dic); + } + } else { + fprintf(stderr,gettext("Can't open affix or dictionary files for dictionary named \"%s\".\n"), dicname); + exit(1); + } + + /* open the private dictionaries */ + if (HOME) { + strcpy(buf,HOME); +#ifndef WIN32 + strcat(buf,"/"); +#endif + if (!privdicname) { + strcat(buf,DICBASENAME); + strcat(buf,basename(dicname,DIRSEPCH)); + load_privdic(buf, pMS[0]); + strcpy(buf,DICBASENAME); + strcat(buf,basename(dicname,DIRSEPCH)); + load_privdic(buf, pMS[0]); + } else { + strcat(buf,privdicname); + load_privdic(buf, pMS[0]); + strcpy(buf,privdicname); + load_privdic(buf, pMS[0]); + } + } + + if (arg_files==-1) { + pipe_interface(pMS, format, stdin); + } else if (filter_mode != NORMAL) { + for (int i = arg_files; i < argc; i++) { + if (exist(argv[i])) { + modified = 0; + currentfilename = argv[i]; + FILE * f = fopen(argv[i], "r"); + pipe_interface(pMS, format, f); + fclose(f); + } else { + fprintf(stderr, gettext("Can't open %s.\n"), argv[i]); + exit(1); + } + } + } else if (filter_mode == NORMAL) { +#ifdef HAVE_CURSES_H + initscr(); + cbreak(); + noecho(); + nonl(); + intrflush(stdscr,FALSE); + + for (int i = arg_files; i < argc; i++) { + if (exist(argv[i])) { + modified = 0; + interactive_interface(pMS, argv[i], format); + } else { + fprintf(stderr, gettext("Can't open %s.\n"), argv[i]); + endwin(); + exit(1); + } + } + + clear(); + refresh(); + endwin(); +#else + fprintf(stderr, gettext("Hunspell has been compiled without Ncurses user interface.\n")); +#endif + } + + if (dicname) free(dicname); + if (privdicname) free(privdicname); + if (path) free(path); + if (aff) free(aff); + if (dic) free(dic); + if (wordchars) free(wordchars); + if (wordchars_utf16_free) free(wordchars_utf16); +#ifdef HAVE_ICONV + free_utf_tbl(); +#endif + for (int i = 0; i < dmax; i++) delete pMS[i]; + return 0; +} diff --git a/src/tools/hunzip.cxx b/src/tools/hunzip.cxx new file mode 100644 index 0000000..5d1581d --- /dev/null +++ b/src/tools/hunzip.cxx @@ -0,0 +1,22 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "hunzip.hxx" + +#define DESC "hunzip - decompress a hzip file to the standard output\n" \ +"Usage: hunzip file.hz [password]\n" + +int fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return 1; +} + +int main(int argc, char** argv) { + Hunzip * h; + const char * s; + if (argc == 1 || strcmp(argv[1], "-h") == 0) return fail(DESC, NULL); + h = new Hunzip(argv[1], (argc > 2) ? argv[2] : NULL); + while (h && (s = h->getline())) printf("%s", s); + return 0; +} diff --git a/src/tools/hzip.c b/src/tools/hzip.c new file mode 100644 index 0000000..cf760e8 --- /dev/null +++ b/src/tools/hzip.c @@ -0,0 +1,325 @@ +/* hzip: file compression for sorted dictionaries with optional encryption, + * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define CODELEN 65536 +#define BUFSIZE 65536 +#define EXTENSION ".hz" + +#define ESCAPE 31 +#define MAGIC "hz0" +#define MAGIC_ENCRYPTED "hz1" + +#define DESC "hzip - dictionary compression utility\n" \ +"Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \ +" -P password encrypted compression\n" \ +" -h display this help and exit\n" + +enum { code_LEAF, code_TERM, code_NODE}; + +struct item { + unsigned short word; + int count; + char type; + struct item * left; + struct item * right; +}; + +int fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return 1; +} + +void code2table(struct item * tree, char **table, char * code, int deep) { + int first = 0; + if (!code) { + first = 1; + code = malloc(CODELEN); + } + code[deep] = '1'; + if (tree->left) code2table(tree->left, table, code, deep + 1); + if (tree->type != code_NODE) { + int i = tree->word; + code[deep] = '\0'; + if (tree->type == code_TERM) i = CODELEN; /* terminal code */ + table[i] = malloc(deep + 1); + strcpy(table[i], code); + } + code[deep] = '0'; + if (tree->right) code2table(tree->right, table, code, deep + 1); + if (first) free(code); +} + +struct item * newitem(int c, struct item * l, struct item * r, int t) { + struct item * ni = (struct item *) malloc(sizeof(struct item)); + ni->type = t; + ni->word = 0; + ni->count = c; + ni->left = l; + ni->right = r; + return ni; +} + +/* return length of the freq array */ +int get_freqdata(struct item *** dest, FILE * f, unsigned short * termword) { + int freq[CODELEN]; + int i, j, k, n; + union { + char c[2]; + unsigned short word; + } u; + for (i = 0; i < CODELEN; i++) freq[i] = 0; + while((j = getc(f)) != -1 && (k = getc(f)) != -1) { + u.c[0] = j; + u.c[1] = k; + freq[u.word]++; + } + if (j != -1) { + u.c[0] = 1; + u.c[1] = j; + } else { + u.c[0] = 0; + u.c[1] = 0; + } + + *dest = (struct item **) malloc((CODELEN + 1) * sizeof(struct item *)); + if (!*dest) return -1; + for (i = 0, n = 0; i < CODELEN; i++) if (freq[i]) { + (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF); + (*dest)[n]->word = i; + n++; + } + /* terminal sequence (also contains the last odd byte of the file) */ + (*dest)[n] = newitem(1, NULL, NULL, code_TERM); + *termword = u.word; + return n + 1; +} + +void get_codetable(struct item **l, int n, char ** table) { + int i; + while (n > 1) { + int min = 0; + int mi2 = 1; + for (i = 1; i < n; i++) { + if (l[i]->count < l[min]->count) { + mi2 = min; + min = i; + } else if (l[i]->count < l[mi2]->count) mi2 = i; + } + l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE); + for (i = mi2 + 1; i < n; i++) l[i - 1] = l[i]; + n--; + } + code2table(l[0], table, NULL, 0); +} + +int write_bits(FILE *f, char * bitbuf, int *bits, char * code) { + while (*code) { + int b = (*bits) % 8; + if (!b) bitbuf[(*bits) / 8] = ((*code) - '0') << 7; + else bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b)); + (*bits)++; + code++; + if (*bits == BUFSIZE * 8) { + if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f)) + return 1; + *bits = 0; + } + } + return 0; +} + +int encode_file(char ** table, int n, FILE *f, FILE *f2, unsigned short tw, char * key) { + char bitbuf[BUFSIZE]; + int i, bits = 0; + unsigned char cl, ch; + int cx[2]; + union { + char c[2]; + unsigned short word; + } u; + char * enc = key; + + /* header and codes */ + fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */ + cl = (unsigned char) (n & 0x00ff); + ch = (unsigned char) (n >> 8); + if (key) { + unsigned char cs; + for (cs = 0; *enc; enc++) cs ^= *enc; + fprintf(f2, "%c", cs); /* 1-byte check sum */ + enc = key; + ch ^= *enc; + if ((*(++enc)) == '\0') enc = key; + cl ^= *enc; + } + fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */ + for (i = 0; i < BUFSIZE; i++) bitbuf[i] = '\0'; + for (i = 0; i < CODELEN + 1; i++) if (table[i]) { + int nmemb; + u.word = (unsigned short) i; + if (i == CODELEN) u.word = tw; + if (key) { + if (*(++enc) == '\0') enc = key; + u.c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + u.c[1] ^= *enc; + } + fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */ + bits = 0; + if (write_bits(f2, bitbuf, &bits, table[i]) != 0) + return 1; + if (key) { + if (*(++enc) == '\0') enc = key; + fprintf(f2, "%c", ((unsigned char) bits) ^ *enc); + for (cl = 0; cl <= bits/8; cl++) { + if (*(++enc) == '\0') enc = key; + bitbuf[cl] ^= *enc; + } + } else + fprintf(f2, "%c", (unsigned char) bits); /* 1-byte code length */ + nmemb = bits/8 + 1; + if (fwrite(bitbuf, 1, bits/8 + 1, f2) != nmemb) /* x-byte code */ + return 1; + } + + /* file encoding */ + bits = 0; + while((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) { + u.c[0] = cx[0]; + u.c[1] = cx[1]; + if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0) + return 1; + } + /* terminal suffixes */ + if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0) + return 1; + if (bits > 0) + { + int nmemb = bits/8 + 1; + if (fwrite(bitbuf, 1, nmemb, f2) != nmemb) + return 1; + } + return 0; +} + +int prefixcompress(FILE *f, FILE *tempfile) { + char buf[BUFSIZE]; + char buf2[BUFSIZE * 2]; + char prev[BUFSIZE]; + int prevlen = 0; + while(fgets(buf,BUFSIZE,f)) { + int i, j, k, m, c=0; + int pfx = prevlen; + char * p = buf2; + m = j = 0; + for (i = 0; buf[i]; i++) { + if ((pfx > 0) && (buf[i] == prev[i])) { + j++; + } else pfx = 0; + } + if (i > 0 && buf[i - 1] == '\n') { + if (j == i) j--; /* line duplicate */ + if (j > 29) j = 29; + c = j; + if (c == '\t') c = 30; + /* common suffix */ + for (; buf[i - m - 2] == prev[prevlen - m - 2] && + m < i - j - 1 && m < 15; m++); + if (m == 1) m = 0; + } else { + j = 0; + m = -1; + } + for (k = j; k < i - m - 1; k++, p++) { + if (((unsigned char) buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') { + *p = ESCAPE; + p++; + } + *p = buf[k]; + } + if (m > 0) { + *p = m + 31; /* 33-46 */ + p++; + } + if (i > 0 && buf[i - 1] == '\n') { + size_t nmemb = p - buf2 + 1; + *p = c; + if (fwrite(buf2, 1, nmemb, tempfile) != nmemb) + return 1; + } else { + size_t nmemb = p - buf2; + if (fwrite(buf2, 1, nmemb, tempfile) != nmemb) + return 1; + } + memcpy(prev, buf, i); + prevlen = i; + } + return 0; +} + +int hzip(const char * filename, char * key) { + struct item ** list; + char * table[CODELEN + 1]; + int n; + char out[BUFSIZE]; + FILE *f, *f2, *tempfile; + unsigned short termword; + strcpy(out, filename); + strcat(out, EXTENSION); + f = fopen(filename, "r"); + if (!f) return fail("hzip: %s: Permission denied\n", filename); + tempfile = tmpfile(); + if (!tempfile) { + fclose(f); + return fail("hzip: cannot create temporary file\n", NULL); + } + f2 = fopen(out, "wb"); + if (!f2) { + fclose(tempfile); + fclose(f); + return fail("hzip: %s: Permission denied\n", out); + } + for (n = 0; n < CODELEN; n++) table[n] = NULL; + if (prefixcompress(f, tempfile) != 0) { + fclose(f2); + fclose(tempfile); + fclose(f); + return fail("hzip: cannot write file\n", NULL); + } + rewind(tempfile); + n = get_freqdata(&list, tempfile, &termword); + get_codetable(list, n, table); + rewind(tempfile); + n = encode_file(table, n, tempfile, f2, termword, key); + fclose(f2); + fclose(tempfile); + fclose(f); + if (n != 0) return fail("hzip: cannot write file\n", NULL); + return n; +} + +int main(int argc, char** argv) { + + int i, j = 0; + char * key = NULL; + for (i = 1; i < argc; i++) { + if (*(argv[i]) == '-') { + if (*(argv[i] + 1) == 'h') + return fail(DESC, NULL); + if (*(argv[i] + 1) == 'P') { + if (i + 1 == argc) + return fail("hzip: missing password\n", NULL); + key = argv[i + 1]; + i++; + continue; + } + return fail("hzip: no such option: %s\n", argv[i]); + } else if (hzip(argv[i], key) != 0) return 1; else j = 1; + } + if (j == 0) return fail("hzip: need a filename parameter\n", NULL); + return 0; +} diff --git a/src/tools/ispellaff2myspell b/src/tools/ispellaff2myspell new file mode 100644 index 0000000..5d60c09 --- /dev/null +++ b/src/tools/ispellaff2myspell @@ -0,0 +1,472 @@ +#!/usr/bin/perl -w +# -*- coding: iso-8859-1 -*- +# $Id: ispellaff2myspell,v 1.2 2010/02/23 12:05:51 caolan Exp $ +# +# (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +sub usage { + print "ispellaff2myspell: A program to convert ispell affix tables to myspell format +(C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL + +Usage: + ispellaff2myspell [options] <affixfile> + + Options: + --affixfile=s Affix file + --bylocale Use current locale setup for upper/lowercase + conversion + --charset=s Use specified charset for upper/lowercase + conversion (defaults to latin1) + --debug Print debugging info + --extraflags Allow some non alphabetic flags + --lowercase=s Lowercase string + --myheader=s Header file + --printcomments Print commented lines in output + --replacements=s Replacements file + --split=i Split flags with more that i entries + --uppercase=s Uppercase string + --wordlist=s Still unused + + Currently allowed valued for charset are: latin1, latin2, latin3 + +This script does not create the dict file. Something like + +( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict + +should do the work, with mydict.words+ being the ispell munched wordlist + +"; + exit; +} + +sub debugprint { + if ( $debug ){ + print STDERR "@_"; + } +} + +sub shipoutflag{ + my $flag_entries=scalar @flag_array; + + if ( $flag_entries != 0 ){ + if ( $split ){ + while ( @flag_array ){ + my @flag_subarray=splice(@flag_array,0,$split); + my $subflag_entries=scalar @flag_subarray; + if ( scalar @flag_array ){ + print "$myaffix $flagname $flagcombine $subflag_entries S\n"; + } else { + print "$myaffix $flagname $flagcombine $subflag_entries\n"; + } + print join("\n",@flag_subarray); + print "\n\n"; + } + } else { + print "$myaffix $flagname $flagcombine $flag_entries\n"; + print join("\n",@flag_array); + print "\n\n"; + } + } + @flag_array=(); + $flagname=''; + $flagcombine=''; +} + +sub mylc{ + my $inputstring=shift; + my $outputstring; + + if ( $bylocale ){ + { + use locale; + $outputstring = lc $inputstring; + } + } else { + if ( $charset eq "latin0" ){ + $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ½¨¸'; + $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ¼¦´'; + } elsif ( $charset eq "latin1" ){ + $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ'; + $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'; + } elsif ( $charset eq "latin2" ){ + $lowercase='a-z±³µ¶¹º»¼¾¿àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ'; + $uppercase='A-Z¡£¥¦©ª«¬®¯ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'; + } elsif ( $charset eq "latin3" ){ + $lowercase='a-z±¶¹º»¼¿àáâäåæçèéêëìíîïñòóôõö÷øùúûüýþ'; + $uppercase='A-Z¡¦©ª«¬¯ÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖ×ØÙÚÛÜÝÞ'; +# } elsif ( $charset eq "other_charset" ){ +# die "latin2 still unimplemented"; + } else { + if ( not $lowercase and not $uppercase ){ + die "Unsupported charset [$charset] + +Explicitly use --lowercase=string and --uppercase=string +options. Remember that both string must match exactly, but +case changed. +"; + } + } + $outputstring=$inputstring; + eval "\$outputstring=~tr/$uppercase/$lowercase/"; + } + return $outputstring; +} + +sub validate_flag (){ + my $flag = shift; + if ($flag=~m/[a-zA-Z]+/){ + return $flag; + } elsif ( $hasextraflags ){ + foreach ( keys %theextraflags ){ + if ($flag =~ m/^$_/){ + $flag =~ s/^$_//; + return $flag; + } + } + } + return ''; +} + +sub process_replacements{ + my $file = shift; + my @replaces = (); + + open (REPLACE,"< $file") || + die "Error: Could not open replacements file: $file\n"; + while (<REPLACE>){ + next unless m/^REP[\s\t]*\D.*/; + next if m/^REP\s+[0-9]+/; + s/\015\012//; + s/\015//; + chomp; + push @replaces, $_; + } + close REPLACE; + my $number = scalar @replaces; + print "REP $number\n"; + foreach ( @replaces ){ + print $_ . "\n"; + } +} + +# ----------------------------------------------------------- +# Now the progran start, after the functions are defined +# ----------------------------------------------------------- + +use Getopt::Long; + +# Initializing option values +$affixfile = ''; +$bylocale = ''; +$charset = ''; +$debug = ''; +$lowercase = ''; +$myheader = ''; +$printcomments = ''; +$replacements = ''; +$split = ''; +$uppercase = ''; +$wordlist = ''; +$hasextraflags = ''; +@flag_array = (); +%theextraflags = (); +# Initializing root values +$rootremove = "0"; +$rootname = ''; +$addtoroot = ''; +$comment = ''; +# Initializing flag values +$flagname = ''; +$flagcombine = ''; +$inflags = ''; + +GetOptions ('affixfile=s' => \$affixfile, + 'bylocale' => \$bylocale, + 'charset=s' => \$charset, + 'debug' => \$debug, + 'extraflags:s' => sub { + $hasextraflags = 1; + shift; + $theflag = shift; + $theextraflags{$theflag}++ if $theflag}, + 'lowercase=s' => \$lowercase, + 'myheader=s' => \$myheader, + 'printcomments' => \$printcomments, + 'replacements=s'=> \$replacements, + 'split=i' => \$split, + 'uppercase=s' => \$uppercase, + 'wordlist=s' => \$wordlist) or usage; + +if ( not $affixfile ){ + $affixfile=shift or usage; +} + +if ( $charset and ( $lowercase or $uppercase )){ + die "Error: charset and lowercase/uppercase options +are incompatible. Use either charset or lowercase/uppercase options to +specify the patterns +" +} elsif ( not $lowercase and not $uppercase and not $charset ){ + $charset="latin1"; +} + +if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){ + $theextraflags{"\\\\"}++; +} + +debugprint "$affixfile $charset"; + +open (AFFIXFILE,"< $affixfile") || + die "Error: Could not open affix file: $affixfile"; + +if ( $myheader ){ + my $myspell_header=`cat $myheader`; + print $myspell_header . "\n"; +} + +while (<AFFIXFILE>){ + chomp; + if (/^\s*\#.*/){ + debugprint "Ignoring line $.\n"; + print "$_\n" if $printcomments; + } elsif (/^\s*$/){ + debugprint "Ignoring line $.\n"; + } elsif (/^\s*prefixes/){ + debugprint "Prefixes starting in line $.\n"; + $affix="PFX"; + } elsif (/^\s*suffixes/){ + debugprint "Suffixes starting in line $.\n"; + $affix="SFX"; + } elsif (/^[\s\t]*flag.*/){ + next if not $affix; # In case we are still in the preamble + shipoutflag if $inflags; + $inflags="yes"; + s/^[\s\t]*flag[\s\t]*//; + s/[\s\t]*:.*$//; + debugprint "Found flag $_ in line $.\n"; + + if (/\*/){ + s/[\*\s]//g; + $flagcombine="Y"; + debugprint "Flag renamed to $_ with combine=$flagcombine\n"; + } else { + $flagcombine="N"; + } + + if ( $flagname = &validate_flag($_) ){ + $myaffix = $affix; + } else { + $myaffix = "\# $affix"; + $flagname = $_; + print STDERR "Ignoring invalid flag $flagname in line $.\n"; + } + } elsif ( $affix and $inflags ) { + ($rootname,@comments) = split('#',$_); + $comment = '# ' . join('#',@comments); + + $rootname =~ s/\s*//g; + $rootname = mylc $rootname; + ($rootname,$addtoroot) = split('>',$rootname); + + if ( $addtoroot =~ s/^\-//g ){ + ($rootremove,$addtoroot) = split(',',$addtoroot); + $addtoroot = "0" unless $addtoroot; + $addtoroot = "0" if ( $addtoroot eq "-"); + } else { + $rootremove = "0"; + } + $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti- + + if ( $rootname eq '.' && $rootremove ne "0" ){ + $rootname = $rootremove; + } + + debugprint "$rootname, $addtoroot, $rootremove\n"; + if ( $printcomments ){ + $affix_line=sprintf("%s %s %-5s %-11s %-24s %s", + $myaffix, $flagname, $rootremove, + $addtoroot, $rootname, $comment); + } else { + $affix_line=sprintf("%s %s %-5s %-11s %s", + $myaffix, $flagname, $rootremove, + $addtoroot, $rootname); + } + $rootremove = "0"; + $rootname = ''; + $addtoroot = ''; + $comment = ''; + @comments = (); + push @flag_array,$affix_line; + debugprint "$affix_line\n"; + } else { + # + } +} +shipoutflag; + +close AFFIXFILE; + +if ( $replacements ){ + &process_replacements($replacements); +} + +__END__ + +=head1 NAME + +B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format. + +=head1 SYNOPSIS + + ispellaff2myspell [options] <affixfile> --myheader your_header + + Options: + + --affixfile=s Affix file + --bylocale Use current locale setup for upper/lowercase + conversion + --charset=s Use specified charset for upper/lowercase + conversion (defaults to latin1) + --debug Print debugging info + --extraflags=s Allow some non alphabetic flags + --lowercase=s Lowercase string + --myheader=s Header file + --printcomments Print commented lines in output + --replacements=s Replacements file + --split=i Split flags with more that i entries + --uppercase=s Uppercase string + +=head1 DESCRIPTION + +B<ispellaff2myspell> is a script that will convert ispell affix tables +to myspell format in a more or less successful way. + +This script does not create the dict file. Something like + +( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict + +should do the work, with mydict.words+ being the munched wordlist + +=head1 OPTIONS + +=over 8 + +=item B<--affixfile=s> + +Affix file. You can put it directly in the command line. + +=item B<--bylocale> + +Use current locale setup for upper/lowercase conversion. Make sure +that the selected locale match the dictionary one, or you might get +into trouble. + +=item B<--charset=s> + +Use specified charset for upper/lowercase conversion (defaults to latin1). +Currently allowed values for charset are: latin0, latin1, latin2, latin3. + +=item B<--debug> + +Print some debugging info. + +=item B<--extraflags:s> + +Allows some non alphabetic flags. + +When invoked with no value the supported flags are currently those +corresponding to chars represented with the escape char B<\> as +first char. B<\> will be stripped. + +When given with the flag prefix will allow that flag and strip the +given prefix. Be careful when giving the prefix to properly escape chars, +e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to +B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all +flags and pass them unmodified. + +You will need a call to -e for each flag type, e.g., +B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>). + +When a prefix is explicitely set, the default value (anything starting by B<\>) +is disabled and you need to enable it explicitely as in previous example. + +=item B<--lowercase=s> + +Lowercase string. Manually set the string of lowercase chars. This +requires B<--uppercase> having exactly that string but uppercase. + +=item B<--myheader=s> + +Header file. The myspell aff header. You need to write it +manually. This can contain everything you want to be before the affix table + +=item B<--printcomments> + +Print commented lines in output. + +=item B<--replacements=file> + +Add a pre-defined replacements table taken from 'file' to the .aff file. +Will skip lines not beginning with REP, and set the replacements number +appropriately. + +=item B<--split=i> + +Split flags with more that i entries. This can be of interest for flags +having a lot of entries. Will split the flag in chunks containing B<i> +entries. + +=item B<--uppercase=s> + +Uppercase string. Manually set the sring of uppercase chars. This +requires B<--lowercase> having exactly that string but lowercase. + +=back + +If your encoding is currently unsupported you can send me a file with +the two strings of lower and uppercase chars. Note that they must match +exactly but case changed. It will look something like + + $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ'; + $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'; + +=head1 SEE ALSO + +The OpenOffice.org Lingucomponent Project home page + +L<http://lingucomponent.openoffice.org/index.html> + +and the document + +L<http://lingucomponent.openoffice.org/affix.readme> + +that provides information about the basics of the myspell affix file format. + +You can also take a look at + + /usr/share/doc/libmyspell-dev/affix.readme.gz + /usr/share/doc/libmyspell-dev/README.compoundwords + /usr/share/doc/libmyspell-dev/README.replacetable + +in your Debian system. + +=head1 AUTHORS + +Agustin Martin <agustin.martin@hispalinux.es> + +=cut diff --git a/src/tools/makealias b/src/tools/makealias new file mode 100755 index 0000000..60d93b6 --- /dev/null +++ b/src/tools/makealias @@ -0,0 +1,115 @@ +#!/bin/sh +# makealias: make alias compressed dic and aff files +# Usage: alias.sh dic aff (not alias.sh aff dic!) +# Version: 2007-10-26 + +case $# in +0|1) +echo 'makealias: make alias compressed dic and aff files +Usage: makealias file.dic file.aff (not makefile file.aff file.dic!)' >/dev/stderr +exit;; +esac + +DIC=`basename $1 .dic` +AFF=`basename $2 .aff` + +# FLAG type definition must be before alias definitions +grep '^FLAG' $2 >"${AFF}_alias.aff" + +awk 'BEGIN{n=1;m=1} +function cutslash(st) { + if (split(st,t,"/") > 1) return t[1] + return st +} +function ltrim(st) { + sub(/^ +/,"",st) + return st +} +FILENAME ~ /.dic$/ && $1 ~ "/[^ \t]" { + split($1,t,"/") + if(!a[t[2]]){ + a[t[2]]=n + b[n]=t[2] + n++ + } + if (NF > 1) { + $1 = "" + if(!a2[$0]){ + a2[$0]=m + c[m]=$0 + m++ + } + print t[1]"/"a[t[2]] "\t" a2[$0] + } else { + print t[1]"/"a[t[2]] + } + next +} +FILENAME ~ /.dic$/ && NF > 1 { + x = $1 + $1 = "" + if(!a2[$0]){ + a2[$0]=m + c[m]=$0 + m++ + } + print cutslash(x) "\t" a2[$0] + next +} +FILENAME ~ /.dic$/ { print cutslash($1) } +FILENAME ~ /.aff$/ && /^[PS]FX/ && ($4 ~ /\/[^ ]/) && NF > 4 { + split($4,t,"/") + if(!a[t[2]]){ + a[t[2]]=n + b[n]=t[2] + n++ + } + begin = $1 " " $2 " " $3 " " (t[1]"/"a[t[2]]) " " $5 + if ($6!="") ok = 1; else ok = 0; + $1 = "" + $2 = "" + $3 = "" + $4 = "" + $5 = "" + if(ok){ + if(!a2[$0]){ + a2[$0]=m + c[m]=$0 + m++ + } + print begin " " a2[$0] >>"/dev/stderr" + } else print begin >>"/dev/stderr" + next +} +FILENAME ~ /.aff$/ && /^[PS]FX/ && NF > 4 { + begin = $1 " " $2 " " $3 " " cutslash($4) " " $5 + if ($6!="") ok = 1; else ok = 0; + $1 = "" + $2 = "" + $3 = "" + $4 = "" + $5 = "" + if(ok) { + if (!a2[$0]){ + a2[$0]=m + c[m]=$0 + m++ + } + print begin " " a2[$0] >>"/dev/stderr" + } else print begin >>"/dev/stderr" + next +} +FILENAME ~ /.aff$/ { print $0 >>"/dev/stderr" } +END{ + if (n>1) { + print "AF", n-1 >>"'${AFF}_alias.aff'" + for(i=1;i<n;i++) print "AF", b[i],"#",i >>"'${AFF}_alias.aff'" + } + if (m>1) { + print "AM", m-1 >>"'${AFF}_alias.aff'" + for(i=1;i<m;i++) print "AM " ltrim(c[i]) >>"'${AFF}_alias.aff'" + } +}' $1 $2 >${DIC}_alias.dic 2>${AFF}_alias.$$ +grep -v '^FLAG' ${AFF}_alias.$$ >>${AFF}_alias.aff +echo "output: ${DIC}_alias.dic, ${AFF}_alias.aff" +rm ${AFF}_alias.$$ diff --git a/src/tools/munch.c b/src/tools/munch.c new file mode 100644 index 0000000..2087efa --- /dev/null +++ b/src/tools/munch.c @@ -0,0 +1,832 @@ +/* Munch a word list and generate a smaller root word list with affixes*/ + +#include <ctype.h> +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#ifdef __linux__ +#include <error.h> +#include <errno.h> +#include <sys/mman.h> +#endif + +#include "munch.h" + +int main(int argc, char** argv) +{ + + int i, j, k, n; + int rl, p , nwl; + int al; + + FILE * wrdlst; + FILE * afflst; + + char *nword, *wf, *af; + char as[(MAX_PREFIXES + MAX_SUFFIXES)]; + char * ap; + + struct hentry * ep; + struct hentry * ep1; + struct affent * pfxp; + struct affent * sfxp; + + /* first parse the command line options */ + /* arg1 - wordlist, arg2 - affix file */ + + if (argv[1]) { + wf = mystrdup(argv[1]); + } else { + fprintf(stderr,"correct syntax is:\n"); + fprintf(stderr,"munch word_list_file affix_file\n"); + exit(1); + } + if (argv[2]) { + af = mystrdup(argv[2]); + } else { + fprintf(stderr,"correct syntax is:\n"); + fprintf(stderr,"munch word_list_file affix_file\n"); + exit(1); + } + + /* open the affix file */ + afflst = fopen(af,"r"); + if (!afflst) { + fprintf(stderr,"Error - could not open affix description file\n"); + exit(1); + } + + /* step one is to parse the affix file building up the internal + affix data structures */ + + numpfx = 0; + numsfx = 0; + + if (parse_aff_file(afflst)) { + fprintf(stderr,"Error - in affix file loading\n"); + exit(1); + } + fclose(afflst); + + fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx); + + /* affix file is now parsed so create hash table of wordlist on the fly */ + + /* open the wordlist */ + wrdlst = fopen(wf,"r"); + if (!wrdlst) { + fprintf(stderr,"Error - could not open word list file\n"); + exit(1); + } + + if (load_tables(wrdlst)) { + fprintf(stderr,"Error building hash tables\n"); + exit(1); + } + fclose(wrdlst); + + for (i=0; i< tablesize; i++) { + ep = &tableptr[i]; + if (ep->word == NULL) continue; + for ( ; ep != NULL; ep = ep->next) { + numroots = 0; + aff_chk(ep->word,strlen(ep->word)); + if (numroots) { + /* now there might be a number of combinations */ + /* of prefixes and suffixes that might match this */ + /* word. So how to choose? As a first shot look */ + /* for the shortest remaining root word to */ + /* to maximize the combinatorial power */ + + /* but be careful, do not REQUIRE a specific combination */ + /* of a prefix and a suffix to generate the word since */ + /* that violates the rule that the root word with just */ + /* the prefix or just the suffix must also exist in the */ + /* wordlist as well */ + + /* in fact because of the cross product issue, this not a */ + /* simple choice since some combinations of previous */ + /* prefixes and new suffixes may not be valid. */ + /* The only way to know is to simply try them all */ + + rl = 1000; + p = -1; + + for (j = 0; j < numroots; j++){ + + /* first collect the root word info and build up */ + /* the potential new affix string */ + nword = (roots[j].hashent)->word; + nwl = strlen(nword); + *as = '\0'; + al = 0; + ap = as; + if (roots[j].prefix) *ap++ = (roots[j].prefix)->achar; + if (roots[j].suffix) *ap++ = (roots[j].suffix)->achar; + if ((roots[j].hashent)->affstr) { + strcpy(ap,(roots[j].hashent)->affstr); + } else { + *ap = '\0'; + } + al =strlen(as); + + /* now expand the potential affix string to generate */ + /* all legal words and make sure they all exist in the */ + /* word list */ + numwords = 0; + wlist[numwords].word = mystrdup(nword); + wlist[numwords].pallow = 0; + numwords++; + n = 0; + if (al) + expand_rootword(nword,nwl,as,al); + for (k=0; k<numwords; k++) { + if (lookup(wlist[k].word)) n++; + free(wlist[k].word); + wlist[k].word = NULL; + wlist[k].pallow = 0; + } + + /* if all exist in word list then okay */ + if (n == numwords) { + if (nwl < rl) { + rl = nwl; + p = j; + } + } + } + if (p != -1) { + ep1 = roots[p].hashent; + pfxp = roots[p].prefix; + sfxp = roots[p].suffix; + ep1->keep = 1; + if (pfxp != NULL) add_affix_char(ep1,pfxp->achar); + if (sfxp != NULL) add_affix_char(ep1,sfxp->achar); + } else { + ep->keep = 1; + } + } else { + ep->keep = 1; + } + } + } + + /* now output only the words to keep along with affixes info */ + /* first count how many words that is */ + k = 0; + for (i=0; i< tablesize; i++) { + ep = &tableptr[i]; + if (ep->word == NULL) continue; + for ( ; ep != NULL; ep = ep->next) { + if (ep->keep > 0) k++; + } + } + fprintf(stdout,"%d\n",k); + + for (i=0; i< tablesize; i++) { + ep = &tableptr[i]; + if (ep->word == NULL) continue; + for ( ; ep != NULL; ep = ep->next) { + if (ep->keep > 0) { + if (ep->affstr != NULL) { + fprintf(stdout,"%s/%s\n",ep->word,ep->affstr); + } else { + fprintf(stdout,"%s\n",ep->word); + } + } + } + } + return 0; +} + + +int parse_aff_file(FILE * afflst) +{ + int i, j; + int numents = 0; + char achar = '\0'; + short ff=0; + char ft; + struct affent * ptr= NULL; + struct affent * nptr= NULL; + char * line = malloc(MAX_LN_LEN); + + while (fgets(line,MAX_LN_LEN,afflst)) { + mychomp(line); + ft = ' '; + fprintf(stderr,"parsing line: %s\n",line); + if (strncmp(line,"PFX",3) == 0) ft = 'P'; + if (strncmp(line,"SFX",3) == 0) ft = 'S'; + if (ft != ' ') { + char * tp = line; + char * piece; + i = 0; + ff = 0; + while ((piece=mystrsep(&tp,' '))) { + if (*piece != '\0') { + switch(i) { + case 0: break; + case 1: { achar = *piece; break; } + case 2: { if (*piece == 'Y') ff = XPRODUCT; break; } + case 3: { numents = atoi(piece); + ptr = malloc(numents * sizeof(struct affent)); + ptr->achar = achar; + ptr->xpflg = ff; + fprintf(stderr,"parsing %c entries %d\n",achar,numents); + break; + } + default: break; + } + i++; + } + free(piece); + } + /* now parse all of the sub entries*/ + nptr = ptr; + for (j=0; j < numents; j++) { + if (!fgets(line,MAX_LN_LEN,afflst)) return 1; + mychomp(line); + tp = line; + i = 0; + while ((piece=mystrsep(&tp,' '))) { + if (*piece != '\0') { + switch(i) { + case 0: { if (nptr != ptr) { + nptr->achar = ptr->achar; + nptr->xpflg = ptr->xpflg; + } + break; + } + case 1: break; + case 2: { nptr->strip = mystrdup(piece); + nptr->stripl = strlen(nptr->strip); + if (strcmp(nptr->strip,"0") == 0) { + free(nptr->strip); + nptr->strip=mystrdup(""); + nptr->stripl = 0; + } + break; + } + case 3: { nptr->appnd = mystrdup(piece); + nptr->appndl = strlen(nptr->appnd); + if (strcmp(nptr->appnd,"0") == 0) { + free(nptr->appnd); + nptr->appnd=mystrdup(""); + nptr->appndl = 0; + } + break; + } + case 4: { encodeit(nptr,piece);} + fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd, + nptr->appndl,nptr->strip,nptr->stripl); + default: break; + } + i++; + } + free(piece); + } + nptr++; + } + if (ft == 'P') { + ptable[numpfx].aep = ptr; + ptable[numpfx].num = numents; + fprintf(stderr,"ptable %d num is %d\n",numpfx,ptable[numpfx].num); + numpfx++; + } else { + stable[numsfx].aep = ptr; + stable[numsfx].num = numents; + fprintf(stderr,"stable %d num is %d\n",numsfx,stable[numsfx].num); + numsfx++; + } + ptr = NULL; + nptr = NULL; + numents = 0; + achar='\0'; + } + } + free(line); + return 0; +} + + +void encodeit(struct affent * ptr, char * cs) +{ + int nc; + int neg; + int grp; + unsigned char c; + int n; + int ec; + int nm; + int i, j, k; + unsigned char mbr[MAX_WD_LEN]; + + /* now clear the conditions array */ + for (i=0;i<SET_SIZE;i++) ptr->conds[i] = (unsigned char) 0; + + /* now parse the string to create the conds array */ + nc = strlen(cs); + neg = 0; /* complement indicator */ + grp = 0; /* group indicator */ + n = 0; /* number of conditions */ + ec = 0; /* end condition indicator */ + nm = 0; /* number of member in group */ + i = 0; + if (strcmp(cs,".")==0) { + ptr->numconds = 0; + return; + } + while (i < nc) { + c = *((unsigned char *)(cs + i)); + if (c == '[') { + grp = 1; + c = 0; + } + if ((grp == 1) && (c == '^')) { + neg = 1; + c = 0; + } + if (c == ']') { + ec = 1; + c = 0; + } + if ((grp == 1) && (c != 0)) { + *(mbr + nm) = c; + nm++; + c = 0; + } + if (c != 0) { + ec = 1; + } + if (ec) { + if (grp == 1) { + if (neg == 0) { + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + ptr->conds[k] = ptr->conds[k] | (1 << n); + } + } else { + for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n); + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + ptr->conds[k] = ptr->conds[k] & ~(1 << n); + } + } + neg = 0; + grp = 0; + nm = 0; + } else { + /* not a group so just set the proper bit for this char */ + /* but first handle special case of . inside condition */ + if (c == '.') { + /* wild card character so set them all */ + for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n); + } else { + ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n); + } + } + n++; + ec = 0; + } + i++; + } + ptr->numconds = n; + return; +} + + + +/* search for a prefix */ +void pfx_chk (const char * word, int len, struct affent* ep, int num) +{ + struct affent * aent; + int cond; + int tlen; + struct hentry * hent; + unsigned char * cp; + int i; + char tword[MAX_WD_LEN]; + + for (aent = ep, i = num; i > 0; aent++, i--) { + + tlen = len - aent->appndl; + + if (tlen > 0 && (aent->appndl == 0 || + strncmp(aent->appnd, word, aent->appndl) == 0) + && tlen + aent->stripl >= aent->numconds) { + + if (aent->stripl) strcpy (tword, aent->strip); + strcpy((tword + aent->stripl), (word + aent->appndl)); + + /* now go through the conds and make sure they all match */ + cp = (unsigned char *) tword; + for (cond = 0; cond < aent->numconds; cond++) { + if ((aent->conds[*cp++] & (1 << cond)) == 0) + break; + } + + if (cond >= aent->numconds) { + tlen += aent->stripl; + if ((hent = lookup(tword)) != NULL) { + if (numroots < MAX_ROOTS) { + roots[numroots].hashent = hent; + roots[numroots].prefix = aent; + roots[numroots].suffix = NULL; + numroots++; + } + } + } + } + } +} + + + +void suf_chk (const char * word, int len, struct affent * ep, + int num, struct affent * pfxent, int cpflag) +{ + struct affent * aent; + int tlen; + int cond; + struct hentry * hent; + unsigned char * cp; + int i; + char tword[MAX_WD_LEN]; + + for (aent = ep, i = num; i > 0; aent++, i--) { + + if ((cpflag & XPRODUCT) != 0 && (aent->xpflg & XPRODUCT) == 0) + continue; + + tlen = len - aent->appndl; + if (tlen > 0 && (aent->appndl == 0 || + strcmp(aent->appnd, (word + tlen)) == 0) + && tlen + aent->stripl >= aent->numconds) { + + strcpy (tword, word); + cp = (unsigned char *) (tword + tlen); + if (aent->stripl) { + strcpy ((char *)cp, aent->strip); + tlen += aent->stripl; + cp = (unsigned char *)(tword + tlen); + } else *cp = '\0'; + + for (cond = aent->numconds; --cond >= 0; ) { + if ((aent->conds[*--cp] & (1 << cond)) == 0) break; + } + if (cond < 0) { + if ((hent = lookup(tword)) != NULL) { + if (numroots < MAX_ROOTS) { + roots[numroots].hashent = hent; + roots[numroots].prefix = pfxent; + roots[numroots].suffix = aent; + numroots++; + } + } + } + } + } +} + + + +void aff_chk (const char * word, int len) +{ + int i; + int j; + int nh=0; + char * nword; + int nwl; + + if (len < 4) return; + + for (i=0; i < numpfx; i++) { + pfx_chk(word, len, ptable[i].aep, ptable[i].num); + } + + nh = numroots; + + if (nh > 0) { + for (j=0;j<nh;j++){ + if (roots[j].prefix->xpflg & XPRODUCT) { + nword = mystrdup((roots[j].hashent)->word); + nwl = strlen(nword); + for (i=0; i < numsfx; i++) { + suf_chk(nword,nwl,stable[i].aep, stable[i].num, roots[j].prefix, XPRODUCT); + } + free(nword); + } + } + } + for (i=0; i < numsfx; i++) { + suf_chk(word, len, stable[i].aep, stable[i].num, NULL, 0); + } +} + + + +/* lookup a root word in the hashtable */ + +struct hentry * lookup(const char *word) +{ + struct hentry * dp; + dp = &tableptr[hash(word)]; + if (dp->word == NULL) return NULL; + for ( ; dp != NULL; dp = dp->next) { + if (strcmp(word,dp->word) == 0) return dp; + } + return NULL; +} + + + +/* add a word to the hash table */ + +int add_word(char * word) +{ + int i; + struct hentry * dp; + struct hentry * hp = (struct hentry *) malloc (sizeof(struct hentry)); + + hp->word = word; + hp->affstr = NULL; + hp->keep = 0; + hp->next = NULL; + + i = hash(word); + dp = &tableptr[i]; + + if (dp->word == NULL) { + *dp = *hp; + free(hp); + } else { + while (dp->next != NULL) dp=dp->next; + dp->next = hp; + } + return 0; +} + + + +/* load a word list and build a hash table on the fly */ + +int load_tables(FILE * wdlst) +{ + char * ap; + char ts[MAX_LN_LEN]; + + /* first read the first line of file to get hash table size */ + if (! fgets(ts, MAX_LN_LEN-1,wdlst)) return 2; + mychomp(ts); + tablesize = atoi(ts); + tablesize = tablesize + 5; + if ((tablesize %2) == 0) tablesize++; + + /* allocate the hash table */ + tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry)); + if (! tableptr) return 3; + + /* loop thorugh all words on much list and add to hash + * table and store away word and affix strings in tmpfile + */ + + while (fgets(ts,MAX_LN_LEN-1,wdlst)) { + mychomp(ts); + ap = mystrdup(ts); + add_word(ap); + + } + return 0; +} + + +/* the hash function is a simple load and rotate + * algorithm borrowed + */ + +int hash(const char * word) +{ + int i; + long hv = 0; + for (i=0; i < 4 && *word != 0; i++) + hv = (hv << 8) | (*word++); + while (*word != 0) { + ROTATE(hv,ROTATE_LEN); + hv ^= (*word++); + } + return (unsigned long) hv % tablesize; +} + + +void add_affix_char(struct hentry * ep, char ac) +{ + int al; + int i; + char * tmp; + if (ep->affstr == NULL) { + ep->affstr = (char *) malloc(2); + *(ep->affstr) = ac; + *((ep->affstr)+1) = '\0'; + return; + } + al = strlen(ep->affstr); + for (i=0; i< al; i++) + if (ac == (ep->affstr)[i]) return; + tmp = calloc(al+2,1); + memcpy(tmp,ep->affstr,(al+1)); + *(tmp+al) = ac; + *(tmp+al+1)='\0'; + free(ep->affstr); + ep->affstr = tmp; + return; +} + + +/* add a prefix to word */ +void pfx_add (const char * word, int len, struct affent* ep, int num) +{ + struct affent * aent; + int cond; + int tlen; + unsigned char * cp; + int i; + char * pp; + char tword[MAX_WD_LEN]; + + + for (aent = ep, i = num; i > 0; aent++, i--) { + + /* now make sure all conditions match */ + if ((len > aent->stripl) && (len >= aent->numconds)) { + + cp = (unsigned char *) word; + for (cond = 0; cond < aent->numconds; cond++) { + if ((aent->conds[*cp++] & (1 << cond)) == 0) + break; + } + if (cond >= aent->numconds) { + + /* we have a match so add prefix */ + tlen = 0; + if (aent->appndl) { + strcpy(tword,aent->appnd); + tlen += aent->appndl; + } + pp = tword + tlen; + strcpy(pp, (word + aent->stripl)); + tlen = tlen + len - aent->stripl; + + if (numwords < MAX_WORDS) { + wlist[numwords].word = mystrdup(tword); + wlist[numwords].pallow = 0; + numwords++; + } + } + } + } +} + + +/* add a suffix to a word */ +void suf_add (const char * word, int len, struct affent * ep, int num) +{ + struct affent * aent; + int tlen; + int cond; + unsigned char * cp; + int i; + char tword[MAX_WD_LEN]; + char * pp; + + for (aent = ep, i = num; i > 0; aent++, i--) { + + /* if conditions hold on root word + * then strip off strip string and add suffix + */ + + if ((len > aent->stripl) && (len >= aent->numconds)) { + cp = (unsigned char *) (word + len); + for (cond = aent->numconds; --cond >= 0; ) { + if ((aent->conds[*--cp] & (1 << cond)) == 0) break; + } + if (cond < 0) { + /* we have a matching condition */ + strcpy(tword,word); + tlen = len; + if (aent->stripl) { + tlen -= aent->stripl; + } + pp = (tword + tlen); + if (aent->appndl) { + strcpy (pp, aent->appnd); + tlen += aent->stripl; + } else *pp = '\0'; + + if (numwords < MAX_WORDS) { + wlist[numwords].word = mystrdup(tword); + wlist[numwords].pallow = (aent->xpflg & XPRODUCT); + numwords++; + } + } + } + } +} + + + +int expand_rootword(const char * ts, int wl, const char * ap, int al) +{ + int i; + int j; + int nh=0; + int nwl; + + for (i=0; i < numsfx; i++) { + if (strchr(ap,(stable[i].aep)->achar)) { + suf_add(ts, wl, stable[i].aep, stable[i].num); + } + } + + nh = numwords; + + if (nh > 1) { + for (j=1;j<nh;j++){ + if (wlist[j].pallow) { + for (i=0; i < numpfx; i++) { + if (strchr(ap,(ptable[i].aep)->achar)) { + if ((ptable[i].aep)->xpflg & XPRODUCT) { + nwl = strlen(wlist[j].word); + pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num); + } + } + } + } + } + } + + for (i=0; i < numpfx; i++) { + if (strchr(ap,(ptable[i].aep)->achar)) { + pfx_add(ts, wl, ptable[i].aep, ptable[i].num); + } + } + return 0; +} + + + +/* strip strings into token based on single char delimiter + * acts like strsep() but only uses a delim char and not + * a delim string + */ +char * mystrsep(char ** stringp, const char delim) +{ + char * rv = NULL; + char * mp = *stringp; + int n = strlen(mp); + if (n > 0) { + char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n); + if (dp) { + int nc; + *stringp = dp+1; + nc = (int)((unsigned long)dp - (unsigned long)mp); + rv = (char *) malloc(nc+1); + if (rv) { + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + } + } else { + rv = (char *) malloc(n+1); + if (rv) { + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + } + } + } + return rv; +} + + +char * mystrdup(const char * s) +{ + char * d = NULL; + if (s) { + int sl = strlen(s)+1; + d = (char *) malloc(sl); + if (d) memcpy(d,s,sl); + } + return d; +} + + +void mychomp(char * s) +{ + int k = strlen(s); + if (k > 0) *(s+k-1) = '\0'; + if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; +} + diff --git a/src/tools/munch.h b/src/tools/munch.h new file mode 100644 index 0000000..ee75878 --- /dev/null +++ b/src/tools/munch.h @@ -0,0 +1,121 @@ +/* munch header file */ + +#define MAX_LN_LEN 200 +#define MAX_WD_LEN 200 +#define MAX_PREFIXES 256 +#define MAX_SUFFIXES 256 +#define MAX_ROOTS 20 +#define MAX_WORDS 5000 + +#define ROTATE_LEN 5 + +#define ROTATE(v,q) \ + (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); + +#define SET_SIZE 256 + +#define XPRODUCT (1 << 0) + +/* the affix table entry */ + +struct affent +{ + char * appnd; + char * strip; + short appndl; + short stripl; + char achar; + char xpflg; + short numconds; + char conds[SET_SIZE]; +}; + + +struct affixptr +{ + struct affent * aep; + int num; +}; + +/* the prefix and suffix table */ +int numpfx; /* Number of prefixes in table */ +int numsfx; /* Number of suffixes in table */ + +/* the prefix table */ +struct affixptr ptable[MAX_PREFIXES]; + +/* the suffix table */ +struct affixptr stable[MAX_SUFFIXES]; + + +/* data structure to store results of lookups */ +struct matches +{ + struct hentry * hashent; /* hash table entry */ + struct affent * prefix; /* Prefix used, or NULL */ + struct affent * suffix; /* Suffix used, or NULL */ +}; + +int numroots; /* number of root words found */ +struct matches roots[MAX_ROOTS]; /* list of root words found */ + +/* hashing stuff */ + +struct hentry +{ + char * word; + char * affstr; + struct hentry * next; + int keep; +}; + + +int tablesize; +struct hentry * tableptr; + +/* unmunch stuff */ + +int numwords; /* number of words found */ +struct dwords +{ + char * word; + int pallow; +}; + +struct dwords wlist[MAX_WORDS]; /* list words found */ + + +/* the routines */ + +int parse_aff_file(FILE* afflst); + +void encodeit(struct affent * ptr, char * cs); + +int load_tables(FILE * wrdlst); + +int hash(const char *); + +int add_word(char *); + +struct hentry * lookup(const char *); + +void aff_chk (const char * word, int len); + +void pfx_chk (const char * word, int len, struct affent* ep, int num); + +void suf_chk (const char * word, int len, struct affent * ep, int num, + struct affent * pfxent, int cpflag); + +void add_affix_char(struct hentry * hent, char ac); + +int expand_rootword(const char *, int, const char*, int); + +void pfx_add (const char * word, int len, struct affent* ep, int num); + +void suf_add (const char * word, int len, struct affent * ep, int num); + +char * mystrsep(char ** stringp, const char delim); + +char * mystrdup(const char * s); + +void mychomp(char * s); diff --git a/src/tools/unmunch.c b/src/tools/unmunch.c new file mode 100644 index 0000000..6bbd09c --- /dev/null +++ b/src/tools/unmunch.c @@ -0,0 +1,514 @@ +/* Un-munch a root word list with affix tags + * to recreate the original word list + */ + +#include <ctype.h> +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#ifdef __linux__ +#include <error.h> +#include <errno.h> +#include <sys/mman.h> +#endif + +#include "unmunch.h" + + +int main(int argc, char** argv) +{ + + int i; + int al, wl; + + FILE * wrdlst; + FILE * afflst; + + char *wf, *af; + char * ap; + char ts[MAX_LN_LEN]; + + /* first parse the command line options */ + /* arg1 - munched wordlist, arg2 - affix file */ + + if (argv[1]) { + wf = mystrdup(argv[1]); + } else { + fprintf(stderr,"correct syntax is:\n"); + fprintf(stderr,"unmunch dic_file affix_file\n"); + exit(1); + } + if (argv[2]) { + af = mystrdup(argv[2]); + } else { + fprintf(stderr,"correct syntax is:\n"); + fprintf(stderr,"unmunch dic_file affix_file\n"); + exit(1); + } + + /* open the affix file */ + afflst = fopen(af,"r"); + if (!afflst) { + fprintf(stderr,"Error - could not open affix description file\n"); + exit(1); + } + + /* step one is to parse the affix file building up the internal + affix data structures */ + + numpfx = 0; + numsfx = 0; + fullstrip = 0; + + if (parse_aff_file(afflst)) { + fprintf(stderr,"Error - in affix file loading\n"); + exit(1); + } + + fclose(afflst); + + fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx); + + /* affix file is now parsed so create hash table of wordlist on the fly */ + + /* open the wordlist */ + wrdlst = fopen(wf,"r"); + if (!wrdlst) { + fprintf(stderr,"Error - could not open word list file\n"); + exit(1); + } + + /* skip over the hash table size */ + if (! fgets(ts, MAX_LN_LEN-1,wrdlst)) { + fclose(wrdlst); + return 2; + } + mychomp(ts); + + while (fgets(ts,MAX_LN_LEN-1,wrdlst)) { + mychomp(ts); + /* split each line into word and affix char strings */ + ap = strchr(ts,'/'); + if (ap) { + *ap = '\0'; + ap++; + al = strlen(ap); + } else { + al = 0; + ap = NULL; + } + + wl = strlen(ts); + + numwords = 0; + wlist[numwords].word = mystrdup(ts); + wlist[numwords].pallow = 0; + numwords++; + + if (al) + expand_rootword(ts,wl,ap,al); + + for (i=0; i < numwords; i++) { + fprintf(stdout,"%s\n",wlist[i].word); + free(wlist[i].word); + wlist[i].word = NULL; + wlist[i].pallow = 0; + } + + } + + fclose(wrdlst); + return 0; +} + + + + +int parse_aff_file(FILE * afflst) +{ + int i, j; + int numents=0; + char achar='\0'; + short ff=0; + char ft; + struct affent * ptr= NULL; + struct affent * nptr= NULL; + char * line = malloc(MAX_LN_LEN); + + while (fgets(line,MAX_LN_LEN,afflst)) { + mychomp(line); + ft = ' '; + fprintf(stderr,"parsing line: %s\n",line); + if (strncmp(line,"FULLSTRIP",9) == 0) fullstrip = 1; + if (strncmp(line,"PFX",3) == 0) ft = 'P'; + if (strncmp(line,"SFX",3) == 0) ft = 'S'; + if (ft != ' ') { + char * tp = line; + char * piece; + ff = 0; + i = 0; + while ((piece=mystrsep(&tp,' '))) { + if (*piece != '\0') { + switch(i) { + case 0: break; + case 1: { achar = *piece; break; } + case 2: { if (*piece == 'Y') ff = XPRODUCT; break; } + case 3: { numents = atoi(piece); + ptr = malloc(numents * sizeof(struct affent)); + ptr->achar = achar; + ptr->xpflg = ff; + fprintf(stderr,"parsing %c entries %d\n",achar,numents); + break; + } + default: break; + } + i++; + } + free(piece); + } + /* now parse all of the sub entries*/ + nptr = ptr; + for (j=0; j < numents; j++) { + if (!fgets(line,MAX_LN_LEN,afflst)) return 1; + mychomp(line); + tp = line; + i = 0; + while ((piece=mystrsep(&tp,' '))) { + if (*piece != '\0') { + switch(i) { + case 0: { if (nptr != ptr) { + nptr->achar = ptr->achar; + nptr->xpflg = ptr->xpflg; + } + break; + } + case 1: break; + case 2: { nptr->strip = mystrdup(piece); + nptr->stripl = strlen(nptr->strip); + if (strcmp(nptr->strip,"0") == 0) { + free(nptr->strip); + nptr->strip=mystrdup(""); + nptr->stripl = 0; + } + break; + } + case 3: { nptr->appnd = mystrdup(piece); + nptr->appndl = strlen(nptr->appnd); + if (strcmp(nptr->appnd,"0") == 0) { + free(nptr->appnd); + nptr->appnd=mystrdup(""); + nptr->appndl = 0; + } + break; + } + case 4: { encodeit(nptr,piece);} + fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd, + nptr->appndl,nptr->strip,nptr->stripl); + default: break; + } + i++; + } + free(piece); + } + nptr++; + } + if (ft == 'P') { + ptable[numpfx].aep = ptr; + ptable[numpfx].num = numents; + fprintf(stderr,"ptable %d num is %d flag %c\n",numpfx,ptable[numpfx].num,ptr->achar); + numpfx++; + } else { + stable[numsfx].aep = ptr; + stable[numsfx].num = numents; + fprintf(stderr,"stable %d num is %d flag %c\n",numsfx,stable[numsfx].num,ptr->achar); + numsfx++; + } + ptr = NULL; + nptr = NULL; + numents = 0; + achar='\0'; + } + } + free(line); + return 0; +} + + +void encodeit(struct affent * ptr, char * cs) +{ + int nc; + int neg; + int grp; + unsigned char c; + int n; + int ec; + int nm; + int i, j, k; + unsigned char mbr[MAX_WD_LEN]; + + /* now clear the conditions array */ + for (i=0;i<SET_SIZE;i++) ptr->conds[i] = (unsigned char) 0; + + /* now parse the string to create the conds array */ + nc = strlen(cs); + neg = 0; /* complement indicator */ + grp = 0; /* group indicator */ + n = 0; /* number of conditions */ + ec = 0; /* end condition indicator */ + nm = 0; /* number of member in group */ + i = 0; + if (strcmp(cs,".")==0) { + ptr->numconds = 0; + return; + } + while (i < nc) { + c = *((unsigned char *)(cs + i)); + if (c == '[') { + grp = 1; + c = 0; + } + if ((grp == 1) && (c == '^')) { + neg = 1; + c = 0; + } + if (c == ']') { + ec = 1; + c = 0; + } + if ((grp == 1) && (c != 0)) { + *(mbr + nm) = c; + nm++; + c = 0; + } + if (c != 0) { + ec = 1; + } + if (ec) { + if (grp == 1) { + if (neg == 0) { + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + ptr->conds[k] = ptr->conds[k] | (1 << n); + } + } else { + for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n); + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + ptr->conds[k] = ptr->conds[k] & ~(1 << n); + } + } + neg = 0; + grp = 0; + nm = 0; + } else { + /* not a group so just set the proper bit for this char */ + /* but first handle special case of . inside condition */ + if (c == '.') { + /* wild card character so set them all */ + for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n); + } else { + ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n); + } + } + n++; + ec = 0; + } + i++; + } + ptr->numconds = n; + return; +} + + + +/* add a prefix to word */ +void pfx_add (const char * word, int len, struct affent* ep, int num) +{ + struct affent * aent; + int cond; + int tlen; + unsigned char * cp; + int i; + char * pp; + char tword[MAX_WD_LEN]; + + + for (aent = ep, i = num; i > 0; aent++, i--) { + + /* now make sure all conditions match */ + if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) && + ((aent->stripl == 0) || + (strncmp(aent->strip, word, aent->stripl) == 0))) { + + cp = (unsigned char *) word; + for (cond = 0; cond < aent->numconds; cond++) { + if ((aent->conds[*cp++] & (1 << cond)) == 0) + break; + } + if (cond >= aent->numconds) { + + /* we have a match so add prefix */ + tlen = 0; + if (aent->appndl) { + strcpy(tword,aent->appnd); + tlen += aent->appndl; + } + pp = tword + tlen; + strcpy(pp, (word + aent->stripl)); + tlen = tlen + len - aent->stripl; + + if (numwords < MAX_WORDS) { + wlist[numwords].word = mystrdup(tword); + wlist[numwords].pallow = 0; + numwords++; + } + } + } + } +} + + +/* add a suffix to a word */ +void suf_add (const char * word, int len, struct affent * ep, int num) +{ + struct affent * aent; + int tlen; + int cond; + unsigned char * cp; + int i; + char tword[MAX_WD_LEN]; + char * pp; + + for (aent = ep, i = num; i > 0; aent++, i--) { + + /* if conditions hold on root word + * then strip off strip string and add suffix + */ + + if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) && + ((aent->stripl == 0) || + (strcmp(aent->strip, word + len - aent->stripl) == 0))) { + cp = (unsigned char *) (word + len); + for (cond = aent->numconds; --cond >= 0; ) { + if ((aent->conds[*--cp] & (1 << cond)) == 0) break; + } + if (cond < 0) { + /* we have a matching condition */ + strcpy(tword,word); + tlen = len; + if (aent->stripl) { + tlen -= aent->stripl; + } + pp = (tword + tlen); + if (aent->appndl) { + strcpy (pp, aent->appnd); + tlen += aent->stripl; + } else *pp = '\0'; + + if (numwords < MAX_WORDS) { + wlist[numwords].word = mystrdup(tword); + wlist[numwords].pallow = (aent->xpflg & XPRODUCT); + numwords++; + } + } + } + } +} + + + +int expand_rootword(const char * ts, int wl, const char * ap, int al) +{ + int i; + int j; + int nh=0; + int nwl; + + for (i=0; i < numsfx; i++) { + if (strchr(ap,(stable[i].aep)->achar)) { + suf_add(ts, wl, stable[i].aep, stable[i].num); + } + } + + nh = numwords; + + if (nh > 1) { + for (j=1;j<nh;j++){ + if (wlist[j].pallow) { + for (i=0; i < numpfx; i++) { + if (strchr(ap,(ptable[i].aep)->achar)) { + if ((ptable[i].aep)->xpflg & XPRODUCT) { + nwl = strlen(wlist[j].word); + pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num); + } + } + } + } + } + } + + for (i=0; i < numpfx; i++) { + if (strchr(ap,(ptable[i].aep)->achar)) { + pfx_add(ts, wl, ptable[i].aep, ptable[i].num); + } + } + return 0; +} + + +/* strip strings into token based on single char delimiter + * acts like strsep() but only uses a delim char and not + * a delim string + */ +char * mystrsep(char ** stringp, const char delim) +{ + char * rv = NULL; + char * mp = *stringp; + int n = strlen(mp); + if (n > 0) { + char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n); + if (dp) { + int nc; + *stringp = dp+1; + nc = (int)((unsigned long)dp - (unsigned long)mp); + rv = (char *) malloc(nc+1); + if (rv) { + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + } + } else { + rv = (char *) malloc(n+1); + if (rv) { + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + } + } + } + return rv; +} + + +char * mystrdup(const char * s) +{ + char * d = NULL; + if (s) { + int sl = strlen(s)+1; + d = (char *) malloc(sl); + if (d) memcpy(d,s,sl); + } + return d; +} + + +void mychomp(char * s) +{ + int k = strlen(s); + if ((k > 0) && (*(s+k-1) == '\n')) *(s+k-1) = '\0'; + if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; +} + diff --git a/src/tools/unmunch.h b/src/tools/unmunch.h new file mode 100644 index 0000000..0c8a6bc --- /dev/null +++ b/src/tools/unmunch.h @@ -0,0 +1,78 @@ +/* unmunch header file */ + +#define MAX_LN_LEN 200 +#define MAX_WD_LEN 200 +#define MAX_PREFIXES 256 +#define MAX_SUFFIXES 256 +#define MAX_WORDS 500000 + +#define ROTATE_LEN 5 + +#define ROTATE(v,q) \ + (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); + +#define SET_SIZE 256 + +#define XPRODUCT (1 << 0) + +/* the affix table entry */ + +struct affent +{ + char * appnd; + char * strip; + short appndl; + short stripl; + char achar; + char xpflg; + short numconds; + char conds[SET_SIZE]; +}; + + +struct affixptr +{ + struct affent * aep; + int num; +}; + +/* the prefix and suffix table */ +int numpfx; /* Number of prefixes in table */ +int numsfx; /* Number of suffixes in table */ + +/* the prefix table */ +struct affixptr ptable[MAX_PREFIXES]; + +/* the suffix table */ +struct affixptr stable[MAX_SUFFIXES]; + +int fullstrip; + + +int numwords; /* number of words found */ +struct dwords +{ + char * word; + int pallow; +}; + +struct dwords wlist[MAX_WORDS]; /* list words found */ + + +/* the routines */ + +int parse_aff_file(FILE* afflst); + +void encodeit(struct affent * ptr, char * cs); + +int expand_rootword(const char *, int, const char*, int); + +void pfx_add (const char * word, int len, struct affent* ep, int num); + +void suf_add (const char * word, int len, struct affent * ep, int num); + +char * mystrsep(char ** stringp, const char delim); + +char * mystrdup(const char * s); + +void mychomp(char * s); diff --git a/src/tools/wordforms b/src/tools/wordforms new file mode 100755 index 0000000..dabc346 --- /dev/null +++ b/src/tools/wordforms @@ -0,0 +1,35 @@ +#!/bin/sh +case $# in +0|1|2) echo "Usage: wordforms [-s | -p] dictionary.aff dictionary.dic word +-s: print only suffixed forms +-p: print only prefixed forms +"; exit 1;; +esac +fx=0 +case $1 in +-s) fx=1; shift;; +-p) fx=2; shift;; +esac +test -h /tmp/wordforms.aff && rm /tmp/wordforms.aff +ln -s $PWD/$1 /tmp/wordforms.aff +# prepared dic only with the query word +echo 1 >/tmp/wordforms.dic +grep "^$3/" $2 >>/tmp/wordforms.dic +echo $3 | awk -v "fx=$fx" ' +fx!=2 && FILENAME!="-" && /^SFX/ && NF > 4{split($4,a,"/");clen=($3=="0") ? 0 : length($3);sfx[a[1],clen]=a[1];sfxc[a[1],clen]=clen;next} +fx!=1 && FILENAME!="-" && /^PFX/ && NF > 4{split($4,a,"/");clen=($3=="0") ? 0 : length($3);pfx[a[1],clen]=a[1];pfxc[a[1],clen]=clen;next} +FILENAME=="-"{ +wlen=length($1) +if (fx==0 || fx==2) { + for (j in pfx) {if (wlen<=pfxc[j]) continue; print (pfx[j]=="0" ? "" : pfx[j]) substr($1, pfxc[j]+1)} +} +if (fx==0 || fx==1) { + for(i in sfx){clen=sfxc[i];if (wlen<=clen) continue; print substr($1, 1, wlen-clen) (sfx[i]=="0" ? "": sfx[i]) } +} +if (fx==0) { +for (j in pfx) {if (wlen<=pfxc[j]) continue; + for(i in sfx){clen=sfxc[i];if (wlen<=clen || wlen <= (clen + pfxc[j]))continue; + print (pfx[j]=="0" ? "" : pfx[j]) substr($1, pfxc[j]+1, wlen-clen-pfxc[j]) (sfx[i]=="0" ? "": sfx[i]) }} +} +} +' /tmp/wordforms.aff - | hunspell -d /tmp/wordforms -G -l diff --git a/src/tools/wordlist2hunspell b/src/tools/wordlist2hunspell new file mode 100644 index 0000000..09a2bb2 --- /dev/null +++ b/src/tools/wordlist2hunspell @@ -0,0 +1,38 @@ +#!/bin/sh +# +# (C) 2008 Caolán McNamara <caolanm@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +# This creates a LANG_TERRITORY .aff & .dic from a wordlist. +# It is only a simple wordlist spellchecking dictionary output, no +# knowledge of language rules can be extrapolated to shrink the +# wordlist or provide .aff rules for extending wordstems + +if [ $# -lt 2 ]; then + echo "Usage: wordlist2hunspell wordlist_file locale" + echo "e.g. wordlist2hunspell breton.words br_FR to create br_FR.dic and br_FR.aff in cwd" + exit 1 +fi + +export LANG=$2.utf8 +echo "# A basic .aff for a raw wordlist, created through wordlist2hunspell" > $2.aff +echo SET UTF-8 >> $2.aff +#see https://bugzilla.redhat.com/show_bug.cgi?id=462184 for the "C" hacks +echo TRY `sed 's/./&\n/g' $1 | sed '/^$/d' | LC_ALL=C sort -n | LC_ALL=C uniq -c | LC_ALL=C sort -rn | tr -s ' ' | cut -d ' ' -f 3 | tr -d '\n'` >> $2.aff +cat $1 | sed '/^$/d' | wc -l > $2.dic +LC_ALL=C sort $1 | sed '/^$/d' >> $2.dic + +echo Basic $2.dic and $2.aff created |