summaryrefslogtreecommitdiff
path: root/src/tools/ispellaff2myspell
diff options
context:
space:
mode:
Diffstat (limited to 'src/tools/ispellaff2myspell')
-rw-r--r--src/tools/ispellaff2myspell472
1 files changed, 472 insertions, 0 deletions
diff --git a/src/tools/ispellaff2myspell b/src/tools/ispellaff2myspell
new file mode 100644
index 0000000..5d60c09
--- /dev/null
+++ b/src/tools/ispellaff2myspell
@@ -0,0 +1,472 @@
+#!/usr/bin/perl -w
+# -*- coding: iso-8859-1 -*-
+# $Id: ispellaff2myspell,v 1.2 2010/02/23 12:05:51 caolan Exp $
+#
+# (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+sub usage {
+ print "ispellaff2myspell: A program to convert ispell affix tables to myspell format
+(C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL
+
+Usage:
+ ispellaff2myspell [options] <affixfile>
+
+ Options:
+ --affixfile=s Affix file
+ --bylocale Use current locale setup for upper/lowercase
+ conversion
+ --charset=s Use specified charset for upper/lowercase
+ conversion (defaults to latin1)
+ --debug Print debugging info
+ --extraflags Allow some non alphabetic flags
+ --lowercase=s Lowercase string
+ --myheader=s Header file
+ --printcomments Print commented lines in output
+ --replacements=s Replacements file
+ --split=i Split flags with more that i entries
+ --uppercase=s Uppercase string
+ --wordlist=s Still unused
+
+ Currently allowed valued for charset are: latin1, latin2, latin3
+
+This script does not create the dict file. Something like
+
+( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
+
+should do the work, with mydict.words+ being the ispell munched wordlist
+
+";
+ exit;
+}
+
+sub debugprint {
+ if ( $debug ){
+ print STDERR "@_";
+ }
+}
+
+sub shipoutflag{
+ my $flag_entries=scalar @flag_array;
+
+ if ( $flag_entries != 0 ){
+ if ( $split ){
+ while ( @flag_array ){
+ my @flag_subarray=splice(@flag_array,0,$split);
+ my $subflag_entries=scalar @flag_subarray;
+ if ( scalar @flag_array ){
+ print "$myaffix $flagname $flagcombine $subflag_entries S\n";
+ } else {
+ print "$myaffix $flagname $flagcombine $subflag_entries\n";
+ }
+ print join("\n",@flag_subarray);
+ print "\n\n";
+ }
+ } else {
+ print "$myaffix $flagname $flagcombine $flag_entries\n";
+ print join("\n",@flag_array);
+ print "\n\n";
+ }
+ }
+ @flag_array=();
+ $flagname='';
+ $flagcombine='';
+}
+
+sub mylc{
+ my $inputstring=shift;
+ my $outputstring;
+
+ if ( $bylocale ){
+ {
+ use locale;
+ $outputstring = lc $inputstring;
+ }
+ } else {
+ if ( $charset eq "latin0" ){
+ $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ½¨¸';
+ $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ¼¦´';
+ } elsif ( $charset eq "latin1" ){
+ $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
+ $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
+ } elsif ( $charset eq "latin2" ){
+ $lowercase='a-z±³µ¶¹º»¼¾¿àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
+ $uppercase='A-Z¡£¥¦©ª«¬®¯ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
+ } elsif ( $charset eq "latin3" ){
+ $lowercase='a-z±¶¹º»¼¿àáâäåæçèéêëìíîïñòóôõö÷øùúûüýþ';
+ $uppercase='A-Z¡¦©ª«¬¯ÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖ×ØÙÚÛÜÝÞ';
+# } elsif ( $charset eq "other_charset" ){
+# die "latin2 still unimplemented";
+ } else {
+ if ( not $lowercase and not $uppercase ){
+ die "Unsupported charset [$charset]
+
+Explicitly use --lowercase=string and --uppercase=string
+options. Remember that both string must match exactly, but
+case changed.
+";
+ }
+ }
+ $outputstring=$inputstring;
+ eval "\$outputstring=~tr/$uppercase/$lowercase/";
+ }
+ return $outputstring;
+}
+
+sub validate_flag (){
+ my $flag = shift;
+ if ($flag=~m/[a-zA-Z]+/){
+ return $flag;
+ } elsif ( $hasextraflags ){
+ foreach ( keys %theextraflags ){
+ if ($flag =~ m/^$_/){
+ $flag =~ s/^$_//;
+ return $flag;
+ }
+ }
+ }
+ return '';
+}
+
+sub process_replacements{
+ my $file = shift;
+ my @replaces = ();
+
+ open (REPLACE,"< $file") ||
+ die "Error: Could not open replacements file: $file\n";
+ while (<REPLACE>){
+ next unless m/^REP[\s\t]*\D.*/;
+ next if m/^REP\s+[0-9]+/;
+ s/\015\012//;
+ s/\015//;
+ chomp;
+ push @replaces, $_;
+ }
+ close REPLACE;
+ my $number = scalar @replaces;
+ print "REP $number\n";
+ foreach ( @replaces ){
+ print $_ . "\n";
+ }
+}
+
+# -----------------------------------------------------------
+# Now the progran start, after the functions are defined
+# -----------------------------------------------------------
+
+use Getopt::Long;
+
+# Initializing option values
+$affixfile = '';
+$bylocale = '';
+$charset = '';
+$debug = '';
+$lowercase = '';
+$myheader = '';
+$printcomments = '';
+$replacements = '';
+$split = '';
+$uppercase = '';
+$wordlist = '';
+$hasextraflags = '';
+@flag_array = ();
+%theextraflags = ();
+# Initializing root values
+$rootremove = "0";
+$rootname = '';
+$addtoroot = '';
+$comment = '';
+# Initializing flag values
+$flagname = '';
+$flagcombine = '';
+$inflags = '';
+
+GetOptions ('affixfile=s' => \$affixfile,
+ 'bylocale' => \$bylocale,
+ 'charset=s' => \$charset,
+ 'debug' => \$debug,
+ 'extraflags:s' => sub {
+ $hasextraflags = 1;
+ shift;
+ $theflag = shift;
+ $theextraflags{$theflag}++ if $theflag},
+ 'lowercase=s' => \$lowercase,
+ 'myheader=s' => \$myheader,
+ 'printcomments' => \$printcomments,
+ 'replacements=s'=> \$replacements,
+ 'split=i' => \$split,
+ 'uppercase=s' => \$uppercase,
+ 'wordlist=s' => \$wordlist) or usage;
+
+if ( not $affixfile ){
+ $affixfile=shift or usage;
+}
+
+if ( $charset and ( $lowercase or $uppercase )){
+ die "Error: charset and lowercase/uppercase options
+are incompatible. Use either charset or lowercase/uppercase options to
+specify the patterns
+"
+} elsif ( not $lowercase and not $uppercase and not $charset ){
+ $charset="latin1";
+}
+
+if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){
+ $theextraflags{"\\\\"}++;
+}
+
+debugprint "$affixfile $charset";
+
+open (AFFIXFILE,"< $affixfile") ||
+ die "Error: Could not open affix file: $affixfile";
+
+if ( $myheader ){
+ my $myspell_header=`cat $myheader`;
+ print $myspell_header . "\n";
+}
+
+while (<AFFIXFILE>){
+ chomp;
+ if (/^\s*\#.*/){
+ debugprint "Ignoring line $.\n";
+ print "$_\n" if $printcomments;
+ } elsif (/^\s*$/){
+ debugprint "Ignoring line $.\n";
+ } elsif (/^\s*prefixes/){
+ debugprint "Prefixes starting in line $.\n";
+ $affix="PFX";
+ } elsif (/^\s*suffixes/){
+ debugprint "Suffixes starting in line $.\n";
+ $affix="SFX";
+ } elsif (/^[\s\t]*flag.*/){
+ next if not $affix; # In case we are still in the preamble
+ shipoutflag if $inflags;
+ $inflags="yes";
+ s/^[\s\t]*flag[\s\t]*//;
+ s/[\s\t]*:.*$//;
+ debugprint "Found flag $_ in line $.\n";
+
+ if (/\*/){
+ s/[\*\s]//g;
+ $flagcombine="Y";
+ debugprint "Flag renamed to $_ with combine=$flagcombine\n";
+ } else {
+ $flagcombine="N";
+ }
+
+ if ( $flagname = &validate_flag($_) ){
+ $myaffix = $affix;
+ } else {
+ $myaffix = "\# $affix";
+ $flagname = $_;
+ print STDERR "Ignoring invalid flag $flagname in line $.\n";
+ }
+ } elsif ( $affix and $inflags ) {
+ ($rootname,@comments) = split('#',$_);
+ $comment = '# ' . join('#',@comments);
+
+ $rootname =~ s/\s*//g;
+ $rootname = mylc $rootname;
+ ($rootname,$addtoroot) = split('>',$rootname);
+
+ if ( $addtoroot =~ s/^\-//g ){
+ ($rootremove,$addtoroot) = split(',',$addtoroot);
+ $addtoroot = "0" unless $addtoroot;
+ $addtoroot = "0" if ( $addtoroot eq "-");
+ } else {
+ $rootremove = "0";
+ }
+ $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti-
+
+ if ( $rootname eq '.' && $rootremove ne "0" ){
+ $rootname = $rootremove;
+ }
+
+ debugprint "$rootname, $addtoroot, $rootremove\n";
+ if ( $printcomments ){
+ $affix_line=sprintf("%s %s %-5s %-11s %-24s %s",
+ $myaffix, $flagname, $rootremove,
+ $addtoroot, $rootname, $comment);
+ } else {
+ $affix_line=sprintf("%s %s %-5s %-11s %s",
+ $myaffix, $flagname, $rootremove,
+ $addtoroot, $rootname);
+ }
+ $rootremove = "0";
+ $rootname = '';
+ $addtoroot = '';
+ $comment = '';
+ @comments = ();
+ push @flag_array,$affix_line;
+ debugprint "$affix_line\n";
+ } else {
+ #
+ }
+}
+shipoutflag;
+
+close AFFIXFILE;
+
+if ( $replacements ){
+ &process_replacements($replacements);
+}
+
+__END__
+
+=head1 NAME
+
+B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format.
+
+=head1 SYNOPSIS
+
+ ispellaff2myspell [options] <affixfile> --myheader your_header
+
+ Options:
+
+ --affixfile=s Affix file
+ --bylocale Use current locale setup for upper/lowercase
+ conversion
+ --charset=s Use specified charset for upper/lowercase
+ conversion (defaults to latin1)
+ --debug Print debugging info
+ --extraflags=s Allow some non alphabetic flags
+ --lowercase=s Lowercase string
+ --myheader=s Header file
+ --printcomments Print commented lines in output
+ --replacements=s Replacements file
+ --split=i Split flags with more that i entries
+ --uppercase=s Uppercase string
+
+=head1 DESCRIPTION
+
+B<ispellaff2myspell> is a script that will convert ispell affix tables
+to myspell format in a more or less successful way.
+
+This script does not create the dict file. Something like
+
+( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
+
+should do the work, with mydict.words+ being the munched wordlist
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--affixfile=s>
+
+Affix file. You can put it directly in the command line.
+
+=item B<--bylocale>
+
+Use current locale setup for upper/lowercase conversion. Make sure
+that the selected locale match the dictionary one, or you might get
+into trouble.
+
+=item B<--charset=s>
+
+Use specified charset for upper/lowercase conversion (defaults to latin1).
+Currently allowed values for charset are: latin0, latin1, latin2, latin3.
+
+=item B<--debug>
+
+Print some debugging info.
+
+=item B<--extraflags:s>
+
+Allows some non alphabetic flags.
+
+When invoked with no value the supported flags are currently those
+corresponding to chars represented with the escape char B<\> as
+first char. B<\> will be stripped.
+
+When given with the flag prefix will allow that flag and strip the
+given prefix. Be careful when giving the prefix to properly escape chars,
+e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to
+B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all
+flags and pass them unmodified.
+
+You will need a call to -e for each flag type, e.g.,
+B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>).
+
+When a prefix is explicitely set, the default value (anything starting by B<\>)
+is disabled and you need to enable it explicitely as in previous example.
+
+=item B<--lowercase=s>
+
+Lowercase string. Manually set the string of lowercase chars. This
+requires B<--uppercase> having exactly that string but uppercase.
+
+=item B<--myheader=s>
+
+Header file. The myspell aff header. You need to write it
+manually. This can contain everything you want to be before the affix table
+
+=item B<--printcomments>
+
+Print commented lines in output.
+
+=item B<--replacements=file>
+
+Add a pre-defined replacements table taken from 'file' to the .aff file.
+Will skip lines not beginning with REP, and set the replacements number
+appropriately.
+
+=item B<--split=i>
+
+Split flags with more that i entries. This can be of interest for flags
+having a lot of entries. Will split the flag in chunks containing B<i>
+entries.
+
+=item B<--uppercase=s>
+
+Uppercase string. Manually set the sring of uppercase chars. This
+requires B<--lowercase> having exactly that string but lowercase.
+
+=back
+
+If your encoding is currently unsupported you can send me a file with
+the two strings of lower and uppercase chars. Note that they must match
+exactly but case changed. It will look something like
+
+ $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ';
+ $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ';
+
+=head1 SEE ALSO
+
+The OpenOffice.org Lingucomponent Project home page
+
+L<http://lingucomponent.openoffice.org/index.html>
+
+and the document
+
+L<http://lingucomponent.openoffice.org/affix.readme>
+
+that provides information about the basics of the myspell affix file format.
+
+You can also take a look at
+
+ /usr/share/doc/libmyspell-dev/affix.readme.gz
+ /usr/share/doc/libmyspell-dev/README.compoundwords
+ /usr/share/doc/libmyspell-dev/README.replacetable
+
+in your Debian system.
+
+=head1 AUTHORS
+
+Agustin Martin <agustin.martin@hispalinux.es>
+
+=cut