diff options
Diffstat (limited to 'src/tools/ispellaff2myspell')
-rw-r--r-- | src/tools/ispellaff2myspell | 472 |
1 files changed, 472 insertions, 0 deletions
diff --git a/src/tools/ispellaff2myspell b/src/tools/ispellaff2myspell new file mode 100644 index 0000000..5d60c09 --- /dev/null +++ b/src/tools/ispellaff2myspell @@ -0,0 +1,472 @@ +#!/usr/bin/perl -w +# -*- coding: iso-8859-1 -*- +# $Id: ispellaff2myspell,v 1.2 2010/02/23 12:05:51 caolan Exp $ +# +# (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +sub usage { + print "ispellaff2myspell: A program to convert ispell affix tables to myspell format +(C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL + +Usage: + ispellaff2myspell [options] <affixfile> + + Options: + --affixfile=s Affix file + --bylocale Use current locale setup for upper/lowercase + conversion + --charset=s Use specified charset for upper/lowercase + conversion (defaults to latin1) + --debug Print debugging info + --extraflags Allow some non alphabetic flags + --lowercase=s Lowercase string + --myheader=s Header file + --printcomments Print commented lines in output + --replacements=s Replacements file + --split=i Split flags with more that i entries + --uppercase=s Uppercase string + --wordlist=s Still unused + + Currently allowed valued for charset are: latin1, latin2, latin3 + +This script does not create the dict file. Something like + +( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict + +should do the work, with mydict.words+ being the ispell munched wordlist + +"; + exit; +} + +sub debugprint { + if ( $debug ){ + print STDERR "@_"; + } +} + +sub shipoutflag{ + my $flag_entries=scalar @flag_array; + + if ( $flag_entries != 0 ){ + if ( $split ){ + while ( @flag_array ){ + my @flag_subarray=splice(@flag_array,0,$split); + my $subflag_entries=scalar @flag_subarray; + if ( scalar @flag_array ){ + print "$myaffix $flagname $flagcombine $subflag_entries S\n"; + } else { + print "$myaffix $flagname $flagcombine $subflag_entries\n"; + } + print join("\n",@flag_subarray); + print "\n\n"; + } + } else { + print "$myaffix $flagname $flagcombine $flag_entries\n"; + print join("\n",@flag_array); + print "\n\n"; + } + } + @flag_array=(); + $flagname=''; + $flagcombine=''; +} + +sub mylc{ + my $inputstring=shift; + my $outputstring; + + if ( $bylocale ){ + { + use locale; + $outputstring = lc $inputstring; + } + } else { + if ( $charset eq "latin0" ){ + $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ½¨¸'; + $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ¼¦´'; + } elsif ( $charset eq "latin1" ){ + $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ'; + $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'; + } elsif ( $charset eq "latin2" ){ + $lowercase='a-z±³µ¶¹º»¼¾¿àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ'; + $uppercase='A-Z¡£¥¦©ª«¬®¯ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'; + } elsif ( $charset eq "latin3" ){ + $lowercase='a-z±¶¹º»¼¿àáâäåæçèéêëìíîïñòóôõö÷øùúûüýþ'; + $uppercase='A-Z¡¦©ª«¬¯ÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖ×ØÙÚÛÜÝÞ'; +# } elsif ( $charset eq "other_charset" ){ +# die "latin2 still unimplemented"; + } else { + if ( not $lowercase and not $uppercase ){ + die "Unsupported charset [$charset] + +Explicitly use --lowercase=string and --uppercase=string +options. Remember that both string must match exactly, but +case changed. +"; + } + } + $outputstring=$inputstring; + eval "\$outputstring=~tr/$uppercase/$lowercase/"; + } + return $outputstring; +} + +sub validate_flag (){ + my $flag = shift; + if ($flag=~m/[a-zA-Z]+/){ + return $flag; + } elsif ( $hasextraflags ){ + foreach ( keys %theextraflags ){ + if ($flag =~ m/^$_/){ + $flag =~ s/^$_//; + return $flag; + } + } + } + return ''; +} + +sub process_replacements{ + my $file = shift; + my @replaces = (); + + open (REPLACE,"< $file") || + die "Error: Could not open replacements file: $file\n"; + while (<REPLACE>){ + next unless m/^REP[\s\t]*\D.*/; + next if m/^REP\s+[0-9]+/; + s/\015\012//; + s/\015//; + chomp; + push @replaces, $_; + } + close REPLACE; + my $number = scalar @replaces; + print "REP $number\n"; + foreach ( @replaces ){ + print $_ . "\n"; + } +} + +# ----------------------------------------------------------- +# Now the progran start, after the functions are defined +# ----------------------------------------------------------- + +use Getopt::Long; + +# Initializing option values +$affixfile = ''; +$bylocale = ''; +$charset = ''; +$debug = ''; +$lowercase = ''; +$myheader = ''; +$printcomments = ''; +$replacements = ''; +$split = ''; +$uppercase = ''; +$wordlist = ''; +$hasextraflags = ''; +@flag_array = (); +%theextraflags = (); +# Initializing root values +$rootremove = "0"; +$rootname = ''; +$addtoroot = ''; +$comment = ''; +# Initializing flag values +$flagname = ''; +$flagcombine = ''; +$inflags = ''; + +GetOptions ('affixfile=s' => \$affixfile, + 'bylocale' => \$bylocale, + 'charset=s' => \$charset, + 'debug' => \$debug, + 'extraflags:s' => sub { + $hasextraflags = 1; + shift; + $theflag = shift; + $theextraflags{$theflag}++ if $theflag}, + 'lowercase=s' => \$lowercase, + 'myheader=s' => \$myheader, + 'printcomments' => \$printcomments, + 'replacements=s'=> \$replacements, + 'split=i' => \$split, + 'uppercase=s' => \$uppercase, + 'wordlist=s' => \$wordlist) or usage; + +if ( not $affixfile ){ + $affixfile=shift or usage; +} + +if ( $charset and ( $lowercase or $uppercase )){ + die "Error: charset and lowercase/uppercase options +are incompatible. Use either charset or lowercase/uppercase options to +specify the patterns +" +} elsif ( not $lowercase and not $uppercase and not $charset ){ + $charset="latin1"; +} + +if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){ + $theextraflags{"\\\\"}++; +} + +debugprint "$affixfile $charset"; + +open (AFFIXFILE,"< $affixfile") || + die "Error: Could not open affix file: $affixfile"; + +if ( $myheader ){ + my $myspell_header=`cat $myheader`; + print $myspell_header . "\n"; +} + +while (<AFFIXFILE>){ + chomp; + if (/^\s*\#.*/){ + debugprint "Ignoring line $.\n"; + print "$_\n" if $printcomments; + } elsif (/^\s*$/){ + debugprint "Ignoring line $.\n"; + } elsif (/^\s*prefixes/){ + debugprint "Prefixes starting in line $.\n"; + $affix="PFX"; + } elsif (/^\s*suffixes/){ + debugprint "Suffixes starting in line $.\n"; + $affix="SFX"; + } elsif (/^[\s\t]*flag.*/){ + next if not $affix; # In case we are still in the preamble + shipoutflag if $inflags; + $inflags="yes"; + s/^[\s\t]*flag[\s\t]*//; + s/[\s\t]*:.*$//; + debugprint "Found flag $_ in line $.\n"; + + if (/\*/){ + s/[\*\s]//g; + $flagcombine="Y"; + debugprint "Flag renamed to $_ with combine=$flagcombine\n"; + } else { + $flagcombine="N"; + } + + if ( $flagname = &validate_flag($_) ){ + $myaffix = $affix; + } else { + $myaffix = "\# $affix"; + $flagname = $_; + print STDERR "Ignoring invalid flag $flagname in line $.\n"; + } + } elsif ( $affix and $inflags ) { + ($rootname,@comments) = split('#',$_); + $comment = '# ' . join('#',@comments); + + $rootname =~ s/\s*//g; + $rootname = mylc $rootname; + ($rootname,$addtoroot) = split('>',$rootname); + + if ( $addtoroot =~ s/^\-//g ){ + ($rootremove,$addtoroot) = split(',',$addtoroot); + $addtoroot = "0" unless $addtoroot; + $addtoroot = "0" if ( $addtoroot eq "-"); + } else { + $rootremove = "0"; + } + $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti- + + if ( $rootname eq '.' && $rootremove ne "0" ){ + $rootname = $rootremove; + } + + debugprint "$rootname, $addtoroot, $rootremove\n"; + if ( $printcomments ){ + $affix_line=sprintf("%s %s %-5s %-11s %-24s %s", + $myaffix, $flagname, $rootremove, + $addtoroot, $rootname, $comment); + } else { + $affix_line=sprintf("%s %s %-5s %-11s %s", + $myaffix, $flagname, $rootremove, + $addtoroot, $rootname); + } + $rootremove = "0"; + $rootname = ''; + $addtoroot = ''; + $comment = ''; + @comments = (); + push @flag_array,$affix_line; + debugprint "$affix_line\n"; + } else { + # + } +} +shipoutflag; + +close AFFIXFILE; + +if ( $replacements ){ + &process_replacements($replacements); +} + +__END__ + +=head1 NAME + +B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format. + +=head1 SYNOPSIS + + ispellaff2myspell [options] <affixfile> --myheader your_header + + Options: + + --affixfile=s Affix file + --bylocale Use current locale setup for upper/lowercase + conversion + --charset=s Use specified charset for upper/lowercase + conversion (defaults to latin1) + --debug Print debugging info + --extraflags=s Allow some non alphabetic flags + --lowercase=s Lowercase string + --myheader=s Header file + --printcomments Print commented lines in output + --replacements=s Replacements file + --split=i Split flags with more that i entries + --uppercase=s Uppercase string + +=head1 DESCRIPTION + +B<ispellaff2myspell> is a script that will convert ispell affix tables +to myspell format in a more or less successful way. + +This script does not create the dict file. Something like + +( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict + +should do the work, with mydict.words+ being the munched wordlist + +=head1 OPTIONS + +=over 8 + +=item B<--affixfile=s> + +Affix file. You can put it directly in the command line. + +=item B<--bylocale> + +Use current locale setup for upper/lowercase conversion. Make sure +that the selected locale match the dictionary one, or you might get +into trouble. + +=item B<--charset=s> + +Use specified charset for upper/lowercase conversion (defaults to latin1). +Currently allowed values for charset are: latin0, latin1, latin2, latin3. + +=item B<--debug> + +Print some debugging info. + +=item B<--extraflags:s> + +Allows some non alphabetic flags. + +When invoked with no value the supported flags are currently those +corresponding to chars represented with the escape char B<\> as +first char. B<\> will be stripped. + +When given with the flag prefix will allow that flag and strip the +given prefix. Be careful when giving the prefix to properly escape chars, +e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to +B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all +flags and pass them unmodified. + +You will need a call to -e for each flag type, e.g., +B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>). + +When a prefix is explicitely set, the default value (anything starting by B<\>) +is disabled and you need to enable it explicitely as in previous example. + +=item B<--lowercase=s> + +Lowercase string. Manually set the string of lowercase chars. This +requires B<--uppercase> having exactly that string but uppercase. + +=item B<--myheader=s> + +Header file. The myspell aff header. You need to write it +manually. This can contain everything you want to be before the affix table + +=item B<--printcomments> + +Print commented lines in output. + +=item B<--replacements=file> + +Add a pre-defined replacements table taken from 'file' to the .aff file. +Will skip lines not beginning with REP, and set the replacements number +appropriately. + +=item B<--split=i> + +Split flags with more that i entries. This can be of interest for flags +having a lot of entries. Will split the flag in chunks containing B<i> +entries. + +=item B<--uppercase=s> + +Uppercase string. Manually set the sring of uppercase chars. This +requires B<--lowercase> having exactly that string but lowercase. + +=back + +If your encoding is currently unsupported you can send me a file with +the two strings of lower and uppercase chars. Note that they must match +exactly but case changed. It will look something like + + $lowercase='a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ'; + $uppercase='A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'; + +=head1 SEE ALSO + +The OpenOffice.org Lingucomponent Project home page + +L<http://lingucomponent.openoffice.org/index.html> + +and the document + +L<http://lingucomponent.openoffice.org/affix.readme> + +that provides information about the basics of the myspell affix file format. + +You can also take a look at + + /usr/share/doc/libmyspell-dev/affix.readme.gz + /usr/share/doc/libmyspell-dev/README.compoundwords + /usr/share/doc/libmyspell-dev/README.replacetable + +in your Debian system. + +=head1 AUTHORS + +Agustin Martin <agustin.martin@hispalinux.es> + +=cut |