View file File name : munchlist Content :#!/bin/sh # # $Id: munchlist.X,v 1.70 2015-02-08 00:35:41-08 geoff Exp $ # # Copyright 1987, 1988, 1989, 1992, 1993, 1999, 2001, 2005, Geoff Kuenning, # Claremont, CA. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. All modifications to the source code must be clearly marked as # such. Binary redistributions based on modified source code # must be clearly marked as modified versions in the documentation # and/or other materials provided with the distribution. # 4. The code that causes the 'ispell -v' command to display a prominent # link to the official ispell Web site may not be removed. # 5. The name of Geoff Kuenning may not be used to endorse or promote # products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Given a list of words for ispell, generate a reduced list # in which all possible affixes have been collapsed. The reduced # list will match the same list as the original. # # Usage: # # munchlist [-l lang] [-c lang] [-s hashfile] [-D] [-w chars] [-v] \ # [file] ... # # Options: # # -l lang Specifies the language table to be used. The default # is "$LIBDIR/default.aff". # -c lang Specifies "conversion" language table. If this option is # given, the input file(s) will be assumed to be described by # this table, rather than the table given in the -l option. # This may be used to convert between incompatible language # tables. (When in doubt, use this option -- it doesn't # hurt, and it may save you from creating a dictionary that has # illegal words in it). The default is no conversion. # -T suff Specifies that the source word lists are in the format # of a "suff"-suffixed file, rather than in the # canonical form. For example, "-T tex" specifies that # string characters in the word lists are in TeX format. # The string character conversions are taken from the language # table specified by the "-l" switch. # -s Remove any words that are already covered by the # dictionary in 'hashfile'. The words will be removed # only if all affixes are covered. This option should not be # specified when the main dictionary is being munched. # 'Hashfile' must have been created with the language # table given in the -l option, but this is not checked. # -D Leave temporary files for debugging purposes # -w Passed on to ispell (specify chars that are part of a word) # Unfortunately, special characters must be quoted twice # rather than once when invoking this script. Also, since # buildhash doesn't accept this option, the final ispell -l # step ignores it, making it somewhat less than useful. # -v Report progress to stderr. # # The given input files are merged, then processed by 'ispell -c' # to generate possible affix lists; these are then combined # and reduced. The final result is written to standard output. # # For portability to older systems, I have avoided getopt. # # Geoff Kuenning # 2/28/87 # # $Log: munchlist.X,v $ # Revision 1.70 2015-02-08 00:35:41-08 geoff # Be a bit more paranoid about creating temporary files. Fix a problem # with detecting a new-style sort that refuses to be backwards # compatible (and yes, it's still cretinism to break backwards # compatibility--but I have to put up with the cretins). # # Revision 1.69 2005/04/28 14:46:51 geoff # Remove references to the now-obsolete count file. # # Revision 1.68 2005/04/27 01:18:34 geoff # Work around idiotic POSIX incompatibilities in sort. Add secure # temp-file handling. # # Revision 1.67 2005/04/14 23:11:36 geoff # Pass the -w switch to icombine. # # Revision 1.66 2005/04/14 21:25:52 geoff # Make the temporary-file handling safer (using mktemp, if it exists). # # Revision 1.65 2005/04/14 14:39:33 geoff # Use /tmp as the default temp directory # # Revision 1.64 2005/04/14 14:38:23 geoff # Update license. Protect against modernized (i.e., incompatible) and # internationalized sort commands. Change the debugging names of the # minimal-affixes count and stat files. # # Revision 1.63 2002/06/20 23:46:16 geoff # Add yet more locale definitions so that we won't run into bugs caused # by sorting inconsistencies. # # Revision 1.62 2001/09/06 00:30:28 geoff # Many changes from Eli Zaretskii to support DJGPP compilation. # # Revision 1.61 2001/07/25 21:51:46 geoff # Minor license update. # # Revision 1.60 2001/07/23 20:24:04 geoff # Update the copyright and the license. # # Revision 1.59 2001/06/07 08:02:18 geoff # Fix a copule of typos in comments. # # Revision 1.58 2000/11/14 07:27:04 geoff # Don't generate an extra dot when attempting to preserve the count # files in -D mode. # # Revision 1.57 2000/10/06 23:59:48 geoff # Don't assume dot is in the path # # Revision 1.56 1999/01/07 01:22:42 geoff # Update the copyright. # # Revision 1.55 1997/12/02 06:25:01 geoff # Start the cross-expansions loop count at 1, not zero. # # Revision 1.54 1997/12/01 00:53:52 geoff # Abort the munchlist cross-product loop if it goes over 100 passes. # # Revision 1.53 1995/01/08 23:23:36 geoff # Support variable hashfile suffixes for DOS purposes. # # Revision 1.52 1994/12/27 23:08:46 geoff # Dynamically determine how to pass backslashes to 'tr' so that it'll # work on any machine. Define LC_CTYPE to work around yet more # internationalized sort programs. Work around a bug in GNU uniq that # uses the wrong separator between counts and duplicated lines. # # Revision 1.51 1994/11/21 07:02:54 geoff # Correctly quote the arguments to 'tr' when detecting systems with # unsigned sorts. Be sure to provide a zero exit status on all systems, # even if MUNCHDEBUG is not set. # # Revision 1.50 1994/10/25 05:46:05 geoff # Export values for LANG and LOCALE in an attempt to override some # stupidly-internationalized sort programs. # # Revision 1.49 1994/10/04 03:51:30 geoff # Add the MUNCHMAIL feature. If the MUNCHMAIL environment variable is # set to an email address, debugging information about the munchlist run # will automatically be collected and mailed to that address. # # Revision 1.48 1994/05/17 06:32:06 geoff # Don't look for affix tables in LIBDIR if the name contains a slash # # Revision 1.47 1994/04/27 02:50:48 geoff # Fix some cosmetic flaws in the verbose-mode messages. # # Revision 1.46 1994/01/25 07:11:59 geoff # Get rid of all old RCS log lines in preparation for the 3.1 release. # # LIBDIR=/usr/lib/ispell TDIR=${TMPDIR-/tmp} MUNCHDIR=`mktemp -d ${TDIR}/munchXXXXXXXXXX 2>/dev/null` || { echo "$0: Failed to create temporary directory, exiting..." 1>&2; exit 1; } TMP=${MUNCHDIR}/munch. MAILDEBUGDIR=${MUNCHDIR-/tmp} if [ "X$MUNCHMAIL" != X ] then exec 2> ${MAILDEBUGDIR}/munchlist.mail echo "munchlist $*" 1>&2 set -vx fi SORTTMP="-T ${TDIR}" # !!SORTTMP!! DBDIR=${MUNCHDEBUGDIR-$MAILDEBUGDIR} # Detect MS-DOS systems and arrange to use their silly suffix system if [ -z "$COMSPEC$ComSpec" ] then EXE="" else EXE=".exe" fi # # Set up some program names. This prefers the versions that are in # the same directory as munchlist was run from; if that can't be # figured out, it prefers local versions and finally ones chosen from # $PATH. # # This code could be simplified by using the dirname command, but it's # not available everywhere. For the same reason, we use -r rather than # -x to test for executable files. # case "$0" in */*) bindir=`expr "$0" : '\(.*\)/[^/]*'` ;; *) bindir='.' ;; esac if [ -r $bindir/buildhash$EXE ] then BUILDHASH=$bindir/buildhash$EXE elif [ -r ./buildhash$EXE ] then BUILDHASH=./buildhash$EXE else BUILDHASH=buildhash fi if [ -r $bindir/icombine$EXE ] then COMBINE=$bindir/icombine$EXE elif [ -r ./icombine$EXE ] then COMBINE=./icombine$EXE else COMBINE=icombine fi if [ -r $bindir/ijoin$EXE ] then JOIN=$bindir/ijoin$EXE elif [ -r ./ijoin$EXE ] then JOIN=./ijoin$EXE else JOIN=ijoin fi if [ -r $bindir/ispell$EXE ] then ISPELL=$bindir/ispell$EXE elif [ -r ./ispell$EXE ] then ISPELL=./ispell$EXE else ISPELL=ispell fi # In one of the most incredibly stupid decisions of all time, some # genius decided to break backwards compatibility by "deprecating" the # old-style sort switches even though it was trivial to recognize both # styles. The result is that that thousands of people (like me) will # have to rewrite shell scripts to tolerate that stupidity. (It's not # that the new syntax is bad--it's definitely easier to understand. # But that doesn't excuse breaking compatibility.) # CRETIN_SORT=true # # The following is necessary so that some internationalized versions of # sort(1) don't confuse things by sorting into a nonstandard order. # LANG=C LOCALE=C LC_ALL=C LC_COLLATE=C LC_CTYPE=C export LANG LOCALE LC_COLLATE LC_CTYPE # # The following aren't strictly necessary, but I've been made paranoid # by problems with the stuff above. It can't hurt to set them to a # sensible value. LC_MESSAGES=C LC_MONETARY=C LC_NUMERIC=C LC_TIME=C export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME debug=no dictopt= langtabs=${LIBDIR}/default.aff convtabs= strip=no icflags= verbose=false # The following value of "wchars" is necessary to prevent ispell from # receiving a null argument if -w is not specified. As long as "A" is # a member of the existing character set, ispell will ignore the argument. wchars=-wA while [ $# != 0 ] do case "$1" in -l) case "$2" in */*) langtabs=$2 ;; *) if [ -r "$2" ] then langtabs="$2" else langtabs="${LIBDIR}/$2" fi ;; esac if [ ! -r "$langtabs" ] then echo "Can't open language table '$2'" 1>&2 rm -rf $MUNCHDIR exit 1 fi shift ;; -c) if [ -r "$2" ] then convtabs="$2" elif [ -r "${LIBDIR}/$2" ] then convtabs="${LIBDIR}/$2" else echo "Can't open conversion language table '$2'" 1>&2 rm -rf $MUNCHDIR exit 1 fi shift ;; -s) dictopt="-d $2" strip=yes shift ;; -D) debug=yes ;; -T) icflags="-T $2" shift ;; -v) verbose=true ;; -w) wchars="-w$2" shift ;; --) shift break ;; -) break ;; -*) echo 'Usage: munchlist [-l lang] [-c lang] [-T suff] [-s hashfile] [-D] [-w chars] [-v] [file] ...' \ 1>&2 rm -rf $MUNCHDIR exit 2 ;; *) break ;; esac shift done if [ "X$MUNCHMAIL" != X ] then verbose=true debug=yes fi trap "rm -rf $MUNCHDIR; exit 1" 1 2 13 15 # # Names of temporary files. This is just to make the code a little easier # to read. # EXPANDEDINPUT=${TMP}a STRIPPEDINPUT=${TMP}b CRUNCHEDINPUT=${TMP}c PRODUCTLIST=${TMP}d EXPANDEDPAIRS=${TMP}e LEGALFLAGLIST=${TMP}f JOINEDPAIRS=${TMP}g MINIMALAFFIXES=${TMP}h CROSSROOTS=${TMP}i CROSSEXPANDED=${TMP}j CROSSPAIRS=${TMP}k CROSSILLEGAL=${TMP}l ILLEGALCOMBOS=${TMP}m FAKEDICT=${TMP}n # Ispell insists that hash files have a ".hash" suffix FAKEHASH=${TMP}o.hash AWKSCRIPT=${TMP}p # If the file exists than we should exit with error ERRORFLAGFILE=${TMP}z if [ "$debug" = yes ] then touch $EXPANDEDINPUT $STRIPPEDINPUT $CRUNCHEDINPUT $PRODUCTLIST \ $EXPANDEDPAIRS $LEGALFLAGLIST $JOINEDPAIRS $MINIMALAFFIXES \ $CROSSROOTS $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $ILLEGALCOMBOS \ $FAKEDICT $FAKEHASH $AWKSCRIPT rm -f ${DBDIR}/EXPANDEDINPUT ${DBDIR}/STRIPPEDINPUT \ ${DBDIR}/CRUNCHEDINPUT ${DBDIR}/PRODUCTLIST ${DBDIR}/EXPANDEDPAIRS \ ${DBDIR}/LEGALFLAGLIST ${DBDIR}/JOINEDPAIRS ${DBDIR}/MINIMALAFFIXES \ ${DBDIR}/CROSSROOTS ${DBDIR}/CROSSEXPANDED ${DBDIR}/CROSSPAIRS \ ${DBDIR}/CROSSILLEGAL ${DBDIR}/ILLEGALCOMBOS ${DBDIR}/FAKEDICT \ ${DBDIR}/FAKEHASH.hash ${DBDIR}/AWKSCRIPT \ ${DBDIR}/CROSSROOTS.[0-9]* ${DBDIR}/CROSSEXP.[0-9]* \ ${DBDIR}/CROSSPAIRS.[0-9]* ${DBDIR}/CROSSILLEGAL.[0-9]* ln $EXPANDEDINPUT ${DBDIR}/EXPANDEDINPUT ln $STRIPPEDINPUT ${DBDIR}/STRIPPEDINPUT ln $CRUNCHEDINPUT ${DBDIR}/CRUNCHEDINPUT ln $PRODUCTLIST ${DBDIR}/PRODUCTLIST ln $EXPANDEDPAIRS ${DBDIR}/EXPANDEDPAIRS ln $LEGALFLAGLIST ${DBDIR}/LEGALFLAGLIST ln $JOINEDPAIRS ${DBDIR}/JOINEDPAIRS ln $MINIMALAFFIXES ${DBDIR}/MINIMALAFFIXES ln $CROSSROOTS ${DBDIR}/CROSSROOTS ln $CROSSEXPANDED ${DBDIR}/CROSSEXPANDED ln $CROSSPAIRS ${DBDIR}/CROSSPAIRS ln $CROSSILLEGAL ${DBDIR}/CROSSILLEGAL ln $ILLEGALCOMBOS ${DBDIR}/ILLEGALCOMBOS ln $FAKEDICT ${DBDIR}/FAKEDICT ln $FAKEHASH ${DBDIR}/FAKEHASH.hash ln $AWKSCRIPT ${DBDIR}/AWKSCRIPT fi run() { "$@" || { status=$? touch "$ERRORFLAGFILE" echo "$@ failed with $status" > "$ERRORFLAGFILE" exit $status } } checkerrorflagfile() { [ ! -e "$ERRORFLAGFILE" ] || { cat "$ERRORFLAGFILE" 1>&2 rm -rf $MUNCHDIR exit 1 } } rm -f "$ERRORFLAGFILE" JOIN="run $JOIN" COMBINE="run $COMBINE" ISPELL="run $ISPELL" set -e # # Create a dummy dictionary to hold a compiled copy of the language # table. Initially, it holds the conversion table, if it exists. # case "X$convtabs" in X) convtabs="$langtabs" ;; esac echo 'QQQQQQQQ' > $FAKEDICT $BUILDHASH -s $FAKEDICT $convtabs $FAKEHASH \ || (echo "Couldn't create fake hash file" 1>&2; rm -rf $MUNCHDIR; exit 1) \ || exit 1 # # Figure out how 'sort' sorts signed fields, for arguments to ijoin. # This is a little bit of a tricky pipe, but the result is that SIGNED # is set to "-s" if characters with the top bit set sort before those # without, and "-u" if the reverse is true. How does it work? The # first "tr" step generates two lines, one containing "-u", the other # with the same but with the high-order bit set. The second "tr" # changes the high-bit "-u" back to "-s". If the high-bit "-u" was # sorted first, the sed step will select "-s" for SIGNED; otherwise # it'll pick "-u". We have to be careful about backslash quoting # conventions, because some systems differ. # backslash=\\ for i in 0 1 2 3 do if [ `echo a | tr "${backslash}141" b` = b ] then break fi backslash="$backslash$backslash" done SIGNED=`echo '-s -u' | tr s "${backslash}365" | sort | tr "${backslash}365" s | sed -e 1q` # # Collect all the input and expand all the affix options ($ISPELL -e), # and preserve (sorted) for later joining in EXPANDEDINPUT. The icombine # step is to make sure that unneeded capitalizations (e.g., Farmer and farmer) # are weeded out. The first sort must be folded for icombine; the second # must be unfolded for join. # $verbose && echo "Collecting input." 1>&2 if $CRETIN_SORT then sortopts='-k 1f,1 -k 1' else sortopts='+0f -1 +0' fi if [ $# -eq 0 ] then $ISPELL "$wchars" -e1 -d $FAKEHASH -p /dev/null | tr " " ' ' else cat "$@" | $ISPELL "$wchars" -e1 -d $FAKEHASH -p /dev/null | tr " " ' ' fi \ | sort $SORTTMP -u $sortopts \ | $COMBINE $icflags "$wchars" $langtabs \ | sort $SORTTMP -u > $EXPANDEDINPUT checkerrorflagfile # # If a conversion table existed, recreate the fake hash file with the # "real" language table. # case "$convtabs" in $langtabs) ;; *) $BUILDHASH -s $FAKEDICT $langtabs $FAKEHASH \ || (echo "Couldn't create fake hash file" 1>&2; \ rm -rf $MUNCHDIR; exit 1) \ || exit 1 ;; esac rm -f ${FAKEDICT}* # # If the -s (strip) option was specified, remove all # expanded words that are covered by the dictionary. This produces # the final list of expanded words that this dictionary must cover. # Leave the list in STRIPPEDINPUT. # if [ "X$strip" = "Xno" ] then rm -f $STRIPPEDINPUT ln $EXPANDEDINPUT $STRIPPEDINPUT if [ "$debug" = yes ] then rm -f ${DBDIR}/STRIPPEDINPUT ln $STRIPPEDINPUT ${DBDIR}/STRIPPEDINPUT fi else $verbose && echo "Stripping words already in the dictionary." 1>&2 $ISPELL "$wchars" -l $dictopt -p /dev/null < $EXPANDEDINPUT \ > $STRIPPEDINPUT checkerrorflagfile fi # # Figure out what the flag-marking character is. # $verbose && echo "Finding flag marker." 1>&2 flagmarker=`$ISPELL -D -d $FAKEHASH \ | sed -n -e '/^flagmarker/s/flagmarker //p'` case "$flagmarker" in \\*) flagmarker=`expr "$flagmarker" : '.\(.\)'` ;; esac checkerrorflagfile # # Munch the input to generate roots and affixes ($ISPELL -c). We are # only interested in words that have at least one affix (grep -E $flagmarker); # the next step will pick up the rest. Some of the roots are illegal. We # use join to restrict the output to those root words that are found # in the original dictionary. # $verbose && echo "Generating roots and affixes." 1>&2 if $CRETIN_SORT then sortopts='-k 1,1 -k 2' else sortopts='+0 -1 +1' fi $ISPELL "$wchars" -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \ | tr " " ' ' \ | grep -a -E "$flagmarker" | sort $SORTTMP -u "-t$flagmarker" $sortopts \ | $JOIN $SIGNED "-t$flagmarker" - $EXPANDEDINPUT > $CRUNCHEDINPUT checkerrorflagfile # # We now have a list of legal roots, and of affixes that apply to the # root words. However, it is possible for some affix flags to generate more # than one output word. For example, with the flag table entry # # flag R: . > ER # . > ERS # # the input "BOTHER" will generate an entry "BOTH/R" in CRUNCHEDINPUT. But # this will accept "BOTHER" and "BOTHERS" in the dictionary, which is # wrong (in this case, though it's good English). # # To cure this problem, we first have to know which flags generate which # expansions. We use $ISPELL -e3 to expand the flags (the second e causes # the root and flag to be included in the output), and get pairs # suitable for joining. In the example above, we would get # # BOTH/R BOTHER # BOTH/R BOTHERS # # We save this in EXPANDEDPAIRS for the next step. # $verbose && echo 'Expanding dictionary into EXPANDEDPAIRS.' 1>&2 if $CRETIN_SORT then sortopts='-k 2' else sortopts='+1' fi $ISPELL "$wchars" -e3 -d $FAKEHASH -p /dev/null < $CRUNCHEDINPUT \ | sort $SORTTMP $sortopts > $EXPANDEDPAIRS checkerrorflagfile # # Now we want to extract the lines in EXPANDEDPAIRS in which the second field # is *not* listed in the original dictionary EXPANDEDINPUT; these illegal # lines contain the flags we cannot include without accepting illegal words. # It is somewhat easier to extract those which actually are listed (with # join), and then use comm to strip these from EXPANDEDPAIRS to get the # illegal expansions, together with the flags that generate them (we must # re-sort EXPANDEDPAIRS before running comm). Sed # gets rid of the expansion and uniq gets rid of duplicates. Comm then # selects the remainder of the list from CRUNCHEDINPUT and puts it in # LEGALFLAGLIST. The final step is to use a sort and icombine to put # the list into a one-entry-per-root format. # # BTW, I thought of using cut for the sed step (on systems that have it), # but it turns out that sed is faster! # $JOIN -j1 2 -o 1.1 1.2 $SIGNED $EXPANDEDPAIRS $EXPANDEDINPUT \ | sort $SORTTMP -u > $JOINEDPAIRS sort $SORTTMP -o $EXPANDEDPAIRS $EXPANDEDPAIRS sort $SORTTMP -o $CRUNCHEDINPUT $CRUNCHEDINPUT $verbose && echo 'Creating list of legal roots/flags.' 1>&2 if $CRETIN_SORT then sortopts='-k 1f,1 -k 1' else sortopts='+0f -1 +0' fi comm -13 $JOINEDPAIRS $EXPANDEDPAIRS \ | (sed -e 's; .*$;;' ; rm -f $JOINEDPAIRS $EXPANDEDPAIRS) \ | uniq \ | (comm -13 - $CRUNCHEDINPUT ; rm -f $CRUNCHEDINPUT) \ | sort $SORTTMP -u "-t$flagmarker" $sortopts \ | $COMBINE "$wchars" $langtabs > $LEGALFLAGLIST checkerrorflagfile # # LEGALFLAGLIST now contains root/flag combinations that, when expanded, # produce only words from EXPANDEDPAIRS. However, there is still a # problem if the language tables have any cross-product flags. A legal # root may appear in LEGALFLAGLIST with two flags that participate # in cross-products. When such a dictionary entry is expanded, # the cross-products will generate some extra words that may not # be in EXPANDEDPAIRS. We need to remove these from LEGALFLAGLIST. # # The first step is to collect the names of the flags that participate # in cross-products. Ispell will dump the language tables for us, and # sed is a pretty handy way to strip out extra information. We use # uniq -c and a numerical sort to put the flags in approximate order of how # "productive" they are (in terms of how likely they are to generate a lot # of output words). The least-productive flags are given last and will # be removed first. # $verbose \ && echo 'Creating list of flags that participate in cross-products.' 1>&2 if $CRETIN_SORT then sortopts='-k 1rn,1 -k 3' else sortopts='+0rn -1 +2' fi $ISPELL -D -d $FAKEHASH \ | sed -n -e '1,$s/:.*$// /^flagmarker/d /^prefixes/,/^suffixes/s/^ flag \*/p /p /^suffixes/,$s/^ flag \*/s /p' \ | sort $SORTTMP \ | uniq -c \ | tr ' ' ' ' \ | sort $SORTTMP $sortopts > $PRODUCTLIST checkerrorflagfile if [ `grep -a -F -c ' p ' $PRODUCTLIST` -gt 0 \ -a `grep -a -F -c ' s ' $PRODUCTLIST` -gt 0 ] then # # The language tables allow cross products. See if LEGALFLAGLIST has # any roots with multiple cross-product flags. Put them in CROSSROOTS. # $verbose && echo 'Finding prefix and suffix flags.' 1>&2 preflags=`sed -n -e 's/^[ 0-9]*p //p' $PRODUCTLIST | tr -d ' '` sufflags=`sed -n -e 's/^[ 0-9]*s //p' $PRODUCTLIST | tr -d ' '` grep -a -E "$flagmarker.*[$preflags].*[$sufflags]|$flagmarker.*[$sufflags].*[$preflags]" \ $LEGALFLAGLIST \ > $CROSSROOTS || : # # We will need an awk script; it's so big that it core-dumps my shell # under certain conditions. The rationale behind the script is commented # where the script is used. Note that you may want to change this # script for languages other than English. # case "$flagmarker" in /) sedchar=: ;; *) sedchar=/ ;; esac $verbose && echo 'Creating awk script.' 1>&2 sed -e "s/PREFLAGS/$preflags/" -e "s/SUFFLAGS/$sufflags/" \ -e "s;ILLEGALCOMBOS;$ILLEGALCOMBOS;" \ -e "s${sedchar}FLAGMARKER${sedchar}$flagmarker${sedchar}" \ > $AWKSCRIPT << 'ENDOFAWKSCRIPT' BEGIN \ { preflags = "PREFLAGS" sufflags = "SUFFLAGS" illegalcombos = "ILLEGALCOMBOS" flagmarker = "FLAGMARKER" pflaglen = length (preflags) for (i = 1; i <= pflaglen; i++) pflags[i] = substr (preflags, i, 1); sflaglen = length (sufflags) for (i = 1; i <= sflaglen; i++) sflags[i] = substr (sufflags, i, 1); } { len = length ($2) pnew2 = "" snew2 = "" pbad = "" sbad = "" sufs = 0 pres = 0 for (i = 1; i <= len; i++) { curflag = substr ($2, i, 1) for (j = 1; j <= pflaglen; j++) { if (pflags[j] == curflag) { pres++ pnew2 = substr ($2, 1, i - 1) substr ($2, i + 1) pbad = curflag } } for (j = 1; j <= sflaglen; j++) { if (sflags[j] == curflag) { sufs++ snew2 = substr ($2, 1, i - 1) substr ($2, i + 1) sbad = curflag } } } if (pres == 1) { print $1 flagmarker pnew2 print $1 flagmarker pbad >> illegalcombos } else if (sufs == 1) { print $1 flagmarker snew2 print $1 flagmarker sbad >> illegalcombos } else if (pres > 0) { print $1 flagmarker pnew2 print $1 flagmarker pbad >> illegalcombos } else { print $1 flagmarker snew2 print $1 flagmarker sbad >> illegalcombos } } ENDOFAWKSCRIPT : > $ILLEGALCOMBOS dbnum=1 while [ -s $CROSSROOTS ] do # # CROSSROOTS contains the roots whose cross-product expansions # might be illegal. We now need to locate the actual illegal ones. # We do this in much the same way we created LEGALFLAGLIST from # CRUNCHEDINPUT. First we make CROSSEXPANDED, which is analogous # to EXPANDEDPAIRS. # $verbose && echo "Creating cross expansions (pass $dbnum)." 1>&2 if $CRETIN_SORT then sortopts='-k 2' else sortopts='+1' fi $ISPELL "$wchars" -e3 -d $FAKEHASH -p /dev/null < $CROSSROOTS \ | sort $SORTTMP $sortopts > $CROSSEXPANDED checkerrorflagfile # # Now we join CROSSEXPANDED against EXPANDEDINPUT to produce # CROSSPAIRS, and then comm that against CROSSEXPANDED to # get CROSSILLEGAL, the list of illegal cross-product flag # combinations. # $JOIN -j1 2 -o 1.1 1.2 $SIGNED $CROSSEXPANDED $EXPANDEDINPUT \ | sort $SORTTMP -u > $CROSSPAIRS checkerrorflagfile sort $SORTTMP -u -o $CROSSEXPANDED $CROSSEXPANDED $verbose \ && echo "Finding illegal cross expansions (pass $dbnum)." 1>&2 comm -13 $CROSSPAIRS $CROSSEXPANDED \ | sed -e 's; .*$;;' \ | uniq > $CROSSILLEGAL if [ "$debug" = yes ] then mv $CROSSROOTS $DBDIR/CROSSROOTS.$dbnum ln $CROSSEXPANDED $DBDIR/CROSSEXP.$dbnum ln $CROSSPAIRS $DBDIR/CROSSPAIRS.$dbnum ln $CROSSILLEGAL $DBDIR/CROSSILLEGAL.$dbnum fi # # Now it is time to try to clear up the illegalities. For # each word in the illegal list, remove one of the cross-product # flags. The flag chosen is selected in an attempt to cure the # problem quickly, as follows: (1) if there is only one suffix # flag or only one prefix flag, we remove that. (2) If there is # a prefix flag, we remove the "least desirable" (according to # the order of preflags). (This may be pro-English prejudice, # and you might want to change this if your language is prefix-heavy). # (3) Otherwise we remove the least-desirable suffix flag # # The output of the awk script becomes the new CROSSROOTS. In # addition, we add the rejected flags to ILLEGALCOMBOS (this is done # inside the awk script) so they can be removed from LEGALFLAGLIST # later. # awk "-F$flagmarker" -f $AWKSCRIPT $CROSSILLEGAL > $CROSSROOTS if [ "$debug" = yes ] then rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL fi dbnum=`expr $dbnum + 1` if [ $dbnum -gt 100 ] then echo "Too many passes, aborting cross-product loop. Munchlist failed." 1>&2 if [ "X$MUNCHMAIL" != X ] then ( ls -ld ${DBDIR}/[A-Z]* cat ${MAILDEBUGDIR}/munchlist.mail ) | mail -s 'Munchlist debug output' "$MUNCHMAIL" rm -f ${MAILDEBUGDIR}/munchlist.mail fi rm -rf $MUNCHDIR exit 1 fi done rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $AWKSCRIPT # # Now we have, in ILLEGALCOMBOS, a list of root/flag combinations # that must be removed from LEGALFLAGLIST to get the final list # of truly legal flags. ILLEGALCOMBOS has one flag per line, so # by turning LEGALFLAGLIST into this form (sed), it's an # easy task for comm. We have to recombine flags again after the # extraction, to get all flags for a given root on the same line so that # cross-products will come out right. # if [ -s $ILLEGALCOMBOS ] then sort $SORTTMP -u -o $ILLEGALCOMBOS $ILLEGALCOMBOS $verbose && echo 'Finding roots of cross expansions.' 1>&2 if $CRETIN_SORT then sortopts='-k 1f,1 -k 1' else sortopts='+0f -1 +0' fi sort $SORTTMP $LEGALFLAGLIST \ | sed -e '/\/../{ s;^\(.*\)/\(.\)\(.*\);\1/\2\ \1/\3; P D }' \ | comm -23 - $ILLEGALCOMBOS \ | sort $SORTTMP -u "-t$flagmarker" $sortopts \ | $COMBINE "$wchars" $langtabs > $CROSSROOTS checkerrorflagfile mv $CROSSROOTS $LEGALFLAGLIST if [ "$debug" = yes ] then rm -f ${DBDIR}/LEGALFLAGLIST1 ln $LEGALFLAGLIST ${DBDIR}/LEGALFLAGLIST1 fi fi fi rm -f $PRODUCTLIST $CROSSROOTS $ILLEGALCOMBOS $EXPANDEDINPUT # # We now have (in LEGALFLAGLIST) a list of roots and flags which will # accept words taken from EXPANDEDINPUT and no others (though some of # EXPANDEDINPUT is not covered by this list). However, many of the # expanded words can be generated in more than one way. For example, # "bather" can be generated from "bath/R" and "bathe/R". This wastes # unnecessary space in the raw dictionary and, in some cases, in the # hash file as well. The solution is to list the various ways of # getting a given word and choose exactly one. All other things being # equal, we want to choose the one with the highest expansion length # to root length ratio. The $ISPELL -e4 option takes care of this by # providing us with a field to sort on. # # The ispell/awk combination is similar to the ispell/sed pipe used to # generate EXPANDEDPAIRS, except that ispell adds an extra field # giving the sort order. The first sort gets things in order so the # first root listed is the one we want, and the second sort (-um) then # selects that first root. Sed strips the expansion from the root, # and a final sort -u generates MINIMALAFFIXES, the final list of # affixes that (more or less) minimally covers what it can from # EXPANDEDINPUT. # $verbose && echo 'Eliminating non-optimal affixes.' 1>&2 if $CRETIN_SORT then sortopts1='-k 2,2 -k 2rn,3 -k 1,1' sortopts2='-k 2,2' sortopts3='-k 1f,1 -k 1' else sortopts1='+1 -2 +2rn -3 +0 -1' sortopts2='+1 -2' sortopts3='+0f -1 +0' fi $ISPELL "$wchars" -e4 -d $FAKEHASH -p /dev/null < $LEGALFLAGLIST \ | sort $SORTTMP $sortopts1 \ | sort $SORTTMP -um $sortopts2 \ | sed -e 's; .*$;;' \ | sort $SORTTMP -u "-t$flagmarker" $sortopts3 > $MINIMALAFFIXES rm -f $LEGALFLAGLIST # # Now we're almost done. MINIMALAFFIXES covers some (with luck, most) # of the words in STRIPPEDINPUT. Now we must create a list of the remaining # words (those omitted by MINIMALAFFIXES) and add it to MINIMALAFFIXES. # The best way to do this is to actually build a partial dictionary from # MINIMALAFFIXES in FAKEHASH, and then use $ISPELL -l to list the words that # are not covered by this dictionary. This must then be combined with the # reduced version of MINIMALAFFIXES and sorted to produce the final result. # $verbose && echo "Generating output word list." 1>&2 if $CRETIN_SORT then sortopts='-k 1f,1 -k 1' else sortopts='+0f -1 +0' fi if [ -s $MINIMALAFFIXES ] then $BUILDHASH -s $MINIMALAFFIXES $langtabs $FAKEHASH > /dev/null \ || (echo "Couldn't create intermediate hash file" 1>&2; rm -rf $MUNCHDIR; exit 1) \ || exit 1 if [ "$debug" = yes ] then rm -f ${DBDIR}/MINIMALAFFIXES.stat ln $MINIMALAFFIXES.stat ${DBDIR}/MINIMALAFFIXES..stat fi ($ISPELL "$wchars" -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT; \ $COMBINE "$wchars" $langtabs < $MINIMALAFFIXES) \ | sort $SORTTMP "-t$flagmarker" -u $sortopts else # MINIMALAFFIXES is empty; just produce a sorted version of STRIPPEDINPUT sort $SORTTMP "-t$flagmarker" -u $sortopts $STRIPPEDINPUT fi checkerrorflagfile if [ "X$MUNCHMAIL" != X ] then ( ls -ld ${DBDIR}/[A-Z]* cat ${MAILDEBUGDIR}/munchlist.mail ) | mail -s 'Munchlist debug output' "$MUNCHMAIL" rm -f ${MAILDEBUGDIR}/munchlist.mail fi rm -rf $MUNCHDIR exit 0