source: trunk/third/ispell/munchlist.X @ 10894

Revision 10894, 25.5 KB checked in by ghudson, 27 years ago (diff)
/usr/tmp -> /var/tmp
  • Property svn:executable set to *
Line 
1: Use /bin/sh
2#
3# $Id: munchlist.X,v 1.2 1997-12-13 16:07:36 ghudson Exp $
4#
5# Copyright 1987, 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11#
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. All modifications to the source code must be clearly marked as
18#    such.  Binary redistributions based on modified source code
19#    must be clearly marked as modified versions in the documentation
20#    and/or other materials provided with the distribution.
21# 4. All advertising materials mentioning features or use of this software
22#    must display the following acknowledgment:
23#      This product includes software developed by Geoff Kuenning and
24#      other unpaid contributors.
25# 5. The name of Geoff Kuenning may not be used to endorse or promote
26#    products derived from this software without specific prior
27#    written permission.
28#
29# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
30# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32# ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
33# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39# SUCH DAMAGE.
40#
41#       Given a list of words for ispell, generate a reduced list
42#       in which all possible affixes have been collapsed.  The reduced
43#       list will match the same list as the original.
44#
45#       Usage:
46#
47#       munchlist [-l lang] [-c lang] [-s hashfile] [-D] [-w chars] [-v] \
48#         [file] ...
49#
50#       Options:
51#
52#       -l lang Specifies the language table to be used.  The default
53#               is "$LIBDIR/!!DEFLANG!!".
54#       -c lang Specifies "conversion" language table.  If this option is
55#               given, the input file(s) will be assumed to be described by
56#               this table, rather than the table given in the -l option.
57#               This may be used to convert between incompatible language
58#               tables.  (When in doubt, use this option -- it doesn't
59#               hurt, and it may save you from creating a dictionary that has
60#               illegal words in it).  The default is no conversion.
61#       -T suff Specifies that the source word lists are in the format
62#               of a "suff"-suffixed file, rather than in the
63#               canonical form.  For example, "-T tex" specifies that
64#               string characters in the word lists are in TeX format.
65#               The string character conversions are taken from the language
66#               table specified by the "-l" switch.
67#       -s      Remove any words that are already covered by the
68#               dictionary in 'hashfile'.  The words will be removed
69#               only if all affixes are covered.  This option should not be
70#               specified when the main dictionary is being munched.
71#               'Hashfile' must have been created with the language
72#               table given in the -l option, but this is not checked.
73#       -D      Leave temporary files for debugging purposes
74#       -w      Passed on to ispell (specify chars that are part of a word)
75#               Unfortunately, special characters must be quoted twice
76#               rather than once when invoking this script.  Also, since
77#               buildhash doesn't accept this option, the final ispell -l
78#               step ignores it, making it somewhat less than useful.
79#       -v      Report progress to stderr.
80#
81#       The given input files are merged, then processed by 'ispell -c'
82#       to generate possible affix lists;  these are then combined
83#       and reduced.  The final result is written to standard output.
84#
85#       For portability to older systems, I have avoided getopt.
86#
87#               Geoff Kuenning
88#               2/28/87
89#
90# $Log: not supported by cvs2svn $
91# Revision 1.1.1.1  1997/09/03 21:08:12  ghudson
92# Import of ispell 3.1.20
93#
94# Revision 1.53  1995/01/08  23:23:36  geoff
95# Support variable hashfile suffixes for DOS purposes.
96#
97# Revision 1.52  1994/12/27  23:08:46  geoff
98# Dynamically determine how to pass backslashes to 'tr' so that it'll
99# work on any machine.  Define LC_CTYPE to work around yet more
100# internationalized sort programs.  Work around a bug in GNU uniq that
101# uses the wrong separator between counts and duplicated lines.
102#
103# Revision 1.51  1994/11/21  07:02:54  geoff
104# Correctly quote the arguments to 'tr' when detecting systems with
105# unsigned sorts.  Be sure to provide a zero exit status on all systems,
106# even if MUNCHDEBUG is not set.
107#
108# Revision 1.50  1994/10/25  05:46:05  geoff
109# Export values for LANG and LOCALE in an attempt to override some
110# stupidly-internationalized sort programs.
111#
112# Revision 1.49  1994/10/04  03:51:30  geoff
113# Add the MUNCHMAIL feature.  If the MUNCHMAIL environment variable is
114# set to an email address, debugging information about the munchlist run
115# will automatically be collected and mailed to that address.
116#
117# Revision 1.48  1994/05/17  06:32:06  geoff
118# Don't look for affix tables in LIBDIR if the name contains a slash
119#
120# Revision 1.47  1994/04/27  02:50:48  geoff
121# Fix some cosmetic flaws in the verbose-mode messages.
122#
123# Revision 1.46  1994/01/25  07:11:59  geoff
124# Get rid of all old RCS log lines in preparation for the 3.1 release.
125#
126#
127if [ "X$MUNCHMAIL" != X ]
128then
129    exec 2> /tmp/munchlist.mail
130    echo "munchlist $*" 1>&2
131    set -vx
132fi
133LIBDIR=!!LIBDIR!!
134TDIR=${TMPDIR-/var/tmp}
135TMP=${TDIR}/munch$$
136SORTTMP="-T ${TDIR}"                    # !!SORTTMP!!
137if [ -r ./icombine ]
138then
139    COMBINE=./icombine
140else
141    COMBINE=icombine
142fi
143if [ -r ./ijoin ]
144then
145    JOIN=./ijoin
146else
147    JOIN=ijoin
148fi
149
150#
151# The following is necessary so that some internationalized versions of
152# sort(1) don't confuse things by sorting into a nonstandard order.
153#
154LANG=C
155LOCALE=C
156LC_CTYPE=C
157export LANG LOCALE LC_CTYPE
158
159debug=no
160dictopt=
161langtabs=${LIBDIR}/!!DEFLANG!!
162convtabs=
163strip=no
164icflags=
165verbose=false
166# The following value of "wchars" is necessary to prevent ispell from
167# receiving a null argument if -w is not specified.  As long as "A" is
168# a member of the existing character set, ispell will ignore the argument.
169wchars=-wA
170while [ $# != 0 ]
171do
172    case "$1" in
173        -l)
174            case "$2" in
175                */*)
176                    langtabs=$2
177                    ;;
178                *)
179                    if [ -r "$2" ]
180                    then
181                        langtabs="$2"
182                    else
183                        langtabs="${LIBDIR}/$2"
184                    fi
185                    ;;
186            esac
187            if [ ! -r "$langtabs" ]
188            then
189                echo "Can't open language table '$2'" 1>&2
190                exit 1
191            fi
192            shift
193            ;;
194        -c)
195            if [ -r "$2" ]
196            then
197                convtabs="$2"
198            elif [ -r "${LIBDIR}/$2" ]
199            then
200                convtabs="${LIBDIR}/$2"
201            else
202                echo "Can't open conversion language table '$2'" 1>&2
203                exit 1
204            fi
205            shift
206            ;;
207        -s)
208            dictopt="-d $2"
209            strip=yes
210            shift
211            ;;
212        -D)
213            debug=yes
214            ;;
215        -T)
216            icflags="-T $2"
217            shift
218            ;;
219        -v)
220            verbose=true
221            ;;
222        -w)
223            wchars="-w$2"
224            shift
225            ;;
226        --)
227            shift
228            break
229            ;;
230        -)
231            break
232            ;;
233        -*)
234            echo 'Usage: munchlist [-l lang] [-c lang] [-T suff] [-s hashfile] [-D] [-w chars] [-v] [file] ...' \
235              1>&2
236            exit 2
237            ;;
238        *)
239            break
240            ;;
241    esac
242    shift
243done
244if [ "X$MUNCHMAIL" != X ]
245then
246    verbose=true
247    debug=yes
248fi
249trap "/bin/rm -f ${TMP}*; exit 1" 1 2 13 15
250#
251# Names of temporary files.  This is just to make the code a little easier
252# to read.
253#
254EXPANDEDINPUT=${TMP}a
255STRIPPEDINPUT=${TMP}b
256CRUNCHEDINPUT=${TMP}c
257PRODUCTLIST=${TMP}d
258EXPANDEDPAIRS=${TMP}e
259LEGALFLAGLIST=${TMP}f
260JOINEDPAIRS=${TMP}g
261MINIMALAFFIXES=${TMP}h
262CROSSROOTS=${TMP}i
263CROSSEXPANDED=${TMP}j
264CROSSPAIRS=${TMP}k
265CROSSILLEGAL=${TMP}l
266ILLEGALCOMBOS=${TMP}m
267FAKEDICT=${TMP}n
268# Ispell insists that hash files have a "!!HASHSUFFIX!!" suffix
269FAKEHASH=${TMP}o!!HASHSUFFIX!!
270AWKSCRIPT=${TMP}p
271if [ "$debug" = yes ]
272then
273    touch $EXPANDEDINPUT $STRIPPEDINPUT $CRUNCHEDINPUT $PRODUCTLIST \
274      $EXPANDEDPAIRS $LEGALFLAGLIST $JOINEDPAIRS $MINIMALAFFIXES \
275      $CROSSROOTS $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $ILLEGALCOMBOS \
276      $FAKEDICT $FAKEHASH $AWKSCRIPT
277    rm -f ${TDIR}/EXPANDEDINPUT ${TDIR}/STRIPPEDINPUT ${TDIR}/CRUNCHEDINPUT \
278      ${TDIR}/PRODUCTLIST ${TDIR}/EXPANDEDPAIRS ${TDIR}/LEGALFLAGLIST \
279      ${TDIR}/JOINEDPAIRS ${TDIR}/MINIMALAFFIXES ${TDIR}/CROSSROOTS \
280      ${TDIR}/CROSSEXPANDED ${TDIR}/CROSSPAIRS ${TDIR}/CROSSILLEGAL \
281      ${TDIR}/ILLEGALCOMBOS ${TDIR}/FAKEDICT ${TDIR}/FAKEHASH!!HASHSUFFIX!! \
282      ${TDIR}/AWKSCRIPT ${TDIR}/CROSSROOTS.[0-9]* ${TDIR}/CROSSEXP.[0-9]* \
283      ${TDIR}/CROSSPAIRS.[0-9]* ${TDIR}/CROSSILLEGAL.[0-9]*
284    ln $EXPANDEDINPUT ${TDIR}/EXPANDEDINPUT
285    ln $STRIPPEDINPUT ${TDIR}/STRIPPEDINPUT
286    ln $CRUNCHEDINPUT ${TDIR}/CRUNCHEDINPUT
287    ln $PRODUCTLIST ${TDIR}/PRODUCTLIST
288    ln $EXPANDEDPAIRS ${TDIR}/EXPANDEDPAIRS
289    ln $LEGALFLAGLIST ${TDIR}/LEGALFLAGLIST
290    ln $JOINEDPAIRS ${TDIR}/JOINEDPAIRS
291    ln $MINIMALAFFIXES ${TDIR}/MINIMALAFFIXES
292    ln $CROSSROOTS ${TDIR}/CROSSROOTS
293    ln $CROSSEXPANDED ${TDIR}/CROSSEXPANDED
294    ln $CROSSPAIRS ${TDIR}/CROSSPAIRS
295    ln $CROSSILLEGAL ${TDIR}/CROSSILLEGAL
296    ln $ILLEGALCOMBOS ${TDIR}/ILLEGALCOMBOS
297    ln $FAKEDICT ${TDIR}/FAKEDICT
298    ln $FAKEHASH ${TDIR}/FAKEHASH!!HASHSUFFIX!!
299    ln $AWKSCRIPT ${TDIR}/AWKSCRIPT
300fi
301#
302# Create a dummy dictionary to hold a compiled copy of the language
303# table.  Initially, it holds the conversion table, if it exists.
304#
305case "X$convtabs" in
306    X)
307        convtabs="$langtabs"
308        ;;
309esac
310echo 'QQQQQQQQ' > $FAKEDICT
311buildhash -s $FAKEDICT $convtabs $FAKEHASH \
312  ||  (echo "Couldn't create fake hash file" 1>&2; /bin/rm -f ${TMP}*; exit 1) \
313  ||  exit 1
314#
315# Figure out how 'sort' sorts signed fields, for arguments to ijoin.
316# This is a little bit of a tricky pipe, but the result is that SIGNED
317# is set to "-s" if characters with the top bit set sort before those
318# without, and "-u" if the reverse is true.  How does it work?  The
319# first "tr" step generates two lines, one containing "-u", the other
320# with the same but with the high-order bit set.  The second "tr"
321# changesthe high-bit "-u" back to "-s".  If the high-bit "-u" was
322# sorted first, the sed step will select "-s" for SIGNED; otherwise
323# it'll pick "-u".  We have to be careful about backslash quoting
324# conventions, because some systems differ.
325#
326backslash=\\
327for i in 0 1 2 3
328do
329    if [ `echo a | tr "${backslash}141" b` = b ]
330    then
331        break
332    fi
333    backslash="$backslash$backslash"
334done
335SIGNED=`echo '-s
336-u' | tr s "${backslash}365" | sort | tr "${backslash}365" s | sed 1q`
337#
338# Collect all the input and expand all the affix options (ispell -e),
339# and preserve (sorted) for later joining in EXPANDEDINPUT.  The icombine
340# step is to make sure that unneeded capitalizations (e.g., Farmer and farmer)
341# are weeded out.  The first sort must be folded for icombine;  the second
342# must be unfolded for join.
343#
344$verbose  &&  echo "Collecting input." 1>&2
345if [ $# -eq 0 ]
346then
347    ispell "$wchars" -e1 -d $FAKEHASH -p /dev/null | tr " " '
348'
349else
350    cat "$@" | ispell "$wchars" -e1 -d $FAKEHASH -p /dev/null | tr " " '
351'
352fi \
353  | sort $SORTTMP -u +0f -1 +0 \
354  | $COMBINE $icflags $langtabs \
355  | sort $SORTTMP -u > $EXPANDEDINPUT
356#
357# If a conversion table existed, recreate the fake hash file with the
358# "real" language table.
359#
360case "$convtabs" in
361    $langtabs)
362        ;;
363    *)
364        buildhash -s $FAKEDICT $langtabs $FAKEHASH \
365          ||  (echo "Couldn't create fake hash file" 1>&2; \
366                /bin/rm -f ${TMP}*; exit 1) \
367          ||  exit 1
368        ;;
369esac
370/bin/rm -f ${FAKEDICT}*
371#
372# If the -s (strip) option was specified, remove all
373# expanded words that are covered by the dictionary.  This produces
374# the final list of expanded words that this dictionary must cover.
375# Leave the list in STRIPPEDINPUT.
376#
377if [ "X$strip" = "Xno" ]
378then
379    rm -f $STRIPPEDINPUT
380    ln $EXPANDEDINPUT $STRIPPEDINPUT
381    if [ "$debug" = yes ]
382    then
383        rm -f ${TDIR}/STRIPPEDINPUT
384        ln $STRIPPEDINPUT ${TDIR}/STRIPPEDINPUT
385    fi
386else
387    $verbose  &&  echo "Stripping words already in the dictionary." 1>&2
388    ispell "$wchars" -l $dictopt -p /dev/null < $EXPANDEDINPUT \
389      > $STRIPPEDINPUT
390fi
391#
392# Figure out what the flag-marking character is.
393#
394$verbose  &&  echo "Finding flag marker." 1>&2
395flagmarker=`ispell -D -d $FAKEHASH \
396  | sed -n '/^flagmarker/s/flagmarker //p'`
397case "$flagmarker" in
398    \\*)
399        flagmarker=`expr "$flagmarker" : '.\(.\)'`
400        ;;
401esac   
402#
403# Munch the input to generate roots and affixes (ispell -c).  We are
404# only interested in words that have at least one affix (egrep $flagmarker);
405# the next step will pick up the rest.  Some of the roots are illegal.  We
406# use join to restrict the output to those root words that are found
407# in the original dictionary.
408#
409$verbose  &&  echo "Generating roots and affixes." 1>&2
410ispell "$wchars" -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \
411  | tr " " '
412' \
413  | egrep "$flagmarker" | sort $SORTTMP -u "-t$flagmarker" +0 -1 +1 \
414  | $JOIN $SIGNED "-t$flagmarker" - $EXPANDEDINPUT > $CRUNCHEDINPUT
415#
416# We now have a list of legal roots, and of affixes that apply to the
417# root words.  However, it is possible for some affix flags to generate more
418# than one output word.  For example, with the flag table entry
419#
420#       flag R: . > ER
421#               . > ERS
422#
423# the input "BOTHER" will generate an entry "BOTH/R" in CRUNCHEDINPUT.  But
424# this will accept "BOTHER" and "BOTHERS" in the dictionary, which is
425# wrong (in this case, though it's good English).
426#
427# To cure this problem, we first have to know which flags generate which
428# expansions.  We use ispell -e3 to expand the flags (the second e causes
429# the root and flag to be included in the output), and get pairs
430# suitable for joining.  In the example above, we would get
431#
432#       BOTH/R BOTHER
433#       BOTH/R BOTHERS
434#
435# We save this in EXPANDEDPAIRS for the next step.
436#
437$verbose  &&  echo 'Expanding dictionary into EXPANDEDPAIRS.' 1>&2
438ispell "$wchars" -e3 -d $FAKEHASH -p /dev/null < $CRUNCHEDINPUT \
439  | sort $SORTTMP +1 > $EXPANDEDPAIRS
440#
441# Now we want to extract the lines in EXPANDEDPAIRS in which the second field
442# is *not* listed in the original dictionary EXPANDEDINPUT;  these illegal
443# lines contain the flags we cannot include without accepting illegal words.
444# It is somewhat easier to extract those which actually are listed (with
445# join), and then use comm to strip these from EXPANDEDPAIRS to get the
446# illegal expansions, together with the flags that generate them (we must
447# re-sort EXPANDEDPAIRS before running comm).  Sed
448# gets rid of the expansion and uniq gets rid of duplicates.  Comm then
449# selects the remainder of the list from CRUNCHEDINPUT and puts it in
450# LEGALFLAGLIST.  The final step is to use a sort and icombine to put
451# the list into a one-entry-per-root format.
452#
453# BTW, I thought of using cut for the sed step (on systems that have it),
454# but it turns out that sed is faster!
455#
456$JOIN -j1 2 -o 1.1 1.2 $SIGNED $EXPANDEDPAIRS $EXPANDEDINPUT \
457  | sort $SORTTMP -u > $JOINEDPAIRS
458
459sort $SORTTMP -o $EXPANDEDPAIRS $EXPANDEDPAIRS
460sort $SORTTMP -o $CRUNCHEDINPUT $CRUNCHEDINPUT
461
462$verbose  &&  echo 'Creating list of legal roots/flags.' 1>&2
463comm -13 $JOINEDPAIRS $EXPANDEDPAIRS \
464  | (sed -e 's; .*$;;' ; /bin/rm -f $JOINEDPAIRS $EXPANDEDPAIRS) \
465  | uniq \
466  | (comm -13 - $CRUNCHEDINPUT ; /bin/rm -f $CRUNCHEDINPUT) \
467  | sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 \
468  | $COMBINE $langtabs > $LEGALFLAGLIST
469
470#
471# LEGALFLAGLIST now contains root/flag combinations that, when expanded,
472# produce only words from EXPANDEDPAIRS.  However, there is still a
473# problem if the language tables have any cross-product flags.  A legal
474# root may appear in LEGALFLAGLIST with two flags that participate
475# in cross-products.  When such a dictionary entry is expanded,
476# the cross-products will generate some extra words that may not
477# be in EXPANDEDPAIRS.  We need to remove these from LEGALFLAGLIST.
478#
479# The first step is to collect the names of the flags that participate
480# in cross-products.  Ispell will dump the language tables for us, and
481# sed is a pretty handy way to strip out extra information.  We use
482# uniq -c and a numerical sort to put the flags in approximate order of how
483# "productive" they are (in terms of how likely they are to generate a lot
484# of output words).  The least-productive flags are given last and will
485# be removed first.
486#
487$verbose \
488  &&  echo 'Creating list of flags that participate in cross-products.' 1>&2
489ispell -D -d $FAKEHASH \
490  | sed -n '1,$s/:.*$//
491    /^flagmarker/d
492    /^prefixes/,/^suffixes/s/^  flag \*/p /p
493    /^suffixes/,$s/^  flag \*/s /p' \
494  | sort $SORTTMP \
495  | uniq -c \
496  | tr '        ' ' ' \
497  | sort $SORTTMP +0rn -1 +2 > $PRODUCTLIST
498
499if [ `egrep ' p ' $PRODUCTLIST | wc -l` -gt 0 \
500  -a `egrep ' s ' $PRODUCTLIST | wc -l` -gt 0 ]
501then
502    #
503    # The language tables allow cross products.  See if LEGALFLAGLIST has
504    # any roots with multiple cross-product flags.  Put them in CROSSROOTS.
505    #
506    $verbose  &&  echo 'Finding prefix and suffix flags.' 1>&2
507    preflags=`sed -n 's/^[ 0-9]*p //p' $PRODUCTLIST | tr -d '
508'`
509    sufflags=`sed -n 's/^[ 0-9]*s //p' $PRODUCTLIST | tr -d '
510'`
511    egrep "$flagmarker.*[$preflags].*[$sufflags]|$flagmarker.*[$sufflags].*[$preflags]" \
512      $LEGALFLAGLIST \
513      > $CROSSROOTS
514
515    #
516    # We will need an awk script;  it's so big that it core-dumps my shell
517    # under certain conditions.  The rationale behind the script is commented
518    # where the script is used.  Note that you may want to change this
519    # script for languages other than English.
520    #
521    case "$flagmarker" in
522        /)
523            sedchar=:
524            ;;
525        *)
526            sedchar=/
527            ;;
528    esac
529    $verbose  &&  echo 'Creating awk script.' 1>&2
530    sed -e "s/PREFLAGS/$preflags/" -e "s/SUFFLAGS/$sufflags/" \
531      -e "s;ILLEGALCOMBOS;$ILLEGALCOMBOS;" \
532      -e "s${sedchar}FLAGMARKER${sedchar}$flagmarker${sedchar}" \
533      > $AWKSCRIPT << 'ENDOFAWKSCRIPT'
534        BEGIN \
535            {
536            preflags = "PREFLAGS"
537            sufflags = "SUFFLAGS"
538            illegalcombos = "ILLEGALCOMBOS"
539            flagmarker = "FLAGMARKER"
540            pflaglen = length (preflags)
541            for (i = 1;  i <= pflaglen;  i++)
542                pflags[i] = substr (preflags, i, 1);
543            sflaglen = length (sufflags)
544            for (i = 1;  i <= sflaglen;  i++)
545                sflags[i] = substr (sufflags, i, 1);
546            }
547            {
548            len = length ($2)
549            pnew2 = ""
550            snew2 = ""
551            pbad = ""
552            sbad = ""
553            sufs = 0
554            pres = 0
555            for (i = 1;  i <= len;  i++)
556                {
557                curflag = substr ($2, i, 1)
558                for (j = 1;  j <= pflaglen;  j++)
559                    {
560                    if (pflags[j] == curflag)
561                        {
562                        pres++
563                        pnew2 = substr ($2, 1, i - 1) substr ($2, i + 1)
564                        pbad = curflag
565                        }
566                    }
567                for (j = 1;  j <= sflaglen;  j++)
568                    {
569                    if (sflags[j] == curflag)
570                        {
571                        sufs++
572                        snew2 = substr ($2, 1, i - 1) substr ($2, i + 1)
573                        sbad = curflag
574                        }
575                    }
576                }
577            if (pres == 1)
578                {
579                print $1 flagmarker pnew2
580                print $1 flagmarker pbad >> illegalcombos
581                }
582            else if (sufs == 1)
583                {
584                print $1 flagmarker snew2
585                print $1 flagmarker sbad >> illegalcombos
586                }
587            else if (pres > 0)
588                {
589                print $1 flagmarker pnew2
590                print $1 flagmarker pbad >> illegalcombos
591                }
592            else
593                {
594                print $1 flagmarker snew2
595                print $1 flagmarker sbad >> illegalcombos
596                }
597            }
598ENDOFAWKSCRIPT
599    : > $ILLEGALCOMBOS
600    dbnum=0
601    while [ -s $CROSSROOTS ]
602    do
603        #
604        # CROSSROOTS contains the roots whose cross-product expansions
605        # might be illegal.  We now need to locate the actual illegal ones.
606        # We do this in much the same way we created LEGALFLAGLIST from
607        # CRUNCHEDINPUT.  First we make CROSSEXPANDED, which is analogous
608        # to EXPANDEDPAIRS.
609        #
610        $verbose  &&  echo "Creating cross expansions (pass $dbnum)." 1>&2
611        ispell "$wchars" -e3 -d $FAKEHASH -p /dev/null < $CROSSROOTS \
612          | sort $SORTTMP +1 > $CROSSEXPANDED
613        #
614        # Now we join CROSSEXPANDED against EXPANDEDINPUT to produce
615        # CROSSPAIRS, and then comm that against CROSSEXPANDED to
616        # get CROSSILLEGAL, the list of illegal cross-product flag
617        # combinations.
618        #
619        $JOIN -j1 2 -o 1.1 1.2 $SIGNED $CROSSEXPANDED $EXPANDEDINPUT \
620          | sort $SORTTMP -u > $CROSSPAIRS
621
622        sort $SORTTMP -u -o $CROSSEXPANDED $CROSSEXPANDED
623
624        $verbose \
625          &&  echo "Finding illegal cross expansions (pass $dbnum)." 1>&2
626        comm -13 $CROSSPAIRS $CROSSEXPANDED \
627          | sed -e 's; .*$;;' \
628          | uniq > $CROSSILLEGAL
629
630        if [ "$debug" = yes ]
631        then
632            mv $CROSSROOTS $TDIR/CROSSROOTS.$dbnum
633            ln $CROSSEXPANDED $TDIR/CROSSEXP.$dbnum
634            ln $CROSSPAIRS $TDIR/CROSSPAIRS.$dbnum
635            ln $CROSSILLEGAL $TDIR/CROSSILLEGAL.$dbnum
636        fi
637        #
638        # Now it is time to try to clear up the illegalities.  For
639        # each word in the illegal list, remove one of the cross-product
640        # flags.  The flag chosen is selected in an attempt to cure the
641        # problem quickly, as follows:  (1) if there is only one suffix
642        # flag or only one prefix flag, we remove that.  (2) If there is
643        # a prefix flag, we remove the "least desirable" (according to
644        # the order of preflags). (This may be pro-English prejudice,
645        # and you might want to change this if your language is prefix-heavy).
646        # (3) Otherwise we remove the least-desirable suffix flag
647        #
648        # The output of the awk script becomes the new CROSSROOTS.  In
649        # addition, we add the rejected flags to ILLEGALCOMBOS (this is done
650        # inside the awk script) so they can be removed from LEGALFLAGLIST
651        # later.
652        #
653        awk "-F$flagmarker" -f $AWKSCRIPT $CROSSILLEGAL > $CROSSROOTS
654        if [ "$debug" = yes ]
655        then
656            /bin/rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL
657        fi
658        dbnum=`expr $dbnum + 1`
659    done
660    /bin/rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $AWKSCRIPT
661    #
662    # Now we have, in ILLEGALCOMBOS, a list of root/flag combinations
663    # that must be removed from LEGALFLAGLIST to get the final list
664    # of truly legal flags.  ILLEGALCOMBOS has one flag per line, so
665    # by turning LEGALFLAGLIST into this form (sed), it's an
666    # easy task for comm.  We have to recombine flags again after the
667    # extraction, to get all flags for a given root on the same line so that
668    # cross-products will come out right.
669    #
670    if [ -s $ILLEGALCOMBOS ]
671    then
672        sort $SORTTMP -u -o $ILLEGALCOMBOS $ILLEGALCOMBOS
673        $verbose  &&  echo 'Finding roots of cross expansions.' 1>&2
674        sort $SORTTMP $LEGALFLAGLIST \
675          | sed '/\/../{
676              s;^\(.*\)/\(.\)\(.*\);\1/\2\
677\1/\3;
678              P
679              D
680              }' \
681          | comm -23 - $ILLEGALCOMBOS \
682          | sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 \
683          | $COMBINE $langtabs > $CROSSROOTS
684        mv $CROSSROOTS $LEGALFLAGLIST
685        if [ "$debug" = yes ]
686        then
687            rm -f ${TDIR}/LEGALFLAGLIST1
688            ln $LEGALFLAGLIST ${TDIR}/LEGALFLAGLIST1
689        fi
690    fi
691fi
692/bin/rm -f $PRODUCTLIST $CROSSROOTS $ILLEGALCOMBOS $EXPANDEDINPUT
693#
694
695# We now have (in LEGALFLAGLIST) a list of roots and flags which will
696# accept words taken from EXPANDEDINPUT and no others (though some of
697# EXPANDEDINPUT is not covered by this list).  However, many of the
698# expanded words can be generated in more than one way.  For example,
699# "bather" can be generated from "bath/R" and "bathe/R".  This wastes
700# unnecessary space in the raw dictionary and, in some cases, in the
701# hash file as well.  The solution is to list the various ways of
702# getting a given word and choose exactly one.  All other things being
703# equal, we want to choose the one with the highest expansion length
704# to root length ratio.  The ispell -e4 option takes care of this by
705# providing us with a field to sort on.
706#
707# The ispell/awk combination is similar to the ispell/sed pipe used to
708# generate EXPANDEDPAIRS, except that ispell adds an extra field
709# giving the sort order.  The first sort gets things in order so the
710# first root listed is the one we want, and the second sort (-um) then
711# selects that first root.  Sed strips the expansion from the root,
712# and a final sort -u generates MINIMALAFFIXES, the final list of
713# affixes that (more or less) minimally covers what it can from
714# EXPANDEDINPUT.
715#
716$verbose  &&  echo 'Eliminating non-optimal affixes.' 1>&2
717ispell "$wchars" -e4 -d $FAKEHASH -p /dev/null < $LEGALFLAGLIST \
718  | sort $SORTTMP +1 -2 +2rn -3 +0 -1 \
719  | sort $SORTTMP -um +1 -2 \
720  | sed -e 's; .*$;;' \
721  | sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 > $MINIMALAFFIXES
722/bin/rm -f $LEGALFLAGLIST
723#
724# Now we're almost done.  MINIMALAFFIXES covers some (with luck, most)
725# of the words in STRIPPEDINPUT.  Now we must create a list of the remaining
726# words (those omitted by MINIMALAFFIXES) and add it to MINIMALAFFIXES.
727# The best way to do this is to actually build a partial dictionary from
728# MINIMALAFFIXES in FAKEHASH, and then use ispell -l to list the words that
729# are not covered by this dictionary.  This must then be combined with the
730# reduced version of MINIMALAFFIXES and sorted to produce the final result.
731#
732$verbose  &&  echo "Generating output word list." 1>&2
733if [ -s $MINIMALAFFIXES ]
734then
735    buildhash -s $MINIMALAFFIXES $langtabs $FAKEHASH > /dev/null \
736      ||  (echo "Couldn't create intermediate hash file" 1>&2;
737        /bin/rm -f ${TMP}*;
738        exit 1) \
739      ||  exit 1
740    if [ "$debug" = yes ]
741    then
742        rm -f ${TDIR}/MINAFFIXES.!!COUNTSUFFIX!! \
743          ${TDIR}/MINAFFIXES!!STATSUFFIX!!
744        ln $MINIMALAFFIXES.!!COUNTSUFFIX!! ${TDIR}/MINAFFIXES.!!COUNTSUFFIX!!
745        ln $MINIMALAFFIXES!!STATSUFFIX!! ${TDIR}/MINAFFIXES!!STATSUFFIX!!
746    fi
747    (ispell "$wchars" -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT; \
748        $COMBINE $langtabs < $MINIMALAFFIXES) \
749      | sort $SORTTMP "-t$flagmarker" -u +0f -1 +0
750else
751    # MINIMALAFFIXES is empty;  just produce a sorted version of STRIPPEDINPUT
752    sort $SORTTMP "-t$flagmarker" -u +0f -1 +0 $STRIPPEDINPUT
753fi
754/bin/rm -f ${TMP}*
755if [ "X$MUNCHMAIL" != X ]
756then
757    (
758    ls -ld ${TDIR}/[A-Z]*
759    cat /tmp/munchlist.mail
760    ) | mail "$MUNCHMAIL"
761    /bin/rm -f /tmp/munchlist.mail
762fi
763exit 0
Note: See TracBrowser for help on using the repository browser.