source: trunk/third/ispell/subset.X @ 22599

Revision 22599, 7.7 KB checked in by ghudson, 18 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r22598, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1!!POUNDBANG!!
2#
3# $Id: subset.X,v 1.1.1.2 2007-02-01 19:50:19 ghudson Exp $
4#
5# Copyright 1992, 1993, 1999, 2001, 2005, Geoff Kuenning, Claremont, CA
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11#
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. All modifications to the source code must be clearly marked as
18#    such.  Binary redistributions based on modified source code
19#    must be clearly marked as modified versions in the documentation
20#    and/or other materials provided with the distribution.
21# 4. The code that causes the 'ispell -v' command to display a prominent
22#    link to the official ispell Web site may not be removed.
23# 5. The name of Geoff Kuenning may not be used to endorse or promote
24#    products derived from this software without specific prior
25#    written permission.
26#
27# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
28# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30# ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
31# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37# SUCH DAMAGE.
38#
39#       Combine and resolve various dictionaries so they are proper
40#       subsets of one another, and so that maximal use is made of
41#       flags in the smaller ones.
42#
43#       Usage:
44#
45#       subset [-b base] [-l langfile] small-dict bigger-dict ... biggest-dict
46#
47#       The output is a an equal number of successively-larger
48#       dictionaries.  The smallest is written to "dict.0".  Successive
49#       files are named "dict.1", "dict.2", and so forth, and each contains
50#       a list of words which should be added to the previous files to
51#       generate a dictionary.  Words which are in smaller dictionaries are
52#       effectively propagated to the larger ones, so that the smaller ones
53#       are proper subsets of their siblings.  If dictionaries are
54#       completely disjoint, this may result in an empty output dictionary.
55#       Affix flags are propagated to the smallest dictionary containing
56#       the root word;  this expands the effectiveness of small dictionaries
57#       at no cost in hash table space.
58#
59#       The -b switch is used to specify a different base name for the
60#       output files than "dict".  (In other words, "-b english" would
61#       produce output in english.0, english.1, etc.).
62#
63#       If the -l switch is specified, the language tables are gotten
64#       from the specified file;  otherwise they come from $LIBDIR/!!DEFLANG!!.
65#
66#       Input dictionaries should be "clean";  if non-word characters
67#       appear in the dictionaries, the script may produce incorrect output.
68#
69# $Log: not supported by cvs2svn $
70# Revision 1.22  2005/04/27 01:18:35  geoff
71# Work around idiotic POSIX incompatibilities in sort.  Add secure
72# temp-file handling.
73#
74# Revision 1.21  2005/04/14 14:39:33  geoff
75# Use /tmp as the default temp directory
76#
77# Revision 1.20  2005/04/14 14:38:23  geoff
78# Update license.  Protect against modernized (i.e., incompatible) and
79# internationalized sort commands.  Fix a place where the affix table
80# wasn't passed to munchlist.
81#
82# Revision 1.19  2001/09/06 00:30:29  geoff
83# Many changes from Eli Zaretskii to support DJGPP compilation.
84#
85# Revision 1.18  2001/07/25 21:51:46  geoff
86# Minor license update.
87#
88# Revision 1.17  2001/07/23 20:24:04  geoff
89# Update the copyright and the license.
90#
91# Revision 1.16  1999/01/07 01:57:42  geoff
92# Update the copyright.
93#
94# Revision 1.15  1995/01/08  23:23:47  geoff
95# Support variable hashfile suffixes for DOS purposes.
96#
97# Revision 1.14  1994/01/25  07:12:10  geoff
98# Get rid of all old RCS log lines in preparation for the 3.1 release.
99#
100#
101
102#
103# The following is necessary so that some internationalized versions of
104# sort(1) don't confuse things by sorting into a nonstandard order.
105#
106LANG=C
107LOCALE=C
108LC_ALL=C
109LC_COLLATE=C
110LC_CTYPE=C
111export LANG LOCALE LC_COLLATE LC_CTYPE
112#
113# The following aren't strictly necessary, but I've been made paranoid
114# by problems with the stuff above.  It can't hurt to set them to a
115# sensible value.
116LC_MESSAGES=C
117LC_MONETARY=C
118LC_NUMERIC=C
119LC_TIME=C
120export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME
121
122LIBDIR=!!LIBDIR!!
123TDIR=${TMPDIR-/tmp}
124TEMPDIR=`mktemp -d ${TDIR}/ssetXXXXXXXXXX 2>/dev/null`  ||  TEMPDIR="$TDIR"
125TMP=${TEMPDIR}/sset$$
126if [ "$TEMPDIR" = "$TDIR" ]
127then
128    TOREMOVE="${TMP}*"
129else
130    TOREMOVE="$TEMPDIR"
131fi
132SORTTMP="-T ${TDIR}"                    # !!SORTTMP!!
133USAGE="Usage:  subset [-b base] [-l langfile] dict-0 dict-1 ..."
134
135langtabs=${LIBDIR}/!!DEFLANG!!
136outbase=dict
137while :
138do
139    case "$1" in
140        -b)
141            outbase="$2"
142            shift; shift
143            ;;
144        -l)
145            langtabs="$2"
146            shift; shift
147            ;;
148        -*)
149            echo "$USAGE" 1>&2
150            exit 1
151            ;;
152        *)
153            break
154            ;;
155    esac
156done
157
158if [ $# -lt 2 ]
159then
160    echo "$USAGE" 1>&2
161    exit 1
162fi
163
164# Temp files
165MUNCHOUTPUT=${TMP}a
166MISSINGWORDS=${TMP}b
167TEMPDICT=${TMP}c
168FAKEDICT=${TMP}d
169FAKEHASH=${TMP}e!!HASHSUFFIX!!
170
171trap "rm -rf $TOREMOVE; exit 1" 1 2 15
172trap "rm -rf $TOREMOVE; exit 0" 13
173
174#
175# Create a dummy dictionary to hold a compiled copy of the language
176# tables.
177#
178echo 'QQQQQQQQ' > $FAKEDICT
179buildhash -s $FAKEDICT $langtabs $FAKEHASH \
180  ||  (echo "Couldn't create fake hash file" 1>&2; rm -rf $TOREMOVE; exit 1) \
181  ||  exit 1
182rm -f ${FAKEDICT}*
183#
184# Figure out what the flag-marking character is.
185#
186flagmarker=`ispell -D -d $FAKEHASH \
187  | sed -n -e '/^flagmarker/s/flagmarker //p'`
188case "$flagmarker" in
189    \\*)
190        flagmarker=`expr "$flagmarker" : '.\(.\)'`
191        ;;
192esac   
193#
194#       (1) Use munchlist to create a list of roots and maximal suffixes.
195#
196munchlist -l $langtabs "$@" | sort $SORTTMP > $MUNCHOUTPUT
197#
198#       (2) Use join to add the maximal suffixes to each dictionary's roots.
199#           Re-expand this, combine with the original, and save for later.
200#
201newline='
202'
203dictno=0
204for dictfile
205do
206    ispell -e -d $FAKEHASH < $dictfile | tr ' ' "$newline" \
207      | sort -u $SORTTMP | join "-t$flagmarker" -a1 - $MUNCHOUTPUT \
208      | ispell -e -d $FAKEHASH | tr ' ' "$newline" \
209      | sort -u $SORTTMP > ${TEMPDICT}.$dictno
210    dictno=`expr $dictno + 1`
211done
212rm -f $MUNCHOUTPUT
213#
214#       (3) For each adjacent pair of dictionaries, use comm to find words
215#           in the smaller that are missing from the larger, and add them
216#           to the larger.
217#
218firstdict="$1"
219shift
220lastdict="${TEMPDICT}.0"
221dictno=1
222for dictfile
223do
224    comm -23 $lastdict ${TEMPDICT}.$dictno > $MISSINGWORDS.$dictno
225    if [ -s $MISSINGWORDS.$dictno ]
226    then
227        sort $SORTTMP -o ${TEMPDICT}.$dictno \
228          ${TEMPDICT}.$dictno $MISSINGWORDS.$dictno
229    fi
230    lastdict="${TEMPDICT}.$dictno"
231    dictno=`expr $dictno + 1`
232done
233rm -f $MISSINGWORDS.*
234#
235#       (4) For each pair of dictionaries, use comm to eliminate words in
236#           the smaller from the larger, and shrink the result with munchlist.
237#           From this point out, we ignore interrupts.
238#
239munchlist -l $langtabs ${TEMPDICT}.0 > $outbase.0
240lastdict="${TEMPDICT}.0"
241dictno=1
242trap "" 1 2 13 15
243for dictfile
244do
245    comm -13 $lastdict ${TEMPDICT}.$dictno \
246      | munchlist -l $langtabs > $outbase.$dictno
247    rm -f $lastdict
248    lastdict="${TEMPDICT}.$dictno"
249    dictno=`expr $dictno + 1`
250done
251rm -rf $TOREMOVE
Note: See TracBrowser for help on using the repository browser.