source: trunk/third/ispell/findaffix.X @ 10334

Revision 10334, 9.6 KB checked in by ghudson, 27 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r10333, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1: Use /bin/sh
2#
3# $Id: findaffix.X,v 1.1.1.1 1997-09-03 21:08:08 ghudson Exp $
4#
5# Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11#
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. All modifications to the source code must be clearly marked as
18#    such.  Binary redistributions based on modified source code
19#    must be clearly marked as modified versions in the documentation
20#    and/or other materials provided with the distribution.
21# 4. All advertising materials mentioning features or use of this software
22#    must display the following acknowledgment:
23#      This product includes software developed by Geoff Kuenning and
24#      other unpaid contributors.
25# 5. The name of Geoff Kuenning may not be used to endorse or promote
26#    products derived from this software without specific prior
27#    written permission.
28#
29# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
30# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32# ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
33# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39# SUCH DAMAGE.
40#
41#       Find possible affixes for use with ispell
42#
43#       Usage:
44#
45#       findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \
46#         [-t tabchar] [files]
47#
48#       Each common prefix (-p) or suffix (-s, default) is presented, along
49#       with statistics to indicate how useful such an affix might be in
50#       reducing the size of the input file.  Only those affixes which
51#       produce a legal root (one found in the original input) are reported.
52#
53#       If the "-c" option is not given, the output lines are in the
54#       following format:
55#
56#               strip/add/count/bytes
57#
58#       where "strip" is the string that should be stripped from a root
59#       word before adding the affix, "add" is the affix to be added, "count"
60#       is a count of the number of times that this "strip/add" combination
61#       appears, and "bytes" is an estimate of the number of bytes that
62#       will be saved in the raw dictionary file if this combination is
63#       added to the affix file.  The field separator in the output will
64#       normally be the tab character specified by the "-t" switch;  the
65#       default is a slash ("/").
66#
67#       If the "-c" ("clean output") option is given, the appearance of
68#       the output is made cleaner by changing it to:
69#
70#               -strip+add<tab>count<tab>bytes
71#
72#       where "strip," "add," "count," and "bytes" are as before, and "<tab>"
73#       represents the ASCII tab character.
74#
75#       The method used to generate possible affixes will also generate
76#       longer affixes which have common headers or trailers.  For example,
77#       the two words "moth" and "mother" will generate not only the obvious
78#       substition "+er" but also "-h+her" and "-th+ther" (and possibly
79#       even longer ones, depending on the value of "min").  To prevent
80#       cluttering the output with such affixes, any affix pair that shares
81#       a common header (or, for prefixes, trailer) string longer than
82#       "elim" characters (default 1) will be suppressed.  You may want to
83#       set "elim" to a value greater than 1 if your language has string
84#       characters;  usually the need for this parameter will become obvious
85#       when you examine the output of your findaffix run.
86#
87#       Normally, the output is sorted on the "bytes" field.  If the "-f"
88#       flag is given, the output is sorted according to the "count" field.
89#
90#       No affix longer than "max" characters (default 8) will be reported.
91#       Smaller values of "max" will make the script run faster.
92#
93#       Affixes which appear fewer than "low" times (default 10) are
94#       suppressed.  This significantly reduces the size of the output file.
95#
96#       Affixes which generate stems shorter than "min" characters (default 3)
97#       are suppressed.  (A stem is the word after the "strip" string has
98#       been removed, and before the "add" string has been added.)  This
99#       reduces both the running time and the size of the output file.  "Min"
100#       should only be set to 1 if you have a *lot* of free time and disk
101#       space.
102#
103#       The script requires a non-blank field-separator character for internal
104#       use.  Normally, this character is a slash ("/"), but if the slash
105#       appears as a character in the input word list, a different character
106#       can be specified with the "-t" switch.
107#
108#       If the input files are ispell dictionaries, they should be expanded
109#       before being fed to this script.
110#
111#       If the input files contains characters other than [A-Za-z], they
112#       should be translated to lowercase before being fed to this script.
113#
114# $Log: not supported by cvs2svn $
115# Revision 1.15  1994/01/25  07:11:29  geoff
116# Get rid of all old RCS log lines in preparation for the 3.1 release.
117#
118#
119TDIR=${TMPDIR-/usr/tmp}
120TMP=${TDIR}/faff$$
121SORTTMP="-T ${TDIR}"                    # !!SORTTMP!!
122USAGE='Usage:  findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]'
123LOOP='
124    i = len - maxlim + 1
125    if (i < minstem + 1)
126        i = minstem + 1
127    for (  ;  i <= len;  i++)
128        print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len
129    print $0 tabch tabch len'
130ELIM='$1!=$2 \
131    {
132    if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen))
133        print
134    }'
135maxlim=8
136minstem=3
137elimlen=1
138lowcount=10
139cleanout=no
140finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1'
141tabch=/
142while :
143do
144    case "$1" in
145        -p)
146            LOOP='
147                lim = len - minstem
148                if (lim > maxlim)
149                    lim = maxlim
150                for (i = 1;  i <= lim;  i++)
151                    print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len
152                print $0 tabch tabch len'
153            ELIM='$1!=$2 \
154                {
155                if (substr ($1, length ($1), elimlen) \
156                  != substr ($2, length ($2), elimlen))
157                    print
158                }'
159            shift
160            ;;
161        -s)
162            shift
163            ;;
164        -f)
165            finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1'
166            shift
167            ;;
168        -c)
169            cleanout=yes
170            shift
171            ;;
172        -e)
173            elimlen=$2
174            shift; shift
175            ;;
176        -m)
177            minstem=$2
178            shift; shift
179            ;;
180        -M)
181            maxlim=$2
182            shift; shift
183            ;;
184        -l)
185            lowcount=$2
186            shift; shift
187            ;;
188        -t)
189            tabch="$2"
190            shift; shift
191            ;;
192        -*)
193            echo "$USAGE" 1>&2
194            exit 1
195            ;;
196        *)
197            break
198            ;;
199    esac
200done
201trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15
202trap "/bin/rm -f ${TMP}*; exit 0" 13
203#
204# We are ready to do the work.  First, we collect all input, translate it
205# to lowercase, sort it (dropping duplications), and save it for later.
206#
207if [ $# -ne 0 ]
208then
209    cat "$@" | tr '[A-Z]' '[a-z]'
210else
211    tr '[A-Z]' '[a-z]'
212fi \
213  | sort -u $SORTTMP > ${TMP}a
214#
215# Now the monstrous pipeline.  The awk command produces several lines for
216# each input word.  Each line contains a possible stem (first field),
217# a possible affix, and the length of the original word.  The loop which
218# does this was placed into the LOOP variable by the code above (q.v.).
219#
220# The first sort puts this output into an order appropriate for feeding
221# to 'join'.  The join command then combines stems and affixes, and for
222# each puts out an affix to strip, an affix to add, and the length of
223# the word before and after modification.
224#
225# From here on out the job is relatively easy.  The second 'awk' gets rid
226# of lines that have the same strip and add affixes, and also eliminates
227# lines where the strip and add affix have a common leading (for suffixes)
228# or trailing (for prefixes) substring, or where the strip affix is longer
229# than the add affix (this is all done by the $ELIM variable, which is also
230# set up by the code above.  The second sort collects identical affixes;
231# the third 'awk' functions like 'uniq -c', replacing duplicate affixes
232# with a count and summing the estimate of bytes saved.  It also eliminates
233# any affixes which appear less frequently than the minimum ("lowcount").
234# Finally, the third sort ($finalsortopts) rearranges the list in the chosen
235# sort order.
236#
237awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"}
238    {
239    len = length ($0)
240    if (len < 2)
241        next
242    '"$LOOP"'
243    }' < ${TMP}a \
244  | sort "-t$tabch" +0 -1 +1 $SORTTMP -o ${TMP}a
245join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \
246  | awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \
247  | sort "-t$tabch" +1 -2 +0 -1 $SORTTMP \
248  | awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'}
249        {
250        if ($1 == last1  &&  $2 == last2)
251            {
252            count++
253            totchars += $3
254            }
255        else
256            {
257            if ((last1 != ""  ||  last2 != "")  &&  count >= lowcount)
258                print last1 tabch last2 tabch count tabch totchars
259            count = 1
260            last1 = $1
261            last2 = $2
262            totchars = $3
263            }
264        }
265    END {
266        if ((last1 != ""  ||  last2 != "")  &&  count >= lowcount)
267            print last1 tabch last2 tabch count tabch totchars
268        }' \
269  | sort "-t$tabch" $finalsortopts $SORTTMP \
270  | if [ "$cleanout" = "yes" ]
271    then
272        case "$tabch" in
273            /)
274                sedsub=/
275                sedsep=';'
276                ;;
277            .|\*|\[|\^|\$|\\)
278                sedsub="\\$tabch"
279                sedsep=/
280                ;;
281            *)
282                sedsub="$tabch"
283                sedsep=/
284                ;;
285        esac
286        exec sed -e "s$sedsep$sedsub$sedsep     ${sedsep}g" \
287          -e 's/        /+/' -e 's/^/-/' \
288          -e 's/^-+/+/' -e 's/+ /       /'
289    else
290        exec cat
291    fi
292/bin/rm -f ${TMP}?
Note: See TracBrowser for help on using the repository browser.