1 | : Use /bin/sh |
---|
2 | # |
---|
3 | # $Id: findaffix.X,v 1.1.1.1 1997-09-03 21:08:08 ghudson Exp $ |
---|
4 | # |
---|
5 | # Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA |
---|
6 | # All rights reserved. |
---|
7 | # |
---|
8 | # Redistribution and use in source and binary forms, with or without |
---|
9 | # modification, are permitted provided that the following conditions |
---|
10 | # are met: |
---|
11 | # |
---|
12 | # 1. Redistributions of source code must retain the above copyright |
---|
13 | # notice, this list of conditions and the following disclaimer. |
---|
14 | # 2. Redistributions in binary form must reproduce the above copyright |
---|
15 | # notice, this list of conditions and the following disclaimer in the |
---|
16 | # documentation and/or other materials provided with the distribution. |
---|
17 | # 3. All modifications to the source code must be clearly marked as |
---|
18 | # such. Binary redistributions based on modified source code |
---|
19 | # must be clearly marked as modified versions in the documentation |
---|
20 | # and/or other materials provided with the distribution. |
---|
21 | # 4. All advertising materials mentioning features or use of this software |
---|
22 | # must display the following acknowledgment: |
---|
23 | # This product includes software developed by Geoff Kuenning and |
---|
24 | # other unpaid contributors. |
---|
25 | # 5. The name of Geoff Kuenning may not be used to endorse or promote |
---|
26 | # products derived from this software without specific prior |
---|
27 | # written permission. |
---|
28 | # |
---|
29 | # THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND |
---|
30 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
31 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
32 | # ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE |
---|
33 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
34 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
35 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
36 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
37 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
38 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
39 | # SUCH DAMAGE. |
---|
40 | # |
---|
41 | # Find possible affixes for use with ispell |
---|
42 | # |
---|
43 | # Usage: |
---|
44 | # |
---|
45 | # findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \ |
---|
46 | # [-t tabchar] [files] |
---|
47 | # |
---|
48 | # Each common prefix (-p) or suffix (-s, default) is presented, along |
---|
49 | # with statistics to indicate how useful such an affix might be in |
---|
50 | # reducing the size of the input file. Only those affixes which |
---|
51 | # produce a legal root (one found in the original input) are reported. |
---|
52 | # |
---|
53 | # If the "-c" option is not given, the output lines are in the |
---|
54 | # following format: |
---|
55 | # |
---|
56 | # strip/add/count/bytes |
---|
57 | # |
---|
58 | # where "strip" is the string that should be stripped from a root |
---|
59 | # word before adding the affix, "add" is the affix to be added, "count" |
---|
60 | # is a count of the number of times that this "strip/add" combination |
---|
61 | # appears, and "bytes" is an estimate of the number of bytes that |
---|
62 | # will be saved in the raw dictionary file if this combination is |
---|
63 | # added to the affix file. The field separator in the output will |
---|
64 | # normally be the tab character specified by the "-t" switch; the |
---|
65 | # default is a slash ("/"). |
---|
66 | # |
---|
67 | # If the "-c" ("clean output") option is given, the appearance of |
---|
68 | # the output is made cleaner by changing it to: |
---|
69 | # |
---|
70 | # -strip+add<tab>count<tab>bytes |
---|
71 | # |
---|
72 | # where "strip," "add," "count," and "bytes" are as before, and "<tab>" |
---|
73 | # represents the ASCII tab character. |
---|
74 | # |
---|
75 | # The method used to generate possible affixes will also generate |
---|
76 | # longer affixes which have common headers or trailers. For example, |
---|
77 | # the two words "moth" and "mother" will generate not only the obvious |
---|
78 | # substition "+er" but also "-h+her" and "-th+ther" (and possibly |
---|
79 | # even longer ones, depending on the value of "min"). To prevent |
---|
80 | # cluttering the output with such affixes, any affix pair that shares |
---|
81 | # a common header (or, for prefixes, trailer) string longer than |
---|
82 | # "elim" characters (default 1) will be suppressed. You may want to |
---|
83 | # set "elim" to a value greater than 1 if your language has string |
---|
84 | # characters; usually the need for this parameter will become obvious |
---|
85 | # when you examine the output of your findaffix run. |
---|
86 | # |
---|
87 | # Normally, the output is sorted on the "bytes" field. If the "-f" |
---|
88 | # flag is given, the output is sorted according to the "count" field. |
---|
89 | # |
---|
90 | # No affix longer than "max" characters (default 8) will be reported. |
---|
91 | # Smaller values of "max" will make the script run faster. |
---|
92 | # |
---|
93 | # Affixes which appear fewer than "low" times (default 10) are |
---|
94 | # suppressed. This significantly reduces the size of the output file. |
---|
95 | # |
---|
96 | # Affixes which generate stems shorter than "min" characters (default 3) |
---|
97 | # are suppressed. (A stem is the word after the "strip" string has |
---|
98 | # been removed, and before the "add" string has been added.) This |
---|
99 | # reduces both the running time and the size of the output file. "Min" |
---|
100 | # should only be set to 1 if you have a *lot* of free time and disk |
---|
101 | # space. |
---|
102 | # |
---|
103 | # The script requires a non-blank field-separator character for internal |
---|
104 | # use. Normally, this character is a slash ("/"), but if the slash |
---|
105 | # appears as a character in the input word list, a different character |
---|
106 | # can be specified with the "-t" switch. |
---|
107 | # |
---|
108 | # If the input files are ispell dictionaries, they should be expanded |
---|
109 | # before being fed to this script. |
---|
110 | # |
---|
111 | # If the input files contains characters other than [A-Za-z], they |
---|
112 | # should be translated to lowercase before being fed to this script. |
---|
113 | # |
---|
114 | # $Log: not supported by cvs2svn $ |
---|
115 | # Revision 1.15 1994/01/25 07:11:29 geoff |
---|
116 | # Get rid of all old RCS log lines in preparation for the 3.1 release. |
---|
117 | # |
---|
118 | # |
---|
119 | TDIR=${TMPDIR-/usr/tmp} |
---|
120 | TMP=${TDIR}/faff$$ |
---|
121 | SORTTMP="-T ${TDIR}" # !!SORTTMP!! |
---|
122 | USAGE='Usage: findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]' |
---|
123 | LOOP=' |
---|
124 | i = len - maxlim + 1 |
---|
125 | if (i < minstem + 1) |
---|
126 | i = minstem + 1 |
---|
127 | for ( ; i <= len; i++) |
---|
128 | print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len |
---|
129 | print $0 tabch tabch len' |
---|
130 | ELIM='$1!=$2 \ |
---|
131 | { |
---|
132 | if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen)) |
---|
133 | print |
---|
134 | }' |
---|
135 | maxlim=8 |
---|
136 | minstem=3 |
---|
137 | elimlen=1 |
---|
138 | lowcount=10 |
---|
139 | cleanout=no |
---|
140 | finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1' |
---|
141 | tabch=/ |
---|
142 | while : |
---|
143 | do |
---|
144 | case "$1" in |
---|
145 | -p) |
---|
146 | LOOP=' |
---|
147 | lim = len - minstem |
---|
148 | if (lim > maxlim) |
---|
149 | lim = maxlim |
---|
150 | for (i = 1; i <= lim; i++) |
---|
151 | print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len |
---|
152 | print $0 tabch tabch len' |
---|
153 | ELIM='$1!=$2 \ |
---|
154 | { |
---|
155 | if (substr ($1, length ($1), elimlen) \ |
---|
156 | != substr ($2, length ($2), elimlen)) |
---|
157 | print |
---|
158 | }' |
---|
159 | shift |
---|
160 | ;; |
---|
161 | -s) |
---|
162 | shift |
---|
163 | ;; |
---|
164 | -f) |
---|
165 | finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1' |
---|
166 | shift |
---|
167 | ;; |
---|
168 | -c) |
---|
169 | cleanout=yes |
---|
170 | shift |
---|
171 | ;; |
---|
172 | -e) |
---|
173 | elimlen=$2 |
---|
174 | shift; shift |
---|
175 | ;; |
---|
176 | -m) |
---|
177 | minstem=$2 |
---|
178 | shift; shift |
---|
179 | ;; |
---|
180 | -M) |
---|
181 | maxlim=$2 |
---|
182 | shift; shift |
---|
183 | ;; |
---|
184 | -l) |
---|
185 | lowcount=$2 |
---|
186 | shift; shift |
---|
187 | ;; |
---|
188 | -t) |
---|
189 | tabch="$2" |
---|
190 | shift; shift |
---|
191 | ;; |
---|
192 | -*) |
---|
193 | echo "$USAGE" 1>&2 |
---|
194 | exit 1 |
---|
195 | ;; |
---|
196 | *) |
---|
197 | break |
---|
198 | ;; |
---|
199 | esac |
---|
200 | done |
---|
201 | trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15 |
---|
202 | trap "/bin/rm -f ${TMP}*; exit 0" 13 |
---|
203 | # |
---|
204 | # We are ready to do the work. First, we collect all input, translate it |
---|
205 | # to lowercase, sort it (dropping duplications), and save it for later. |
---|
206 | # |
---|
207 | if [ $# -ne 0 ] |
---|
208 | then |
---|
209 | cat "$@" | tr '[A-Z]' '[a-z]' |
---|
210 | else |
---|
211 | tr '[A-Z]' '[a-z]' |
---|
212 | fi \ |
---|
213 | | sort -u $SORTTMP > ${TMP}a |
---|
214 | # |
---|
215 | # Now the monstrous pipeline. The awk command produces several lines for |
---|
216 | # each input word. Each line contains a possible stem (first field), |
---|
217 | # a possible affix, and the length of the original word. The loop which |
---|
218 | # does this was placed into the LOOP variable by the code above (q.v.). |
---|
219 | # |
---|
220 | # The first sort puts this output into an order appropriate for feeding |
---|
221 | # to 'join'. The join command then combines stems and affixes, and for |
---|
222 | # each puts out an affix to strip, an affix to add, and the length of |
---|
223 | # the word before and after modification. |
---|
224 | # |
---|
225 | # From here on out the job is relatively easy. The second 'awk' gets rid |
---|
226 | # of lines that have the same strip and add affixes, and also eliminates |
---|
227 | # lines where the strip and add affix have a common leading (for suffixes) |
---|
228 | # or trailing (for prefixes) substring, or where the strip affix is longer |
---|
229 | # than the add affix (this is all done by the $ELIM variable, which is also |
---|
230 | # set up by the code above. The second sort collects identical affixes; |
---|
231 | # the third 'awk' functions like 'uniq -c', replacing duplicate affixes |
---|
232 | # with a count and summing the estimate of bytes saved. It also eliminates |
---|
233 | # any affixes which appear less frequently than the minimum ("lowcount"). |
---|
234 | # Finally, the third sort ($finalsortopts) rearranges the list in the chosen |
---|
235 | # sort order. |
---|
236 | # |
---|
237 | awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"} |
---|
238 | { |
---|
239 | len = length ($0) |
---|
240 | if (len < 2) |
---|
241 | next |
---|
242 | '"$LOOP"' |
---|
243 | }' < ${TMP}a \ |
---|
244 | | sort "-t$tabch" +0 -1 +1 $SORTTMP -o ${TMP}a |
---|
245 | join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \ |
---|
246 | | awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \ |
---|
247 | | sort "-t$tabch" +1 -2 +0 -1 $SORTTMP \ |
---|
248 | | awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'} |
---|
249 | { |
---|
250 | if ($1 == last1 && $2 == last2) |
---|
251 | { |
---|
252 | count++ |
---|
253 | totchars += $3 |
---|
254 | } |
---|
255 | else |
---|
256 | { |
---|
257 | if ((last1 != "" || last2 != "") && count >= lowcount) |
---|
258 | print last1 tabch last2 tabch count tabch totchars |
---|
259 | count = 1 |
---|
260 | last1 = $1 |
---|
261 | last2 = $2 |
---|
262 | totchars = $3 |
---|
263 | } |
---|
264 | } |
---|
265 | END { |
---|
266 | if ((last1 != "" || last2 != "") && count >= lowcount) |
---|
267 | print last1 tabch last2 tabch count tabch totchars |
---|
268 | }' \ |
---|
269 | | sort "-t$tabch" $finalsortopts $SORTTMP \ |
---|
270 | | if [ "$cleanout" = "yes" ] |
---|
271 | then |
---|
272 | case "$tabch" in |
---|
273 | /) |
---|
274 | sedsub=/ |
---|
275 | sedsep=';' |
---|
276 | ;; |
---|
277 | .|\*|\[|\^|\$|\\) |
---|
278 | sedsub="\\$tabch" |
---|
279 | sedsep=/ |
---|
280 | ;; |
---|
281 | *) |
---|
282 | sedsub="$tabch" |
---|
283 | sedsep=/ |
---|
284 | ;; |
---|
285 | esac |
---|
286 | exec sed -e "s$sedsep$sedsub$sedsep ${sedsep}g" \ |
---|
287 | -e 's/ /+/' -e 's/^/-/' \ |
---|
288 | -e 's/^-+/+/' -e 's/+ / /' |
---|
289 | else |
---|
290 | exec cat |
---|
291 | fi |
---|
292 | /bin/rm -f ${TMP}? |
---|