1 | #ifndef lint |
---|
2 | static char Rcs_Id[] = |
---|
3 | "$Id: lookup.c,v 1.1.1.1 1997-09-03 21:08:12 ghudson Exp $"; |
---|
4 | #endif |
---|
5 | |
---|
6 | /* |
---|
7 | * lookup.c - see if a word appears in the dictionary |
---|
8 | * |
---|
9 | * Pace Willisson, 1983 |
---|
10 | * |
---|
11 | * Copyright 1987, 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA |
---|
12 | * All rights reserved. |
---|
13 | * |
---|
14 | * Redistribution and use in source and binary forms, with or without |
---|
15 | * modification, are permitted provided that the following conditions |
---|
16 | * are met: |
---|
17 | * |
---|
18 | * 1. Redistributions of source code must retain the above copyright |
---|
19 | * notice, this list of conditions and the following disclaimer. |
---|
20 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
21 | * notice, this list of conditions and the following disclaimer in the |
---|
22 | * documentation and/or other materials provided with the distribution. |
---|
23 | * 3. All modifications to the source code must be clearly marked as |
---|
24 | * such. Binary redistributions based on modified source code |
---|
25 | * must be clearly marked as modified versions in the documentation |
---|
26 | * and/or other materials provided with the distribution. |
---|
27 | * 4. All advertising materials mentioning features or use of this software |
---|
28 | * must display the following acknowledgment: |
---|
29 | * This product includes software developed by Geoff Kuenning and |
---|
30 | * other unpaid contributors. |
---|
31 | * 5. The name of Geoff Kuenning may not be used to endorse or promote |
---|
32 | * products derived from this software without specific prior |
---|
33 | * written permission. |
---|
34 | * |
---|
35 | * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND |
---|
36 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
38 | * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE |
---|
39 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
40 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
41 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
42 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
43 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
44 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
45 | * SUCH DAMAGE. |
---|
46 | */ |
---|
47 | |
---|
48 | /* |
---|
49 | * $Log: not supported by cvs2svn $ |
---|
50 | * Revision 1.42 1995/01/08 23:23:42 geoff |
---|
51 | * Support MSDOS_BINARY_OPEN when opening the hash file to read it in. |
---|
52 | * |
---|
53 | * Revision 1.41 1994/01/25 07:11:51 geoff |
---|
54 | * Get rid of all old RCS log lines in preparation for the 3.1 release. |
---|
55 | * |
---|
56 | */ |
---|
57 | |
---|
58 | #include "config.h" |
---|
59 | #include "ispell.h" |
---|
60 | #include "proto.h" |
---|
61 | #include "msgs.h" |
---|
62 | |
---|
63 | int linit P ((void)); |
---|
64 | #ifdef INDEXDUMP |
---|
65 | static void dumpindex P ((struct flagptr * indexp, int depth)); |
---|
66 | #endif /* INDEXDUMP */ |
---|
67 | struct dent * lookup P ((ichar_t * word, int dotree)); |
---|
68 | |
---|
69 | static inited = 0; |
---|
70 | |
---|
71 | int linit () |
---|
72 | { |
---|
73 | int hashfd; |
---|
74 | register int i; |
---|
75 | register struct dent * dp; |
---|
76 | struct flagent * entry; |
---|
77 | struct flagptr * ind; |
---|
78 | int nextchar; |
---|
79 | int viazero; |
---|
80 | register ichar_t * cp; |
---|
81 | |
---|
82 | if (inited) |
---|
83 | return 0; |
---|
84 | |
---|
85 | if ((hashfd = open (hashname, 0 | MSDOS_BINARY_OPEN)) < 0) |
---|
86 | { |
---|
87 | (void) fprintf (stderr, CANT_OPEN, hashname); |
---|
88 | return (-1); |
---|
89 | } |
---|
90 | |
---|
91 | hashsize = read (hashfd, (char *) &hashheader, sizeof hashheader); |
---|
92 | if (hashsize < sizeof hashheader) |
---|
93 | { |
---|
94 | if (hashsize < 0) |
---|
95 | (void) fprintf (stderr, LOOKUP_C_CANT_READ, hashname); |
---|
96 | else if (hashsize == 0) |
---|
97 | (void) fprintf (stderr, LOOKUP_C_NULL_HASH, hashname); |
---|
98 | else |
---|
99 | (void) fprintf (stderr, |
---|
100 | LOOKUP_C_SHORT_HASH (hashname, hashsize, |
---|
101 | (int) sizeof hashheader)); |
---|
102 | return (-1); |
---|
103 | } |
---|
104 | else if (hashheader.magic != MAGIC) |
---|
105 | { |
---|
106 | (void) fprintf (stderr, |
---|
107 | LOOKUP_C_BAD_MAGIC (hashname, (unsigned int) MAGIC, |
---|
108 | (unsigned int) hashheader.magic)); |
---|
109 | return (-1); |
---|
110 | } |
---|
111 | else if (hashheader.magic2 != MAGIC) |
---|
112 | { |
---|
113 | (void) fprintf (stderr, |
---|
114 | LOOKUP_C_BAD_MAGIC2 (hashname, (unsigned int) MAGIC, |
---|
115 | (unsigned int) hashheader.magic2)); |
---|
116 | return (-1); |
---|
117 | } |
---|
118 | else if (hashheader.compileoptions != COMPILEOPTIONS |
---|
119 | || hashheader.maxstringchars != MAXSTRINGCHARS |
---|
120 | || hashheader.maxstringcharlen != MAXSTRINGCHARLEN) |
---|
121 | { |
---|
122 | (void) fprintf (stderr, |
---|
123 | LOOKUP_C_BAD_OPTIONS ((unsigned int) hashheader.compileoptions, |
---|
124 | hashheader.maxstringchars, hashheader.maxstringcharlen, |
---|
125 | (unsigned int) COMPILEOPTIONS, MAXSTRINGCHARS, MAXSTRINGCHARLEN)); |
---|
126 | return (-1); |
---|
127 | } |
---|
128 | if (nodictflag) |
---|
129 | { |
---|
130 | /* |
---|
131 | * Dictionary is not needed - create an empty dummy table. We |
---|
132 | * actually have to have one entry since the hash |
---|
133 | * algorithm involves a divide by the table size |
---|
134 | * (actually modulo, but zero is still unacceptable). |
---|
135 | * So we create an empty entry. |
---|
136 | */ |
---|
137 | hashsize = 1; /* This prevents divides by zero */ |
---|
138 | hashtbl = (struct dent *) calloc (1, sizeof (struct dent)); |
---|
139 | if (hashtbl == NULL) |
---|
140 | { |
---|
141 | (void) fprintf (stderr, LOOKUP_C_NO_HASH_SPACE); |
---|
142 | return (-1); |
---|
143 | } |
---|
144 | hashtbl[0].word = NULL; |
---|
145 | hashtbl[0].next = NULL; |
---|
146 | hashtbl[0].flagfield &= ~(USED | KEEP); |
---|
147 | /* The flag bits don't matter, but calloc cleared them. */ |
---|
148 | hashstrings = (char *) malloc ((unsigned) hashheader.lstringsize); |
---|
149 | } |
---|
150 | else |
---|
151 | { |
---|
152 | hashtbl = |
---|
153 | (struct dent *) |
---|
154 | malloc ((unsigned) hashheader.tblsize * sizeof (struct dent)); |
---|
155 | hashsize = hashheader.tblsize; |
---|
156 | hashstrings = (char *) malloc ((unsigned) hashheader.stringsize); |
---|
157 | } |
---|
158 | numsflags = hashheader.stblsize; |
---|
159 | numpflags = hashheader.ptblsize; |
---|
160 | sflaglist = (struct flagent *) |
---|
161 | malloc ((numsflags + numpflags) * sizeof (struct flagent)); |
---|
162 | if (hashtbl == NULL || hashstrings == NULL || sflaglist == NULL) |
---|
163 | { |
---|
164 | (void) fprintf (stderr, LOOKUP_C_NO_HASH_SPACE); |
---|
165 | return (-1); |
---|
166 | } |
---|
167 | pflaglist = sflaglist + numsflags; |
---|
168 | |
---|
169 | if (nodictflag) |
---|
170 | { |
---|
171 | /* |
---|
172 | * Read just the strings for the language table, and |
---|
173 | * skip over the rest of the strings and all of the |
---|
174 | * hash table. |
---|
175 | */ |
---|
176 | if (read (hashfd, hashstrings, (unsigned) hashheader.lstringsize) |
---|
177 | != hashheader.lstringsize) |
---|
178 | { |
---|
179 | (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT); |
---|
180 | return (-1); |
---|
181 | } |
---|
182 | (void) lseek (hashfd, |
---|
183 | (long) hashheader.stringsize - (long) hashheader.lstringsize |
---|
184 | + (long) hashheader.tblsize * (long) sizeof (struct dent), |
---|
185 | 1); |
---|
186 | } |
---|
187 | else |
---|
188 | { |
---|
189 | if (read (hashfd, hashstrings, (unsigned) hashheader.stringsize) |
---|
190 | != hashheader.stringsize |
---|
191 | || read (hashfd, (char *) hashtbl, |
---|
192 | (unsigned) hashheader.tblsize * sizeof (struct dent)) |
---|
193 | != hashheader.tblsize * sizeof (struct dent)) |
---|
194 | { |
---|
195 | (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT); |
---|
196 | return (-1); |
---|
197 | } |
---|
198 | } |
---|
199 | if (read (hashfd, (char *) sflaglist, |
---|
200 | (unsigned) (numsflags + numpflags) * sizeof (struct flagent)) |
---|
201 | != (numsflags + numpflags) * sizeof (struct flagent)) |
---|
202 | { |
---|
203 | (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT); |
---|
204 | return (-1); |
---|
205 | } |
---|
206 | (void) close (hashfd); |
---|
207 | |
---|
208 | if (!nodictflag) |
---|
209 | { |
---|
210 | for (i = hashsize, dp = hashtbl; --i >= 0; dp++) |
---|
211 | { |
---|
212 | if (dp->word == (char *) -1) |
---|
213 | dp->word = NULL; |
---|
214 | else |
---|
215 | dp->word = &hashstrings [ (int)(dp->word) ]; |
---|
216 | if (dp->next == (struct dent *) -1) |
---|
217 | dp->next = NULL; |
---|
218 | else |
---|
219 | dp->next = &hashtbl [ (int)(dp->next) ]; |
---|
220 | } |
---|
221 | } |
---|
222 | |
---|
223 | for (i = numsflags + numpflags, entry = sflaglist; --i >= 0; entry++) |
---|
224 | { |
---|
225 | if (entry->stripl) |
---|
226 | entry->strip = (ichar_t *) &hashstrings[(int) entry->strip]; |
---|
227 | else |
---|
228 | entry->strip = NULL; |
---|
229 | if (entry->affl) |
---|
230 | entry->affix = (ichar_t *) &hashstrings[(int) entry->affix]; |
---|
231 | else |
---|
232 | entry->affix = NULL; |
---|
233 | } |
---|
234 | /* |
---|
235 | ** Warning - 'entry' and 'i' are reset in the body of the loop |
---|
236 | ** below. Don't try to optimize it by (e.g.) moving the decrement |
---|
237 | ** of i into the loop condition. |
---|
238 | */ |
---|
239 | for (i = numsflags, entry = sflaglist; i > 0; i--, entry++) |
---|
240 | { |
---|
241 | if (entry->affl == 0) |
---|
242 | { |
---|
243 | cp = NULL; |
---|
244 | ind = &sflagindex[0]; |
---|
245 | viazero = 1; |
---|
246 | } |
---|
247 | else |
---|
248 | { |
---|
249 | cp = entry->affix + entry->affl - 1; |
---|
250 | ind = &sflagindex[*cp]; |
---|
251 | viazero = 0; |
---|
252 | while (ind->numents == 0 && ind->pu.fp != NULL) |
---|
253 | { |
---|
254 | if (cp == entry->affix) |
---|
255 | { |
---|
256 | ind = &ind->pu.fp[0]; |
---|
257 | viazero = 1; |
---|
258 | } |
---|
259 | else |
---|
260 | { |
---|
261 | ind = &ind->pu.fp[*--cp]; |
---|
262 | viazero = 0; |
---|
263 | } |
---|
264 | } |
---|
265 | } |
---|
266 | if (ind->numents == 0) |
---|
267 | ind->pu.ent = entry; |
---|
268 | ind->numents++; |
---|
269 | /* |
---|
270 | ** If this index entry has more than MAXSEARCH flags in |
---|
271 | ** it, we will split it into subentries to reduce the |
---|
272 | ** searching. However, the split doesn't make sense in |
---|
273 | ** two cases: (a) if we are already at the end of the |
---|
274 | ** current affix, or (b) if all the entries in the list |
---|
275 | ** have identical affixes. Since the list is sorted, (b) |
---|
276 | ** is true if the first and last affixes in the list |
---|
277 | ** are identical. |
---|
278 | */ |
---|
279 | if (!viazero && ind->numents >= MAXSEARCH |
---|
280 | && icharcmp (entry->affix, ind->pu.ent->affix) != 0) |
---|
281 | { |
---|
282 | /* Sneaky trick: back up and reprocess */ |
---|
283 | entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */ |
---|
284 | i = numsflags - (entry - sflaglist); |
---|
285 | ind->pu.fp = |
---|
286 | (struct flagptr *) |
---|
287 | calloc ((unsigned) (SET_SIZE + hashheader.nstrchars), |
---|
288 | sizeof (struct flagptr)); |
---|
289 | if (ind->pu.fp == NULL) |
---|
290 | { |
---|
291 | (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); |
---|
292 | return (-1); |
---|
293 | } |
---|
294 | ind->numents = 0; |
---|
295 | } |
---|
296 | } |
---|
297 | /* |
---|
298 | ** Warning - 'entry' and 'i' are reset in the body of the loop |
---|
299 | ** below. Don't try to optimize it by (e.g.) moving the decrement |
---|
300 | ** of i into the loop condition. |
---|
301 | */ |
---|
302 | for (i = numpflags, entry = pflaglist; i > 0; i--, entry++) |
---|
303 | { |
---|
304 | if (entry->affl == 0) |
---|
305 | { |
---|
306 | cp = NULL; |
---|
307 | ind = &pflagindex[0]; |
---|
308 | viazero = 1; |
---|
309 | } |
---|
310 | else |
---|
311 | { |
---|
312 | cp = entry->affix; |
---|
313 | ind = &pflagindex[*cp++]; |
---|
314 | viazero = 0; |
---|
315 | while (ind->numents == 0 && ind->pu.fp != NULL) |
---|
316 | { |
---|
317 | if (*cp == 0) |
---|
318 | { |
---|
319 | ind = &ind->pu.fp[0]; |
---|
320 | viazero = 1; |
---|
321 | } |
---|
322 | else |
---|
323 | { |
---|
324 | ind = &ind->pu.fp[*cp++]; |
---|
325 | viazero = 0; |
---|
326 | } |
---|
327 | } |
---|
328 | } |
---|
329 | if (ind->numents == 0) |
---|
330 | ind->pu.ent = entry; |
---|
331 | ind->numents++; |
---|
332 | /* |
---|
333 | ** If this index entry has more than MAXSEARCH flags in |
---|
334 | ** it, we will split it into subentries to reduce the |
---|
335 | ** searching. However, the split doesn't make sense in |
---|
336 | ** two cases: (a) if we are already at the end of the |
---|
337 | ** current affix, or (b) if all the entries in the list |
---|
338 | ** have identical affixes. Since the list is sorted, (b) |
---|
339 | ** is true if the first and last affixes in the list |
---|
340 | ** are identical. |
---|
341 | */ |
---|
342 | if (!viazero && ind->numents >= MAXSEARCH |
---|
343 | && icharcmp (entry->affix, ind->pu.ent->affix) != 0) |
---|
344 | { |
---|
345 | /* Sneaky trick: back up and reprocess */ |
---|
346 | entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */ |
---|
347 | i = numpflags - (entry - pflaglist); |
---|
348 | ind->pu.fp = |
---|
349 | (struct flagptr *) calloc (SET_SIZE + hashheader.nstrchars, |
---|
350 | sizeof (struct flagptr)); |
---|
351 | if (ind->pu.fp == NULL) |
---|
352 | { |
---|
353 | (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); |
---|
354 | return (-1); |
---|
355 | } |
---|
356 | ind->numents = 0; |
---|
357 | } |
---|
358 | } |
---|
359 | #ifdef INDEXDUMP |
---|
360 | (void) fprintf (stderr, "Prefix index table:\n"); |
---|
361 | dumpindex (pflagindex, 0); |
---|
362 | (void) fprintf (stderr, "Suffix index table:\n"); |
---|
363 | dumpindex (sflagindex, 0); |
---|
364 | #endif |
---|
365 | if (hashheader.nstrchartype == 0) |
---|
366 | chartypes = NULL; |
---|
367 | else |
---|
368 | { |
---|
369 | chartypes = (struct strchartype *) |
---|
370 | malloc (hashheader.nstrchartype * sizeof (struct strchartype)); |
---|
371 | if (chartypes == NULL) |
---|
372 | { |
---|
373 | (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); |
---|
374 | return (-1); |
---|
375 | } |
---|
376 | for (i = 0, nextchar = hashheader.strtypestart; |
---|
377 | i < hashheader.nstrchartype; |
---|
378 | i++) |
---|
379 | { |
---|
380 | chartypes[i].name = &hashstrings[nextchar]; |
---|
381 | nextchar += strlen (chartypes[i].name) + 1; |
---|
382 | chartypes[i].deformatter = &hashstrings[nextchar]; |
---|
383 | nextchar += strlen (chartypes[i].deformatter) + 1; |
---|
384 | chartypes[i].suffixes = &hashstrings[nextchar]; |
---|
385 | while (hashstrings[nextchar] != '\0') |
---|
386 | nextchar += strlen (&hashstrings[nextchar]) + 1; |
---|
387 | nextchar++; |
---|
388 | } |
---|
389 | } |
---|
390 | inited = 1; |
---|
391 | return (0); |
---|
392 | } |
---|
393 | |
---|
394 | #ifdef INDEXDUMP |
---|
395 | static void dumpindex (indexp, depth) |
---|
396 | register struct flagptr * indexp; |
---|
397 | register int depth; |
---|
398 | { |
---|
399 | register int i; |
---|
400 | int j; |
---|
401 | int k; |
---|
402 | char stripbuf[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; |
---|
403 | |
---|
404 | for (i = 0; i < SET_SIZE + hashheader.nstrchars; i++, indexp++) |
---|
405 | { |
---|
406 | if (indexp->numents == 0 && indexp->pu.fp != NULL) |
---|
407 | { |
---|
408 | for (j = depth; --j >= 0; ) |
---|
409 | (void) putc (' ', stderr); |
---|
410 | if (i >= ' ' && i <= '~') |
---|
411 | (void) putc (i, stderr); |
---|
412 | else |
---|
413 | (void) fprintf (stderr, "0x%x", i); |
---|
414 | (void) putc ('\n', stderr); |
---|
415 | dumpindex (indexp->pu.fp, depth + 1); |
---|
416 | } |
---|
417 | else if (indexp->numents) |
---|
418 | { |
---|
419 | for (j = depth; --j >= 0; ) |
---|
420 | (void) putc (' ', stderr); |
---|
421 | if (i >= ' ' && i <= '~') |
---|
422 | (void) putc (i, stderr); |
---|
423 | else |
---|
424 | (void) fprintf (stderr, "0x%x", i); |
---|
425 | (void) fprintf (stderr, " -> %d entries\n", indexp->numents); |
---|
426 | for (k = 0; k < indexp->numents; k++) |
---|
427 | { |
---|
428 | for (j = depth; --j >= 0; ) |
---|
429 | (void) putc (' ', stderr); |
---|
430 | if (indexp->pu.ent[k].stripl) |
---|
431 | { |
---|
432 | (void) ichartostr (stripbuf, indexp->pu.ent[k].strip, |
---|
433 | sizeof stripbuf, 1); |
---|
434 | (void) fprintf (stderr, " entry %d (-%s,%s)\n", |
---|
435 | &indexp->pu.ent[k] - sflaglist, |
---|
436 | stripbuf, |
---|
437 | indexp->pu.ent[k].affl |
---|
438 | ? ichartosstr (indexp->pu.ent[k].affix, 1) : "-"); |
---|
439 | } |
---|
440 | else |
---|
441 | (void) fprintf (stderr, " entry %d (%s)\n", |
---|
442 | &indexp->pu.ent[k] - sflaglist, |
---|
443 | ichartosstr (indexp->pu.ent[k].affix, 1)); |
---|
444 | } |
---|
445 | } |
---|
446 | } |
---|
447 | } |
---|
448 | #endif |
---|
449 | |
---|
450 | /* n is length of s */ |
---|
451 | struct dent * lookup (s, dotree) |
---|
452 | register ichar_t * s; |
---|
453 | int dotree; |
---|
454 | { |
---|
455 | register struct dent * dp; |
---|
456 | register char * s1; |
---|
457 | char schar[INPUTWORDLEN + MAXAFFIXLEN]; |
---|
458 | |
---|
459 | dp = &hashtbl[hash (s, hashsize)]; |
---|
460 | if (ichartostr (schar, s, sizeof schar, 1)) |
---|
461 | (void) fprintf (stderr, WORD_TOO_LONG (schar)); |
---|
462 | for ( ; dp != NULL; dp = dp->next) |
---|
463 | { |
---|
464 | /* quick strcmp, but only for equality */ |
---|
465 | s1 = dp->word; |
---|
466 | if (s1 && s1[0] == schar[0] && strcmp (s1 + 1, schar + 1) == 0) |
---|
467 | return dp; |
---|
468 | #ifndef NO_CAPITALIZATION_SUPPORT |
---|
469 | while (dp->flagfield & MOREVARIANTS) /* Skip variations */ |
---|
470 | dp = dp->next; |
---|
471 | #endif |
---|
472 | } |
---|
473 | if (dotree) |
---|
474 | { |
---|
475 | dp = treelookup (s); |
---|
476 | return dp; |
---|
477 | } |
---|
478 | else |
---|
479 | return NULL; |
---|
480 | } |
---|