source: trunk/third/ispell/makedent.c @ 10334

Revision 10334, 31.2 KB checked in by ghudson, 27 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r10333, which included commits to RCS files with non-trunk default branches.
Line 
1#ifndef lint
2static char Rcs_Id[] =
3        "$Id: makedent.c,v 1.1.1.1 1997-09-03 21:08:12 ghudson Exp $";
4#endif
5
6/*
7 * Copyright 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All modifications to the source code must be clearly marked as
20 *    such.  Binary redistributions based on modified source code
21 *    must be clearly marked as modified versions in the documentation
22 *    and/or other materials provided with the distribution.
23 * 4. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgment:
25 *      This product includes software developed by Geoff Kuenning and
26 *      other unpaid contributors.
27 * 5. The name of Geoff Kuenning may not be used to endorse or promote
28 *    products derived from this software without specific prior
29 *    written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 */
43
44/*
45 * $Log: not supported by cvs2svn $
46 * Revision 1.45  1994/12/27  23:08:52  geoff
47 * Add code to makedent to reject words that contain non-word characters.
48 * This helps protect people who use ISO 8-bit characters when ispell
49 * isn't configured for that option.
50 *
51 * Revision 1.44  1994/10/25  05:46:20  geoff
52 * Fix some incorrect declarations in the lint versions of some routines.
53 *
54 * Revision 1.43  1994/09/16  03:32:34  geoff
55 * Issue an error message for bad affix flags
56 *
57 * Revision 1.42  1994/02/07  04:23:43  geoff
58 * Correctly identify the deformatter when changing file types
59 *
60 * Revision 1.41  1994/01/25  07:11:55  geoff
61 * Get rid of all old RCS log lines in preparation for the 3.1 release.
62 *
63 */
64
65#include "config.h"
66#include "ispell.h"
67#include "proto.h"
68#include "msgs.h"
69
70int             makedent P ((char * lbuf, int lbuflen, struct dent * ent));
71#ifndef NO_CAPITALIZATION_SUPPORT
72long            whatcap P ((ichar_t * word));
73#endif
74int             addvheader P ((struct dent * ent));
75int             combinecaps P ((struct dent * hdr, struct dent * newent));
76#ifndef NO_CAPITALIZATION_SUPPORT
77static void     forcevheader P ((struct dent * hdrp, struct dent * oldp,
78                  struct dent * newp));
79#endif /* NO_CAPITALIZATION_SUPPORT */
80static int      combine_two_entries P ((struct dent * hdrp,
81                  struct dent * oldp, struct dent * newp));
82static int      acoversb P ((struct dent * enta, struct dent * entb));
83void            upcase P ((ichar_t * string));
84void            lowcase P ((ichar_t * string));
85void            chupcase P ((char * s));
86static int      issubset P ((struct dent * ent1, struct dent * ent2));
87static void     combineaffixes P ((struct dent * ent1, struct dent * ent2));
88void            toutent P ((FILE * outfile, struct dent * hent,
89                  int onlykeep));
90static void     toutword P ((FILE * outfile, char * word,
91                  struct dent * cent));
92static void     flagout P ((FILE * outfile, int flag));
93int             stringcharlen P ((char * bufp, int canonical));
94int             strtoichar P ((ichar_t * out, char * in, int outlen,
95                  int canonical));
96int             ichartostr P ((char * out, ichar_t * in, int outlen,
97                  int canonical));
98ichar_t *       strtosichar P ((char * in, int canonical));
99char *          ichartosstr P ((ichar_t * in, int canonical));
100char *          printichar P ((int in));
101#ifndef ICHAR_IS_CHAR
102ichar_t *       icharcpy P ((ichar_t * out, ichar_t * in));
103int             icharlen P ((ichar_t * str));
104int             icharcmp P ((ichar_t * s1, ichar_t * s2));
105int             icharncmp P ((ichar_t * s1, ichar_t * s2, int n));
106#endif /* ICHAR_IS_CHAR */
107int             findfiletype P ((char * name, int searchnames,
108                  int * deformatter));
109
110static int      has_marker;
111
112/*
113 * Fill in a directory entry, including setting the capitalization flags, and
114 * allocate and initialize memory for the d->word field.  Returns -1
115 * if there was trouble.  The input word must be in canonical form.
116 */
117
118int makedent (lbuf, lbuflen, d)
119    char *              lbuf;
120    int                 lbuflen;
121    struct dent *       d;
122    {
123    ichar_t             ibuf[INPUTWORDLEN + MAXAFFIXLEN];
124    ichar_t *           ip;
125    char *              p;
126    int                 bit;
127    int                 len;
128
129    /* Strip off any trailing newline */
130    len = strlen (lbuf) - 1;
131    if (lbuf[len] == '\n')
132        lbuf[len] = '\0';
133
134    d->next = NULL;
135    /* WARNING:  flagfield might be the same as mask! See ispell.h. */
136    d->flagfield = 0;
137    (void) bzero ((char *) d->mask, sizeof (d->mask));
138    d->flagfield |= USED;
139    d->flagfield &= ~KEEP;
140
141    p = index (lbuf, hashheader.flagmarker);
142    if (p != NULL)
143        *p = 0;
144
145    /*
146    ** Convert the word to an ichar_t and back;  this makes sure that
147    ** it is in canonical form and thus that the length is correct.
148    */
149    if (strtoichar (ibuf, lbuf, INPUTWORDLEN * sizeof (ichar_t), 1)
150      ||  ichartostr (lbuf, ibuf, lbuflen, 1))
151        {
152        (void) fprintf (stderr, WORD_TOO_LONG (lbuf));
153        return (-1);
154        }
155    /*
156    ** Make sure the word is well-formed (contains only legal characters).
157    */
158    for (ip = ibuf;  *ip != 0;  ip++)
159        {
160        if (!iswordch (*ip))
161            {
162            /* Boundary characters are legal as long as they're not at edges */
163            if (!isboundarych (*ip)
164              ||  ip == ibuf  ||  ip[1] == 0)
165                {
166                (void) fprintf (stderr, MAKEDENT_C_BAD_WORD_CHAR, lbuf);
167                return -1;
168                }
169            }
170        }
171    len = strlen (lbuf);
172#ifndef NO_CAPITALIZATION_SUPPORT
173    /*
174    ** Figure out the capitalization rules from the capitalization of
175    ** the sample entry.
176    */
177    d->flagfield |= whatcap (ibuf);
178#endif
179
180    if (len > INPUTWORDLEN - 1)
181        {
182        (void) fprintf (stderr, WORD_TOO_LONG (lbuf));
183        return (-1);
184        }
185
186    d->word = mymalloc ((unsigned) len + 1);
187    if (d->word == NULL)
188        {
189        (void) fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, lbuf);
190        return -1;
191        }
192
193    (void) strcpy (d->word, lbuf);
194#ifdef NO_CAPITALIZATION_SUPPORT
195    chupcase (d->word);
196#else /* NO_CAPITALIZATION_SUPPORT */
197    if (captype (d->flagfield) != FOLLOWCASE)
198        chupcase (d->word);
199#endif /* NO_CAPITALIZATION_SUPPORT */
200    if (p == NULL)
201        return (0);
202
203    p++;
204    while (*p != '\0'  &&  *p != '\n')
205        {
206        bit = CHARTOBIT ((unsigned char) *p);
207        if (bit >= 0  &&  bit <= LARGESTFLAG)
208            SETMASKBIT (d->mask, bit);
209        else
210            (void) fprintf (stderr, BAD_FLAG, (unsigned char) *p);
211        p++;
212        if (*p == hashheader.flagmarker)
213            p++;                /* Handle old-format dictionaries too */
214        }
215    return (0);
216    }
217
218#ifndef NO_CAPITALIZATION_SUPPORT
219/*
220** Classify the capitalization of a sample entry.  Returns one of the
221** four capitalization codes ANYCASE, ALLCAPS, CAPITALIZED, or FOLLOWCASE.
222*/
223
224long whatcap (word)
225    register ichar_t *  word;
226    {
227    register ichar_t *  p;
228
229    for (p = word;  *p;  p++)
230        {
231        if (mylower (*p))
232            break;
233        }
234    if (*p == '\0')
235        return ALLCAPS;
236    else
237        {
238        for (  ;  *p;  p++)
239            {
240            if (myupper (*p))
241                break;
242            }
243        if (*p == '\0')
244            {
245            /*
246            ** No uppercase letters follow the lowercase ones.
247            ** If there is more than one uppercase letter, it's
248            ** "followcase". If only the first one is capitalized,
249            ** it's "capitalize".  If there are no capitals
250            ** at all, it's ANYCASE.
251            */
252            if (myupper (word[0]))
253                {
254                for (p = word + 1;  *p != '\0';  p++)
255                    {
256                    if (myupper (*p))
257                        return FOLLOWCASE;
258                    }
259                return CAPITALIZED;
260                }
261            else
262                return ANYCASE;
263            }
264        else
265            return FOLLOWCASE;  /* .../lower/upper */
266        }
267    }
268
269/*
270** Add a variant-capitalization header to a word.  This routine may be
271** called even for a followcase word that doesn't yet have a header.
272**
273** Returns 0 if all was ok, -1 if allocation error.
274*/
275int addvheader (dp)
276    register struct dent *      dp;     /* Entry to update */
277    {
278    register struct dent *      tdent; /* Copy of entry */
279
280    /*
281    ** Add a second entry with the correct capitalization, and then make
282    ** dp into a special dummy entry.
283    */
284    tdent = (struct dent *) mymalloc (sizeof (struct dent));
285    if (tdent == NULL)
286        {
287        (void) fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
288        return -1;
289        }
290    *tdent = *dp;
291    if (captype (tdent->flagfield) != FOLLOWCASE)
292        tdent->word = NULL;
293    else
294        {
295        /* Followcase words need a copy of the capitalization */
296        tdent->word = mymalloc ((unsigned int) strlen (tdent->word) + 1);
297        if (tdent->word == NULL)
298            {
299            (void) fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
300            myfree ((char *) tdent);
301            return -1;
302            }
303        (void) strcpy (tdent->word, dp->word);
304        }
305    chupcase (dp->word);
306    dp->next = tdent;
307    dp->flagfield &= ~CAPTYPEMASK;
308    dp->flagfield |= (ALLCAPS | MOREVARIANTS);
309    return 0;
310    }
311#endif /* NO_CAPITALIZATION_SUPPORT */
312
313/*
314** Combine and resolve the entries describing two capitalizations of the same
315** word.  This may require allocating yet more entries.
316**
317** Hdrp is a pointer into a hash table.  If the word covered by hdrp has
318** variations, hdrp must point to the header.  Newp is a pointer to temporary
319** storage, and space is malloc'ed if newp is to be kept.  The newp->word
320** field must have been allocated with mymalloc, so that this routine may free
321** the space if it keeps newp but not the word.
322**
323** Return value:  0 if the word was added, 1 if the word was combined
324** with an existing entry, and -1 if trouble occurred (e.g., malloc).
325** If 1 is returned, newp->word may have been be freed using myfree.
326**
327** Life is made much more difficult by the KEEP flag's possibilities.  We
328** must ensure that a !KEEP word doesn't find its way into the personal
329** dictionary as a result of this routine's actions.  However, a !KEEP
330** word that has affixes must have come from the main dictionary, so it
331** is acceptable to combine entries in that case (got that?).
332**
333** The net result of all this is a set of rules that is a bloody pain
334** to figure out.  Basically, we want to choose one of the following actions:
335**
336**      (1) Add newp's affixes and KEEP flag to oldp, and discard newp.
337**      (2) Add oldp's affixes and KEEP flag to newp, replace oldp with
338**          newp, and discard newp.
339#ifndef NO_CAPITALIZATION_SUPPORT
340**      (3) Insert newp as a new entry in the variants list.  If there is
341**          currently no variant header, this requires adding one.  Adding a
342**          header splits into two sub-cases:
343**
344**          (3a) If oldp is ALLCAPS and the KEEP flags match, just turn it
345**              into the header.
346**          (3b) Otherwise, add a new entry to serve as the header.
347**              To ease list linking, this is done by copying oldp into
348**              the new entry, and then performing (3a).
349**
350**          After newp has been added as a variant, its affixes and KEEP
351**          flag are OR-ed into the variant header.
352#endif
353**
354** So how to choose which?  The default is always case (3), which adds newp
355** as a new entry in the variants list.  Cases (1) and (2) are symmetrical
356** except for which entry is discarded.  We can use case (1) or (2) whenever
357** one entry "covers" the other.  "Covering" is defined as follows:
358**
359**      (4) For entries with matching capitalization types, A covers B
360**          if:
361**
362**          (4a) B's affix flags are a subset of A's, or the KEEP flags
363**               match, and
364**          (4b) either the KEEP flags match, or A's KEEP flag is set.
365**              (Since A has more suffixes, combining B with it won't
366**              cause any extra suffixes to be added to the dictionary.)
367**          (4c) If the words are FOLLOWCASE, the capitalizations match
368**              exactly.
369**
370#ifndef NO_CAPITALIZATION_SUPPORT
371**      (5) For entries with mismatched capitalization types, A covers B
372**          if (4a) and (4b) are true, and:
373**
374**          (5a) B is ALLCAPS, or
375**          (5b) A is ANYCASE, and B is CAPITALIZED.
376#endif
377**
378** For any "hdrp" without variants, oldp is the same as hdrp.  Otherwise,
379** the above tests are applied using each variant in turn for oldp.
380*/
381int combinecaps (hdrp, newp)
382    struct dent *       hdrp;   /* Header of entry currently in dictionary */
383    register struct dent *
384                        newp;   /* Entry to add */
385    {
386    register struct dent *
387                        oldp;   /* Current "oldp" entry */
388#ifndef NO_CAPITALIZATION_SUPPORT
389    register struct dent *
390                        tdent; /* Entry we'll add to the dictionary */
391#endif /* NO_CAPITALIZATION_SUPPORT */
392    register int        retval = 0; /* Return value from combine_two_entries */
393
394    /*
395    ** First, see if we can combine the two entries (cases 1 and 2).  If
396    ** combine_two_entries does so, it will return 1.  If it has trouble,
397    ** it will return zero.
398    */
399    oldp = hdrp;
400#ifdef NO_CAPITALIZATION_SUPPORT
401    retval = combine_two_entries (hdrp, oldp, newp);
402#else /* NO_CAPITALIZATION_SUPPORT */
403    if ((oldp->flagfield & (CAPTYPEMASK | MOREVARIANTS))
404      == (ALLCAPS | MOREVARIANTS))
405        {
406        while (oldp->flagfield & MOREVARIANTS)
407            {
408            oldp = oldp->next;
409            retval = combine_two_entries (hdrp, oldp, newp);
410            if (retval != 0)            /* Did we combine them? */
411                break;
412            }
413        }
414    else
415        retval = combine_two_entries (hdrp, oldp, newp);
416    if (retval == 0)
417        {
418        /*
419        ** Couldn't combine the two entries.  Add a new variant.  For
420        ** ease, we'll stick it right behind the header, rather than
421        ** at the end of the list.
422        */
423        forcevheader (hdrp, oldp, newp);
424        tdent = (struct dent *) mymalloc (sizeof (struct dent));
425        if (tdent == NULL)
426            {
427            (void) fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, newp->word);
428            return -1;
429            }
430        *tdent = *newp;
431        tdent->next = hdrp->next;
432        hdrp->next = tdent;
433        tdent->flagfield |= (hdrp->flagfield & MOREVARIANTS);
434        hdrp->flagfield |= MOREVARIANTS;
435        combineaffixes (hdrp, newp);
436        hdrp->flagfield |= (newp->flagfield & KEEP);
437        if (captype (newp->flagfield) == FOLLOWCASE)
438            tdent->word = newp->word;
439        else
440            {
441            tdent->word = NULL;
442            myfree (newp->word);                /* newp->word isn't needed */
443            }
444        }
445#endif /* NO_CAPITALIZATION_SUPPORT */
446    return retval;
447    }
448
449#ifndef NO_CAPITALIZATION_SUPPORT
450/*
451** The following routine implements steps 3a and 3b in the commentary
452** for "combinecaps".
453*/
454static void forcevheader (hdrp, oldp, newp)
455    register struct dent *      hdrp;
456    struct dent *               oldp;
457    struct dent *               newp;
458    {
459
460    if ((hdrp->flagfield & (CAPTYPEMASK | MOREVARIANTS)) == ALLCAPS
461      &&  ((oldp->flagfield ^ newp->flagfield) & KEEP) == 0)
462        return;                 /* Caller will set MOREVARIANTS */
463    else if ((hdrp->flagfield & (CAPTYPEMASK | MOREVARIANTS))
464      != (ALLCAPS | MOREVARIANTS))
465        (void) addvheader (hdrp);
466    }
467#endif /* NO_CAPITALIZATION_SUPPORT */
468
469/*
470** This routine implements steps 4 and 5 of the commentary for "combinecaps".
471**
472** Returns 1 if newp can be discarded, 0 if nothing done.
473*/
474static int combine_two_entries (hdrp, oldp, newp)
475    struct dent *       hdrp;   /* (Possible) header of variant chain */
476    register struct dent *
477                        oldp;   /* Pre-existing dictionary entry */
478    register struct dent *
479                        newp;   /* Entry to possibly combine */
480    {
481
482    if (acoversb (oldp, newp))
483        {
484        /* newp is superfluous.  Drop it, preserving affixes and keep flag */
485        combineaffixes (oldp, newp);
486        oldp->flagfield |= (newp->flagfield & KEEP);
487        hdrp->flagfield |= (newp->flagfield & KEEP);
488        myfree (newp->word);
489        return 1;
490        }
491    else if (acoversb (newp, oldp))
492        {
493        /*
494        ** oldp is superfluous.  Replace it with newp, preserving affixes and
495        ** the keep flag.
496        */
497        combineaffixes (newp, oldp);
498#ifdef NO_CAPITALIZATION_SUPPORT
499        newp->flagfield |= (oldp->flagfield & KEEP);
500#else /* NO_CAPITALIZATION_SUPPORT */
501        newp->flagfield |= (oldp->flagfield & (KEEP | MOREVARIANTS));
502#endif /* NO_CAPITALIZATION_SUPPORT */
503        hdrp->flagfield |= (newp->flagfield & KEEP);
504        newp->next = oldp->next;
505        /*
506        ** We really want to free oldp->word, but that might be part of
507        ** "hashstrings".  So we'll futz around to arrange things so we can
508        ** free newp->word instead.  This depends very much on the fact
509        ** that both words are the same length.
510        */
511        if (oldp->word != NULL)
512            (void) strcpy (oldp->word, newp->word);
513        myfree (newp->word);    /* No longer needed */
514        newp->word = oldp->word;
515        *oldp = *newp;
516#ifndef NO_CAPITALIZATION_SUPPORT
517        /* We may need to add a header if newp is followcase */
518        if (captype (newp->flagfield) == FOLLOWCASE
519          &&  (hdrp->flagfield & (CAPTYPEMASK | MOREVARIANTS))
520            != (ALLCAPS | MOREVARIANTS))
521            (void) addvheader (hdrp);
522#endif /* NO_CAPITALIZATION_SUPPORT */
523        return 1;
524        }
525    else
526        return 0;
527    }
528
529/*
530** Determine if enta covers entb, according to the rules in steps 4 and 5
531** of the commentary for "combinecaps".
532*/
533static int acoversb (enta, entb)
534    register struct dent *      enta;   /* "A" in the rules */
535    register struct dent *      entb;   /* "B" in the rules */
536    {
537    int                         subset; /* NZ if entb is a subset of enta */
538
539    if ((subset = issubset (entb, enta)) != 0)
540        {
541        /* entb is a subset of enta;  thus enta might cover entb */
542        if (((enta->flagfield ^ entb->flagfield) & KEEP) != 0
543          &&  (enta->flagfield & KEEP) == 0)    /* Inverse of condition (4b) */
544            return 0;
545        }
546    else
547        {
548        /* not a subset;  KEEP flags must match exactly (both (4a) and (4b)) */
549        if (((enta->flagfield ^ entb->flagfield) & KEEP) != 0)
550            return 0;
551        }
552
553    /* Rules (4a) and (4b) are satisfied;  check for capitalization match */
554#ifdef NO_CAPITALIZATION_SUPPORT
555#ifdef lint
556    return subset;                              /* Just so it gets used */
557#else /* lint */
558    return 1;                                   /* All words match */
559#endif /* lint */
560#else /* NO_CAPITALIZATION_SUPPORT */
561    if (((enta->flagfield ^ entb->flagfield) & CAPTYPEMASK) == 0)
562        {
563        if (captype (enta->flagfield) != FOLLOWCASE     /* Condition (4c) */
564          ||  strcmp (enta->word, entb->word) == 0)
565            return 1;                           /* Perfect match */
566        else
567            return 0;
568        }
569    else if (subset == 0)                       /* No flag subset, refuse */
570        return 0;                               /* ..near matches */
571    else if (captype (entb->flagfield) == ALLCAPS)
572        return 1;
573    else if (captype (enta->flagfield) == ANYCASE
574      &&  captype (entb->flagfield) == CAPITALIZED)
575        return 1;
576    else
577        return 0;
578#endif /* NO_CAPITALIZATION_SUPPORT */
579    }
580
581void upcase (s)
582    register ichar_t *  s;
583    {
584
585    while (*s)
586        {
587        *s = mytoupper (*s);
588        s++;
589        }
590    }
591
592void lowcase (s)
593    register ichar_t *  s;
594    {
595
596    while (*s)
597        {
598        *s = mytolower (*s);
599        s++;
600        }
601    }
602
603/*
604 * Upcase variant that works on normal strings.  Note that it is a lot
605 * slower than the normal upcase.  The input must be in canonical form.
606 */
607void chupcase (s)
608    char *      s;
609    {
610    ichar_t *   is;
611
612    is = strtosichar (s, 1);
613    upcase (is);
614    (void) ichartostr (s, is, strlen (s) + 1, 1);
615    }
616
617/*
618** See if one affix field is a subset of another.  Returns NZ if ent1
619** is a subset of ent2.  The KEEP flag is not taken into consideration.
620*/
621static int issubset (ent1, ent2)
622    register struct dent *      ent1;
623    register struct dent *      ent2;
624    {
625/* The following is really testing for MASKSIZE > 1, but cpp can't do that */
626#if MASKBITS > 32
627    register int                flagword;
628
629#ifdef FULLMASKSET
630#define MASKMAX MASKSIZE
631#else
632#define MASKMAX MASKSIZE - 1
633#endif /* FULLMASKSET */
634    for (flagword = MASKMAX;  --flagword >= 0;  )
635        {
636        if ((ent1->mask[flagword] & ent2->mask[flagword])
637          != ent1->mask[flagword])
638            return 0;
639        }
640#endif /* MASKBITS > 32 */
641#ifdef FULLMASKSET
642    return ((ent1->mask[MASKSIZE - 1] & ent2->mask[MASKSIZE - 1])
643      == ent1->mask[MASKSIZE - 1]);
644#else
645    if (((ent1->mask[MASKSIZE - 1] & ent2->mask[MASKSIZE - 1])
646      ^ ent1->mask[MASKSIZE - 1]) & ~ALLFLAGS)
647        return 0;
648    else
649        return 1;
650#endif /* FULLMASKSET */
651    }
652
653/*
654** Add ent2's affix flags to ent1.
655*/
656static void combineaffixes (ent1, ent2)
657    register struct dent *      ent1;
658    register struct dent *      ent2;
659    {
660/* The following is really testing for MASKSIZE > 1, but cpp can't do that */
661#if MASKBITS > 32
662    register int                flagword;
663
664    if (ent1 == ent2)
665        return;
666    /* MASKMAX is defined in issubset, just above */
667    for (flagword = MASKMAX;  --flagword >= 0;  )
668        ent1->mask[flagword] |= ent2->mask[flagword];
669#endif /* MASKBITS > 32 */
670#ifndef FULLMASKSET
671    ent1->mask[MASKSIZE - 1] |= ent2->mask[MASKSIZE - 1] & ~ALLFLAGS;
672#endif
673    }
674
675/*
676** Write out a dictionary entry, including capitalization variants.
677** If onlykeep is true, only those variants with KEEP set will be
678** written.
679*/
680void toutent (toutfile, hent, onlykeep)
681    register FILE *     toutfile;
682    struct dent *       hent;
683    register int        onlykeep;
684    {
685#ifdef NO_CAPITALIZATION_SUPPORT
686    if (!onlykeep  ||  (hent->flagfield & KEEP))
687        toutword (toutfile, hent->word, hent);
688#else
689    register struct dent * cent;
690    ichar_t             wbuf[INPUTWORDLEN + MAXAFFIXLEN];
691
692    cent = hent;
693    if (strtoichar (wbuf, cent->word, INPUTWORDLEN, 1))
694        (void) fprintf (stderr, WORD_TOO_LONG (cent->word));
695    for (  ;  ;  )
696        {
697        if (!onlykeep  ||  (cent->flagfield & KEEP))
698            {
699            switch (captype (cent->flagfield))
700                {
701                case ANYCASE:
702                    lowcase (wbuf);
703                    toutword (toutfile, ichartosstr (wbuf, 1), cent);
704                    break;
705                case ALLCAPS:
706                    if ((cent->flagfield & MOREVARIANTS) == 0
707                      ||  cent != hent)
708                        {
709                        upcase (wbuf);
710                        toutword (toutfile, ichartosstr (wbuf, 1), cent);
711                        }
712                    break;
713                case CAPITALIZED:
714                    lowcase (wbuf);
715                    wbuf[0] = mytoupper (wbuf[0]);
716                    toutword (toutfile, ichartosstr (wbuf, 1), cent);
717                    break;
718                case FOLLOWCASE:
719                    toutword (toutfile, cent->word, cent);
720                    break;
721                }
722            }
723        if (cent->flagfield & MOREVARIANTS)
724            cent = cent->next;
725        else
726            break;
727        }
728#endif
729    }
730               
731static void toutword (toutfile, word, cent)
732    register FILE *     toutfile;
733    char *              word;
734    register struct dent * cent;
735    {
736    register int        bit;
737
738    has_marker = 0;
739    (void) fprintf (toutfile, "%s", word);
740    for (bit = 0;  bit < LARGESTFLAG;  bit++)
741        {
742        if (TSTMASKBIT (cent->mask, bit))
743          flagout (toutfile, BITTOCHAR (bit));
744        }
745    (void) fprintf (toutfile, "\n");
746    }
747
748static void flagout (toutfile, flag)
749    register FILE *     toutfile;
750    int                 flag;
751    {
752    if (!has_marker)
753        (void) putc (hashheader.flagmarker, toutfile);
754    has_marker = 1;
755    (void) putc (flag, toutfile);
756    }
757
758/*
759 * If the string under the given pointer begins with a string character,
760 * return the length of that "character".  If not, return 0.
761 * May be called any time, but it's best if "isstrstart" is first
762 * used to filter out unnecessary calls.
763 *
764 * As a side effect, "laststringch" is set to the number of the string
765 * found, or to -1 if none was found.  This can be useful for such things
766 * as case conversion.
767 */
768int stringcharlen (bufp, canonical)
769    char *              bufp;
770    int                 canonical;      /* NZ if input is in canonical form */
771    {
772#ifdef SLOWMULTIPLY
773    static char *       sp[MAXSTRINGCHARS];
774    static int          inited = 0;
775#endif /* SLOWMULTIPLY */
776    register char *     bufcur;
777    register char *     stringcur;
778    register int        stringno;
779    register int        lowstringno;
780    register int        highstringno;
781    int                 dupwanted;
782
783#ifdef SLOWMULTIPLY
784    if (!inited)
785        {
786        inited = 1;
787        for (stringno = 0;  stringno < MAXSTRINGCHARS;  stringno++)
788            sp[stringno] = &hashheader.stringchars[stringno][0];
789        }
790#endif /* SLOWMULTIPLY */
791    lowstringno = 0;
792    highstringno = hashheader.nstrchars - 1;
793    dupwanted = canonical ? 0 : defdupchar;
794    while (lowstringno <= highstringno)
795        {
796        stringno = (lowstringno + highstringno) >> 1;
797#ifdef SLOWMULTIPLY
798        stringcur = sp[stringno];
799#else /* SLOWMULTIPLY */
800        stringcur = &hashheader.stringchars[stringno][0];
801#endif /* SLOWMULTIPLY */
802        bufcur = bufp;
803        while (*stringcur)
804            {
805#ifdef NO8BIT
806            if (((*bufcur++ ^ *stringcur) & 0x7F) != 0)
807#else /* NO8BIT */
808            if (*bufcur++ != *stringcur)
809#endif /* NO8BIT */
810                break;
811            /*
812            ** We can't use autoincrement above because of the
813            ** test below.
814            */
815            stringcur++;
816            }
817        if (*stringcur == '\0')
818            {
819            if (hashheader.dupnos[stringno] == dupwanted)
820                {
821                /* We have a match */
822                laststringch = hashheader.stringdups[stringno];
823#ifdef SLOWMULTIPLY
824                return stringcur - sp[stringno];
825#else /* SLOWMULTIPLY */
826                return stringcur - &hashheader.stringchars[stringno][0];
827#endif /* SLOWMULTIPLY */
828                }
829            else
830                --stringcur;
831            }
832        /* No match - choose which side to search on */
833#ifdef NO8BIT
834        if ((*--bufcur & 0x7F) < (*stringcur & 0x7F))
835            highstringno = stringno - 1;
836        else if ((*bufcur & 0x7F) > (*stringcur & 0x7F))
837            lowstringno = stringno + 1;
838#else /* NO8BIT */
839        if (*--bufcur < *stringcur)
840            highstringno = stringno - 1;
841        else if (*bufcur > *stringcur)
842            lowstringno = stringno + 1;
843#endif /* NO8BIT */
844        else if (dupwanted < hashheader.dupnos[stringno])
845            highstringno = stringno - 1;
846        else
847            lowstringno = stringno + 1;
848        }
849    laststringch = -1;
850    return 0;                   /* Not a string character */
851    }
852
853/*
854 * Convert an external string to an ichar_t string.  If necessary, the parity
855 * bit is stripped off as part of the process.
856 *
857 * Returns NZ if the output string overflowed.
858 */
859int strtoichar (out, in, outlen, canonical)
860    register ichar_t *  out;            /* Where to put result */
861    register char *     in;             /* String to convert */
862    int                 outlen;         /* Size of output buffer, *BYTES* */
863    int                 canonical;      /* NZ if input is in canonical form */
864    {
865    register int        len;            /* Length of next character */
866
867    outlen /= sizeof (ichar_t);         /* Convert to an ichar_t count */
868    for (  ;  --outlen > 0  &&  *in != '\0';  in += len)
869        {
870        if (l1_isstringch (in, len, canonical))
871            *out++ = SET_SIZE + laststringch;
872        else
873            *out++ = *in & NOPARITY;
874        }
875    *out = 0;
876    return outlen <= 0;
877    }
878
879/*
880 * Convert an ichar_t string to an external string.
881 *
882 * WARNING: the resulting string may wind up being longer than the
883 * original.  In fact, even the sequence strtoichar->ichartostr may
884 * produce a result longer than the original, because the output form
885 * may use a different string type set than the original input form.
886 *
887 * Returns NZ if the output string overflowed.
888 */
889int ichartostr (out, in, outlen, canonical)
890    register char *     out;            /* Where to put result */
891    register ichar_t *  in;             /* String to convert */
892    int                 outlen;         /* Size of output buffer, bytes */
893    int                 canonical;      /* NZ for canonical form */
894    {
895    register int        ch;             /* Next character to store */
896    register int        i;              /* Index into duplicates list */
897    register char *     scharp;         /* Pointer into a string char */
898
899    while (--outlen > 0  &&  (ch = *in++) != 0)
900        {
901        if (ch < SET_SIZE)
902            *out++ = (char) ch;
903        else
904            {
905            ch -= SET_SIZE;
906            if (!canonical)
907                {
908                for (i = hashheader.nstrchars;  --i >= 0;  )
909                    {
910                    if (hashheader.dupnos[i] == defdupchar
911                      &&  hashheader.stringdups[i] == ch)
912                        {
913                        ch = i;
914                        break;
915                        }
916                    }
917                }
918            scharp = hashheader.stringchars[(unsigned) ch];
919            while ((*out++ = *scharp++) != '\0')
920                ;
921            out--;
922            }
923        }
924    *out = '\0';
925    return outlen <= 0;
926    }
927
928/*
929 * Convert a string to an ichar_t, storing the result in a static area.
930 */
931ichar_t * strtosichar (in, canonical)
932    char *              in;             /* String to convert */
933    int                 canonical;      /* NZ if input is in canonical form */
934    {
935    static ichar_t      out[STRTOSICHAR_SIZE / sizeof (ichar_t)];
936
937    if (strtoichar (out, in, sizeof out, canonical))
938        (void) fprintf (stderr, WORD_TOO_LONG (in));
939    return out;
940    }
941
942/*
943 * Convert an ichar_t to a string, storing the result in a static area.
944 */
945char * ichartosstr (in, canonical)
946    ichar_t *           in;             /* Internal string to convert */
947    int                 canonical;      /* NZ for canonical conversion */
948    {
949    static char         out[ICHARTOSSTR_SIZE];
950
951    if (ichartostr (out, in, sizeof out, canonical))
952        (void) fprintf (stderr, WORD_TOO_LONG (out));
953    return out;
954    }
955
956/*
957 * Convert a single ichar to a printable string, storing the result in
958 * a static area.
959 */
960char * printichar (in)
961    int                 in;
962    {
963    static char         out[MAXSTRINGCHARLEN + 1];
964
965    if (in < SET_SIZE)
966        {
967        out[0] = (char) in;
968        out[1] = '\0';
969        }
970    else
971        (void) strcpy (out, hashheader.stringchars[(unsigned) in - SET_SIZE]);
972    return out;
973    }
974
975#ifndef ICHAR_IS_CHAR
976/*
977 * Copy an ichar_t.
978 */
979ichar_t * icharcpy (out, in)
980    register ichar_t *  out;            /* Destination */
981    register ichar_t *  in;             /* Source */
982    {
983    ichar_t *           origout;        /* Copy of destination for return */
984
985    origout = out;
986    while ((*out++ = *in++) != 0)
987        ;
988    return origout;
989    }
990
991/*
992 * Return the length of an ichar_t.
993 */
994int icharlen (in)
995    register ichar_t *  in;             /* String to count */
996    {
997    register int        len;            /* Length so far */
998
999    for (len = 0;  *in++ != 0;  len++)
1000        ;
1001    return len;
1002    }
1003
1004/*
1005 * Compare two ichar_t's.
1006 */
1007int icharcmp (s1, s2)
1008    register ichar_t *  s1;
1009    register ichar_t *  s2;
1010    {
1011
1012    while (*s1 != 0)
1013        {
1014        if (*s1++ != *s2++)
1015            return *--s1 - *--s2;
1016        }
1017    return *s1 - *s2;
1018    }
1019
1020/*
1021 * Strncmp for two ichar_t's.
1022 */
1023int icharncmp (s1, s2, n)
1024    register ichar_t *  s1;
1025    register ichar_t *  s2;
1026    register int        n;
1027    {
1028
1029    while (--n >= 0  &&  *s1 != 0)
1030        {
1031        if (*s1++ != *s2++)
1032            return *--s1 - *--s2;
1033        }
1034    if (n < 0)
1035        return 0;
1036    else
1037        return *s1 - *s2;
1038    }
1039
1040#endif /* ICHAR_IS_CHAR */
1041
1042int findfiletype (name, searchnames, deformatter)
1043    char *              name;           /* Name to look up in suffix table */
1044    int                 searchnames;    /* NZ to search name field of table */
1045    int *               deformatter;    /* Where to set deformatter type */
1046    {
1047    char *              cp;             /* Pointer into suffix list */
1048    int                 cplen;          /* Length of current suffix */
1049    register int        i;              /* Index into type table */
1050    int                 len;            /* Length of the name */
1051
1052    /*
1053     * Note:  for now, the deformatter is set to 1 for tex, 0 for nroff.
1054     * Further, we assume that it's one or the other, so that a test
1055     * for tex is sufficient.  This needs to be generalized.
1056     */
1057    len = strlen (name);
1058    if (searchnames)
1059        {
1060        for (i = 0;  i < hashheader.nstrchartype;  i++)
1061            {
1062            if (strcmp (name, chartypes[i].name) == 0)
1063                {
1064                if (deformatter != NULL)
1065                    *deformatter =
1066                      (strcmp (chartypes[i].deformatter, "tex") == 0);
1067                return i;
1068                }
1069            }
1070        }
1071    for (i = 0;  i < hashheader.nstrchartype;  i++)
1072        {
1073        for (cp = chartypes[i].suffixes;  *cp != '\0';  cp += cplen + 1)
1074            {
1075            cplen = strlen (cp);
1076            if (len >= cplen  &&  strcmp (&name[len - cplen], cp) == 0)
1077                {
1078                if (deformatter != NULL)
1079                    *deformatter =
1080                      (strcmp (chartypes[i].deformatter, "tex") == 0);
1081                return i;
1082                }
1083            }
1084        }
1085    return -1;
1086    }
1087
1088/*
1089 * The following routines are all dummies for the benefit of lint.
1090 */
1091#ifdef lint
1092int TSTMASKBIT (mask, bit) MASKTYPE * mask; int bit;
1093    { return bit + (int) *mask; }
1094void CLRMASKBIT (mask, bit) MASKTYPE * mask; int bit; { bit += (int) *mask; }
1095void SETMASKBIT (mask, bit) MASKTYPE * mask; int bit; { bit += (int) *mask; }
1096int BITTOCHAR (bit) int bit; { return bit; }
1097int CHARTOBIT (ch) int ch; { return ch; }
1098int myupper (ch) unsigned int ch; { return (int) ch; }
1099int mylower (ch) unsigned int ch; { return (int) ch; }
1100int myspace (ch) unsigned int ch; { return (int) ch; }
1101int iswordch (ch) unsigned int ch; { return (int) ch; }
1102int isboundarych (ch) unsigned int ch; { return (int) ch; }
1103int isstringstart (ch) unsigned int ch; { return ch; }
1104ichar_t mytolower (ch) unsigned int ch; { return (ichar_t) ch; }
1105ichar_t mytoupper (ch) unsigned int ch; { return (ichar_t) ch; }
1106#endif /* lint */
Note: See TracBrowser for help on using the repository browser.