source: trunk/third/ispell/ijoin.c @ 10334

Revision 10334, 13.8 KB checked in by ghudson, 27 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r10333, which included commits to RCS files with non-trunk default branches.
Line 
1#ifndef lint
2static char Rcs_Id[] =
3    "$Id: ijoin.c,v 1.1.1.1 1997-09-03 21:08:09 ghudson Exp $";
4#endif
5
6/*
7 * Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All modifications to the source code must be clearly marked as
20 *    such.  Binary redistributions based on modified source code
21 *    must be clearly marked as modified versions in the documentation
22 *    and/or other materials provided with the distribution.
23 * 4. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgment:
25 *      This product includes software developed by Geoff Kuenning and
26 *      other unpaid contributors.
27 * 5. The name of Geoff Kuenning may not be used to endorse or promote
28 *    products derived from this software without specific prior
29 *    written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 */
43
44/*
45 * "Join" command for ispell.
46 *
47 * This command is a complete reimplementation of the UNIX "join"
48 * command, except that fields cannot be separated by a newline, it
49 * can handle lines of unlimited length, and the preceding sort can
50 * treat characters as either signed or unsigned.
51 *
52 * Usage:
53 *
54 *      ijoin [options] file1 file2
55 *
56 * See the UNIX "join" manual page for option descriptions.  Only
57 * nonstandard options are described here.
58 *
59 * Either file1 or file2 may be "-", in which case the standard input
60 * is used for that file.
61 *
62 * Normally, ijoin uses "strcmp" to compare fields.  This is the
63 * correct thing to do on most systems if you are using the
64 * system-provided "sort" command to sort the input files before
65 * feeding them to ijoin.  In some cases, however, the sort command
66 * you use will disagree with strcmp about how to handle characters
67 * with the high bit set.  If this is the case, you can specify the
68 * "-s" (signed comparisons) or "-u" (unsigned comparisons) option to
69 * force ijoin to conform to the method used by the sort program.
70 * This is only necessary if one of the input files contains 8-bit
71 * characters in the field that is being joined on.
72 *
73 * On some older machines with non-ANSI compilers, the "-s" option
74 * will be ineffective because characters default to unsigned.
75 * However, this option should be unnecessary on those machines, so no
76 * harm will be done.
77 */
78
79/*
80 * $Log: not supported by cvs2svn $
81 * Revision 1.6  1994/10/18  04:03:21  geoff
82 * Fix a couple of bugs, one where the last field on a line could be
83 * output incorrectly, and one where fields from the wrong file could be
84 * output.
85 *
86 * Revision 1.5  1994/01/25  07:11:36  geoff
87 * Get rid of all old RCS log lines in preparation for the 3.1 release.
88 *
89 */
90
91#include <stdio.h>
92#include "config.h"
93#include "ispell.h"
94#include "proto.h"
95#include "fields.h"
96
97#ifdef __STDC__
98#define SIGNED  signed
99#else /* __STDC */
100#define SIGNED
101#endif /* __STDC */
102
103int             main P ((int argc, char * argv[])); /* Join files */
104static void     usage P ((void));       /* Issue a usage message */
105static void     dojoin P ((void));      /* Actually perform the join */
106static void     full_output P ((field_t * line1, field_t * line2));
107                                        /* Output everything from both lines */
108static void     selected_output P ((field_t * line1, field_t * line2));
109                                        /* Output selected fields */
110static int      strscmp P ((SIGNED char * a, SIGNED char * b));
111                                        /* Signed version of strcmp */
112static int      strucmp P ((unsigned char * a, unsigned char * b));
113                                        /* Unsigned version of strcmp */
114
115typedef struct
116    {
117    int         file;                   /* Number of file to output from */
118    int         field;                  /* Number of field to output */
119    }
120                outlist_t;              /* Output description list */
121
122static int              (*compare) () = strcmp; /* Comparison function */
123static char *           emptyfield = ""; /* Use this to replace empty fields */
124static FILE *           file1;          /* First file to join */
125static FILE *           file2;          /* Second file to join */
126static int              join1field = 0; /* Field to join file 1 on */
127static int              join2field = 0; /* Field to join file 2 on */
128static int              maxf[2] = {0, 0}; /* Max field to parse in each file */
129static outlist_t *      outlist = NULL; /* List of fields to write */
130static int              outlistsize;    /* Number of elements in outlist */
131static int              runs = FLD_RUNS; /* Set if runs of tabchar same as 1 */
132static char *           tabchar = " \t"; /* Field separator character(s) */
133static int              unpairable1 = 0; /* NZ if -a1 */
134static int              unpairable2 = 0; /* NZ if -a2 */
135
136extern int      strcmp ();
137
138int main (argc, argv)                   /* Join files */
139    int                 argc;           /* Argument count */
140    char *              argv[];         /* Argument vector */
141    {
142   
143    while (argc > 3  &&  argv[1][0] == '-')
144        {
145        argc--;
146        argv++;
147        switch (argv[0][1])
148            {
149            case 'a':                   /* produce output for unpairables */
150                if (argv[0][2] == '1')
151                    unpairable1 = 1;
152                else if (argv[0][2] == '2')
153                    unpairable2 = 1;
154                else if (argv[0][2] == '\0')
155                    unpairable1 = unpairable2 = 1;
156                else
157                    usage ();
158                break;
159            case 'e':                   /* Replace empty fields with this */
160                argc--;
161                argv++;
162                emptyfield = *argv;
163                break;
164            case 'j':                   /* Specify field to join on */
165                if (argv[0][2] == '1')
166                    join1field = atoi (argv[1]) - 1;
167                else if (argv[0][2] == '2')
168                    join2field = atoi (argv[1]) - 1;
169                else if (argv[0][2] == '\0')
170                    join1field = join2field = atoi (argv[1]) - 1;
171                else
172                    usage ();
173                argc--;
174                argv++;
175                break;
176            case 'o':                   /* Specify output list */
177                /*
178                 * We will assume that all remaining switch arguments
179                 * are used to describe the output list.  This will
180                 * occasionally result in malloc'ing a few too many
181                 * elements, but no real harm will be done.
182                 */
183                outlist =
184                  (outlist_t *) malloc ((argc - 3) * sizeof (outlist_t));
185                if (outlist == NULL)
186                    {
187                    (void) fprintf (stderr, "ijoin:  out of memory!\n");
188                    return 1;
189                    }
190                for (outlistsize = 0, argc--, argv++;
191                  argc > 2  &&  (argv[0][0] == '1'  ||  argv[0][0] == '2')
192                    &&  argv[0][1] == '.';
193                  argc--, argv++, outlistsize++)
194                    {
195                    outlist[outlistsize].file = argv[0][0] - '0';
196                    outlist[outlistsize].field = atoi (&argv[0][2]) - 1;
197                    if (maxf[outlist[outlistsize].file - 1]
198                      <= outlist[outlistsize].field)
199                        maxf[outlist[outlistsize].file - 1] =
200                          outlist[outlistsize].field + 1;
201                    }
202                argc++;                 /* Un-do arg that stopped us */
203                argv--;
204                break;
205            case 't':
206                tabchar = &argv[0][2];
207                runs &= ~FLD_RUNS;
208                break;
209            case 's':
210                compare = strscmp;
211                break;
212            case 'u':
213                compare = strucmp;
214                break;
215            default:
216                usage ();
217                break;
218            }
219        }
220    if (argc != 3)
221        usage ();
222    if (strcmp (argv[1], "-") == 0)
223        file1 = stdin;
224    else
225        {
226        file1 = fopen (argv[1], "r");
227        if (file1 == NULL)
228            perror (argv[1]);
229        }
230    file2 = fopen (argv[2], "r");
231    if (file2 == NULL)
232        perror (argv[2]);
233    if (file1 == NULL  ||  file2 == NULL)
234        return 1;
235    dojoin ();
236    return 0;
237    }
238
239static void usage ()                    /* Issue a usage message */
240    {
241
242    (void) fprintf (stderr,
243      "Usage:  ijoin [-an] [-e s] [-jn m] [-o n.m ...] [-tc] file1 file2\n");
244    exit (1);
245    }
246
247static void dojoin ()                   /* Actually perform the join */
248    {
249    int                 comparison;     /* Result of comparing the lines */
250    long                file2pos;       /* Position file 2 started at */
251    register field_t *  line1;          /* Line from file 1 */
252    register field_t *  line2;          /* Line from file 2 */
253    int                 pairable;       /* NZ if lines can be paired */
254    int                 skip2;          /* No. of "unpairable" 2's to skip */
255
256    runs |= FLD_NOSHRINK;               /* Don't do excessive reallocations */
257    field_line_inc = BUFSIZ;            /* Allocate line bfr in huge chunks */
258    line1 = fieldread (file1, tabchar, runs, maxf[0]);
259    file2pos = ftell (file2);
260    skip2 = 0;
261    if (file2pos == -1)
262        {
263        (void) fprintf (stderr, "ijoin:  Can't seek file ");
264        perror ("2");
265        exit (1);
266        }
267    line2 = fieldread (file2, tabchar, runs, maxf[1]);
268    while (line1 != NULL  ||  line2 != NULL)
269        {
270        /*
271         * Do a little work to reduce the number of calls to realloc, at
272         * the expense of slightly-increased memory usage.
273         */
274        if (line1 != NULL  &&  line1->nfields >= field_field_inc)
275            field_field_inc = line1->nfields + 1;
276        if (line2 != NULL  &&  line2->nfields >= field_field_inc)
277            field_field_inc = line2->nfields + 1;
278        /*
279         * Determine if the lines can be paired.
280         */
281        pairable = 1;
282        if (line1 == NULL)
283            {
284            pairable = 0;
285            comparison = 1;             /* This causes file 2 to advance */
286            }
287        else if (join1field >= line1->nfields)
288            {
289            pairable = 0;
290            comparison = -1;            /* This causes file 1 to advance */
291            }
292        if (line2 == NULL)
293            {
294            pairable = 0;
295            comparison = -1;            /* This causes file 1 to advance */
296            }
297        else if (join2field >= line2->nfields)
298            {
299            pairable = 0;
300            comparison = 1;             /* This causes file 2 to advance */
301            }
302        if (pairable)
303            {
304            comparison = (*compare) (line1->fields[join1field],
305              line2->fields[join2field]);
306            pairable = (comparison == 0);
307            }
308        if (pairable)
309            {
310            /*
311             * The two lines can be paired.  Produce output.
312             */
313            if (outlist == NULL)
314                full_output (line1, line2);
315            else
316                selected_output (line1, line2);
317            }
318        /*
319         * Advance through the files
320         */
321        if (comparison < 0)
322            {
323            if (unpairable1)
324                {
325                if (outlist == NULL)
326                    (void) fieldwrite (stdout, line1, tabchar[0]);
327                else
328                    selected_output (line1, (field_t *) NULL);
329                }
330            fieldfree (line1);
331            line1 = fieldread (file1, tabchar, runs, maxf[0]);
332            }
333        else if (comparison > 0)
334            {
335            if (skip2 > 0)
336                skip2--;
337            else if (unpairable2)
338                {
339                if (outlist == NULL)
340                    (void) fieldwrite (stdout, line2, tabchar[0]);
341                else
342                    selected_output ((field_t *) NULL, line2);
343                }
344            fieldfree (line2);
345            file2pos = ftell (file2);
346            line2 = fieldread (file2, tabchar, runs, maxf[1]);
347            }
348        else
349            {
350            /*
351             * Here's the tricky part.  We have to advance file 2
352             * until comparisons fail, and then back it up and advance
353             * file 1.
354             */
355            skip2++;
356            fieldfree (line2);
357            line2 = fieldread (file2, tabchar, runs, maxf[1]);
358            if (line2 == NULL
359              ||  join2field >= line2->nfields
360              ||  (*compare) (line1->fields[join1field],
361                  line2->fields[join2field])
362                != 0)
363                {
364                (void) fseek (file2, file2pos, 0);
365                fieldfree (line2);
366                line2 = fieldread (file2, tabchar, runs, maxf[1]);
367                fieldfree (line1);
368                line1 = fieldread (file1, tabchar, runs, maxf[0]);
369                if (line1 != NULL  &&  line2 != NULL
370                  &&  join1field < line1->nfields
371                  &&  join2field < line2->nfields
372                  &&  (*compare) (line1->fields[join1field],
373                        line2->fields[join2field])
374                    == 0)
375                    skip2 = 0;
376                }
377            }
378        }
379    }
380
381static void full_output (line1, line2)  /* Output everything from both lines */
382    register field_t *  line1;          /* Line from file 1 */
383    register field_t *  line2;          /* Line from file 2 */
384    {
385    register int        fieldno;        /* Number of field being handled */
386
387    (void) fputs (line1->fields[join1field], stdout);
388    for (fieldno = 0;  fieldno < line1->nfields;  fieldno++)
389        {
390        if (fieldno == join1field)
391            continue;
392        (void) putchar (tabchar[0]);
393        if (line1->fields[fieldno][0] == '\0')
394            (void) fputs (emptyfield, stdout);
395        else
396            (void) fputs (line1->fields[fieldno], stdout);
397        }
398    for (fieldno = 0;  fieldno < line2->nfields;  fieldno++)
399        {
400        if (fieldno == join2field)
401            continue;
402        (void) putchar (tabchar[0]);
403        if (line2->fields[fieldno][0] == '\0')
404            (void) fputs (emptyfield, stdout);
405        else
406            (void) fputs (line2->fields[fieldno], stdout);
407        }
408    (void) putchar ('\n');
409    }
410
411static void     selected_output (line1, line2) /* Output selected fields */
412    field_t *           line1;          /* Line from file 1 */
413    field_t *           line2;          /* Line from file 2 */
414    {
415    register field_t *  cline;          /* Current line being handled */
416    register int        listno;         /* Number of output list entry */
417   
418    for (listno = 0;  listno < outlistsize;  listno++)
419        {
420        if (listno != 0)
421            (void) putchar (tabchar[0]);
422        if (outlist[listno].file == 1)
423            cline = line1;
424        else
425            cline = line2;
426        if (cline == NULL
427          ||  outlist[listno].field >= cline->nfields
428          ||  cline->fields[outlist[listno].field][0] == '\0')
429            (void) fputs (emptyfield, stdout);
430        else
431            (void) fputs (cline->fields[outlist[listno].field], stdout);
432        }
433    (void) putchar ('\n');
434    }
435
436static int strscmp (a, b)               /* Compare signed strings */
437    register SIGNED char * a;           /* First string to compare */
438    register SIGNED char * b;           /* Second string to compare */
439    {
440
441    while (*a != '\0')
442        {
443        if (*a++ != *b++)
444            return *--a - *--b;
445        }
446    return *a - *b;
447    }
448
449static int strucmp (a, b)               /* Compare unsigned strings */
450    register unsigned char * a;         /* First string to compare */
451    register unsigned char * b;         /* Second string to compare */
452    {
453
454    while (*a != '\0')
455        {
456        if (*a++ != *b++)
457            return *--a - *--b;
458        }
459    return *a - *b;
460    }
Note: See TracBrowser for help on using the repository browser.