source: trunk/third/pcre/pcretest.c @ 19309

Revision 19309, 33.0 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r19308, which included commits to RCS files with non-trunk default branches.
Line 
1/*************************************************
2*             PCRE testing program               *
3*************************************************/
4
5#include <ctype.h>
6#include <stdio.h>
7#include <string.h>
8#include <stdlib.h>
9#include <time.h>
10#include <locale.h>
11
12/* Use the internal info for displaying the results of pcre_study(). */
13
14#include "internal.h"
15
16/* It is possible to compile this test program without including support for
17testing the POSIX interface, though this is not available via the standard
18Makefile. */
19
20#if !defined NOPOSIX
21#include "pcreposix.h"
22#endif
23
24#ifndef CLOCKS_PER_SEC
25#ifdef CLK_TCK
26#define CLOCKS_PER_SEC CLK_TCK
27#else
28#define CLOCKS_PER_SEC 100
29#endif
30#endif
31
32#define LOOPREPEAT 20000
33
34
35static FILE *outfile;
36static int log_store = 0;
37static size_t gotten_store;
38
39
40
41static int utf8_table1[] = {
42  0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43
44static int utf8_table2[] = {
45  0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46
47static int utf8_table3[] = {
48  0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49
50
51/*************************************************
52*       Convert character value to UTF-8         *
53*************************************************/
54
55/* This function takes an integer value in the range 0 - 0x7fffffff
56and encodes it as a UTF-8 character in 0 to 6 bytes.
57
58Arguments:
59  cvalue     the character value
60  buffer     pointer to buffer for result - at least 6 bytes long
61
62Returns:     number of characters placed in the buffer
63             -1 if input character is negative
64             0 if input character is positive but too big (only when
65             int is longer than 32 bits)
66*/
67
68static int
69ord2utf8(int cvalue, unsigned char *buffer)
70{
71register int i, j;
72for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73  if (cvalue <= utf8_table1[i]) break;
74if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75if (cvalue < 0) return -1;
76
77buffer += i;
78for (j = i; j > 0; j--)
79 {
80 *buffer-- = 0x80 | (cvalue & 0x3f);
81 cvalue >>= 6;
82 }
83*buffer = utf8_table2[i] | cvalue;
84return i + 1;
85}
86
87
88/*************************************************
89*            Convert UTF-8 string to value       *
90*************************************************/
91
92/* This function takes one or more bytes that represents a UTF-8 character,
93and returns the value of the character.
94
95Argument:
96  buffer   a pointer to the byte vector
97  vptr     a pointer to an int to receive the value
98
99Returns:   >  0 => the number of bytes consumed
100           -6 to 0 => malformed UTF-8 character at offset = (-return)
101*/
102
103int
104utf82ord(unsigned char *buffer, int *vptr)
105{
106int c = *buffer++;
107int d = c;
108int i, j, s;
109
110for (i = -1; i < 6; i++)               /* i is number of additional bytes */
111  {
112  if ((d & 0x80) == 0) break;
113  d <<= 1;
114  }
115
116if (i == -1) { *vptr = c; return 1; }  /* ascii character */
117if (i == 0 || i == 6) return 0;        /* invalid UTF-8 */
118
119/* i now has a value in the range 1-5 */
120
121s = 6*i;
122d = (c & utf8_table3[i]) << s;
123
124for (j = 0; j < i; j++)
125  {
126  c = *buffer++;
127  if ((c & 0xc0) != 0x80) return -(j+1);
128  s -= 6;
129  d |= (c & 0x3f) << s;
130  }
131
132/* Check that encoding was the correct unique one */
133
134for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
135  if (d <= utf8_table1[j]) break;
136if (j != i) return -(i+1);
137
138/* Valid value */
139
140*vptr = d;
141return i+1;
142}
143
144
145
146
147
148
149/* Debugging function to print the internal form of the regex. This is the same
150code as contained in pcre.c under the DEBUG macro. */
151
152static const char *OP_names[] = {
153  "End", "\\A", "\\B", "\\b", "\\D", "\\d",
154  "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
155  "Opt", "^", "$", "Any", "chars", "not",
156  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
157  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
158  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
159  "*", "*?", "+", "+?", "?", "??", "{", "{",
160  "class", "Ref", "Recurse",
161  "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
162  "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
163  "Brazero", "Braminzero", "Branumber", "Bra"
164};
165
166
167static void print_internals(pcre *re)
168{
169unsigned char *code = ((real_pcre *)re)->code;
170
171fprintf(outfile, "------------------------------------------------------------------\n");
172
173for(;;)
174  {
175  int c;
176  int charlength;
177
178  fprintf(outfile, "%3d ", (int)(code - ((real_pcre *)re)->code));
179
180  if (*code >= OP_BRA)
181    {
182    if (*code - OP_BRA > EXTRACT_BASIC_MAX)
183      fprintf(outfile, "%3d Bra extra", (code[1] << 8) + code[2]);
184    else
185      fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
186    code += 2;
187    }
188
189  else switch(*code)
190    {
191    case OP_END:
192    fprintf(outfile, "    %s\n", OP_names[*code]);
193    fprintf(outfile, "------------------------------------------------------------------\n");
194    return;
195
196    case OP_OPT:
197    fprintf(outfile, " %.2x %s", code[1], OP_names[*code]);
198    code++;
199    break;
200
201    case OP_CHARS:
202    charlength = *(++code);
203    fprintf(outfile, "%3d ", charlength);
204    while (charlength-- > 0)
205      if (isprint(c = *(++code))) fprintf(outfile, "%c", c);
206        else fprintf(outfile, "\\x%02x", c);
207    break;
208
209    case OP_KETRMAX:
210    case OP_KETRMIN:
211    case OP_ALT:
212    case OP_KET:
213    case OP_ASSERT:
214    case OP_ASSERT_NOT:
215    case OP_ASSERTBACK:
216    case OP_ASSERTBACK_NOT:
217    case OP_ONCE:
218    case OP_COND:
219    case OP_BRANUMBER:
220    case OP_REVERSE:
221    case OP_CREF:
222    fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
223    code += 2;
224    break;
225
226    case OP_STAR:
227    case OP_MINSTAR:
228    case OP_PLUS:
229    case OP_MINPLUS:
230    case OP_QUERY:
231    case OP_MINQUERY:
232    case OP_TYPESTAR:
233    case OP_TYPEMINSTAR:
234    case OP_TYPEPLUS:
235    case OP_TYPEMINPLUS:
236    case OP_TYPEQUERY:
237    case OP_TYPEMINQUERY:
238    if (*code >= OP_TYPESTAR)
239      fprintf(outfile, "    %s", OP_names[code[1]]);
240    else if (isprint(c = code[1])) fprintf(outfile, "    %c", c);
241      else fprintf(outfile, "    \\x%02x", c);
242    fprintf(outfile, "%s", OP_names[*code++]);
243    break;
244
245    case OP_EXACT:
246    case OP_UPTO:
247    case OP_MINUPTO:
248    if (isprint(c = code[3])) fprintf(outfile, "    %c{", c);
249      else fprintf(outfile, "    \\x%02x{", c);
250    if (*code != OP_EXACT) fprintf(outfile, ",");
251    fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
252    if (*code == OP_MINUPTO) fprintf(outfile, "?");
253    code += 3;
254    break;
255
256    case OP_TYPEEXACT:
257    case OP_TYPEUPTO:
258    case OP_TYPEMINUPTO:
259    fprintf(outfile, "    %s{", OP_names[code[3]]);
260    if (*code != OP_TYPEEXACT) fprintf(outfile, "0,");
261    fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
262    if (*code == OP_TYPEMINUPTO) fprintf(outfile, "?");
263    code += 3;
264    break;
265
266    case OP_NOT:
267    if (isprint(c = *(++code))) fprintf(outfile, "    [^%c]", c);
268      else fprintf(outfile, "    [^\\x%02x]", c);
269    break;
270
271    case OP_NOTSTAR:
272    case OP_NOTMINSTAR:
273    case OP_NOTPLUS:
274    case OP_NOTMINPLUS:
275    case OP_NOTQUERY:
276    case OP_NOTMINQUERY:
277    if (isprint(c = code[1])) fprintf(outfile, "    [^%c]", c);
278      else fprintf(outfile, "    [^\\x%02x]", c);
279    fprintf(outfile, "%s", OP_names[*code++]);
280    break;
281
282    case OP_NOTEXACT:
283    case OP_NOTUPTO:
284    case OP_NOTMINUPTO:
285    if (isprint(c = code[3])) fprintf(outfile, "    [^%c]{", c);
286      else fprintf(outfile, "    [^\\x%02x]{", c);
287    if (*code != OP_NOTEXACT) fprintf(outfile, ",");
288    fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
289    if (*code == OP_NOTMINUPTO) fprintf(outfile, "?");
290    code += 3;
291    break;
292
293    case OP_REF:
294    fprintf(outfile, "    \\%d", (code[1] << 8) | code[2]);
295    code += 3;
296    goto CLASS_REF_REPEAT;
297
298    case OP_CLASS:
299      {
300      int i, min, max;
301      code++;
302      fprintf(outfile, "    [");
303
304      for (i = 0; i < 256; i++)
305        {
306        if ((code[i/8] & (1 << (i&7))) != 0)
307          {
308          int j;
309          for (j = i+1; j < 256; j++)
310            if ((code[j/8] & (1 << (j&7))) == 0) break;
311          if (i == '-' || i == ']') fprintf(outfile, "\\");
312          if (isprint(i)) fprintf(outfile, "%c", i); else fprintf(outfile, "\\x%02x", i);
313          if (--j > i)
314            {
315            fprintf(outfile, "-");
316            if (j == '-' || j == ']') fprintf(outfile, "\\");
317            if (isprint(j)) fprintf(outfile, "%c", j); else fprintf(outfile, "\\x%02x", j);
318            }
319          i = j;
320          }
321        }
322      fprintf(outfile, "]");
323      code += 32;
324
325      CLASS_REF_REPEAT:
326
327      switch(*code)
328        {
329        case OP_CRSTAR:
330        case OP_CRMINSTAR:
331        case OP_CRPLUS:
332        case OP_CRMINPLUS:
333        case OP_CRQUERY:
334        case OP_CRMINQUERY:
335        fprintf(outfile, "%s", OP_names[*code]);
336        break;
337
338        case OP_CRRANGE:
339        case OP_CRMINRANGE:
340        min = (code[1] << 8) + code[2];
341        max = (code[3] << 8) + code[4];
342        if (max == 0) fprintf(outfile, "{%d,}", min);
343        else fprintf(outfile, "{%d,%d}", min, max);
344        if (*code == OP_CRMINRANGE) fprintf(outfile, "?");
345        code += 4;
346        break;
347
348        default:
349        code--;
350        }
351      }
352    break;
353
354    /* Anything else is just a one-node item */
355
356    default:
357    fprintf(outfile, "    %s", OP_names[*code]);
358    break;
359    }
360
361  code++;
362  fprintf(outfile, "\n");
363  }
364}
365
366
367
368/* Character string printing function. A "normal" and a UTF-8 version. */
369
370static void pchars(unsigned char *p, int length, int utf8)
371{
372int c;
373while (length-- > 0)
374  {
375  if (utf8)
376    {
377    int rc = utf82ord(p, &c);
378    if (rc > 0)
379      {
380      length -= rc - 1;
381      p += rc;
382      if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
383        else fprintf(outfile, "\\x{%02x}", c);
384      continue;
385      }
386    }
387
388   /* Not UTF-8, or malformed UTF-8  */
389
390  if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
391    else fprintf(outfile, "\\x%02x", c);
392  }
393}
394
395
396
397/* Alternative malloc function, to test functionality and show the size of the
398compiled re. */
399
400static void *new_malloc(size_t size)
401{
402gotten_store = size;
403if (log_store)
404  fprintf(outfile, "Memory allocation (code space): %d\n",
405    (int)((int)size - offsetof(real_pcre, code[0])));
406return malloc(size);
407}
408
409
410
411
412/* Get one piece of information from the pcre_fullinfo() function */
413
414static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
415{
416int rc;
417if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
418  fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
419}
420
421
422
423
424/* Read lines from named file or stdin and write to named file or stdout; lines
425consist of a regular expression, in delimiters and optionally followed by
426options, followed by a set of test data, terminated by an empty line. */
427
428int main(int argc, char **argv)
429{
430FILE *infile = stdin;
431int options = 0;
432int study_options = 0;
433int op = 1;
434int timeit = 0;
435int showinfo = 0;
436int showstore = 0;
437int size_offsets = 45;
438int size_offsets_max;
439int *offsets;
440#if !defined NOPOSIX
441int posix = 0;
442#endif
443int debug = 0;
444int done = 0;
445unsigned char buffer[30000];
446unsigned char dbuffer[1024];
447
448/* Static so that new_malloc can use it. */
449
450outfile = stdout;
451
452/* Scan options */
453
454while (argc > 1 && argv[op][0] == '-')
455  {
456  char *endptr;
457
458  if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
459    showstore = 1;
460  else if (strcmp(argv[op], "-t") == 0) timeit = 1;
461  else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
462  else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
463  else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
464      ((size_offsets = (int)strtoul(argv[op+1], &endptr, 10)), *endptr == 0))
465    {
466    op++;
467    argc--;
468    }
469#if !defined NOPOSIX
470  else if (strcmp(argv[op], "-p") == 0) posix = 1;
471#endif
472  else
473    {
474    printf("** Unknown or malformed option %s\n", argv[op]);
475    printf("Usage:   pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
476    printf("  -d     debug: show compiled code; implies -i\n"
477           "  -i     show information about compiled pattern\n"
478           "  -o <n> set size of offsets vector to <n>\n");
479#if !defined NOPOSIX
480    printf("  -p     use POSIX interface\n");
481#endif
482    printf("  -s     output store information\n"
483           "  -t     time compilation and execution\n");
484    return 1;
485    }
486  op++;
487  argc--;
488  }
489
490/* Get the store for the offsets vector, and remember what it was */
491
492size_offsets_max = size_offsets;
493offsets = malloc(size_offsets_max * sizeof(int));
494if (offsets == NULL)
495  {
496  printf("** Failed to get %d bytes of memory for offsets vector\n",
497    size_offsets_max * sizeof(int));
498  return 1;
499  }
500
501/* Sort out the input and output files */
502
503if (argc > 1)
504  {
505  infile = fopen(argv[op], "r");
506  if (infile == NULL)
507    {
508    printf("** Failed to open %s\n", argv[op]);
509    return 1;
510    }
511  }
512
513if (argc > 2)
514  {
515  outfile = fopen(argv[op+1], "w");
516  if (outfile == NULL)
517    {
518    printf("** Failed to open %s\n", argv[op+1]);
519    return 1;
520    }
521  }
522
523/* Set alternative malloc function */
524
525pcre_malloc = new_malloc;
526
527/* Heading line, then prompt for first regex if stdin */
528
529fprintf(outfile, "PCRE version %s\n\n", pcre_version());
530
531/* Main loop */
532
533while (!done)
534  {
535  pcre *re = NULL;
536  pcre_extra *extra = NULL;
537
538#if !defined NOPOSIX  /* There are still compilers that require no indent */
539  regex_t preg;
540  int do_posix = 0;
541#endif
542
543  const char *error;
544  unsigned char *p, *pp, *ppp;
545  const unsigned char *tables = NULL;
546  int do_study = 0;
547  int do_debug = debug;
548  int do_G = 0;
549  int do_g = 0;
550  int do_showinfo = showinfo;
551  int do_showrest = 0;
552  int utf8 = 0;
553  int erroroffset, len, delimiter;
554
555  if (infile == stdin) printf("  re> ");
556  if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
557  if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
558
559  p = buffer;
560  while (isspace(*p)) p++;
561  if (*p == 0) continue;
562
563  /* Get the delimiter and seek the end of the pattern; if is isn't
564  complete, read more. */
565
566  delimiter = *p++;
567
568  if (isalnum(delimiter) || delimiter == '\\')
569    {
570    fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
571    goto SKIP_DATA;
572    }
573
574  pp = p;
575
576  for(;;)
577    {
578    while (*pp != 0)
579      {
580      if (*pp == '\\' && pp[1] != 0) pp++;
581        else if (*pp == delimiter) break;
582      pp++;
583      }
584    if (*pp != 0) break;
585
586    len = sizeof(buffer) - (pp - buffer);
587    if (len < 256)
588      {
589      fprintf(outfile, "** Expression too long - missing delimiter?\n");
590      goto SKIP_DATA;
591      }
592
593    if (infile == stdin) printf("    > ");
594    if (fgets((char *)pp, len, infile) == NULL)
595      {
596      fprintf(outfile, "** Unexpected EOF\n");
597      done = 1;
598      goto CONTINUE;
599      }
600    if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
601    }
602
603  /* If the first character after the delimiter is backslash, make
604  the pattern end with backslash. This is purely to provide a way
605  of testing for the error message when a pattern ends with backslash. */
606
607  if (pp[1] == '\\') *pp++ = '\\';
608
609  /* Terminate the pattern at the delimiter */
610
611  *pp++ = 0;
612
613  /* Look for options after final delimiter */
614
615  options = 0;
616  study_options = 0;
617  log_store = showstore;  /* default from command line */
618
619  while (*pp != 0)
620    {
621    switch (*pp++)
622      {
623      case 'g': do_g = 1; break;
624      case 'i': options |= PCRE_CASELESS; break;
625      case 'm': options |= PCRE_MULTILINE; break;
626      case 's': options |= PCRE_DOTALL; break;
627      case 'x': options |= PCRE_EXTENDED; break;
628
629      case '+': do_showrest = 1; break;
630      case 'A': options |= PCRE_ANCHORED; break;
631      case 'D': do_debug = do_showinfo = 1; break;
632      case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
633      case 'G': do_G = 1; break;
634      case 'I': do_showinfo = 1; break;
635      case 'M': log_store = 1; break;
636
637#if !defined NOPOSIX
638      case 'P': do_posix = 1; break;
639#endif
640
641      case 'S': do_study = 1; break;
642      case 'U': options |= PCRE_UNGREEDY; break;
643      case 'X': options |= PCRE_EXTRA; break;
644      case '8': options |= PCRE_UTF8; utf8 = 1; break;
645
646      case 'L':
647      ppp = pp;
648      while (*ppp != '\n' && *ppp != ' ') ppp++;
649      *ppp = 0;
650      if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
651        {
652        fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
653        goto SKIP_DATA;
654        }
655      tables = pcre_maketables();
656      pp = ppp;
657      break;
658
659      case '\n': case ' ': break;
660      default:
661      fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
662      goto SKIP_DATA;
663      }
664    }
665
666  /* Handle compiling via the POSIX interface, which doesn't support the
667  timing, showing, or debugging options, nor the ability to pass over
668  local character tables. */
669
670#if !defined NOPOSIX
671  if (posix || do_posix)
672    {
673    int rc;
674    int cflags = 0;
675    if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
676    if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
677    rc = regcomp(&preg, (char *)p, cflags);
678
679    /* Compilation failed; go back for another re, skipping to blank line
680    if non-interactive. */
681
682    if (rc != 0)
683      {
684      (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
685      fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
686      goto SKIP_DATA;
687      }
688    }
689
690  /* Handle compiling via the native interface */
691
692  else
693#endif  /* !defined NOPOSIX */
694
695    {
696    if (timeit)
697      {
698      register int i;
699      clock_t time_taken;
700      clock_t start_time = clock();
701      for (i = 0; i < LOOPREPEAT; i++)
702        {
703        re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
704        if (re != NULL) free(re);
705        }
706      time_taken = clock() - start_time;
707      fprintf(outfile, "Compile time %.3f milliseconds\n",
708        ((double)time_taken * 1000.0) /
709        ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
710      }
711
712    re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
713
714    /* Compilation failed; go back for another re, skipping to blank line
715    if non-interactive. */
716
717    if (re == NULL)
718      {
719      fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
720      SKIP_DATA:
721      if (infile != stdin)
722        {
723        for (;;)
724          {
725          if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
726            {
727            done = 1;
728            goto CONTINUE;
729            }
730          len = (int)strlen((char *)buffer);
731          while (len > 0 && isspace(buffer[len-1])) len--;
732          if (len == 0) break;
733          }
734        fprintf(outfile, "\n");
735        }
736      goto CONTINUE;
737      }
738
739    /* Compilation succeeded; print data if required. There are now two
740    info-returning functions. The old one has a limited interface and
741    returns only limited data. Check that it agrees with the newer one. */
742
743    if (do_showinfo)
744      {
745      unsigned long int get_options;
746      int old_first_char, old_options, old_count;
747      int count, backrefmax, first_char, need_char;
748      size_t size;
749
750      if (do_debug) print_internals(re);
751
752      new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
753      new_info(re, NULL, PCRE_INFO_SIZE, &size);
754      new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
755      new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
756      new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
757      new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
758
759      old_count = pcre_info(re, &old_options, &old_first_char);
760      if (count < 0) fprintf(outfile,
761        "Error %d from pcre_info()\n", count);
762      else
763        {
764        if (old_count != count) fprintf(outfile,
765          "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
766            old_count);
767
768        if (old_first_char != first_char) fprintf(outfile,
769          "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
770            first_char, old_first_char);
771
772        if (old_options != (int)get_options) fprintf(outfile,
773          "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
774            get_options, old_options);
775        }
776
777      if (size != gotten_store) fprintf(outfile,
778        "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
779        size, gotten_store);
780
781      fprintf(outfile, "Capturing subpattern count = %d\n", count);
782      if (backrefmax > 0)
783        fprintf(outfile, "Max back reference = %d\n", backrefmax);
784      if (get_options == 0) fprintf(outfile, "No options\n");
785        else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
786          ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
787          ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
788          ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
789          ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
790          ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
791          ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
792          ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
793          ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
794          ((get_options & PCRE_UTF8) != 0)? " utf8" : "");
795
796      if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
797        fprintf(outfile, "Case state changes\n");
798
799      if (first_char == -1)
800        {
801        fprintf(outfile, "First char at start or follows \\n\n");
802        }
803      else if (first_char < 0)
804        {
805        fprintf(outfile, "No first char\n");
806        }
807      else
808        {
809        if (isprint(first_char))
810          fprintf(outfile, "First char = \'%c\'\n", first_char);
811        else
812          fprintf(outfile, "First char = %d\n", first_char);
813        }
814
815      if (need_char < 0)
816        {
817        fprintf(outfile, "No need char\n");
818        }
819      else
820        {
821        if (isprint(need_char))
822          fprintf(outfile, "Need char = \'%c\'\n", need_char);
823        else
824          fprintf(outfile, "Need char = %d\n", need_char);
825        }
826      }
827
828    /* If /S was present, study the regexp to generate additional info to
829    help with the matching. */
830
831    if (do_study)
832      {
833      if (timeit)
834        {
835        register int i;
836        clock_t time_taken;
837        clock_t start_time = clock();
838        for (i = 0; i < LOOPREPEAT; i++)
839          extra = pcre_study(re, study_options, &error);
840        time_taken = clock() - start_time;
841        if (extra != NULL) free(extra);
842        fprintf(outfile, "  Study time %.3f milliseconds\n",
843          ((double)time_taken * 1000.0)/
844          ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
845        }
846
847      extra = pcre_study(re, study_options, &error);
848      if (error != NULL)
849        fprintf(outfile, "Failed to study: %s\n", error);
850      else if (extra == NULL)
851        fprintf(outfile, "Study returned NULL\n");
852
853      else if (do_showinfo)
854        {
855        uschar *start_bits = NULL;
856        new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
857        if (start_bits == NULL)
858          fprintf(outfile, "No starting character set\n");
859        else
860          {
861          int i;
862          int c = 24;
863          fprintf(outfile, "Starting character set: ");
864          for (i = 0; i < 256; i++)
865            {
866            if ((start_bits[i/8] & (1<<(i%8))) != 0)
867              {
868              if (c > 75)
869                {
870                fprintf(outfile, "\n  ");
871                c = 2;
872                }
873              if (isprint(i) && i != ' ')
874                {
875                fprintf(outfile, "%c ", i);
876                c += 2;
877                }
878              else
879                {
880                fprintf(outfile, "\\x%02x ", i);
881                c += 5;
882                }
883              }
884            }
885          fprintf(outfile, "\n");
886          }
887        }
888      }
889    }
890
891  /* Read data lines and test them */
892
893  for (;;)
894    {
895    unsigned char *q;
896    unsigned char *bptr = dbuffer;
897    int *use_offsets = offsets;
898    int use_size_offsets = size_offsets;
899    int count, c;
900    int copystrings = 0;
901    int getstrings = 0;
902    int getlist = 0;
903    int gmatched = 0;
904    int start_offset = 0;
905    int g_notempty = 0;
906
907    options = 0;
908
909    if (infile == stdin) printf("data> ");
910    if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
911      {
912      done = 1;
913      goto CONTINUE;
914      }
915    if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
916
917    len = (int)strlen((char *)buffer);
918    while (len > 0 && isspace(buffer[len-1])) len--;
919    buffer[len] = 0;
920    if (len == 0) break;
921
922    p = buffer;
923    while (isspace(*p)) p++;
924
925    q = dbuffer;
926    while ((c = *p++) != 0)
927      {
928      int i = 0;
929      int n = 0;
930      if (c == '\\') switch ((c = *p++))
931        {
932        case 'a': c =    7; break;
933        case 'b': c = '\b'; break;
934        case 'e': c =   27; break;
935        case 'f': c = '\f'; break;
936        case 'n': c = '\n'; break;
937        case 'r': c = '\r'; break;
938        case 't': c = '\t'; break;
939        case 'v': c = '\v'; break;
940
941        case '0': case '1': case '2': case '3':
942        case '4': case '5': case '6': case '7':
943        c -= '0';
944        while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
945          c = c * 8 + *p++ - '0';
946        break;
947
948        case 'x':
949
950        /* Handle \x{..} specially - new Perl thing for utf8 */
951
952        if (*p == '{')
953          {
954          unsigned char *pt = p;
955          c = 0;
956          while (isxdigit(*(++pt)))
957            c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
958          if (*pt == '}')
959            {
960            unsigned char buffer[8];
961            int ii, utn;
962            utn = ord2utf8(c, buffer);
963            for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
964            c = buffer[ii];   /* Last byte */
965            p = pt + 1;
966            break;
967            }
968          /* Not correct form; fall through */
969          }
970
971        /* Ordinary \x */
972
973        c = 0;
974        while (i++ < 2 && isxdigit(*p))
975          {
976          c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
977          p++;
978          }
979        break;
980
981        case 0:   /* Allows for an empty line */
982        p--;
983        continue;
984
985        case 'A':  /* Option setting */
986        options |= PCRE_ANCHORED;
987        continue;
988
989        case 'B':
990        options |= PCRE_NOTBOL;
991        continue;
992
993        case 'C':
994        while(isdigit(*p)) n = n * 10 + *p++ - '0';
995        copystrings |= 1 << n;
996        continue;
997
998        case 'G':
999        while(isdigit(*p)) n = n * 10 + *p++ - '0';
1000        getstrings |= 1 << n;
1001        continue;
1002
1003        case 'L':
1004        getlist = 1;
1005        continue;
1006
1007        case 'N':
1008        options |= PCRE_NOTEMPTY;
1009        continue;
1010
1011        case 'O':
1012        while(isdigit(*p)) n = n * 10 + *p++ - '0';
1013        if (n > size_offsets_max)
1014          {
1015          size_offsets_max = n;
1016          free(offsets);
1017          use_offsets = offsets = malloc(size_offsets_max * sizeof(int));
1018          if (offsets == NULL)
1019            {
1020            printf("** Failed to get %d bytes of memory for offsets vector\n",
1021              size_offsets_max * sizeof(int));
1022            return 1;
1023            }
1024          }
1025        use_size_offsets = n;
1026        if (n == 0) use_offsets = NULL;
1027        continue;
1028
1029        case 'Z':
1030        options |= PCRE_NOTEOL;
1031        continue;
1032        }
1033      *q++ = c;
1034      }
1035    *q = 0;
1036    len = q - dbuffer;
1037
1038    /* Handle matching via the POSIX interface, which does not
1039    support timing. */
1040
1041#if !defined NOPOSIX
1042    if (posix || do_posix)
1043      {
1044      int rc;
1045      int eflags = 0;
1046      regmatch_t *pmatch = malloc(sizeof(regmatch_t) * use_size_offsets);
1047      if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1048      if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1049
1050      rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1051
1052      if (rc != 0)
1053        {
1054        (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
1055        fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1056        }
1057      else
1058        {
1059        size_t i;
1060        for (i = 0; i < use_size_offsets; i++)
1061          {
1062          if (pmatch[i].rm_so >= 0)
1063            {
1064            fprintf(outfile, "%2d: ", (int)i);
1065            pchars(dbuffer + pmatch[i].rm_so,
1066              pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1067            fprintf(outfile, "\n");
1068            if (i == 0 && do_showrest)
1069              {
1070              fprintf(outfile, " 0+ ");
1071              pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1072              fprintf(outfile, "\n");
1073              }
1074            }
1075          }
1076        }
1077      free(pmatch);
1078      }
1079
1080    /* Handle matching via the native interface - repeats for /g and /G */
1081
1082    else
1083#endif  /* !defined NOPOSIX */
1084
1085    for (;; gmatched++)    /* Loop for /g or /G */
1086      {
1087      if (timeit)
1088        {
1089        register int i;
1090        clock_t time_taken;
1091        clock_t start_time = clock();
1092        for (i = 0; i < LOOPREPEAT; i++)
1093          count = pcre_exec(re, extra, (char *)bptr, len,
1094            start_offset, options | g_notempty, use_offsets, use_size_offsets);
1095        time_taken = clock() - start_time;
1096        fprintf(outfile, "Execute time %.3f milliseconds\n",
1097          ((double)time_taken * 1000.0)/
1098          ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
1099        }
1100
1101      count = pcre_exec(re, extra, (char *)bptr, len,
1102        start_offset, options | g_notempty, use_offsets, use_size_offsets);
1103
1104      if (count == 0)
1105        {
1106        fprintf(outfile, "Matched, but too many substrings\n");
1107        count = use_size_offsets/3;
1108        }
1109
1110      /* Matched */
1111
1112      if (count >= 0)
1113        {
1114        int i;
1115        for (i = 0; i < count * 2; i += 2)
1116          {
1117          if (use_offsets[i] < 0)
1118            fprintf(outfile, "%2d: <unset>\n", i/2);
1119          else
1120            {
1121            fprintf(outfile, "%2d: ", i/2);
1122            pchars(bptr + use_offsets[i], use_offsets[i+1] - use_offsets[i], utf8);
1123            fprintf(outfile, "\n");
1124            if (i == 0)
1125              {
1126              if (do_showrest)
1127                {
1128                fprintf(outfile, " 0+ ");
1129                pchars(bptr + use_offsets[i+1], len - use_offsets[i+1], utf8);
1130                fprintf(outfile, "\n");
1131                }
1132              }
1133            }
1134          }
1135
1136        for (i = 0; i < 32; i++)
1137          {
1138          if ((copystrings & (1 << i)) != 0)
1139            {
1140            char copybuffer[16];
1141            int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1142              i, copybuffer, sizeof(copybuffer));
1143            if (rc < 0)
1144              fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1145            else
1146              fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1147            }
1148          }
1149
1150        for (i = 0; i < 32; i++)
1151          {
1152          if ((getstrings & (1 << i)) != 0)
1153            {
1154            const char *substring;
1155            int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1156              i, &substring);
1157            if (rc < 0)
1158              fprintf(outfile, "get substring %d failed %d\n", i, rc);
1159            else
1160              {
1161              fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1162              /* free((void *)substring); */
1163              pcre_free_substring(substring);
1164              }
1165            }
1166          }
1167
1168        if (getlist)
1169          {
1170          const char **stringlist;
1171          int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1172            &stringlist);
1173          if (rc < 0)
1174            fprintf(outfile, "get substring list failed %d\n", rc);
1175          else
1176            {
1177            for (i = 0; i < count; i++)
1178              fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1179            if (stringlist[i] != NULL)
1180              fprintf(outfile, "string list not terminated by NULL\n");
1181            /* free((void *)stringlist); */
1182            pcre_free_substring_list(stringlist);
1183            }
1184          }
1185        }
1186
1187      /* Failed to match. If this is a /g or /G loop and we previously set
1188      g_notempty after a null match, this is not necessarily the end.
1189      We want to advance the start offset, and continue. Fudge the offset
1190      values to achieve this. We won't be at the end of the string - that
1191      was checked before setting g_notempty. */
1192
1193      else
1194        {
1195        if (g_notempty != 0)
1196          {
1197          use_offsets[0] = start_offset;
1198          use_offsets[1] = start_offset + 1;
1199          }
1200        else
1201          {
1202          if (gmatched == 0)   /* Error if no previous matches */
1203            {
1204            if (count == -1) fprintf(outfile, "No match\n");
1205              else fprintf(outfile, "Error %d\n", count);
1206            }
1207          break;  /* Out of the /g loop */
1208          }
1209        }
1210
1211      /* If not /g or /G we are done */
1212
1213      if (!do_g && !do_G) break;
1214
1215      /* If we have matched an empty string, first check to see if we are at
1216      the end of the subject. If so, the /g loop is over. Otherwise, mimic
1217      what Perl's /g options does. This turns out to be rather cunning. First
1218      we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1219      same point. If this fails (picked up above) we advance to the next
1220      character. */
1221
1222      g_notempty = 0;
1223      if (use_offsets[0] == use_offsets[1])
1224        {
1225        if (use_offsets[0] == len) break;
1226        g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1227        }
1228
1229      /* For /g, update the start offset, leaving the rest alone */
1230
1231      if (do_g) start_offset = use_offsets[1];
1232
1233      /* For /G, update the pointer and length */
1234
1235      else
1236        {
1237        bptr += use_offsets[1];
1238        len -= use_offsets[1];
1239        }
1240      }  /* End of loop for /g and /G */
1241    }    /* End of loop for data lines */
1242
1243  CONTINUE:
1244
1245#if !defined NOPOSIX
1246  if (posix || do_posix) regfree(&preg);
1247#endif
1248
1249  if (re != NULL) free(re);
1250  if (extra != NULL) free(extra);
1251  if (tables != NULL)
1252    {
1253    free((void *)tables);
1254    setlocale(LC_CTYPE, "C");
1255    }
1256  }
1257
1258fprintf(outfile, "\n");
1259return 0;
1260}
1261
1262/* End */
Note: See TracBrowser for help on using the repository browser.