source: trunk/third/pcre/study.c @ 19309

Revision 19309, 10.7 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r19308, which included commits to RCS files with non-trunk default branches.
Line 
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/*
6This is a library of functions to support regular expressions whose syntax
7and semantics are as close as possible to those of the Perl 5 language. See
8the file Tech.Notes for some information on the internals.
9
10Written by: Philip Hazel <ph10@cam.ac.uk>
11
12           Copyright (c) 1997-2001 University of Cambridge
13
14-----------------------------------------------------------------------------
15Permission is granted to anyone to use this software for any purpose on any
16computer system, and to redistribute it freely, subject to the following
17restrictions:
18
191. This software is distributed in the hope that it will be useful,
20   but WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
232. The origin of this software must not be misrepresented, either by
24   explicit claim or by omission.
25
263. Altered versions must be plainly marked as such, and must not be
27   misrepresented as being the original software.
28
294. If PCRE is embedded in any software that is released under the GNU
30   General Purpose Licence (GPL), then the terms of that licence shall
31   supersede any condition above with which it is incompatible.
32-----------------------------------------------------------------------------
33*/
34
35
36/* Include the internals header, which itself includes Standard C headers plus
37the external pcre header. */
38
39#include "internal.h"
40
41
42
43/*************************************************
44*      Set a bit and maybe its alternate case    *
45*************************************************/
46
47/* Given a character, set its bit in the table, and also the bit for the other
48version of a letter if we are caseless.
49
50Arguments:
51  start_bits    points to the bit map
52  c             is the character
53  caseless      the caseless flag
54  cd            the block with char table pointers
55
56Returns:        nothing
57*/
58
59static void
60set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61{
62start_bits[c/8] |= (1 << (c&7));
63if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64  start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65}
66
67
68
69/*************************************************
70*          Create bitmap of starting chars       *
71*************************************************/
72
73/* This function scans a compiled unanchored expression and attempts to build a
74bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75goes by, we may be able to get more clever at doing this.
76
77Arguments:
78  code         points to an expression
79  start_bits   points to a 32-byte table, initialized to 0
80  caseless     the current state of the caseless flag
81  cd           the block with char table pointers
82
83Returns:       TRUE if table built, FALSE otherwise
84*/
85
86static BOOL
87set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88  compile_data *cd)
89{
90register int c;
91
92/* This next statement and the later reference to dummy are here in order to
93trick the optimizer of the IBM C compiler for OS/2 into generating correct
94code. Apparently IBM isn't going to fix the problem, and we would rather not
95disable optimization (in this module it actually makes a big difference, and
96the pcre module can use all the optimization it can get). */
97
98volatile int dummy;
99
100do
101  {
102  const uschar *tcode = code + 3;
103  BOOL try_next = TRUE;
104
105  while (try_next)
106    {
107    /* If a branch starts with a bracket or a positive lookahead assertion,
108    recurse to set bits from within them. That's all for this branch. */
109
110    if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
111      {
112      if (!set_start_bits(tcode, start_bits, caseless, cd))
113        return FALSE;
114      try_next = FALSE;
115      }
116
117    else switch(*tcode)
118      {
119      default:
120      return FALSE;
121
122      /* Skip over extended extraction bracket number */
123
124      case OP_BRANUMBER:
125      tcode += 3;
126      break;
127
128      /* Skip over lookbehind and negative lookahead assertions */
129
130      case OP_ASSERT_NOT:
131      case OP_ASSERTBACK:
132      case OP_ASSERTBACK_NOT:
133      do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
134      tcode += 3;
135      break;
136
137      /* Skip over an option setting, changing the caseless flag */
138
139      case OP_OPT:
140      caseless = (tcode[1] & PCRE_CASELESS) != 0;
141      tcode += 2;
142      break;
143
144      /* BRAZERO does the bracket, but carries on. */
145
146      case OP_BRAZERO:
147      case OP_BRAMINZERO:
148      if (!set_start_bits(++tcode, start_bits, caseless, cd))
149        return FALSE;
150      dummy = 1;
151      do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
152      tcode += 3;
153      break;
154
155      /* Single-char * or ? sets the bit and tries the next item */
156
157      case OP_STAR:
158      case OP_MINSTAR:
159      case OP_QUERY:
160      case OP_MINQUERY:
161      set_bit(start_bits, tcode[1], caseless, cd);
162      tcode += 2;
163      break;
164
165      /* Single-char upto sets the bit and tries the next */
166
167      case OP_UPTO:
168      case OP_MINUPTO:
169      set_bit(start_bits, tcode[3], caseless, cd);
170      tcode += 4;
171      break;
172
173      /* At least one single char sets the bit and stops */
174
175      case OP_EXACT:       /* Fall through */
176      tcode++;
177
178      case OP_CHARS:       /* Fall through */
179      tcode++;
180
181      case OP_PLUS:
182      case OP_MINPLUS:
183      set_bit(start_bits, tcode[1], caseless, cd);
184      try_next = FALSE;
185      break;
186
187      /* Single character type sets the bits and stops */
188
189      case OP_NOT_DIGIT:
190      for (c = 0; c < 32; c++)
191        start_bits[c] |= ~cd->cbits[c+cbit_digit];
192      try_next = FALSE;
193      break;
194
195      case OP_DIGIT:
196      for (c = 0; c < 32; c++)
197        start_bits[c] |= cd->cbits[c+cbit_digit];
198      try_next = FALSE;
199      break;
200
201      case OP_NOT_WHITESPACE:
202      for (c = 0; c < 32; c++)
203        start_bits[c] |= ~cd->cbits[c+cbit_space];
204      try_next = FALSE;
205      break;
206
207      case OP_WHITESPACE:
208      for (c = 0; c < 32; c++)
209        start_bits[c] |= cd->cbits[c+cbit_space];
210      try_next = FALSE;
211      break;
212
213      case OP_NOT_WORDCHAR:
214      for (c = 0; c < 32; c++)
215        start_bits[c] |= ~cd->cbits[c+cbit_word];
216      try_next = FALSE;
217      break;
218
219      case OP_WORDCHAR:
220      for (c = 0; c < 32; c++)
221        start_bits[c] |= cd->cbits[c+cbit_word];
222      try_next = FALSE;
223      break;
224
225      /* One or more character type fudges the pointer and restarts, knowing
226      it will hit a single character type and stop there. */
227
228      case OP_TYPEPLUS:
229      case OP_TYPEMINPLUS:
230      tcode++;
231      break;
232
233      case OP_TYPEEXACT:
234      tcode += 3;
235      break;
236
237      /* Zero or more repeats of character types set the bits and then
238      try again. */
239
240      case OP_TYPEUPTO:
241      case OP_TYPEMINUPTO:
242      tcode += 2;               /* Fall through */
243
244      case OP_TYPESTAR:
245      case OP_TYPEMINSTAR:
246      case OP_TYPEQUERY:
247      case OP_TYPEMINQUERY:
248      switch(tcode[1])
249        {
250        case OP_NOT_DIGIT:
251        for (c = 0; c < 32; c++)
252          start_bits[c] |= ~cd->cbits[c+cbit_digit];
253        break;
254
255        case OP_DIGIT:
256        for (c = 0; c < 32; c++)
257          start_bits[c] |= cd->cbits[c+cbit_digit];
258        break;
259
260        case OP_NOT_WHITESPACE:
261        for (c = 0; c < 32; c++)
262          start_bits[c] |= ~cd->cbits[c+cbit_space];
263        break;
264
265        case OP_WHITESPACE:
266        for (c = 0; c < 32; c++)
267          start_bits[c] |= cd->cbits[c+cbit_space];
268        break;
269
270        case OP_NOT_WORDCHAR:
271        for (c = 0; c < 32; c++)
272          start_bits[c] |= ~cd->cbits[c+cbit_word];
273        break;
274
275        case OP_WORDCHAR:
276        for (c = 0; c < 32; c++)
277          start_bits[c] |= cd->cbits[c+cbit_word];
278        break;
279        }
280
281      tcode += 2;
282      break;
283
284      /* Character class: set the bits and either carry on or not,
285      according to the repeat count. */
286
287      case OP_CLASS:
288        {
289        tcode++;
290        for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
291        tcode += 32;
292        switch (*tcode)
293          {
294          case OP_CRSTAR:
295          case OP_CRMINSTAR:
296          case OP_CRQUERY:
297          case OP_CRMINQUERY:
298          tcode++;
299          break;
300
301          case OP_CRRANGE:
302          case OP_CRMINRANGE:
303          if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
304            else try_next = FALSE;
305          break;
306
307          default:
308          try_next = FALSE;
309          break;
310          }
311        }
312      break; /* End of class handling */
313
314      }      /* End of switch */
315    }        /* End of try_next loop */
316
317  code += (code[1] << 8) + code[2];   /* Advance to next branch */
318  }
319while (*code == OP_ALT);
320return TRUE;
321}
322
323
324
325/*************************************************
326*          Study a compiled expression           *
327*************************************************/
328
329/* This function is handed a compiled expression that it must study to produce
330information that will speed up the matching. It returns a pcre_extra block
331which then gets handed back to pcre_exec().
332
333Arguments:
334  re        points to the compiled expression
335  options   contains option bits
336  errorptr  points to where to place error messages;
337            set NULL unless error
338
339Returns:    pointer to a pcre_extra block,
340            NULL on error or if no optimization possible
341*/
342
343pcre_extra *
344pcre_study(const pcre *external_re, int options, const char **errorptr)
345{
346uschar start_bits[32];
347real_pcre_extra *extra;
348const real_pcre *re = (const real_pcre *)external_re;
349compile_data compile_block;
350
351*errorptr = NULL;
352
353if (re == NULL || re->magic_number != MAGIC_NUMBER)
354  {
355  *errorptr = "argument is not a compiled regular expression";
356  return NULL;
357  }
358
359if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
360  {
361  *errorptr = "unknown or incorrect option bit(s) set";
362  return NULL;
363  }
364
365/* For an anchored pattern, or an unchored pattern that has a first char, or a
366multiline pattern that matches only at "line starts", no further processing at
367present. */
368
369if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
370  return NULL;
371
372/* Set the character tables in the block which is passed around */
373
374compile_block.lcc = re->tables + lcc_offset;
375compile_block.fcc = re->tables + fcc_offset;
376compile_block.cbits = re->tables + cbits_offset;
377compile_block.ctypes = re->tables + ctypes_offset;
378
379/* See if we can find a fixed set of initial characters for the pattern. */
380
381memset(start_bits, 0, 32 * sizeof(uschar));
382if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
383  &compile_block)) return NULL;
384
385/* Get an "extra" block and put the information therein. */
386
387extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
388
389if (extra == NULL)
390  {
391  *errorptr = "failed to get memory";
392  return NULL;
393  }
394
395extra->options = PCRE_STUDY_MAPPED;
396memcpy(extra->start_bits, start_bits, sizeof(start_bits));
397
398return (pcre_extra *)extra;
399}
400
401/* End of study.c */
Note: See TracBrowser for help on using the repository browser.