1 | /************************************************* |
---|
2 | * Perl-Compatible Regular Expressions * |
---|
3 | *************************************************/ |
---|
4 | |
---|
5 | |
---|
6 | /* This is a library of functions to support regular expressions whose syntax |
---|
7 | and semantics are as close as possible to those of the Perl 5 language. See |
---|
8 | the file Tech.Notes for some information on the internals. |
---|
9 | |
---|
10 | Written by: Philip Hazel <ph10@cam.ac.uk> |
---|
11 | |
---|
12 | Copyright (c) 1997-2001 University of Cambridge |
---|
13 | |
---|
14 | ----------------------------------------------------------------------------- |
---|
15 | Permission is granted to anyone to use this software for any purpose on any |
---|
16 | computer system, and to redistribute it freely, subject to the following |
---|
17 | restrictions: |
---|
18 | |
---|
19 | 1. This software is distributed in the hope that it will be useful, |
---|
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
---|
22 | |
---|
23 | 2. The origin of this software must not be misrepresented, either by |
---|
24 | explicit claim or by omission. |
---|
25 | |
---|
26 | 3. Altered versions must be plainly marked as such, and must not be |
---|
27 | misrepresented as being the original software. |
---|
28 | |
---|
29 | 4. If PCRE is embedded in any software that is released under the GNU |
---|
30 | General Purpose Licence (GPL), then the terms of that licence shall |
---|
31 | supersede any condition above with which it is incompatible. |
---|
32 | ----------------------------------------------------------------------------- |
---|
33 | */ |
---|
34 | |
---|
35 | /* This header contains definitions that are shared between the different |
---|
36 | modules, but which are not relevant to the outside. */ |
---|
37 | |
---|
38 | /* Get the definitions provided by running "configure" */ |
---|
39 | |
---|
40 | #include "config.h" |
---|
41 | |
---|
42 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
---|
43 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
---|
44 | is set. Otherwise, include an emulating function for those systems that have |
---|
45 | neither (there some non-Unix environments where this is the case). This assumes |
---|
46 | that all calls to memmove are moving strings upwards in store, which is the |
---|
47 | case in PCRE. */ |
---|
48 | |
---|
49 | #if ! HAVE_MEMMOVE |
---|
50 | #undef memmove /* some systems may have a macro */ |
---|
51 | #if HAVE_BCOPY |
---|
52 | #define memmove(a, b, c) bcopy(b, a, c) |
---|
53 | #else |
---|
54 | void * |
---|
55 | pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) |
---|
56 | { |
---|
57 | int i; |
---|
58 | dest += n; |
---|
59 | src += n; |
---|
60 | for (i = 0; i < n; ++i) *(--dest) = *(--src); |
---|
61 | } |
---|
62 | #define memmove(a, b, c) pcre_memmove(a, b, c) |
---|
63 | #endif |
---|
64 | #endif |
---|
65 | |
---|
66 | /* Standard C headers plus the external interface definition */ |
---|
67 | |
---|
68 | #include <ctype.h> |
---|
69 | #include <limits.h> |
---|
70 | #include <stddef.h> |
---|
71 | #include <stdio.h> |
---|
72 | #include <stdlib.h> |
---|
73 | #include <string.h> |
---|
74 | #include "pcre.h" |
---|
75 | |
---|
76 | /* In case there is no definition of offsetof() provided - though any proper |
---|
77 | Standard C system should have one. */ |
---|
78 | |
---|
79 | #ifndef offsetof |
---|
80 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) |
---|
81 | #endif |
---|
82 | |
---|
83 | /* These are the public options that can change during matching. */ |
---|
84 | |
---|
85 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) |
---|
86 | |
---|
87 | /* Private options flags start at the most significant end of the four bytes, |
---|
88 | but skip the top bit so we can use ints for convenience without getting tangled |
---|
89 | with negative values. The public options defined in pcre.h start at the least |
---|
90 | significant end. Make sure they don't overlap, though now that we have expanded |
---|
91 | to four bytes there is plenty of space. */ |
---|
92 | |
---|
93 | #define PCRE_FIRSTSET 0x40000000 /* first_char is set */ |
---|
94 | #define PCRE_REQCHSET 0x20000000 /* req_char is set */ |
---|
95 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ |
---|
96 | #define PCRE_INGROUP 0x08000000 /* compiling inside a group */ |
---|
97 | #define PCRE_ICHANGED 0x04000000 /* i option changes within regex */ |
---|
98 | |
---|
99 | /* Options for the "extra" block produced by pcre_study(). */ |
---|
100 | |
---|
101 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ |
---|
102 | |
---|
103 | /* Masks for identifying the public options which are permitted at compile |
---|
104 | time, run time or study time, respectively. */ |
---|
105 | |
---|
106 | #define PUBLIC_OPTIONS \ |
---|
107 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
---|
108 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8) |
---|
109 | |
---|
110 | #define PUBLIC_EXEC_OPTIONS \ |
---|
111 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY) |
---|
112 | |
---|
113 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ |
---|
114 | |
---|
115 | /* Magic number to provide a small check against being handed junk. */ |
---|
116 | |
---|
117 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
---|
118 | |
---|
119 | /* Miscellaneous definitions */ |
---|
120 | |
---|
121 | typedef int BOOL; |
---|
122 | |
---|
123 | #define FALSE 0 |
---|
124 | #define TRUE 1 |
---|
125 | |
---|
126 | /* Escape items that are just an encoding of a particular data value. Note that |
---|
127 | ESC_N is defined as yet another macro, which is set in config.h to either \n |
---|
128 | (the default) or \r (which some people want). */ |
---|
129 | |
---|
130 | #ifndef ESC_E |
---|
131 | #define ESC_E 27 |
---|
132 | #endif |
---|
133 | |
---|
134 | #ifndef ESC_F |
---|
135 | #define ESC_F '\f' |
---|
136 | #endif |
---|
137 | |
---|
138 | #ifndef ESC_N |
---|
139 | #define ESC_N NEWLINE |
---|
140 | #endif |
---|
141 | |
---|
142 | #ifndef ESC_R |
---|
143 | #define ESC_R '\r' |
---|
144 | #endif |
---|
145 | |
---|
146 | #ifndef ESC_T |
---|
147 | #define ESC_T '\t' |
---|
148 | #endif |
---|
149 | |
---|
150 | /* These are escaped items that aren't just an encoding of a particular data |
---|
151 | value such as \n. They must have non-zero values, as check_escape() returns |
---|
152 | their negation. Also, they must appear in the same order as in the opcode |
---|
153 | definitions below, up to ESC_z. The final one must be ESC_REF as subsequent |
---|
154 | values are used for \1, \2, \3, etc. There is a test in the code for an escape |
---|
155 | greater than ESC_b and less than ESC_Z to detect the types that may be |
---|
156 | repeated. If any new escapes are put in-between that don't consume a character, |
---|
157 | that code will have to change. */ |
---|
158 | |
---|
159 | enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, |
---|
160 | ESC_Z, ESC_z, ESC_REF }; |
---|
161 | |
---|
162 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
---|
163 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to |
---|
164 | OP_EOD must correspond in order to the list of escapes immediately above. */ |
---|
165 | |
---|
166 | enum { |
---|
167 | OP_END, /* End of pattern */ |
---|
168 | |
---|
169 | /* Values corresponding to backslashed metacharacters */ |
---|
170 | |
---|
171 | OP_SOD, /* Start of data: \A */ |
---|
172 | OP_NOT_WORD_BOUNDARY, /* \B */ |
---|
173 | OP_WORD_BOUNDARY, /* \b */ |
---|
174 | OP_NOT_DIGIT, /* \D */ |
---|
175 | OP_DIGIT, /* \d */ |
---|
176 | OP_NOT_WHITESPACE, /* \S */ |
---|
177 | OP_WHITESPACE, /* \s */ |
---|
178 | OP_NOT_WORDCHAR, /* \W */ |
---|
179 | OP_WORDCHAR, /* \w */ |
---|
180 | OP_EODN, /* End of data or \n at end of data: \Z. */ |
---|
181 | OP_EOD, /* End of data: \z */ |
---|
182 | |
---|
183 | OP_OPT, /* Set runtime options */ |
---|
184 | OP_CIRC, /* Start of line - varies with multiline switch */ |
---|
185 | OP_DOLL, /* End of line - varies with multiline switch */ |
---|
186 | OP_ANY, /* Match any character */ |
---|
187 | OP_CHARS, /* Match string of characters */ |
---|
188 | OP_NOT, /* Match anything but the following char */ |
---|
189 | |
---|
190 | OP_STAR, /* The maximizing and minimizing versions of */ |
---|
191 | OP_MINSTAR, /* all these opcodes must come in pairs, with */ |
---|
192 | OP_PLUS, /* the minimizing one second. */ |
---|
193 | OP_MINPLUS, /* This first set applies to single characters */ |
---|
194 | OP_QUERY, |
---|
195 | OP_MINQUERY, |
---|
196 | OP_UPTO, /* From 0 to n matches */ |
---|
197 | OP_MINUPTO, |
---|
198 | OP_EXACT, /* Exactly n matches */ |
---|
199 | |
---|
200 | OP_NOTSTAR, /* The maximizing and minimizing versions of */ |
---|
201 | OP_NOTMINSTAR, /* all these opcodes must come in pairs, with */ |
---|
202 | OP_NOTPLUS, /* the minimizing one second. */ |
---|
203 | OP_NOTMINPLUS, /* This first set applies to "not" single characters */ |
---|
204 | OP_NOTQUERY, |
---|
205 | OP_NOTMINQUERY, |
---|
206 | OP_NOTUPTO, /* From 0 to n matches */ |
---|
207 | OP_NOTMINUPTO, |
---|
208 | OP_NOTEXACT, /* Exactly n matches */ |
---|
209 | |
---|
210 | OP_TYPESTAR, /* The maximizing and minimizing versions of */ |
---|
211 | OP_TYPEMINSTAR, /* all these opcodes must come in pairs, with */ |
---|
212 | OP_TYPEPLUS, /* the minimizing one second. These codes must */ |
---|
213 | OP_TYPEMINPLUS, /* be in exactly the same order as those above. */ |
---|
214 | OP_TYPEQUERY, /* This set applies to character types such as \d */ |
---|
215 | OP_TYPEMINQUERY, |
---|
216 | OP_TYPEUPTO, /* From 0 to n matches */ |
---|
217 | OP_TYPEMINUPTO, |
---|
218 | OP_TYPEEXACT, /* Exactly n matches */ |
---|
219 | |
---|
220 | OP_CRSTAR, /* The maximizing and minimizing versions of */ |
---|
221 | OP_CRMINSTAR, /* all these opcodes must come in pairs, with */ |
---|
222 | OP_CRPLUS, /* the minimizing one second. These codes must */ |
---|
223 | OP_CRMINPLUS, /* be in exactly the same order as those above. */ |
---|
224 | OP_CRQUERY, /* These are for character classes and back refs */ |
---|
225 | OP_CRMINQUERY, |
---|
226 | OP_CRRANGE, /* These are different to the three seta above. */ |
---|
227 | OP_CRMINRANGE, |
---|
228 | |
---|
229 | OP_CLASS, /* Match a character class */ |
---|
230 | OP_REF, /* Match a back reference */ |
---|
231 | OP_RECURSE, /* Match this pattern recursively */ |
---|
232 | |
---|
233 | OP_ALT, /* Start of alternation */ |
---|
234 | OP_KET, /* End of group that doesn't have an unbounded repeat */ |
---|
235 | OP_KETRMAX, /* These two must remain together and in this */ |
---|
236 | OP_KETRMIN, /* order. They are for groups the repeat for ever. */ |
---|
237 | |
---|
238 | /* The assertions must come before ONCE and COND */ |
---|
239 | |
---|
240 | OP_ASSERT, /* Positive lookahead */ |
---|
241 | OP_ASSERT_NOT, /* Negative lookahead */ |
---|
242 | OP_ASSERTBACK, /* Positive lookbehind */ |
---|
243 | OP_ASSERTBACK_NOT, /* Negative lookbehind */ |
---|
244 | OP_REVERSE, /* Move pointer back - used in lookbehind assertions */ |
---|
245 | |
---|
246 | /* ONCE and COND must come after the assertions, with ONCE first, as there's |
---|
247 | a test for >= ONCE for a subpattern that isn't an assertion. */ |
---|
248 | |
---|
249 | OP_ONCE, /* Once matched, don't back up into the subpattern */ |
---|
250 | OP_COND, /* Conditional group */ |
---|
251 | OP_CREF, /* Used to hold an extraction string number (cond ref) */ |
---|
252 | |
---|
253 | OP_BRAZERO, /* These two must remain together and in this */ |
---|
254 | OP_BRAMINZERO, /* order. */ |
---|
255 | |
---|
256 | OP_BRANUMBER, /* Used for extracting brackets whose number is greater |
---|
257 | than can fit into an opcode. */ |
---|
258 | |
---|
259 | OP_BRA /* This and greater values are used for brackets that |
---|
260 | extract substrings up to a basic limit. After that, |
---|
261 | use is made of OP_BRANUMBER. */ |
---|
262 | }; |
---|
263 | |
---|
264 | /* The highest extraction number before we have to start using additional |
---|
265 | bytes. (Originally PCRE didn't have support for extraction counts highter than |
---|
266 | this number.) The value is limited by the number of opcodes left after OP_BRA, |
---|
267 | i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional |
---|
268 | opcodes. */ |
---|
269 | |
---|
270 | #define EXTRACT_BASIC_MAX 150 |
---|
271 | |
---|
272 | /* The texts of compile-time error messages are defined as macros here so that |
---|
273 | they can be accessed by the POSIX wrapper and converted into error codes. Yes, |
---|
274 | I could have used error codes in the first place, but didn't feel like changing |
---|
275 | just to accommodate the POSIX wrapper. */ |
---|
276 | |
---|
277 | #define ERR1 "\\ at end of pattern" |
---|
278 | #define ERR2 "\\c at end of pattern" |
---|
279 | #define ERR3 "unrecognized character follows \\" |
---|
280 | #define ERR4 "numbers out of order in {} quantifier" |
---|
281 | #define ERR5 "number too big in {} quantifier" |
---|
282 | #define ERR6 "missing terminating ] for character class" |
---|
283 | #define ERR7 "invalid escape sequence in character class" |
---|
284 | #define ERR8 "range out of order in character class" |
---|
285 | #define ERR9 "nothing to repeat" |
---|
286 | #define ERR10 "operand of unlimited repeat could match the empty string" |
---|
287 | #define ERR11 "internal error: unexpected repeat" |
---|
288 | #define ERR12 "unrecognized character after (?" |
---|
289 | #define ERR13 "unused error" |
---|
290 | #define ERR14 "missing )" |
---|
291 | #define ERR15 "back reference to non-existent subpattern" |
---|
292 | #define ERR16 "erroffset passed as NULL" |
---|
293 | #define ERR17 "unknown option bit(s) set" |
---|
294 | #define ERR18 "missing ) after comment" |
---|
295 | #define ERR19 "parentheses nested too deeply" |
---|
296 | #define ERR20 "regular expression too large" |
---|
297 | #define ERR21 "failed to get memory" |
---|
298 | #define ERR22 "unmatched parentheses" |
---|
299 | #define ERR23 "internal error: code overflow" |
---|
300 | #define ERR24 "unrecognized character after (?<" |
---|
301 | #define ERR25 "lookbehind assertion is not fixed length" |
---|
302 | #define ERR26 "malformed number after (?(" |
---|
303 | #define ERR27 "conditional group contains more than two branches" |
---|
304 | #define ERR28 "assertion expected after (?(" |
---|
305 | #define ERR29 "(?p must be followed by )" |
---|
306 | #define ERR30 "unknown POSIX class name" |
---|
307 | #define ERR31 "POSIX collating elements are not supported" |
---|
308 | #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" |
---|
309 | #define ERR33 "characters with values > 255 are not yet supported in classes" |
---|
310 | #define ERR34 "character value in \\x{...} sequence is too large" |
---|
311 | #define ERR35 "invalid condition (?(0)" |
---|
312 | |
---|
313 | /* All character handling must be done as unsigned characters. Otherwise there |
---|
314 | are problems with top-bit-set characters and functions such as isspace(). |
---|
315 | However, we leave the interface to the outside world as char *, because that |
---|
316 | should make things easier for callers. We define a short type for unsigned char |
---|
317 | to save lots of typing. I tried "uchar", but it causes problems on Digital |
---|
318 | Unix, where it is defined in sys/types, so use "uschar" instead. */ |
---|
319 | |
---|
320 | typedef unsigned char uschar; |
---|
321 | |
---|
322 | /* The real format of the start of the pcre block; the actual code vector |
---|
323 | runs on as long as necessary after the end. */ |
---|
324 | |
---|
325 | typedef struct real_pcre { |
---|
326 | unsigned long int magic_number; |
---|
327 | size_t size; |
---|
328 | const unsigned char *tables; |
---|
329 | unsigned long int options; |
---|
330 | unsigned short int top_bracket; |
---|
331 | unsigned short int top_backref; |
---|
332 | uschar first_char; |
---|
333 | uschar req_char; |
---|
334 | uschar code[1]; |
---|
335 | } real_pcre; |
---|
336 | |
---|
337 | /* The real format of the extra block returned by pcre_study(). */ |
---|
338 | |
---|
339 | typedef struct real_pcre_extra { |
---|
340 | uschar options; |
---|
341 | uschar start_bits[32]; |
---|
342 | } real_pcre_extra; |
---|
343 | |
---|
344 | |
---|
345 | /* Structure for passing "static" information around between the functions |
---|
346 | doing the compiling, so that they are thread-safe. */ |
---|
347 | |
---|
348 | typedef struct compile_data { |
---|
349 | const uschar *lcc; /* Points to lower casing table */ |
---|
350 | const uschar *fcc; /* Points to case-flipping table */ |
---|
351 | const uschar *cbits; /* Points to character type table */ |
---|
352 | const uschar *ctypes; /* Points to table of type maps */ |
---|
353 | } compile_data; |
---|
354 | |
---|
355 | /* Structure for passing "static" information around between the functions |
---|
356 | doing the matching, so that they are thread-safe. */ |
---|
357 | |
---|
358 | typedef struct match_data { |
---|
359 | int errorcode; /* As it says */ |
---|
360 | int *offset_vector; /* Offset vector */ |
---|
361 | int offset_end; /* One past the end */ |
---|
362 | int offset_max; /* The maximum usable for return data */ |
---|
363 | const uschar *lcc; /* Points to lower casing table */ |
---|
364 | const uschar *ctypes; /* Points to table of type maps */ |
---|
365 | BOOL offset_overflow; /* Set if too many extractions */ |
---|
366 | BOOL notbol; /* NOTBOL flag */ |
---|
367 | BOOL noteol; /* NOTEOL flag */ |
---|
368 | BOOL utf8; /* UTF8 flag */ |
---|
369 | BOOL endonly; /* Dollar not before final \n */ |
---|
370 | BOOL notempty; /* Empty string match not wanted */ |
---|
371 | const uschar *start_pattern; /* For use when recursing */ |
---|
372 | const uschar *start_subject; /* Start of the subject string */ |
---|
373 | const uschar *end_subject; /* End of the subject string */ |
---|
374 | const uschar *start_match; /* Start of this match attempt */ |
---|
375 | const uschar *end_match_ptr; /* Subject position at end match */ |
---|
376 | int end_offset_top; /* Highwater mark at end of match */ |
---|
377 | } match_data; |
---|
378 | |
---|
379 | /* Bit definitions for entries in the pcre_ctypes table. */ |
---|
380 | |
---|
381 | #define ctype_space 0x01 |
---|
382 | #define ctype_letter 0x02 |
---|
383 | #define ctype_digit 0x04 |
---|
384 | #define ctype_xdigit 0x08 |
---|
385 | #define ctype_word 0x10 /* alphameric or '_' */ |
---|
386 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
---|
387 | |
---|
388 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
---|
389 | of bits for a class map. Some classes are built by combining these tables. */ |
---|
390 | |
---|
391 | #define cbit_space 0 /* [:space:] or \s */ |
---|
392 | #define cbit_xdigit 32 /* [:xdigit:] */ |
---|
393 | #define cbit_digit 64 /* [:digit:] or \d */ |
---|
394 | #define cbit_upper 96 /* [:upper:] */ |
---|
395 | #define cbit_lower 128 /* [:lower:] */ |
---|
396 | #define cbit_word 160 /* [:word:] or \w */ |
---|
397 | #define cbit_graph 192 /* [:graph:] */ |
---|
398 | #define cbit_print 224 /* [:print:] */ |
---|
399 | #define cbit_punct 256 /* [:punct:] */ |
---|
400 | #define cbit_cntrl 288 /* [:cntrl:] */ |
---|
401 | #define cbit_length 320 /* Length of the cbits table */ |
---|
402 | |
---|
403 | /* Offsets of the various tables from the base tables pointer, and |
---|
404 | total length. */ |
---|
405 | |
---|
406 | #define lcc_offset 0 |
---|
407 | #define fcc_offset 256 |
---|
408 | #define cbits_offset 512 |
---|
409 | #define ctypes_offset (cbits_offset + cbit_length) |
---|
410 | #define tables_length (ctypes_offset + 256) |
---|
411 | |
---|
412 | /* End of internal.h */ |
---|