source: trunk/third/perl/regcomp.h @ 14545

Revision 14545, 11.2 KB checked in by ghudson, 25 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r14544, which included commits to RCS files with non-trunk default branches.
Line 
1/*    regcomp.h
2 */
3
4typedef OP OP_4tree;                    /* Will be redefined later. */
5
6/*
7 * The "internal use only" fields in regexp.h are present to pass info from
8 * compile to execute that permits the execute phase to run lots faster on
9 * simple cases.  They are:
10 *
11 * regstart     sv that must begin a match; Nullch if none obvious
12 * reganch      is the match anchored (at beginning-of-line only)?
13 * regmust      string (pointer into program) that match must include, or NULL
14 *  [regmust changed to SV* for bminstr()--law]
15 * regmlen      length of regmust string
16 *  [regmlen not used currently]
17 *
18 * Regstart and reganch permit very fast decisions on suitable starting points
19 * for a match, cutting down the work a lot.  Regmust permits fast rejection
20 * of lines that cannot possibly match.  The regmust tests are costly enough
21 * that pregcomp() supplies a regmust only if the r.e. contains something
22 * potentially expensive (at present, the only such thing detected is * or +
23 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
24 * supplied because the test in pregexec() needs it and pregcomp() is computing
25 * it anyway.
26 * [regmust is now supplied always.  The tests that use regmust have a
27 * heuristic that disables the test if it usually matches.]
28 *
29 * [In fact, we now use regmust in many cases to locate where the search
30 * starts in the string, so if regback is >= 0, the regmust search is never
31 * wasted effort.  The regback variable says how many characters back from
32 * where regmust matched is the earliest possible start of the match.
33 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
34 */
35
36/*
37 * Structure for regexp "program".  This is essentially a linear encoding
38 * of a nondeterministic finite-state machine (aka syntax charts or
39 * "railroad normal form" in parsing technology).  Each node is an opcode
40 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
41 * all nodes except BRANCH implement concatenation; a "next" pointer with
42 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
43 * have one of the subtle syntax dependencies:  an individual BRANCH (as
44 * opposed to a collection of them) is never concatenated with anything
45 * because of operator precedence.)  The operand of some types of node is
46 * a literal string; for others, it is a node leading into a sub-FSM.  In
47 * particular, the operand of a BRANCH node is the first node of the branch.
48 * (NB this is *not* a tree structure:  the tail of the branch connects
49 * to the thing following the set of BRANCHes.)  The opcodes are:
50 */
51
52/*
53 * A node is one char of opcode followed by two chars of "next" pointer.
54 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
55 * value is a positive offset from the opcode of the node containing it.
56 * An operand, if any, simply follows the node.  (Note that much of the
57 * code generation knows about this implicit relationship.)
58 *
59 * Using two bytes for the "next" pointer is vast overkill for most things,
60 * but allows patterns to get big without disasters.
61 *
62 * [The "next" pointer is always aligned on an even
63 * boundary, and reads the offset directly as a short.  Also, there is no
64 * special test to reverse the sign of BACK pointers since the offset is
65 * stored negative.]
66 */
67
68struct regnode_string {
69    U8  str_len;
70    U8  type;
71    U16 next_off;
72    char string[1];
73};
74
75struct regnode_1 {
76    U8  flags;
77    U8  type;
78    U16 next_off;
79    U32 arg1;
80};
81
82struct regnode_2 {
83    U8  flags;
84    U8  type;
85    U16 next_off;
86    U16 arg1;
87    U16 arg2;
88};
89
90#define ANYOF_BITMAP_SIZE       32      /* 256 b/(8 b/B) */
91#define ANYOF_CLASSBITMAP_SIZE   4
92
93struct regnode_charclass {
94    U8  flags;
95    U8  type;
96    U16 next_off;
97    char bitmap[ANYOF_BITMAP_SIZE];
98};
99
100struct regnode_charclass_class {
101    U8  flags;
102    U8  type;
103    U16 next_off;
104    char bitmap[ANYOF_BITMAP_SIZE];
105    char classflags[ANYOF_CLASSBITMAP_SIZE];
106};
107
108/* XXX fix this description.
109   Impose a limit of REG_INFTY on various pattern matching operations
110   to limit stack growth and to avoid "infinite" recursions.
111*/
112/* The default size for REG_INFTY is I16_MAX, which is the same as
113   SHORT_MAX (see perl.h).  Unfortunately I16 isn't necessarily 16 bits
114   (see handy.h).  On the Cray C90, sizeof(short)==4 and hence I16_MAX is
115   ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
116   ((1<<63)-1).  To limit stack growth to reasonable sizes, supply a
117   smaller default.
118        --Andy Dougherty  11 June 1998
119*/
120#if SHORTSIZE > 2
121#  ifndef REG_INFTY
122#    define REG_INFTY ((1<<15)-1)
123#  endif
124#endif
125
126#ifndef REG_INFTY
127#  define REG_INFTY I16_MAX
128#endif
129
130#define ARG_VALUE(arg) (arg)
131#define ARG__SET(arg,val) ((arg) = (val))
132
133#define ARG(p) ARG_VALUE(ARG_LOC(p))
134#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
135#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
136#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
137#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
138#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
139
140#ifndef lint
141#  define NEXT_OFF(p) ((p)->next_off)
142#  define NODE_ALIGN(node)
143#  define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
144#else /* lint */
145#  define NEXT_OFF(p) 0
146#  define NODE_ALIGN(node)
147#  define NODE_ALIGN_FILL(node)
148#endif /* lint */
149
150#define SIZE_ALIGN NODE_ALIGN
151
152#define OP(p)           ((p)->type)
153#define OPERAND(p)      (((struct regnode_string *)p)->string)
154#define MASK(p)         ((char*)OPERAND(p))
155#define STR_LEN(p)      (((struct regnode_string *)p)->str_len)
156#define STRING(p)       (((struct regnode_string *)p)->string)
157#define STR_SZ(l)       ((l + sizeof(regnode) - 1) / sizeof(regnode))
158#define NODE_SZ_STR(p)  (STR_SZ(STR_LEN(p))+1)
159
160#define NODE_ALIGN(node)
161#define ARG_LOC(p)      (((struct regnode_1 *)p)->arg1)
162#define ARG1_LOC(p)     (((struct regnode_2 *)p)->arg1)
163#define ARG2_LOC(p)     (((struct regnode_2 *)p)->arg2)
164#define NODE_STEP_REGNODE       1       /* sizeof(regnode)/sizeof(regnode) */
165#define EXTRA_STEP_2ARGS        EXTRA_SIZE(struct regnode_2)
166
167#define NODE_STEP_B     4
168
169#define NEXTOPER(p)     ((p) + NODE_STEP_REGNODE)
170#define PREVOPER(p)     ((p) - NODE_STEP_REGNODE)
171
172#define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
173    (ptr)->type = op;    (ptr)->next_off = 0;   (ptr)++; } STMT_END
174#define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
175    ARG_SET(ptr, arg);  FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
176
177#define REG_MAGIC 0234
178
179#define SIZE_ONLY (PL_regcode == &PL_regdummy)
180
181/* Flags for node->flags of ANYOF */
182
183#define ANYOF_CLASS     0x08
184#define ANYOF_INVERT    0x04
185#define ANYOF_FOLD      0x02
186#define ANYOF_LOCALE    0x01
187
188/* Used for regstclass only */
189#define ANYOF_EOS       0x10            /* Can match an empty string too */
190
191/* Character classes for node->classflags of ANYOF */
192/* Should be synchronized with a table in regprop() */
193/* 2n should pair with 2n+1 */
194
195#define ANYOF_ALNUM      0      /* \w, utf8::IsWord, isALNUM() */
196#define ANYOF_NALNUM     1
197#define ANYOF_SPACE      2
198#define ANYOF_NSPACE     3
199#define ANYOF_DIGIT      4
200#define ANYOF_NDIGIT     5
201#define ANYOF_ALNUMC     6      /* isalnum(3), utf8::IsAlnum, isALNUMC() */
202#define ANYOF_NALNUMC    7
203#define ANYOF_ALPHA      8
204#define ANYOF_NALPHA     9
205#define ANYOF_ASCII     10
206#define ANYOF_NASCII    11
207#define ANYOF_CNTRL     12
208#define ANYOF_NCNTRL    13
209#define ANYOF_GRAPH     14
210#define ANYOF_NGRAPH    15
211#define ANYOF_LOWER     16
212#define ANYOF_NLOWER    17
213#define ANYOF_PRINT     18
214#define ANYOF_NPRINT    19
215#define ANYOF_PUNCT     20
216#define ANYOF_NPUNCT    21
217#define ANYOF_UPPER     22
218#define ANYOF_NUPPER    23
219#define ANYOF_XDIGIT    24
220#define ANYOF_NXDIGIT   25
221
222#define ANYOF_MAX       31
223
224/* Backward source code compatibility. */
225
226#define ANYOF_ALNUML     ANYOF_ALNUM
227#define ANYOF_NALNUML    ANYOF_NALNUM
228#define ANYOF_SPACEL     ANYOF_SPACE
229#define ANYOF_NSPACEL    ANYOF_NSPACE
230
231/* Utility macros for the bitmap and classes of ANYOF */
232
233#define ANYOF_SIZE              (sizeof(struct regnode_charclass))
234#define ANYOF_CLASS_SIZE        (sizeof(struct regnode_charclass_class))
235
236#define ANYOF_FLAGS(p)          ((p)->flags)
237#define ANYOF_FLAGS_ALL         0xff
238
239#define ANYOF_BIT(c)            (1 << ((c) & 7))
240
241#define ANYOF_CLASS_BYTE(p, c)  (((struct regnode_charclass_class*)(p))->classflags[((c) >> 3) & 3])
242#define ANYOF_CLASS_SET(p, c)   (ANYOF_CLASS_BYTE(p, c) |=  ANYOF_BIT(c))
243#define ANYOF_CLASS_CLEAR(p, c) (ANYOF_CLASS_BYTE(p, c) &= ~ANYOF_BIT(c))
244#define ANYOF_CLASS_TEST(p, c)  (ANYOF_CLASS_BYTE(p, c) &   ANYOF_BIT(c))
245
246#define ANYOF_CLASS_ZERO(ret)   Zero(((struct regnode_charclass_class*)(ret))->classflags, ANYOF_CLASSBITMAP_SIZE, char)
247#define ANYOF_BITMAP_ZERO(ret)  Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char)
248
249#define ANYOF_BITMAP(p)         (((struct regnode_charclass*)(p))->bitmap)
250#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[((c) >> 3) & 31])
251#define ANYOF_BITMAP_SET(p, c)  (ANYOF_BITMAP_BYTE(p, c) |=  ANYOF_BIT(c))
252#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c))
253#define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) &   ANYOF_BIT(c))
254
255#define ANYOF_SKIP              ((ANYOF_SIZE - 1)/sizeof(regnode))
256#define ANYOF_CLASS_SKIP        ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode))
257#define ANYOF_CLASS_ADD_SKIP    (ANYOF_CLASS_SKIP - ANYOF_SKIP)
258
259/*
260 * Utility definitions.
261 */
262#ifndef lint
263#ifndef CHARMASK
264#define UCHARAT(p)      ((int)*(U8*)(p))
265#else
266#define UCHARAT(p)      ((int)*(p)&CHARMASK)
267#endif
268#else /* lint */
269#define UCHARAT(p)      PL_regdummy
270#endif /* lint */
271
272#define FAIL(m) \
273    STMT_START {                                                        \
274        if (!SIZE_ONLY)                                                 \
275            SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx);            \
276        Perl_croak(aTHX_ "/%.127s/: %s",  PL_regprecomp,m);             \
277    } STMT_END
278
279#define FAIL2(pat,m) \
280    STMT_START {                                                        \
281        if (!SIZE_ONLY)                                                 \
282            SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx);            \
283        S_re_croak2(aTHX_ "/%.127s/: ",pat,PL_regprecomp,m);            \
284    } STMT_END
285
286#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
287
288#define REG_SEEN_ZERO_LEN       1
289#define REG_SEEN_LOOKBEHIND     2
290#define REG_SEEN_GPOS           4
291#define REG_SEEN_EVAL           8
292
293START_EXTERN_C
294
295#include "regnodes.h"
296
297/* The following have no fixed length. U8 so we can do strchr() on it. */
298#ifndef DOINIT
299EXTCONST U8 PL_varies[];
300#else
301EXTCONST U8 PL_varies[] = {
302    BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
303    WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0
304};
305#endif
306
307/* The following always have a length of 1. U8 we can do strchr() on it. */
308/* (Note that length 1 means "one character" under UTF8, not "one octet".) */
309#ifndef DOINIT
310EXTCONST U8 PL_simple[];
311#else
312EXTCONST U8 PL_simple[] = {
313    REG_ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8,
314    ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8,
315    NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8,
316    SPACE, SPACEUTF8, SPACEL, SPACELUTF8,
317    NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8,
318    DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0
319};
320#endif
321
322END_EXTERN_C
323
324typedef struct re_scream_pos_data_s
325{
326    char **scream_olds;         /* match pos */
327    I32 *scream_pos;            /* Internal iterator of scream. */
328} re_scream_pos_data;
329
330struct reg_data {
331    U32 count;
332    U8 *what;
333    void* data[1];
334};
335
336struct reg_substr_datum {
337    I32 min_offset;
338    I32 max_offset;
339    SV *substr;
340};
341
342struct reg_substr_data {
343    struct reg_substr_datum data[3];    /* Actual array */
344};
345
346#define anchored_substr substrs->data[0].substr
347#define anchored_offset substrs->data[0].min_offset
348#define float_substr substrs->data[1].substr
349#define float_min_offset substrs->data[1].min_offset
350#define float_max_offset substrs->data[1].max_offset
351#define check_substr substrs->data[2].substr
352#define check_offset_min substrs->data[2].min_offset
353#define check_offset_max substrs->data[2].max_offset
Note: See TracBrowser for help on using the repository browser.