1 | /* classes: h_files */ |
---|
2 | |
---|
3 | #ifndef RXSUPERH |
---|
4 | #define RXSUPERH |
---|
5 | |
---|
6 | /* Copyright (C) 1995, 1996 Tom Lord |
---|
7 | * |
---|
8 | * This program is free software; you can redistribute it and/or modify |
---|
9 | * it under the terms of the GNU Library General Public License as published by |
---|
10 | * the Free Software Foundation; either version 2, or (at your option) |
---|
11 | * any later version. |
---|
12 | * |
---|
13 | * This program is distributed in the hope that it will be useful, |
---|
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
16 | * GNU Library General Public License for more details. |
---|
17 | * |
---|
18 | * You should have received a copy of the GNU Library General Public License |
---|
19 | * along with this software; see the file COPYING. If not, write to |
---|
20 | * the Free Software Foundation, 59 Temple Place - Suite 330, |
---|
21 | * Boston, MA 02111-1307, USA. |
---|
22 | */ |
---|
23 | |
---|
24 | /* lord Sun May 7 12:40:17 1995 */ |
---|
25 | |
---|
26 | |
---|
27 | |
---|
28 | #include "rxnfa.h" |
---|
29 | |
---|
30 | |
---|
31 | |
---|
32 | /* This begins the description of the superstate NFA. |
---|
33 | * |
---|
34 | * The superstate NFA corresponds to the NFA in these ways: |
---|
35 | * |
---|
36 | * Superstate states correspond to sets of NFA states (nfa_states(SUPER)), |
---|
37 | * |
---|
38 | * Superstate edges correspond to NFA paths. |
---|
39 | * |
---|
40 | * The superstate has no epsilon transitions; |
---|
41 | * every edge has a character label, and a (possibly empty) side |
---|
42 | * effect label. The side effect label corresponds to a list of |
---|
43 | * side effects that occur in the NFA. These parts are referred |
---|
44 | * to as: superedge_character(EDGE) and superedge_sides(EDGE). |
---|
45 | * |
---|
46 | * For a superstate edge EDGE starting in some superstate SUPER, |
---|
47 | * the following is true (in pseudo-notation :-): |
---|
48 | * |
---|
49 | * exists DEST in nfa_states s.t. |
---|
50 | * exists nfaEDGE in nfa_edges s.t. |
---|
51 | * origin (nfaEDGE) == DEST |
---|
52 | * && origin (nfaEDGE) is a member of nfa_states(SUPER) |
---|
53 | * && exists PF in possible_futures(dest(nfaEDGE)) s.t. |
---|
54 | * sides_of_possible_future (PF) == superedge_sides (EDGE) |
---|
55 | * |
---|
56 | * also: |
---|
57 | * |
---|
58 | * let SUPER2 := superedge_destination(EDGE) |
---|
59 | * nfa_states(SUPER2) |
---|
60 | * == union of all nfa state sets S s.t. |
---|
61 | * exists PF in possible_futures(dest(nfaEDGE)) s.t. |
---|
62 | * sides_of_possible_future (PF) == superedge_sides (EDGE) |
---|
63 | * && S == dests_of_possible_future (PF) } |
---|
64 | * |
---|
65 | * Or in english, every superstate is a set of nfa states. A given |
---|
66 | * character and a superstate implies many transitions in the NFA -- |
---|
67 | * those that begin with an edge labeled with that character from a |
---|
68 | * state in the set corresponding to the superstate. |
---|
69 | * |
---|
70 | * The destinations of those transitions each have a set of possible |
---|
71 | * futures. A possible future is a list of side effects and a set of |
---|
72 | * destination NFA states. Two sets of possible futures can be |
---|
73 | * `merged' by combining all pairs of possible futures that have the |
---|
74 | * same side effects. A pair is combined by creating a new future |
---|
75 | * with the same side effect but the union of the two destination sets. |
---|
76 | * In this way, all the possible futures suggested by a superstate |
---|
77 | * and a character can be merged into a set of possible futures where |
---|
78 | * no two elements of the set have the same set of side effects. |
---|
79 | * |
---|
80 | * The destination of a possible future, being a set of NFA states, |
---|
81 | * corresponds to a supernfa state. So, the merged set of possible |
---|
82 | * futures we just created can serve as a set of edges in the |
---|
83 | * supernfa. |
---|
84 | * |
---|
85 | * The representation of the superstate nfa and the nfa is critical. |
---|
86 | * The nfa has to be compact, but has to facilitate the rapid |
---|
87 | * computation of missing superstates. The superstate nfa has to |
---|
88 | * be fast to interpret, lazilly constructed, and bounded in space. |
---|
89 | * |
---|
90 | * To facilitate interpretation, the superstate data structures are |
---|
91 | * peppered with `instruction frames'. There is an instruction set |
---|
92 | * defined below which matchers using the supernfa must be able to |
---|
93 | * interpret. |
---|
94 | * |
---|
95 | * We'd like to make it possible but not mandatory to use code |
---|
96 | * addresses to represent instructions (c.f. gcc's computed goto). |
---|
97 | * Therefore, we define an enumerated type of opcodes, and when |
---|
98 | * writing one of these instructions into a data structure, use |
---|
99 | * the opcode as an index into a table of instruction values. |
---|
100 | * |
---|
101 | * Below are the opcodes that occur in the superstate nfa. |
---|
102 | * |
---|
103 | * The descriptions of the opcodes refer to data structures that are |
---|
104 | * described further below. |
---|
105 | */ |
---|
106 | |
---|
107 | enum rx_opcode |
---|
108 | { |
---|
109 | /* |
---|
110 | * BACKTRACK_POINT is invoked when a character transition in |
---|
111 | * a superstate leads to more than one edge. In that case, |
---|
112 | * the edges have to be explored independently using a backtracking |
---|
113 | * strategy. |
---|
114 | * |
---|
115 | * A BACKTRACK_POINT instruction is stored in a superstate's |
---|
116 | * transition table for some character when it is known that that |
---|
117 | * character crosses more than one edge. On encountering this |
---|
118 | * instruction, the matcher saves enough state to backtrack to this |
---|
119 | * point later in the match. |
---|
120 | */ |
---|
121 | rx_backtrack_point = 0, /* data is (struct transition_class *) */ |
---|
122 | |
---|
123 | /* |
---|
124 | * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. |
---|
125 | * There is one occurence of this instruction per rx_distinct_future. |
---|
126 | * This instruction is skipped if a rx_distinct_future has no side effects. |
---|
127 | */ |
---|
128 | rx_do_side_effects = rx_backtrack_point + 1, |
---|
129 | |
---|
130 | /* data is (struct rx_distinct_future *) */ |
---|
131 | |
---|
132 | /* |
---|
133 | * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose |
---|
134 | * destination superstate has been reclaimed (or was never built). |
---|
135 | * It recomputes the destination superstate. |
---|
136 | * RX_CACHE_MISS is also stored in a superstate transition table before |
---|
137 | * any of its edges have been built. |
---|
138 | */ |
---|
139 | rx_cache_miss = rx_do_side_effects + 1, |
---|
140 | /* data is (struct rx_distinct_future *) */ |
---|
141 | |
---|
142 | /* |
---|
143 | * RX_NEXT_CHAR is called to consume the next character and take the |
---|
144 | * corresponding transition. This is the only instruction that uses |
---|
145 | * the DATA field of the instruction frame instead of DATA_2. |
---|
146 | * The comments about rx_inx explain this further. |
---|
147 | */ |
---|
148 | rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ |
---|
149 | |
---|
150 | /* RX_BACKTRACK indicates that a transition fails. Don't |
---|
151 | * confuse this with rx_backtrack_point. |
---|
152 | */ |
---|
153 | rx_backtrack = rx_next_char + 1, /* no data */ |
---|
154 | |
---|
155 | /* |
---|
156 | * RX_ERROR_INX is stored only in places that should never be executed. |
---|
157 | */ |
---|
158 | rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ |
---|
159 | |
---|
160 | rx_num_instructions = rx_error_inx + 1 |
---|
161 | }; |
---|
162 | |
---|
163 | /* The heart of the matcher is a `word-code-interpreter' |
---|
164 | * (like a byte-code interpreter, except that instructions |
---|
165 | * are a full word wide). |
---|
166 | * |
---|
167 | * Instructions are not stored in a vector of code, instead, |
---|
168 | * they are scattered throughout the data structures built |
---|
169 | * by the regexp compiler and the matcher. One word-code instruction, |
---|
170 | * together with the arguments to that instruction, constitute |
---|
171 | * an instruction frame (struct rx_inx). |
---|
172 | * |
---|
173 | * This structure type is padded by hand to a power of 2 because |
---|
174 | * in one of the dominant cases, we dispatch by indexing a table |
---|
175 | * of instruction frames. If that indexing can be accomplished |
---|
176 | * by just a shift of the index, we're happy. |
---|
177 | * |
---|
178 | * Instructions take at most one argument, but there are two |
---|
179 | * slots in an instruction frame that might hold that argument. |
---|
180 | * These are called data and data_2. The data slot is only |
---|
181 | * used for one instruction (RX_NEXT_CHAR). For all other |
---|
182 | * instructions, data should be set to 0. |
---|
183 | * |
---|
184 | * RX_NEXT_CHAR is the most important instruction by far. |
---|
185 | * By reserving the data field for its exclusive use, |
---|
186 | * instruction dispatch is sped up in that case. There is |
---|
187 | * no need to fetch both the instruction and the data, |
---|
188 | * only the data is needed. In other words, a `cycle' begins |
---|
189 | * by fetching the field data. If that is non-0, then it must |
---|
190 | * be the destination state of a next_char transition, so |
---|
191 | * make that value the current state, advance the match position |
---|
192 | * by one character, and start a new cycle. On the other hand, |
---|
193 | * if data is 0, fetch the instruction and do a more complicated |
---|
194 | * dispatch on that. |
---|
195 | */ |
---|
196 | |
---|
197 | struct rx_inx |
---|
198 | { |
---|
199 | void * data; |
---|
200 | void * data_2; |
---|
201 | void * inx; |
---|
202 | void * fnord; |
---|
203 | }; |
---|
204 | |
---|
205 | #ifndef RX_TAIL_ARRAY |
---|
206 | #define RX_TAIL_ARRAY 1 |
---|
207 | #endif |
---|
208 | |
---|
209 | /* A superstate corresponds to a set of nfa states. Those sets are |
---|
210 | * represented by STRUCT RX_SUPERSET. The constructors |
---|
211 | * guarantee that only one (shared) structure is created for a given set. |
---|
212 | */ |
---|
213 | struct rx_superset |
---|
214 | { |
---|
215 | int refs; /* This is a reference counted structure. */ |
---|
216 | |
---|
217 | /* We keep these sets in a cache because (in an unpredictable way), |
---|
218 | * the same set is often created again and again. |
---|
219 | * |
---|
220 | * When an NFA is destroyed, some of the supersets for that NFA |
---|
221 | * may still exist. This can lead to false cache hits -- an apparent cache |
---|
222 | * hit on a superset that properly belongs to an already free NFA. |
---|
223 | * |
---|
224 | * When a cache hit appears to occur, we will have in hand the |
---|
225 | * nfa for which it may have happened. Every nfa is given |
---|
226 | * its own sequence number. The cache is validated |
---|
227 | * by comparing the nfa sequence number to this field: |
---|
228 | */ |
---|
229 | int id; |
---|
230 | |
---|
231 | struct rx_nfa_state * car; /* May or may not be a valid addr. */ |
---|
232 | struct rx_superset * cdr; |
---|
233 | |
---|
234 | /* If the corresponding superstate exists: */ |
---|
235 | struct rx_superstate * superstate; |
---|
236 | |
---|
237 | /* That is_final field of the constiuent nfa states which has the greatest magnitude. */ |
---|
238 | int is_final; |
---|
239 | |
---|
240 | /* The OR of the corresponding fields of the constiuent nfa states. */ |
---|
241 | int has_cset_edges; |
---|
242 | |
---|
243 | |
---|
244 | /* There is another bookkeeping problem. It is expensive to |
---|
245 | * compute the starting nfa state set for an nfa. So, once computed, |
---|
246 | * it is cached in the `struct rx'. |
---|
247 | * |
---|
248 | * But, the state set can be flushed from the superstate cache. |
---|
249 | * When that happens, the cached value in the `struct rx' has |
---|
250 | * to be flushed. |
---|
251 | */ |
---|
252 | struct rx * starts_for; |
---|
253 | |
---|
254 | /* This is used to link into a hash bucket so these objects can |
---|
255 | * be `hash-consed'. |
---|
256 | */ |
---|
257 | struct rx_hash_item hash_item; |
---|
258 | }; |
---|
259 | |
---|
260 | #define rx_protect_superset(RX,CON) (++(CON)->refs) |
---|
261 | |
---|
262 | /* The terminology may be confusing (rename this structure?). |
---|
263 | * Every character occurs in at most one rx_super_edge per super-state. |
---|
264 | * But, that structure might have more than one option, indicating a point |
---|
265 | * of non-determinism. |
---|
266 | * |
---|
267 | * In other words, this structure holds a list of superstate edges |
---|
268 | * sharing a common starting state and character label. The edges |
---|
269 | * are in the field OPTIONS. All superstate edges sharing the same |
---|
270 | * starting state and character are in this list. |
---|
271 | */ |
---|
272 | struct rx_super_edge |
---|
273 | { |
---|
274 | struct rx_super_edge *next; |
---|
275 | struct rx_inx rx_backtrack_frame; |
---|
276 | int cset_size; |
---|
277 | rx_Bitset cset; |
---|
278 | struct rx_distinct_future *options; |
---|
279 | }; |
---|
280 | |
---|
281 | /* A superstate is a set of nfa states (RX_SUPERSET) along |
---|
282 | * with a transition table. Superstates are built on demand and reclaimed |
---|
283 | * without warning. To protect a superstate from this ghastly fate, |
---|
284 | * use LOCK_SUPERSTATE. |
---|
285 | */ |
---|
286 | struct rx_superstate |
---|
287 | { |
---|
288 | int rx_id; /* c.f. the id field of rx_superset */ |
---|
289 | int locks; /* protection from reclamation */ |
---|
290 | |
---|
291 | /* Within a superstate cache, all the superstates are kept in a big |
---|
292 | * queue. The tail of the queue is the state most likely to be |
---|
293 | * reclaimed. The *recyclable fields hold the queue position of |
---|
294 | * this state. |
---|
295 | */ |
---|
296 | struct rx_superstate * next_recyclable; |
---|
297 | struct rx_superstate * prev_recyclable; |
---|
298 | |
---|
299 | /* The supernfa edges that exist in the cache and that have |
---|
300 | * this state as their destination are kept in this list: |
---|
301 | */ |
---|
302 | struct rx_distinct_future * transition_refs; |
---|
303 | |
---|
304 | /* The list of nfa states corresponding to this superstate: */ |
---|
305 | struct rx_superset * contents; |
---|
306 | |
---|
307 | /* The list of edges in the cache beginning from this state. */ |
---|
308 | struct rx_super_edge * edges; |
---|
309 | |
---|
310 | /* A tail of the recyclable queue is marked as semifree. A semifree |
---|
311 | * state has no incoming next_char transitions -- any transition |
---|
312 | * into a semifree state causes a complex dispatch with the side |
---|
313 | * effect of rescuing the state from its semifree state into a |
---|
314 | * fully free state at the head of the queue. |
---|
315 | * |
---|
316 | * An alternative to this might be to make next_char more expensive, |
---|
317 | * and to move a state to the head of the recyclable queue whenever |
---|
318 | * it is entered. That way, popular states would never be recycled. |
---|
319 | * |
---|
320 | * But unilaterally making next_char more expensive actually loses. |
---|
321 | * So, incoming transitions are only made expensive for states near |
---|
322 | * the tail of the recyclable queue. The more cache contention |
---|
323 | * there is, the more frequently a state will have to prove itself |
---|
324 | * and be moved back to the front of the queue. If there is less |
---|
325 | * contention, then popular states just aggregate in the front of |
---|
326 | * the queue and stay there. |
---|
327 | */ |
---|
328 | int is_semifree; |
---|
329 | |
---|
330 | |
---|
331 | /* This keeps track of the size of the transition table for this |
---|
332 | * state. There is a half-hearted attempt to support variable sized |
---|
333 | * superstates. |
---|
334 | */ |
---|
335 | int trans_size; |
---|
336 | |
---|
337 | /* Indexed by characters... */ |
---|
338 | struct rx_inx transitions[RX_TAIL_ARRAY]; |
---|
339 | }; |
---|
340 | |
---|
341 | |
---|
342 | /* A list of distinct futures define the edges that leave from a |
---|
343 | * given superstate on a given character. c.f. rx_super_edge. |
---|
344 | */ |
---|
345 | struct rx_distinct_future |
---|
346 | { |
---|
347 | struct rx_distinct_future * next_same_super_edge[2]; |
---|
348 | struct rx_distinct_future * next_same_dest; |
---|
349 | struct rx_distinct_future * prev_same_dest; |
---|
350 | struct rx_superstate * present; /* source state */ |
---|
351 | struct rx_superstate * future; /* destination state */ |
---|
352 | struct rx_super_edge * edge; |
---|
353 | |
---|
354 | |
---|
355 | /* The future_frame holds the instruction that should be executed |
---|
356 | * after all the side effects are done, when it is time to complete |
---|
357 | * the transition to the next state. |
---|
358 | * |
---|
359 | * Normally this is a next_char instruction, but it may be a |
---|
360 | * cache_miss instruction as well, depending on whether or not |
---|
361 | * the superstate is in the cache and semifree. |
---|
362 | * |
---|
363 | * If this is the only future for a given superstate/char, and |
---|
364 | * if there are no side effects to be performed, this frame is |
---|
365 | * not used (directly) at all. Instead, its contents are copied |
---|
366 | * into the transition table of the starting state of this dist. future |
---|
367 | * (a sort of goto elimination). |
---|
368 | */ |
---|
369 | struct rx_inx future_frame; |
---|
370 | |
---|
371 | struct rx_inx side_effects_frame; |
---|
372 | struct rx_se_list * effects; |
---|
373 | }; |
---|
374 | |
---|
375 | #define rx_lock_superstate(R,S) ((S)->locks++) |
---|
376 | #define rx_unlock_superstate(R,S) (--(S)->locks) |
---|
377 | |
---|
378 | struct rx_cache; |
---|
379 | |
---|
380 | #ifdef __STDC__ |
---|
381 | typedef void (*rx_morecore_fn)(struct rx_cache *); |
---|
382 | #else |
---|
383 | typedef void (*rx_morecore_fn)(); |
---|
384 | #endif |
---|
385 | |
---|
386 | struct rx_cache |
---|
387 | { |
---|
388 | struct rx_hash_rules superset_hash_rules; |
---|
389 | |
---|
390 | struct rx_superstate * lru_superstate; |
---|
391 | struct rx_superstate * semifree_superstate; |
---|
392 | |
---|
393 | struct rx_superset * empty_superset; |
---|
394 | |
---|
395 | int superstates; |
---|
396 | int semifree_superstates; |
---|
397 | int hits; |
---|
398 | int misses; |
---|
399 | |
---|
400 | int bytes_allowed; |
---|
401 | int bytes_used; |
---|
402 | |
---|
403 | int local_cset_size; |
---|
404 | void ** instruction_table; |
---|
405 | |
---|
406 | struct rx_hash superset_table; |
---|
407 | }; |
---|
408 | |
---|
409 | #ifndef RX_DEFAULT_DFA_CACHE_SIZE |
---|
410 | /* This is an upper bound on the number of bytes that may (normally) |
---|
411 | * be allocated for DFA states. If this threshold would be exceeded, |
---|
412 | * Rx tries to flush some DFA states from the cache. |
---|
413 | * |
---|
414 | * This value is used whenever the rx_default_cache is used (for example, |
---|
415 | * with the Posix entry points). |
---|
416 | */ |
---|
417 | #define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19) |
---|
418 | #endif |
---|
419 | |
---|
420 | extern struct rx_cache * rx_default_cache; |
---|
421 | |
---|
422 | |
---|
423 | #ifdef __STDC__ |
---|
424 | extern char * rx_cache_malloc (struct rx_cache * cache, int size); |
---|
425 | extern void rx_cache_free (struct rx_cache * cache, int size, char * mem); |
---|
426 | extern void rx_release_superset (struct rx *rx, |
---|
427 | struct rx_superset *set); |
---|
428 | extern struct rx_superset * rx_superset_cons (struct rx * rx, |
---|
429 | struct rx_nfa_state *car, struct rx_superset *cdr); |
---|
430 | extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ; |
---|
431 | extern struct rx_superstate * rx_superstate (struct rx *rx, |
---|
432 | struct rx_superset *set); |
---|
433 | extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ; |
---|
434 | |
---|
435 | #else /* STDC */ |
---|
436 | extern char * rx_cache_malloc (); |
---|
437 | extern void rx_cache_free (); |
---|
438 | extern void rx_release_superset (); |
---|
439 | extern struct rx_superset * rx_superset_cons (); |
---|
440 | extern struct rx_superset * rx_superstate_eclosure_union (); |
---|
441 | extern struct rx_superstate * rx_superstate (); |
---|
442 | extern struct rx_inx * rx_handle_cache_miss (); |
---|
443 | |
---|
444 | #endif /* STDC */ |
---|
445 | |
---|
446 | #endif /* RXSUPERH */ |
---|