source: trunk/third/rx/rx/rxsuper.h @ 10430

Revision 10430, 16.0 KB checked in by ghudson, 27 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r10429, which included commits to RCS files with non-trunk default branches.
Line 
1/* classes: h_files */
2
3#ifndef RXSUPERH
4#define RXSUPERH
5
6/*      Copyright (C) 1995, 1996 Tom Lord
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Library General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
11 * any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this software; see the file COPYING.  If not, write to
20 * the Free Software Foundation, 59 Temple Place - Suite 330,
21 * Boston, MA 02111-1307, USA.
22 */
23
24/*  lord        Sun May  7 12:40:17 1995        */
25
26
27
28#include "rxnfa.h"
29
30
31
32/* This begins the description of the superstate NFA.
33 *
34 * The superstate NFA corresponds to the NFA in these ways:
35 *
36 * Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
37 *
38 * Superstate edges correspond to NFA paths.
39 *
40 * The superstate has no epsilon transitions;
41 * every edge has a character label, and a (possibly empty) side
42 * effect label.   The side effect label corresponds to a list of
43 * side effects that occur in the NFA.  These parts are referred
44 * to as:   superedge_character(EDGE) and superedge_sides(EDGE).
45 *
46 * For a superstate edge EDGE starting in some superstate SUPER,
47 * the following is true (in pseudo-notation :-):
48 *
49 *       exists DEST in nfa_states s.t.
50 *         exists nfaEDGE in nfa_edges s.t.
51 *                 origin (nfaEDGE) == DEST
52 *              && origin (nfaEDGE) is a member of nfa_states(SUPER)
53 *              && exists PF in possible_futures(dest(nfaEDGE)) s.t.
54 *                      sides_of_possible_future (PF) == superedge_sides (EDGE)
55 *
56 * also:
57 *
58 *      let SUPER2 := superedge_destination(EDGE)
59 *          nfa_states(SUPER2)
60 *           == union of all nfa state sets S s.t.
61 *                          exists PF in possible_futures(dest(nfaEDGE)) s.t.
62 *                             sides_of_possible_future (PF) == superedge_sides (EDGE)
63 *                          && S == dests_of_possible_future (PF) }
64 *
65 * Or in english, every superstate is a set of nfa states.  A given
66 * character and a superstate implies many transitions in the NFA --
67 * those that begin with an edge labeled with that character from a
68 * state in the set corresponding to the superstate.
69 *
70 * The destinations of those transitions each have a set of possible
71 * futures.  A possible future is a list of side effects and a set of
72 * destination NFA states.  Two sets of possible futures can be
73 * `merged' by combining all pairs of possible futures that have the
74 * same side effects.  A pair is combined by creating a new future
75 * with the same side effect but the union of the two destination sets.
76 * In this way, all the possible futures suggested by a superstate
77 * and a character can be merged into a set of possible futures where
78 * no two elements of the set have the same set of side effects.
79 *
80 * The destination of a possible future, being a set of NFA states,
81 * corresponds to a supernfa state.  So, the merged set of possible
82 * futures we just created can serve as a set of edges in the
83 * supernfa.
84 *
85 * The representation of the superstate nfa and the nfa is critical.
86 * The nfa has to be compact, but has to facilitate the rapid
87 * computation of missing superstates.  The superstate nfa has to
88 * be fast to interpret, lazilly constructed, and bounded in space.
89 *
90 * To facilitate interpretation, the superstate data structures are
91 * peppered with `instruction frames'.  There is an instruction set
92 * defined below which matchers using the supernfa must be able to
93 * interpret.
94 *
95 * We'd like to make it possible but not mandatory to use code
96 * addresses to represent instructions (c.f. gcc's computed goto).
97 * Therefore, we define an enumerated type of opcodes, and when
98 * writing one of these instructions into a data structure, use
99 * the opcode as an index into a table of instruction values.
100 *
101 * Below are the opcodes that occur in the superstate nfa.
102 *
103 * The descriptions of the opcodes refer to data structures that are
104 * described further below.
105 */
106
107enum rx_opcode
108{
109  /*
110   * BACKTRACK_POINT is invoked when a character transition in
111   * a superstate leads to more than one edge.  In that case,
112   * the edges have to be explored independently using a backtracking
113   * strategy.
114   *
115   * A BACKTRACK_POINT instruction is stored in a superstate's
116   * transition table for some character when it is known that that
117   * character crosses more than one edge.  On encountering this
118   * instruction, the matcher saves enough state to backtrack to this
119   * point later in the match.
120   */
121  rx_backtrack_point = 0,       /* data is (struct transition_class *) */
122
123  /*
124   * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
125   * There is one occurence of this instruction per rx_distinct_future.
126   * This instruction is skipped if a rx_distinct_future has no side effects.
127   */
128  rx_do_side_effects = rx_backtrack_point + 1,
129
130  /* data is (struct rx_distinct_future *) */
131
132  /*
133   * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
134   * destination superstate has been reclaimed (or was never built).
135   * It recomputes the destination superstate.
136   * RX_CACHE_MISS is also stored in a superstate transition table before
137   * any of its edges have been built.
138   */
139  rx_cache_miss = rx_do_side_effects + 1,
140  /* data is (struct rx_distinct_future *) */
141
142  /*
143   * RX_NEXT_CHAR is called to consume the next character and take the
144   * corresponding transition.  This is the only instruction that uses
145   * the DATA field of the instruction frame instead of DATA_2.
146   * The comments about rx_inx explain this further.
147   */
148  rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
149
150  /* RX_BACKTRACK indicates that a transition fails.  Don't
151   * confuse this with rx_backtrack_point.
152   */
153  rx_backtrack = rx_next_char + 1, /* no data */
154
155  /*
156   * RX_ERROR_INX is stored only in places that should never be executed.
157   */
158  rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
159
160  rx_num_instructions = rx_error_inx + 1
161};
162
163/* The heart of the matcher is a `word-code-interpreter'
164 * (like a byte-code interpreter, except that instructions
165 * are a full word wide).
166 *
167 * Instructions are not stored in a vector of code, instead,
168 * they are scattered throughout the data structures built
169 * by the regexp compiler and the matcher.  One word-code instruction,
170 * together with the arguments to that instruction, constitute
171 * an instruction frame (struct rx_inx).
172 *
173 * This structure type is padded by hand to a power of 2 because
174 * in one of the dominant cases, we dispatch by indexing a table
175 * of instruction frames.  If that indexing can be accomplished
176 * by just a shift of the index, we're happy.
177 *
178 * Instructions take at most one argument, but there are two
179 * slots in an instruction frame that might hold that argument.
180 * These are called data and data_2.  The data slot is only
181 * used for one instruction (RX_NEXT_CHAR).  For all other
182 * instructions, data should be set to 0.
183 *
184 * RX_NEXT_CHAR is the most important instruction by far.
185 * By reserving the data field for its exclusive use,
186 * instruction dispatch is sped up in that case.  There is
187 * no need to fetch both the instruction and the data,
188 * only the data is needed.  In other words, a `cycle' begins
189 * by fetching the field data.  If that is non-0, then it must
190 * be the destination state of a next_char transition, so
191 * make that value the current state, advance the match position
192 * by one character, and start a new cycle.  On the other hand,
193 * if data is 0, fetch the instruction and do a more complicated
194 * dispatch on that.
195 */
196
197struct rx_inx
198{
199  void * data;
200  void * data_2;
201  void * inx;
202  void * fnord;
203};
204
205#ifndef RX_TAIL_ARRAY
206#define RX_TAIL_ARRAY  1
207#endif
208
209/* A superstate corresponds to a set of nfa states.  Those sets are
210 * represented by STRUCT RX_SUPERSET.  The constructors
211 * guarantee that only one (shared) structure is created for a given set.
212 */
213struct rx_superset
214{
215  int refs;                     /* This is a reference counted structure. */
216
217  /* We keep these sets in a cache because (in an unpredictable way),
218   * the same set is often created again and again. 
219   *
220   * When an NFA is destroyed, some of the supersets for that NFA
221   * may still exist.  This can lead to false cache hits -- an apparent cache
222   * hit on a superset that properly belongs to an already free NFA.
223   *
224   * When a cache hit appears to occur, we will have in hand the
225   * nfa for which it may have happened.  Every nfa is given
226   * its own sequence number.  The cache is validated
227   * by comparing the nfa sequence number to this field:
228   */
229  int id;
230
231  struct rx_nfa_state * car;    /* May or may not be a valid addr. */
232  struct rx_superset * cdr;
233
234  /* If the corresponding superstate exists: */
235  struct rx_superstate * superstate;
236
237  /* That is_final field of the constiuent nfa states which has the greatest magnitude. */
238  int is_final;
239
240  /* The OR of the corresponding fields of the constiuent nfa states. */
241  int has_cset_edges;
242
243
244  /* There is another bookkeeping problem.  It is expensive to
245   * compute the starting nfa state set for an nfa.  So, once computed,
246   * it is cached in the `struct rx'.
247   *
248   * But, the state set can be flushed from the superstate cache.
249   * When that happens, the cached value in the `struct rx' has
250   * to be flushed.
251   */
252  struct rx * starts_for;
253
254  /* This is used to link into a hash bucket so these objects can
255   * be `hash-consed'.
256   */
257  struct rx_hash_item hash_item;
258};
259
260#define rx_protect_superset(RX,CON) (++(CON)->refs)
261
262/* The terminology may be confusing (rename this structure?).
263 * Every character occurs in at most one rx_super_edge per super-state.
264 * But, that structure might have more than one option, indicating a point
265 * of non-determinism.
266 *
267 * In other words, this structure holds a list of superstate edges
268 * sharing a common starting state and character label.  The edges
269 * are in the field OPTIONS.  All superstate edges sharing the same
270 * starting state and character are in this list.
271 */
272struct rx_super_edge
273{
274  struct rx_super_edge *next;
275  struct rx_inx rx_backtrack_frame;
276  int cset_size;
277  rx_Bitset cset;
278  struct rx_distinct_future *options;
279};
280
281/* A superstate is a set of nfa states (RX_SUPERSET) along
282 * with a transition table.  Superstates are built on demand and reclaimed
283 * without warning.  To protect a superstate from this ghastly fate,
284 * use LOCK_SUPERSTATE.
285 */
286struct rx_superstate
287{
288  int rx_id;                    /* c.f. the id field of rx_superset */
289  int locks;                    /* protection from reclamation */
290
291  /* Within a superstate cache, all the superstates are kept in a big
292   * queue.  The tail of the queue is the state most likely to be
293   * reclaimed.  The *recyclable fields hold the queue position of
294   * this state.
295   */
296  struct rx_superstate * next_recyclable;
297  struct rx_superstate * prev_recyclable;
298
299  /* The supernfa edges that exist in the cache and that have
300   * this state as their destination are kept in this list:
301   */
302  struct rx_distinct_future * transition_refs;
303
304  /* The list of nfa states corresponding to this superstate: */
305  struct rx_superset * contents;
306
307  /* The list of edges in the cache beginning from this state. */
308  struct rx_super_edge * edges;
309
310  /* A tail of the recyclable queue is marked as semifree.  A semifree
311   * state has no incoming next_char transitions -- any transition
312   * into a semifree state causes a complex dispatch with the side
313   * effect of rescuing the state from its semifree state into a
314   * fully free state at the head of the queue.
315   *
316   * An alternative to this might be to make next_char more expensive,
317   * and to move a state to the head of the recyclable queue whenever
318   * it is entered.  That way, popular states would never be recycled.
319   *
320   * But unilaterally making next_char more expensive actually loses.
321   * So, incoming transitions are only made expensive for states near
322   * the tail of the recyclable queue.  The more cache contention
323   * there is, the more frequently a state will have to prove itself
324   * and be moved back to the front of the queue.  If there is less
325   * contention, then popular states just aggregate in the front of
326   * the queue and stay there.
327   */
328  int is_semifree;
329
330
331  /* This keeps track of the size of the transition table for this
332   * state.  There is a half-hearted attempt to support variable sized
333   * superstates.
334   */
335  int trans_size;
336
337  /* Indexed by characters... */
338  struct rx_inx transitions[RX_TAIL_ARRAY];
339};
340
341
342/* A list of distinct futures define the edges that leave from a
343 * given superstate on a given character.  c.f. rx_super_edge.
344 */
345struct rx_distinct_future
346{
347  struct rx_distinct_future * next_same_super_edge[2];
348  struct rx_distinct_future * next_same_dest;
349  struct rx_distinct_future * prev_same_dest;
350  struct rx_superstate * present;       /* source state */
351  struct rx_superstate * future;        /* destination state */
352  struct rx_super_edge * edge;
353
354
355  /* The future_frame holds the instruction that should be executed
356   * after all the side effects are done, when it is time to complete
357   * the transition to the next state.
358   *
359   * Normally this is a next_char instruction, but it may be a
360   * cache_miss instruction as well, depending on whether or not
361   * the superstate is in the cache and semifree.
362   *
363   * If this is the only future for a given superstate/char, and
364   * if there are no side effects to be performed, this frame is
365   * not used (directly) at all.  Instead, its contents are copied
366   * into the transition table of the starting state of this dist. future
367   * (a sort of goto elimination).
368   */
369  struct rx_inx future_frame;
370
371  struct rx_inx side_effects_frame;
372  struct rx_se_list * effects;
373};
374
375#define rx_lock_superstate(R,S)  ((S)->locks++)
376#define rx_unlock_superstate(R,S) (--(S)->locks)
377
378struct rx_cache;
379
380#ifdef __STDC__
381typedef void (*rx_morecore_fn)(struct rx_cache *);
382#else
383typedef void (*rx_morecore_fn)();
384#endif
385
386struct rx_cache
387{
388  struct rx_hash_rules superset_hash_rules;
389
390  struct rx_superstate * lru_superstate;
391  struct rx_superstate * semifree_superstate;
392
393  struct rx_superset * empty_superset;
394
395  int superstates;
396  int semifree_superstates;
397  int hits;
398  int misses;
399
400  int bytes_allowed;
401  int bytes_used;
402
403  int local_cset_size;
404  void ** instruction_table;
405
406  struct rx_hash superset_table;
407};
408
409#ifndef RX_DEFAULT_DFA_CACHE_SIZE
410/* This is an upper bound on the number of bytes that may (normally)
411 * be allocated for DFA states.  If this threshold would be exceeded,
412 * Rx tries to flush some DFA states from the cache.
413 *
414 * This value is used whenever the rx_default_cache is used (for example,
415 * with the Posix entry points).
416 */
417#define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
418#endif
419
420extern struct rx_cache * rx_default_cache;
421
422
423#ifdef __STDC__
424extern char * rx_cache_malloc (struct rx_cache * cache, int size);
425extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
426extern void rx_release_superset (struct rx *rx,
427                                 struct rx_superset *set);
428extern struct rx_superset * rx_superset_cons (struct rx * rx,
429                                              struct rx_nfa_state *car, struct rx_superset *cdr);
430extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
431extern struct rx_superstate * rx_superstate (struct rx *rx,
432                                             struct rx_superset *set);
433extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;
434
435#else /* STDC */
436extern char * rx_cache_malloc ();
437extern void rx_cache_free ();
438extern void rx_release_superset ();
439extern struct rx_superset * rx_superset_cons ();
440extern struct rx_superset * rx_superstate_eclosure_union ();
441extern struct rx_superstate * rx_superstate ();
442extern struct rx_inx * rx_handle_cache_miss ();
443
444#endif /* STDC */
445
446#endif  /* RXSUPERH */
Note: See TracBrowser for help on using the repository browser.