1 /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
3 This file is part of the librx library.
5 Librx is free software; you can redistribute it and/or modify it under
6 the terms of the GNU Library General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 Librx is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 You should have received a copy of the GNU Library General Public
16 License along with this software; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
20 /* NOTE!!! AIX is so losing it requires this to be the first thing in the
22 * Do not put ANYTHING before it!
24 #if !defined (__GNUC__) && defined (_AIX)
28 /* To make linux happy? */
37 const char *rx_version_string = "GNU Rx version 0.07.2";
47 #define isgraph(c) (isprint (c) && !isspace (c))
50 #define isblank(c) ((c) == ' ' || (c) == '\t')
53 #include <sys/types.h>
57 #define MAX(a, b) ((a) > (b) ? (a) : (b))
58 #define MIN(a, b) ((a) < (b) ? (a) : (b))
69 /* Emacs already defines alloca, sometimes. */
72 /* Make alloca work the best possible way. */
74 #define alloca __builtin_alloca
75 #else /* not __GNUC__ */
78 #else /* not __GNUC__ or HAVE_ALLOCA_H */
79 #ifndef _AIX /* Already did AIX, up at the top. */
82 #endif /* not HAVE_ALLOCA_H */
83 #endif /* not __GNUC__ */
85 #endif /* not alloca */
87 /* Memory management and stuff for emacs. */
90 #define remalloc(M, S) (M ? realloc (M, S) : malloc (S))
93 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
94 * use `alloca' instead of `malloc' for the backtracking stack.
96 * Emacs will die miserably if we don't do this.
100 #define REGEX_ALLOCATE malloc
101 #else /* not REGEX_MALLOC */
102 #define REGEX_ALLOCATE alloca
103 #endif /* not REGEX_MALLOC */
106 #ifdef RX_WANT_RX_DEFS
107 #define RX_DECL extern
110 #define RX_WANT_RX_DEFS
111 #define RX_DECL static
112 #define RX_DEF_QUAL static
116 #define RX_DECL RX_DEF_QUAL
122 extern char *re_syntax_table;
123 #else /* not SYNTAX_TABLE */
125 RX_DECL char re_syntax_table[CHAR_SET_SIZE];
129 init_syntax_once (void)
141 bzero (re_syntax_table, sizeof re_syntax_table);
143 for (c = 'a'; c <= 'z'; c++)
144 re_syntax_table[c] = Sword;
146 for (c = 'A'; c <= 'Z'; c++)
147 re_syntax_table[c] = Sword;
149 for (c = '0'; c <= '9'; c++)
150 re_syntax_table[c] = Sword;
152 re_syntax_table['_'] = Sword;
156 #endif /* not SYNTAX_TABLE */
157 #endif /* not emacs */
159 /* Compile with `-DRX_DEBUG' and use the following flags.
162 * rx_debug - print information as a regexp is compiled
163 * rx_debug_trace - print information as a regexp is executed
168 int rx_debug_compile = 0;
169 int rx_debug_trace = 0;
170 static struct re_pattern_buffer * dbug_rxb = 0;
173 typedef void (*side_effect_printer) (struct rx *, void *, FILE *);
175 typedef void (*side_effect_printer) ();
179 static void print_cset (struct rx *rx, rx_Bitset cset, FILE * fp);
181 static void print_cset ();
186 print_rexp (struct rx *rx,
187 struct rexp_node *node, int depth,
188 side_effect_printer seprint, FILE * fp)
191 print_rexp (rx, node, depth, seprint, fp)
193 struct rexp_node *node;
195 side_effect_printer seprint;
207 fprintf (fp, "%*s", depth, "");
208 print_cset (rx, node->params.cset, fp);
215 fprintf (fp, "%*s%s\n", depth, "",
216 node->type == r_opt ? "opt" : "star");
217 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
221 fprintf (fp, "%*s2phase star\n", depth, "");
222 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
223 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
229 fprintf (fp, "%*s%s\n", depth, "",
230 node->type == r_alternate ? "alt" : "concat");
231 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
232 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
235 fprintf (fp, "%*sSide effect: ", depth, "");
236 seprint (rx, node->params.side_effect, fp);
244 print_nfa (struct rx * rx,
245 struct rx_nfa_state * n,
246 side_effect_printer seprint, FILE * fp)
249 print_nfa (rx, n, seprint, fp)
251 struct rx_nfa_state * n;
252 side_effect_printer seprint;
258 struct rx_nfa_edge *e = n->edges;
259 struct rx_possible_future *ec = n->futures;
260 fprintf (fp, "node %d %s\n", n->id,
261 n->is_final ? "final" : (n->is_start ? "start" : ""));
264 fprintf (fp, " edge to %d, ", e->dest->id);
268 fprintf (fp, "epsilon\n");
271 fprintf (fp, "side effect ");
272 seprint (rx, e->params.side_effect, fp);
276 fprintf (fp, "cset ");
277 print_cset (rx, e->params.cset, fp);
287 struct rx_nfa_state_set * s;
288 struct rx_se_list * l;
289 fprintf (fp, " eclosure to {");
290 for (s = ec->destset; s; s = s->cdr)
291 fprintf (fp, "%d ", s->car->id);
293 for (l = ec->effects; l; l = l->cdr)
295 seprint (rx, l->car, fp);
305 static char * efnames [] =
321 "re_se_notwordbound",
328 static char * efnames2[] =
339 static char * inx_names[] =
341 "rx_backtrack_point",
342 "rx_do_side_effects",
347 "rx_num_instructions"
353 re_seprint (struct rx * rx, void * effect, FILE * fp)
356 re_seprint (rx, effect, fp)
363 fputs (efnames[-(int)effect], fp);
366 struct re_se_params * p = &dbug_rxb->se_params[(int)effect];
367 fprintf (fp, "%s(%d,%d)", efnames2[p->se], p->op1, p->op2);
370 fprintf (fp, "[complex op # %d]", (int)effect);
374 /* These are so the regex.c regression tests will compile. */
376 print_compiled_pattern (rxb)
377 struct re_pattern_buffer * rxb;
387 #endif /* RX_DEBUG */
391 /* This page: Bitsets. Completely unintersting. */
395 rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b)
398 rx_bitset_is_equal (size, a, b)
408 for (x = rx_bitset_numb_subsets(size) - 1; a[x] == b[x]; --x)
412 return !x && s == a[0];
417 rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b)
420 rx_bitset_is_subset (size, a, b)
426 int x = rx_bitset_numb_subsets(size) - 1;
427 while (x-- && (a[x] & b[x]) == a[x]);
434 rx_bitset_empty (int size, rx_Bitset set)
437 rx_bitset_empty (size, set)
443 RX_subset s = set[0];
445 for (x = rx_bitset_numb_subsets(size) - 1; !set[x]; --x)
453 rx_bitset_null (int size, rx_Bitset b)
456 rx_bitset_null (size, b)
461 bzero (b, rx_sizeof_bitset(size));
467 rx_bitset_universe (int size, rx_Bitset b)
470 rx_bitset_universe (size, b)
475 int x = rx_bitset_numb_subsets (size);
477 *b++ = ~(RX_subset)0;
483 rx_bitset_complement (int size, rx_Bitset b)
486 rx_bitset_complement (size, b)
491 int x = rx_bitset_numb_subsets (size);
502 rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b)
505 rx_bitset_assign (size, a, b)
512 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
519 rx_bitset_union (int size, rx_Bitset a, rx_Bitset b)
522 rx_bitset_union (size, a, b)
529 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
536 rx_bitset_intersection (int size,
537 rx_Bitset a, rx_Bitset b)
540 rx_bitset_intersection (size, a, b)
547 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
554 rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b)
557 rx_bitset_difference (size, a, b)
564 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
571 rx_bitset_revdifference (int size,
572 rx_Bitset a, rx_Bitset b)
575 rx_bitset_revdifference (size, a, b)
582 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
588 rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b)
591 rx_bitset_xor (size, a, b)
598 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
604 RX_DECL unsigned long
605 rx_bitset_hash (int size, rx_Bitset b)
607 RX_DECL unsigned long
608 rx_bitset_hash (size, b)
614 unsigned long hash = (unsigned long)rx_bitset_hash;
616 for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x)
617 hash ^= rx_bitset_subset_val(b, x);
623 RX_DECL RX_subset rx_subset_singletons [RX_subset_bits] =
663 print_cset (struct rx *rx, rx_Bitset cset, FILE * fp)
666 print_cset (rx, cset, fp)
674 for (x = 0; x < rx->local_cset_size; ++x)
675 if (RX_bitset_member (cset, x))
680 fprintf (fp, "\\0%o ", x);
685 #endif /* RX_DEBUG */
689 static unsigned long rx_hash_masks[4] =
700 RX_DECL struct rx_hash_item *
701 rx_hash_find (struct rx_hash * table,
704 struct rx_hash_rules * rules)
706 RX_DECL struct rx_hash_item *
707 rx_hash_find (table, hash, value, rules)
708 struct rx_hash * table;
711 struct rx_hash_rules * rules;
714 rx_hash_eq eq = rules->eq;
716 long mask = rx_hash_masks [0];
717 int bucket = (hash & mask) % 13;
719 while (table->children [bucket])
721 table = table->children [bucket];
723 mask = rx_hash_masks[maskc];
724 bucket = (hash & mask) % 13;
728 struct rx_hash_item * it = table->buckets[bucket];
730 if (eq (it->data, value))
733 it = it->next_same_hash;
741 RX_DECL struct rx_hash_item *
742 rx_hash_store (struct rx_hash * table,
745 struct rx_hash_rules * rules)
747 RX_DECL struct rx_hash_item *
748 rx_hash_store (table, hash, value, rules)
749 struct rx_hash * table;
752 struct rx_hash_rules * rules;
755 rx_hash_eq eq = rules->eq;
757 long mask = rx_hash_masks[0];
758 int bucket = (hash & mask) % 13;
761 while (table->children [bucket])
763 table = table->children [bucket];
765 mask = rx_hash_masks[maskc];
766 bucket = (hash & mask) % 13;
771 struct rx_hash_item * it = table->buckets[bucket];
773 if (eq (it->data, value))
776 it = it->next_same_hash;
781 && (table->bucket_size [bucket] >= 4))
783 struct rx_hash * newtab = ((struct rx_hash *)
784 rules->hash_alloc (rules));
787 bzero (newtab, sizeof (*newtab));
788 newtab->parent = table;
790 struct rx_hash_item * them = table->buckets[bucket];
791 unsigned long newmask = rx_hash_masks[maskc + 1];
794 struct rx_hash_item * save = them->next_same_hash;
795 int new_buck = (them->hash & newmask) % 13;
796 them->next_same_hash = newtab->buckets[new_buck];
797 newtab->buckets[new_buck] = them;
798 them->table = newtab;
800 ++newtab->bucket_size[new_buck];
803 table->refs = (table->refs - table->bucket_size[bucket] + 1);
804 table->bucket_size[bucket] = 0;
805 table->buckets[bucket] = 0;
806 table->children[bucket] = newtab;
808 bucket = (hash & newmask) % 13;
814 struct rx_hash_item * it = ((struct rx_hash_item *)
815 rules->hash_item_alloc (rules, value));
820 /* DATA and BINDING are to be set in hash_item_alloc */
821 it->next_same_hash = table->buckets [bucket];
822 table->buckets[bucket] = it;
823 ++table->bucket_size [bucket];
832 rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules)
835 rx_hash_free (it, rules)
836 struct rx_hash_item * it;
837 struct rx_hash_rules * rules;
842 struct rx_hash * table = it->table;
843 unsigned long hash = it->hash;
844 int depth = (table->parent
845 ? (table->parent->parent
846 ? (table->parent->parent->parent
851 int bucket = (hash & rx_hash_masks [depth]) % 13;
852 struct rx_hash_item ** pos = &table->buckets [bucket];
855 pos = &(*pos)->next_same_hash;
856 *pos = it->next_same_hash;
857 rules->free_hash_item (it, rules);
858 --table->bucket_size[bucket];
860 while (!table->refs && depth)
862 struct rx_hash * save = table;
863 table = table->parent;
865 bucket = (hash & rx_hash_masks [depth]) % 13;
867 table->children[bucket] = 0;
868 rules->free_hash (save, rules);
875 rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
876 struct rx_hash_rules * rules)
879 rx_free_hash_table (tab, freefn, rules)
880 struct rx_hash * tab;
881 rx_hash_freefn freefn;
882 struct rx_hash_rules * rules;
887 for (x = 0; x < 13; ++x)
888 if (tab->children[x])
890 rx_free_hash_table (tab->children[x], freefn, rules);
891 rules->free_hash (tab->children[x], rules);
895 struct rx_hash_item * them = tab->buckets[x];
898 struct rx_hash_item * that = them;
899 them = that->next_same_hash;
901 rules->free_hash_item (that, rules);
908 /* Utilities for manipulating bitset represntations of characters sets. */
912 rx_cset (struct rx *rx)
919 rx_Bitset b = (rx_Bitset) malloc (rx_sizeof_bitset (rx->local_cset_size));
921 rx_bitset_null (rx->local_cset_size, b);
928 rx_copy_cset (struct rx *rx, rx_Bitset a)
936 rx_Bitset cs = rx_cset (rx);
939 rx_bitset_union (rx->local_cset_size, cs, a);
947 rx_free_cset (struct rx * rx, rx_Bitset c)
960 /* Hash table memory allocation policy for the regexp compiler */
963 static struct rx_hash *
964 compiler_hash_alloc (struct rx_hash_rules * rules)
966 static struct rx_hash *
967 compiler_hash_alloc (rules)
968 struct rx_hash_rules * rules;
971 return (struct rx_hash *)malloc (sizeof (struct rx_hash));
976 static struct rx_hash_item *
977 compiler_hash_item_alloc (struct rx_hash_rules * rules, void * value)
979 static struct rx_hash_item *
980 compiler_hash_item_alloc (rules, value)
981 struct rx_hash_rules * rules;
985 struct rx_hash_item * it;
986 it = (struct rx_hash_item *)malloc (sizeof (*it));
998 compiler_free_hash (struct rx_hash * tab,
999 struct rx_hash_rules * rules)
1002 compiler_free_hash (tab, rules)
1003 struct rx_hash * tab;
1004 struct rx_hash_rules * rules;
1013 compiler_free_hash_item (struct rx_hash_item * item,
1014 struct rx_hash_rules * rules)
1017 compiler_free_hash_item (item, rules)
1018 struct rx_hash_item * item;
1019 struct rx_hash_rules * rules;
1022 free ((char *)item);
1026 /* This page: REXP_NODE (expression tree) structures. */
1029 RX_DECL struct rexp_node *
1030 rexp_node (struct rx *rx,
1031 enum rexp_node_type type)
1033 RX_DECL struct rexp_node *
1034 rexp_node (rx, type)
1036 enum rexp_node_type type;
1039 struct rexp_node *n;
1041 n = (struct rexp_node *)malloc (sizeof (*n));
1042 bzero (n, sizeof (*n));
1049 /* free_rexp_node assumes that the bitset passed to rx_mk_r_cset
1050 * can be freed using rx_free_cset.
1053 RX_DECL struct rexp_node *
1054 rx_mk_r_cset (struct rx * rx,
1057 RX_DECL struct rexp_node *
1058 rx_mk_r_cset (rx, b)
1063 struct rexp_node * n = rexp_node (rx, r_cset);
1071 RX_DECL struct rexp_node *
1072 rx_mk_r_concat (struct rx * rx,
1073 struct rexp_node * a,
1074 struct rexp_node * b)
1076 RX_DECL struct rexp_node *
1077 rx_mk_r_concat (rx, a, b)
1079 struct rexp_node * a;
1080 struct rexp_node * b;
1083 struct rexp_node * n = rexp_node (rx, r_concat);
1086 n->params.pair.left = a;
1087 n->params.pair.right = b;
1094 RX_DECL struct rexp_node *
1095 rx_mk_r_alternate (struct rx * rx,
1096 struct rexp_node * a,
1097 struct rexp_node * b)
1099 RX_DECL struct rexp_node *
1100 rx_mk_r_alternate (rx, a, b)
1102 struct rexp_node * a;
1103 struct rexp_node * b;
1106 struct rexp_node * n = rexp_node (rx, r_alternate);
1109 n->params.pair.left = a;
1110 n->params.pair.right = b;
1117 RX_DECL struct rexp_node *
1118 rx_mk_r_opt (struct rx * rx,
1119 struct rexp_node * a)
1121 RX_DECL struct rexp_node *
1124 struct rexp_node * a;
1127 struct rexp_node * n = rexp_node (rx, r_opt);
1130 n->params.pair.left = a;
1131 n->params.pair.right = 0;
1138 RX_DECL struct rexp_node *
1139 rx_mk_r_star (struct rx * rx,
1140 struct rexp_node * a)
1142 RX_DECL struct rexp_node *
1143 rx_mk_r_star (rx, a)
1145 struct rexp_node * a;
1148 struct rexp_node * n = rexp_node (rx, r_star);
1151 n->params.pair.left = a;
1152 n->params.pair.right = 0;
1159 RX_DECL struct rexp_node *
1160 rx_mk_r_2phase_star (struct rx * rx,
1161 struct rexp_node * a,
1162 struct rexp_node * b)
1164 RX_DECL struct rexp_node *
1165 rx_mk_r_2phase_star (rx, a, b)
1167 struct rexp_node * a;
1168 struct rexp_node * b;
1171 struct rexp_node * n = rexp_node (rx, r_2phase_star);
1174 n->params.pair.left = a;
1175 n->params.pair.right = b;
1182 RX_DECL struct rexp_node *
1183 rx_mk_r_side_effect (struct rx * rx,
1186 RX_DECL struct rexp_node *
1187 rx_mk_r_side_effect (rx, a)
1192 struct rexp_node * n = rexp_node (rx, r_side_effect);
1195 n->params.side_effect = a;
1196 n->params.pair.right = 0;
1203 RX_DECL struct rexp_node *
1204 rx_mk_r_data (struct rx * rx,
1207 RX_DECL struct rexp_node *
1208 rx_mk_r_data (rx, a)
1213 struct rexp_node * n = rexp_node (rx, r_data);
1216 n->params.pair.left = a;
1217 n->params.pair.right = 0;
1225 rx_free_rexp (struct rx * rx, struct rexp_node * node)
1228 rx_free_rexp (rx, node)
1230 struct rexp_node * node;
1238 if (node->params.cset)
1239 rx_free_cset (rx, node->params.cset);
1249 rx_free_rexp (rx, node->params.pair.left);
1250 rx_free_rexp (rx, node->params.pair.right);
1254 /* This shouldn't occur. */
1257 free ((char *)node);
1263 RX_DECL struct rexp_node *
1264 rx_copy_rexp (struct rx *rx,
1265 struct rexp_node *node)
1267 RX_DECL struct rexp_node *
1268 rx_copy_rexp (rx, node)
1270 struct rexp_node *node;
1277 struct rexp_node *n = rexp_node (rx, node->type);
1283 n->params.cset = rx_copy_cset (rx, node->params.cset);
1284 if (!n->params.cset)
1286 rx_free_rexp (rx, n);
1292 n->params.side_effect = node->params.side_effect;
1300 n->params.pair.left =
1301 rx_copy_rexp (rx, node->params.pair.left);
1302 n->params.pair.right =
1303 rx_copy_rexp (rx, node->params.pair.right);
1304 if ( (node->params.pair.left && !n->params.pair.left)
1305 || (node->params.pair.right && !n->params.pair.right))
1307 rx_free_rexp (rx, n);
1312 /* shouldn't happen */
1321 /* This page: functions to build and destroy graphs that describe nfa's */
1323 /* Constructs a new nfa node. */
1325 RX_DECL struct rx_nfa_state *
1326 rx_nfa_state (struct rx *rx)
1328 RX_DECL struct rx_nfa_state *
1333 struct rx_nfa_state * n = (struct rx_nfa_state *)malloc (sizeof (*n));
1336 bzero (n, sizeof (*n));
1337 n->next = rx->nfa_states;
1345 rx_free_nfa_state (struct rx_nfa_state * n)
1348 rx_free_nfa_state (n)
1349 struct rx_nfa_state * n;
1356 /* This looks up an nfa node, given a numeric id. Numeric id's are
1357 * assigned after the nfa has been built.
1360 RX_DECL struct rx_nfa_state *
1361 rx_id_to_nfa_state (struct rx * rx,
1364 RX_DECL struct rx_nfa_state *
1365 rx_id_to_nfa_state (rx, id)
1370 struct rx_nfa_state * n;
1371 for (n = rx->nfa_states; n; n = n->next)
1378 /* This adds an edge between two nodes, but doesn't initialize the
1383 RX_DECL struct rx_nfa_edge *
1384 rx_nfa_edge (struct rx *rx,
1385 enum rx_nfa_etype type,
1386 struct rx_nfa_state *start,
1387 struct rx_nfa_state *dest)
1389 RX_DECL struct rx_nfa_edge *
1390 rx_nfa_edge (rx, type, start, dest)
1392 enum rx_nfa_etype type;
1393 struct rx_nfa_state *start;
1394 struct rx_nfa_state *dest;
1397 struct rx_nfa_edge *e;
1398 e = (struct rx_nfa_edge *)malloc (sizeof (*e));
1401 e->next = start->edges;
1411 rx_free_nfa_edge (struct rx_nfa_edge * e)
1414 rx_free_nfa_edge (e)
1415 struct rx_nfa_edge * e;
1422 /* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure
1423 * of an NFA. These are added to an nfa automaticly by eclose_nfa.
1427 static struct rx_possible_future *
1428 rx_possible_future (struct rx * rx,
1429 struct rx_se_list * effects)
1431 static struct rx_possible_future *
1432 rx_possible_future (rx, effects)
1434 struct rx_se_list * effects;
1437 struct rx_possible_future *ec;
1438 ec = (struct rx_possible_future *) malloc (sizeof (*ec));
1443 ec->effects = effects;
1450 rx_free_possible_future (struct rx_possible_future * pf)
1453 rx_free_possible_future (pf)
1454 struct rx_possible_future * pf;
1463 rx_free_nfa (struct rx *rx)
1470 while (rx->nfa_states)
1472 while (rx->nfa_states->edges)
1474 switch (rx->nfa_states->edges->type)
1477 rx_free_cset (rx, rx->nfa_states->edges->params.cset);
1483 struct rx_nfa_edge * e;
1484 e = rx->nfa_states->edges;
1485 rx->nfa_states->edges = rx->nfa_states->edges->next;
1486 rx_free_nfa_edge (e);
1488 } /* while (rx->nfa_states->edges) */
1490 /* Iterate over the partial epsilon closures of rx->nfa_states */
1491 struct rx_possible_future * pf = rx->nfa_states->futures;
1494 struct rx_possible_future * pft = pf;
1496 rx_free_possible_future (pft);
1500 struct rx_nfa_state *n;
1502 rx->nfa_states = rx->nfa_states->next;
1503 rx_free_nfa_state (n);
1510 /* This page: translating a pattern expression into an nfa and doing the
1511 * static part of the nfa->super-nfa translation.
1514 /* This is the thompson regexp->nfa algorithm.
1515 * It is modified to allow for `side-effect epsilons.' Those are
1516 * edges that are taken whenever a similar epsilon edge would be,
1517 * but which imply that some side effect occurs when the edge
1520 * Side effects are used to model parts of the pattern langauge
1521 * that are not regular (in the formal sense).
1526 rx_build_nfa (struct rx *rx,
1527 struct rexp_node *rexp,
1528 struct rx_nfa_state **start,
1529 struct rx_nfa_state **end)
1532 rx_build_nfa (rx, rexp, start, end)
1534 struct rexp_node *rexp;
1535 struct rx_nfa_state **start;
1536 struct rx_nfa_state **end;
1539 struct rx_nfa_edge *edge;
1541 /* Start & end nodes may have been allocated by the caller. */
1542 *start = *start ? *start : rx_nfa_state (rx);
1553 *end = *end ? *end : rx_nfa_state (rx);
1557 rx_free_nfa_state (*start);
1567 edge = rx_nfa_edge (rx, ne_cset, *start, *end);
1570 edge->params.cset = rx_copy_cset (rx, rexp->params.cset);
1571 if (!edge->params.cset)
1573 rx_free_nfa_edge (edge);
1579 return (rx_build_nfa (rx, rexp->params.pair.left, start, end)
1580 && rx_nfa_edge (rx, ne_epsilon, *start, *end));
1584 struct rx_nfa_state * star_start = 0;
1585 struct rx_nfa_state * star_end = 0;
1586 return (rx_build_nfa (rx, rexp->params.pair.left,
1587 &star_start, &star_end)
1590 && rx_nfa_edge (rx, ne_epsilon, star_start, star_end)
1591 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1592 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1594 && rx_nfa_edge (rx, ne_epsilon, star_end, star_start));
1599 struct rx_nfa_state * star_start = 0;
1600 struct rx_nfa_state * star_end = 0;
1601 struct rx_nfa_state * loop_exp_start = 0;
1602 struct rx_nfa_state * loop_exp_end = 0;
1604 return (rx_build_nfa (rx, rexp->params.pair.left,
1605 &star_start, &star_end)
1606 && rx_build_nfa (rx, rexp->params.pair.right,
1607 &loop_exp_start, &loop_exp_end)
1612 && rx_nfa_edge (rx, ne_epsilon, star_start, *end)
1613 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1614 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1616 && rx_nfa_edge (rx, ne_epsilon, star_end, loop_exp_start)
1617 && rx_nfa_edge (rx, ne_epsilon, loop_exp_end, star_start));
1623 struct rx_nfa_state *shared = 0;
1625 (rx_build_nfa (rx, rexp->params.pair.left, start, &shared)
1626 && rx_build_nfa (rx, rexp->params.pair.right, &shared, end));
1631 struct rx_nfa_state *ls = 0;
1632 struct rx_nfa_state *le = 0;
1633 struct rx_nfa_state *rs = 0;
1634 struct rx_nfa_state *re = 0;
1635 return (rx_build_nfa (rx, rexp->params.pair.left, &ls, &le)
1636 && rx_build_nfa (rx, rexp->params.pair.right, &rs, &re)
1637 && rx_nfa_edge (rx, ne_epsilon, *start, ls)
1638 && rx_nfa_edge (rx, ne_epsilon, *start, rs)
1639 && rx_nfa_edge (rx, ne_epsilon, le, *end)
1640 && rx_nfa_edge (rx, ne_epsilon, re, *end));
1644 edge = rx_nfa_edge (rx, ne_side_effect, *start, *end);
1647 edge->params.side_effect = rexp->params.side_effect;
1651 /* this should never happen */
1656 /* RX_NAME_NFA_STATES identifies all nodes with outgoing non-epsilon
1657 * transitions. Only these nodes can occur in super-states.
1658 * All nodes are given an integer id.
1659 * The id is non-negative if the node has non-epsilon out-transitions, negative
1660 * otherwise (this is because we want the non-negative ids to be used as
1661 * array indexes in a few places).
1666 rx_name_nfa_states (struct rx *rx)
1669 rx_name_nfa_states (rx)
1673 struct rx_nfa_state *n = rx->nfa_states;
1680 struct rx_nfa_edge *e = n->edges;
1683 n->eclosure_needed = 1;
1690 case ne_side_effect:
1694 n->id = rx->nodec++;
1696 struct rx_nfa_edge *from_n = n->edges;
1699 from_n->dest->eclosure_needed = 1;
1700 from_n = from_n->next;
1707 n->id = rx->epsnodec--;
1711 rx->epsnodec = -rx->epsnodec;
1715 /* This page: data structures for the static part of the nfa->supernfa
1718 * There are side effect lists -- lists of side effects occuring
1719 * along an uninterrupted, acyclic path of side-effect epsilon edges.
1720 * Such paths are collapsed to single edges in the course of computing
1721 * epsilon closures. Such single edges are labled with a list of all
1722 * the side effects entailed in crossing them. Like lists of side
1723 * effects are made == by the constructors below.
1725 * There are also nfa state sets. These are used to hold a list of all
1726 * states reachable from a starting state for a given type of transition
1727 * and side effect list. These are also hash-consed.
1730 /* The next several functions compare, construct, etc. lists of side
1731 * effects. See ECLOSE_NFA (below) for details.
1734 /* Ordering of rx_se_list
1735 * (-1, 0, 1 return value convention).
1740 se_list_cmp (void * va, void * vb)
1743 se_list_cmp (va, vb)
1748 struct rx_se_list * a = (struct rx_se_list *)va;
1749 struct rx_se_list * b = (struct rx_se_list *)vb;
1757 : ((long)a->car < (long)b->car
1759 : ((long)a->car > (long)b->car
1761 : se_list_cmp ((void *)a->cdr, (void *)b->cdr))))));
1767 se_list_equal (void * va, void * vb)
1770 se_list_equal (va, vb)
1775 return !(se_list_cmp (va, vb));
1778 static struct rx_hash_rules se_list_hash_rules =
1781 compiler_hash_alloc,
1783 compiler_hash_item_alloc,
1784 compiler_free_hash_item
1789 static struct rx_se_list *
1790 side_effect_cons (struct rx * rx,
1791 void * se, struct rx_se_list * list)
1793 static struct rx_se_list *
1794 side_effect_cons (rx, se, list)
1797 struct rx_se_list * list;
1800 struct rx_se_list * l;
1801 l = ((struct rx_se_list *) malloc (sizeof (*l)));
1811 static struct rx_se_list *
1812 hash_cons_se_prog (struct rx * rx,
1813 struct rx_hash * memo,
1814 void * car, struct rx_se_list * cdr)
1816 static struct rx_se_list *
1817 hash_cons_se_prog (rx, memo, car, cdr)
1819 struct rx_hash * memo;
1821 struct rx_se_list * cdr;
1824 long hash = (long)car ^ (long)cdr;
1825 struct rx_se_list template;
1830 struct rx_hash_item * it = rx_hash_store (memo, hash,
1832 &se_list_hash_rules);
1835 if (it->data == (void *)&template)
1837 struct rx_se_list * consed;
1838 consed = (struct rx_se_list *) malloc (sizeof (*consed));
1840 it->data = (void *)consed;
1842 return (struct rx_se_list *)it->data;
1848 static struct rx_se_list *
1849 hash_se_prog (struct rx * rx, struct rx_hash * memo, struct rx_se_list * prog)
1851 static struct rx_se_list *
1852 hash_se_prog (rx, memo, prog)
1854 struct rx_hash * memo;
1855 struct rx_se_list * prog;
1858 struct rx_se_list * answer = 0;
1861 answer = hash_cons_se_prog (rx, memo, prog->car, answer);
1871 nfa_set_cmp (void * va, void * vb)
1874 nfa_set_cmp (va, vb)
1879 struct rx_nfa_state_set * a = (struct rx_nfa_state_set *)va;
1880 struct rx_nfa_state_set * b = (struct rx_nfa_state_set *)vb;
1888 : (a->car->id < b->car->id
1890 : (a->car->id > b->car->id
1892 : nfa_set_cmp ((void *)a->cdr, (void *)b->cdr))))));
1897 nfa_set_equal (void * va, void * vb)
1900 nfa_set_equal (va, vb)
1905 return !nfa_set_cmp (va, vb);
1908 static struct rx_hash_rules nfa_set_hash_rules =
1911 compiler_hash_alloc,
1913 compiler_hash_item_alloc,
1914 compiler_free_hash_item
1919 static struct rx_nfa_state_set *
1920 nfa_set_cons (struct rx * rx,
1921 struct rx_hash * memo, struct rx_nfa_state * state,
1922 struct rx_nfa_state_set * set)
1924 static struct rx_nfa_state_set *
1925 nfa_set_cons (rx, memo, state, set)
1927 struct rx_hash * memo;
1928 struct rx_nfa_state * state;
1929 struct rx_nfa_state_set * set;
1932 struct rx_nfa_state_set template;
1933 struct rx_hash_item * node;
1934 template.car = state;
1936 node = rx_hash_store (memo,
1937 (((long)state) >> 8) ^ (long)set,
1938 &template, &nfa_set_hash_rules);
1941 if (node->data == &template)
1943 struct rx_nfa_state_set * l;
1944 l = (struct rx_nfa_state_set *) malloc (sizeof (*l));
1945 node->data = (void *) l;
1950 return (struct rx_nfa_state_set *)node->data;
1955 static struct rx_nfa_state_set *
1956 nfa_set_enjoin (struct rx * rx,
1957 struct rx_hash * memo, struct rx_nfa_state * state,
1958 struct rx_nfa_state_set * set)
1960 static struct rx_nfa_state_set *
1961 nfa_set_enjoin (rx, memo, state, set)
1963 struct rx_hash * memo;
1964 struct rx_nfa_state * state;
1965 struct rx_nfa_state_set * set;
1968 if (!set || state->id < set->car->id)
1969 return nfa_set_cons (rx, memo, state, set);
1970 if (state->id == set->car->id)
1974 struct rx_nfa_state_set * newcdr
1975 = nfa_set_enjoin (rx, memo, state, set->cdr);
1976 if (newcdr != set->cdr)
1977 set = nfa_set_cons (rx, memo, set->car, newcdr);
1984 /* This page: computing epsilon closures. The closures aren't total.
1985 * Each node's closures are partitioned according to the side effects entailed
1986 * along the epsilon edges. Return true on success.
1991 struct rx_se_list *prog_backwards;
1997 eclose_node (struct rx *rx, struct rx_nfa_state *outnode,
1998 struct rx_nfa_state *node, struct eclose_frame *frame)
2001 eclose_node (rx, outnode, node, frame)
2003 struct rx_nfa_state *outnode;
2004 struct rx_nfa_state *node;
2005 struct eclose_frame *frame;
2008 struct rx_nfa_edge *e = node->edges;
2010 /* For each node, we follow all epsilon paths to build the closure.
2011 * The closure omits nodes that have only epsilon edges.
2012 * The closure is split into partial closures -- all the states in
2013 * a partial closure are reached by crossing the same list of
2014 * of side effects (though not necessarily the same path).
2020 if (node->id >= 0 || node->is_final)
2022 struct rx_possible_future **ec;
2023 struct rx_se_list * prog_in_order
2024 = ((struct rx_se_list *)hash_se_prog (rx,
2026 frame->prog_backwards));
2029 ec = &outnode->futures;
2033 cmp = se_list_cmp ((void *)(*ec)->effects, (void *)prog_in_order);
2038 if (!*ec || (cmp < 0))
2040 struct rx_possible_future * saved = *ec;
2041 *ec = rx_possible_future (rx, prog_in_order);
2042 (*ec)->next = saved;
2048 (*ec)->destset = nfa_set_enjoin (rx, &rx->set_list_memo,
2049 node, (*ec)->destset);
2050 if (!(*ec)->destset)
2060 if (!eclose_node (rx, outnode, e->dest, frame))
2063 case ne_side_effect:
2065 frame->prog_backwards = side_effect_cons (rx,
2066 e->params.side_effect,
2067 frame->prog_backwards);
2068 if (!frame->prog_backwards)
2070 if (!eclose_node (rx, outnode, e->dest, frame))
2073 struct rx_se_list * dying = frame->prog_backwards;
2074 frame->prog_backwards = frame->prog_backwards->cdr;
2075 free ((char *)dying);
2091 rx_eclose_nfa (struct rx *rx)
2098 struct rx_nfa_state *n = rx->nfa_states;
2099 struct eclose_frame frame;
2100 static int rx_id = 0;
2102 frame.prog_backwards = 0;
2103 rx->rx_id = rx_id++;
2104 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2105 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2109 if (n->eclosure_needed && !eclose_node (rx, n, n, &frame))
2111 /* clear_marks (rx); */
2118 /* This deletes epsilon edges from an NFA. After running eclose_node,
2119 * we have no more need for these edges. They are removed to simplify
2120 * further operations on the NFA.
2125 rx_delete_epsilon_transitions (struct rx *rx)
2128 rx_delete_epsilon_transitions (rx)
2132 struct rx_nfa_state *n = rx->nfa_states;
2133 struct rx_nfa_edge **e;
2140 struct rx_nfa_edge *t;
2144 case ne_side_effect:
2147 rx_free_nfa_edge (t);
2160 /* This page: storing the nfa in a contiguous region of memory for
2161 * subsequent conversion to a super-nfa.
2164 /* This is for qsort on an array of nfa_states. The order
2165 * is based on state ids and goes
2166 * [0...MAX][MIN..-1] where (MAX>=0) and (MIN<0)
2167 * This way, positive ids double as array indices.
2172 nfacmp (void * va, void * vb)
2180 struct rx_nfa_state **a = (struct rx_nfa_state **)va;
2181 struct rx_nfa_state **b = (struct rx_nfa_state **)vb;
2182 return (*a == *b /* &&&& 3.18 */
2184 : (((*a)->id < 0) == ((*b)->id < 0)
2185 ? (((*a)->id < (*b)->id) ? -1 : 1)
2192 count_hash_nodes (struct rx_hash * st)
2195 count_hash_nodes (st)
2196 struct rx_hash * st;
2201 for (x = 0; x < 13; ++x)
2202 count += ((st->children[x])
2203 ? count_hash_nodes (st->children[x])
2204 : st->bucket_size[x]);
2212 se_memo_freer (struct rx_hash_item * node)
2215 se_memo_freer (node)
2216 struct rx_hash_item * node;
2219 free ((char *)node->data);
2225 nfa_set_freer (struct rx_hash_item * node)
2228 nfa_set_freer (node)
2229 struct rx_hash_item * node;
2232 free ((char *)node->data);
2236 /* This copies an entire NFA into a single malloced block of memory.
2237 * Mostly this is for compatability with regex.c, though it is convenient
2238 * to have the nfa nodes in an array.
2243 rx_compactify_nfa (struct rx *rx,
2244 void **mem, unsigned long *size)
2247 rx_compactify_nfa (rx, mem, size)
2250 unsigned long *size;
2254 struct rx_nfa_state *n;
2257 int se_list_consc = count_hash_nodes (&rx->se_list_memo);
2258 int nfa_setc = count_hash_nodes (&rx->set_list_memo);
2259 unsigned long total_size;
2261 /* This takes place in two stages. First, the total size of the
2262 * nfa is computed, then structures are copied.
2268 struct rx_nfa_edge *e = n->edges;
2269 struct rx_possible_future *ec = n->futures;
2284 total_size = (total_nodec * sizeof (struct rx_nfa_state)
2285 + edgec * rx_sizeof_bitset (rx->local_cset_size)
2286 + edgec * sizeof (struct rx_nfa_edge)
2287 + nfa_setc * sizeof (struct rx_nfa_state_set)
2288 + eclosec * sizeof (struct rx_possible_future)
2289 + se_list_consc * sizeof (struct rx_se_list)
2292 if (total_size > *size)
2294 *mem = remalloc (*mem, total_size);
2300 /* Now we've allocated the memory; this copies the NFA. */
2302 static struct rx_nfa_state **scratch = 0;
2303 static int scratch_alloc = 0;
2304 struct rx_nfa_state *state_base = (struct rx_nfa_state *) * mem;
2305 struct rx_nfa_state *new_state = state_base;
2306 struct rx_nfa_edge *new_edge =
2307 (struct rx_nfa_edge *)
2308 ((char *) state_base + total_nodec * sizeof (struct rx_nfa_state));
2309 struct rx_se_list * new_se_list =
2310 (struct rx_se_list *)
2311 ((char *)new_edge + edgec * sizeof (struct rx_nfa_edge));
2312 struct rx_possible_future *new_close =
2313 ((struct rx_possible_future *)
2314 ((char *) new_se_list
2315 + se_list_consc * sizeof (struct rx_se_list)));
2316 struct rx_nfa_state_set * new_nfa_set =
2317 ((struct rx_nfa_state_set *)
2318 ((char *)new_close + eclosec * sizeof (struct rx_possible_future)));
2320 ((char *) new_nfa_set + nfa_setc * sizeof (struct rx_nfa_state_set));
2322 struct rx_nfa_state *n;
2324 if (scratch_alloc < total_nodec)
2326 scratch = ((struct rx_nfa_state **)
2327 remalloc (scratch, total_nodec * sizeof (*scratch)));
2329 scratch_alloc = total_nodec;
2337 for (x = 0, n = rx->nfa_states; n; n = n->next)
2340 qsort (scratch, total_nodec,
2341 sizeof (struct rx_nfa_state *), (int (*)())nfacmp);
2343 for (x = 0; x < total_nodec; ++x)
2345 struct rx_possible_future *eclose = scratch[x]->futures;
2346 struct rx_nfa_edge *edge = scratch[x]->edges;
2347 struct rx_nfa_state *cn = new_state++;
2350 cn->next = (x == total_nodec - 1) ? 0 : (cn + 1);
2351 cn->id = scratch[x]->id;
2352 cn->is_final = scratch[x]->is_final;
2353 cn->is_start = scratch[x]->is_start;
2357 int indx = (edge->dest->id < 0
2358 ? (total_nodec + edge->dest->id)
2360 struct rx_nfa_edge *e = new_edge++;
2361 rx_Bitset cset = (rx_Bitset) new_bitset;
2362 new_bitset += rx_sizeof_bitset (rx->local_cset_size);
2363 rx_bitset_null (rx->local_cset_size, cset);
2364 rx_bitset_union (rx->local_cset_size, cset, edge->params.cset);
2365 e->next = cn->edges;
2367 e->type = edge->type;
2368 e->dest = state_base + indx;
2369 e->params.cset = cset;
2374 struct rx_possible_future *ec = new_close++;
2375 struct rx_hash_item * sp;
2376 struct rx_se_list ** sepos;
2377 struct rx_se_list * sesrc;
2378 struct rx_nfa_state_set * destlst;
2379 struct rx_nfa_state_set ** destpos;
2380 ec->next = cn->futures;
2382 for (sepos = &ec->effects, sesrc = eclose->effects;
2384 sesrc = sesrc->cdr, sepos = &(*sepos)->cdr)
2386 sp = rx_hash_find (&rx->se_list_memo,
2387 (long)sesrc->car ^ (long)sesrc->cdr,
2388 sesrc, &se_list_hash_rules);
2391 sesrc = (struct rx_se_list *)sp->binding;
2394 *new_se_list = *sesrc;
2395 sp->binding = (void *)new_se_list;
2396 *sepos = new_se_list;
2400 for (destpos = &ec->destset, destlst = eclose->destset;
2402 destpos = &(*destpos)->cdr, destlst = destlst->cdr)
2404 sp = rx_hash_find (&rx->set_list_memo,
2405 ((((long)destlst->car) >> 8)
2406 ^ (long)destlst->cdr),
2407 destlst, &nfa_set_hash_rules);
2410 destlst = (struct rx_nfa_state_set *)sp->binding;
2413 *new_nfa_set = *destlst;
2414 new_nfa_set->car = state_base + destlst->car->id;
2415 sp->binding = (void *)new_nfa_set;
2416 *destpos = new_nfa_set;
2420 eclose = eclose->next;
2424 rx_free_hash_table (&rx->se_list_memo, se_memo_freer, &se_list_hash_rules);
2425 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2426 rx_free_hash_table (&rx->set_list_memo, nfa_set_freer, &nfa_set_hash_rules);
2427 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2430 rx->nfa_states = (struct rx_nfa_state *)*mem;
2435 /* The functions in the next several pages define the lazy-NFA-conversion used
2436 * by matchers. The input to this construction is an NFA such as
2437 * is built by compactify_nfa (rx.c). The output is the superNFA.
2440 /* Match engines can use arbitrary values for opcodes. So, the parse tree
2441 * is built using instructions names (enum rx_opcode), but the superstate
2442 * nfa is populated with mystery opcodes (void *).
2444 * For convenience, here is an id table. The opcodes are == to their inxs
2446 * The lables in re_search_2 would make good values for instructions.
2449 void * rx_id_instruction_table[rx_num_instructions] =
2451 (void *) rx_backtrack_point,
2452 (void *) rx_do_side_effects,
2453 (void *) rx_cache_miss,
2454 (void *) rx_next_char,
2455 (void *) rx_backtrack,
2456 (void *) rx_error_inx
2461 /* Memory mgt. for superstate graphs. */
2465 rx_cache_malloc (struct rx_cache * cache, int bytes)
2468 rx_cache_malloc (cache, bytes)
2469 struct rx_cache * cache;
2473 while (cache->bytes_left < bytes)
2475 if (cache->memory_pos)
2476 cache->memory_pos = cache->memory_pos->next;
2477 if (!cache->memory_pos)
2479 cache->morecore (cache);
2480 if (!cache->memory_pos)
2483 cache->bytes_left = cache->memory_pos->bytes;
2484 cache->memory_addr = ((char *)cache->memory_pos
2485 + sizeof (struct rx_blocklist));
2487 cache->bytes_left -= bytes;
2489 char * addr = cache->memory_addr;
2490 cache->memory_addr += bytes;
2497 rx_cache_free (struct rx_cache * cache,
2498 struct rx_freelist ** freelist, char * mem)
2501 rx_cache_free (cache, freelist, mem)
2502 struct rx_cache * cache;
2503 struct rx_freelist ** freelist;
2507 struct rx_freelist * it = (struct rx_freelist *)mem;
2508 it->next = *freelist;
2513 /* The partially instantiated superstate graph has a transition
2514 * table at every node. There is one entry for every character.
2515 * This fills in the transition for a set.
2519 install_transition (struct rx_superstate *super,
2520 struct rx_inx *answer, rx_Bitset trcset)
2523 install_transition (super, answer, trcset)
2524 struct rx_superstate *super;
2525 struct rx_inx *answer;
2529 struct rx_inx * transitions = super->transitions;
2531 for (chr = 0; chr < 256; )
2539 RX_subset sub = *trcset;
2541 int bound = chr + 32;
2545 transitions [chr] = *answer;
2556 qlen (struct rx_superstate * q)
2560 struct rx_superstate * q;
2564 struct rx_superstate * it;
2567 for (it = q->next_recyclable; it != q; it = it->next_recyclable)
2574 check_cache (struct rx_cache * cache)
2578 struct rx_cache * cache;
2581 struct rx_cache * you_fucked_up = 0;
2582 int total = cache->superstates;
2583 int semi = cache->semifree_superstates;
2584 if (semi != qlen (cache->semifree_superstate))
2585 check_cache (you_fucked_up);
2586 if ((total - semi) != qlen (cache->lru_superstate))
2587 check_cache (you_fucked_up);
2590 /* When a superstate is old and neglected, it can enter a
2591 * semi-free state. A semi-free state is slated to die.
2592 * Incoming transitions to a semi-free state are re-written
2593 * to cause an (interpreted) fault when they are taken.
2594 * The fault handler revives the semi-free state, patches
2595 * incoming transitions back to normal, and continues.
2597 * The idea is basicly to free in two stages, aborting
2598 * between the two if the state turns out to be useful again.
2599 * When a free is aborted, the rescued superstate is placed
2600 * in the most-favored slot to maximize the time until it
2601 * is next semi-freed.
2606 semifree_superstate (struct rx_cache * cache)
2609 semifree_superstate (cache)
2610 struct rx_cache * cache;
2613 int disqualified = cache->semifree_superstates;
2614 if (disqualified == cache->superstates)
2616 while (cache->lru_superstate->locks)
2618 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2620 if (disqualified == cache->superstates)
2624 struct rx_superstate * it = cache->lru_superstate;
2625 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2626 it->prev_recyclable->next_recyclable = it->next_recyclable;
2627 cache->lru_superstate = (it == it->next_recyclable
2629 : it->next_recyclable);
2630 if (!cache->semifree_superstate)
2632 cache->semifree_superstate = it;
2633 it->next_recyclable = it;
2634 it->prev_recyclable = it;
2638 it->prev_recyclable = cache->semifree_superstate->prev_recyclable;
2639 it->next_recyclable = cache->semifree_superstate;
2640 it->prev_recyclable->next_recyclable = it;
2641 it->next_recyclable->prev_recyclable = it;
2644 struct rx_distinct_future *df;
2645 it->is_semifree = 1;
2646 ++cache->semifree_superstates;
2647 df = it->transition_refs;
2650 df->prev_same_dest->next_same_dest = 0;
2651 for (df = it->transition_refs; df; df = df->next_same_dest)
2653 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2654 df->future_frame.data = 0;
2655 df->future_frame.data_2 = (void *) df;
2656 /* If there are any NEXT-CHAR instruction frames that
2657 * refer to this state, we convert them to CACHE-MISS frames.
2660 && (df->edge->options->next_same_super_edge[0]
2661 == df->edge->options))
2662 install_transition (df->present, &df->future_frame,
2665 df = it->transition_refs;
2666 df->prev_same_dest->next_same_dest = df;
2675 refresh_semifree_superstate (struct rx_cache * cache,
2676 struct rx_superstate * super)
2679 refresh_semifree_superstate (cache, super)
2680 struct rx_cache * cache;
2681 struct rx_superstate * super;
2684 struct rx_distinct_future *df;
2686 if (super->transition_refs)
2688 super->transition_refs->prev_same_dest->next_same_dest = 0;
2689 for (df = super->transition_refs; df; df = df->next_same_dest)
2691 df->future_frame.inx = cache->instruction_table[rx_next_char];
2692 df->future_frame.data = (void *) super->transitions;
2693 /* CACHE-MISS instruction frames that refer to this state,
2694 * must be converted to NEXT-CHAR frames.
2697 && (df->edge->options->next_same_super_edge[0]
2698 == df->edge->options))
2699 install_transition (df->present, &df->future_frame,
2702 super->transition_refs->prev_same_dest->next_same_dest
2703 = super->transition_refs;
2705 if (cache->semifree_superstate == super)
2706 cache->semifree_superstate = (super->prev_recyclable == super
2708 : super->prev_recyclable);
2709 super->next_recyclable->prev_recyclable = super->prev_recyclable;
2710 super->prev_recyclable->next_recyclable = super->next_recyclable;
2712 if (!cache->lru_superstate)
2713 (cache->lru_superstate
2714 = super->next_recyclable
2715 = super->prev_recyclable
2719 super->next_recyclable = cache->lru_superstate;
2720 super->prev_recyclable = cache->lru_superstate->prev_recyclable;
2721 super->next_recyclable->prev_recyclable = super;
2722 super->prev_recyclable->next_recyclable = super;
2724 super->is_semifree = 0;
2725 --cache->semifree_superstates;
2730 rx_refresh_this_superstate (struct rx_cache * cache, struct rx_superstate * superstate)
2733 rx_refresh_this_superstate (cache, superstate)
2734 struct rx_cache * cache;
2735 struct rx_superstate * superstate;
2738 if (superstate->is_semifree)
2739 refresh_semifree_superstate (cache, superstate);
2740 else if (cache->lru_superstate == superstate)
2741 cache->lru_superstate = superstate->next_recyclable;
2742 else if (superstate != cache->lru_superstate->prev_recyclable)
2744 superstate->next_recyclable->prev_recyclable
2745 = superstate->prev_recyclable;
2746 superstate->prev_recyclable->next_recyclable
2747 = superstate->next_recyclable;
2748 superstate->next_recyclable = cache->lru_superstate;
2749 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
2750 superstate->next_recyclable->prev_recyclable = superstate;
2751 superstate->prev_recyclable->next_recyclable = superstate;
2757 release_superset_low (struct rx_cache * cache,
2758 struct rx_superset *set)
2761 release_superset_low (cache, set)
2762 struct rx_cache * cache;
2763 struct rx_superset *set;
2769 release_superset_low (cache, set->cdr);
2771 set->starts_for = 0;
2775 (&cache->superset_table,
2776 (unsigned long)set->car ^ set->id ^ (unsigned long)set->cdr,
2778 &cache->superset_hash_rules),
2779 &cache->superset_hash_rules);
2780 rx_cache_free (cache, &cache->free_supersets, (char *)set);
2786 rx_release_superset (struct rx *rx,
2787 struct rx_superset *set)
2790 rx_release_superset (rx, set)
2792 struct rx_superset *set;
2795 release_superset_low (rx->cache, set);
2798 /* This tries to add a new superstate to the superstate freelist.
2799 * It might, as a result, free some edge pieces or hash tables.
2800 * If nothing can be freed because too many locks are being held, fail.
2805 rx_really_free_superstate (struct rx_cache * cache)
2808 rx_really_free_superstate (cache)
2809 struct rx_cache * cache;
2812 int locked_superstates = 0;
2813 struct rx_superstate * it;
2815 if (!cache->superstates)
2819 /* This is a total guess. The idea is that we should expect as
2820 * many misses as we've recently experienced. I.e., cache->misses
2821 * should be the same as cache->semifree_superstates.
2823 while ((cache->hits + cache->misses) > cache->superstates_allowed)
2826 cache->misses >>= 1;
2828 if ( ((cache->hits + cache->misses) * cache->semifree_superstates)
2829 < (cache->superstates * cache->misses))
2831 semifree_superstate (cache);
2832 semifree_superstate (cache);
2836 while (cache->semifree_superstate && cache->semifree_superstate->locks)
2838 refresh_semifree_superstate (cache, cache->semifree_superstate);
2839 ++locked_superstates;
2840 if (locked_superstates == cache->superstates)
2844 if (cache->semifree_superstate)
2846 it = cache->semifree_superstate;
2847 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2848 it->prev_recyclable->next_recyclable = it->next_recyclable;
2849 cache->semifree_superstate = ((it == it->next_recyclable)
2851 : it->next_recyclable);
2852 --cache->semifree_superstates;
2856 while (cache->lru_superstate->locks)
2858 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2859 ++locked_superstates;
2860 if (locked_superstates == cache->superstates)
2863 it = cache->lru_superstate;
2864 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2865 it->prev_recyclable->next_recyclable = it->next_recyclable;
2866 cache->lru_superstate = ((it == it->next_recyclable)
2868 : it->next_recyclable);
2871 if (it->transition_refs)
2873 struct rx_distinct_future *df;
2874 for (df = it->transition_refs,
2875 df->prev_same_dest->next_same_dest = 0;
2877 df = df->next_same_dest)
2879 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2880 df->future_frame.data = 0;
2881 df->future_frame.data_2 = (void *) df;
2884 it->transition_refs->prev_same_dest->next_same_dest =
2885 it->transition_refs;
2888 struct rx_super_edge *tc = it->edges;
2891 struct rx_distinct_future * df;
2892 struct rx_super_edge *tct = tc->next;
2894 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
2897 struct rx_distinct_future *dft = df;
2898 df = df->next_same_super_edge[0];
2901 if (dft->future && dft->future->transition_refs == dft)
2903 dft->future->transition_refs = dft->next_same_dest;
2904 if (dft->future->transition_refs == dft)
2905 dft->future->transition_refs = 0;
2907 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
2908 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
2909 rx_cache_free (cache, &cache->free_discernable_futures,
2912 rx_cache_free (cache, &cache->free_transition_classes, (char *)tc);
2917 if (it->contents->superstate == it)
2918 it->contents->superstate = 0;
2919 release_superset_low (cache, it->contents);
2920 rx_cache_free (cache, &cache->free_superstates, (char *)it);
2921 --cache->superstates;
2927 rx_cache_get (struct rx_cache * cache,
2928 struct rx_freelist ** freelist)
2931 rx_cache_get (cache, freelist)
2932 struct rx_cache * cache;
2933 struct rx_freelist ** freelist;
2936 while (!*freelist && rx_really_free_superstate (cache))
2941 struct rx_freelist * it = *freelist;
2942 *freelist = it->next;
2949 rx_cache_malloc_or_get (struct rx_cache * cache,
2950 struct rx_freelist ** freelist, int bytes)
2953 rx_cache_malloc_or_get (cache, freelist, bytes)
2954 struct rx_cache * cache;
2955 struct rx_freelist ** freelist;
2961 char * answer = rx_cache_malloc (cache, bytes);
2966 return rx_cache_get (cache, freelist);
2971 rx_cache_get_superstate (struct rx_cache * cache)
2974 rx_cache_get_superstate (cache)
2975 struct rx_cache * cache;
2979 int bytes = ( sizeof (struct rx_superstate)
2980 + cache->local_cset_size * sizeof (struct rx_inx));
2981 if (!cache->free_superstates
2982 && (cache->superstates < cache->superstates_allowed))
2984 answer = rx_cache_malloc (cache, bytes);
2987 ++cache->superstates;
2991 answer = rx_cache_get (cache, &cache->free_superstates);
2994 answer = rx_cache_malloc (cache, bytes);
2996 ++cache->superstates_allowed;
2998 ++cache->superstates;
3006 supersetcmp (void * va, void * vb)
3009 supersetcmp (va, vb)
3014 struct rx_superset * a = (struct rx_superset *)va;
3015 struct rx_superset * b = (struct rx_superset *)vb;
3017 || (a && b && (a->car == b->car) && (a->cdr == b->cdr)));
3021 static struct rx_hash_item *
3022 superset_allocator (struct rx_hash_rules * rules, void * val)
3024 static struct rx_hash_item *
3025 superset_allocator (rules, val)
3026 struct rx_hash_rules * rules;
3030 struct rx_cache * cache
3031 = ((struct rx_cache *)
3033 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3034 struct rx_superset * template = (struct rx_superset *)val;
3035 struct rx_superset * newset
3036 = ((struct rx_superset *)
3037 rx_cache_malloc_or_get (cache,
3038 &cache->free_supersets,
3039 sizeof (*template)));
3043 newset->car = template->car;
3044 newset->id = template->car->id;
3045 newset->cdr = template->cdr;
3046 newset->superstate = 0;
3047 rx_protect_superset (rx, template->cdr);
3048 newset->hash_item.data = (void *)newset;
3049 newset->hash_item.binding = 0;
3050 return &newset->hash_item;
3054 static struct rx_hash *
3055 super_hash_allocator (struct rx_hash_rules * rules)
3057 static struct rx_hash *
3058 super_hash_allocator (rules)
3059 struct rx_hash_rules * rules;
3062 struct rx_cache * cache
3063 = ((struct rx_cache *)
3065 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3066 return ((struct rx_hash *)
3067 rx_cache_malloc_or_get (cache,
3068 &cache->free_hash, sizeof (struct rx_hash)));
3074 super_hash_liberator (struct rx_hash * hash, struct rx_hash_rules * rules)
3077 super_hash_liberator (hash, rules)
3078 struct rx_hash * hash;
3079 struct rx_hash_rules * rules;
3082 struct rx_cache * cache
3083 = ((struct rx_cache *)
3084 (char *)rules - (long)(&((struct rx_cache *)0)->superset_hash_rules));
3085 rx_cache_free (cache, &cache->free_hash, (char *)hash);
3090 superset_hash_item_liberator (struct rx_hash_item * it,
3091 struct rx_hash_rules * rules)
3094 superset_hash_item_liberator (it, rules) /* Well, it does ya know. */
3095 struct rx_hash_item * it;
3096 struct rx_hash_rules * rules;
3101 int rx_cache_bound = 128;
3102 static int rx_default_cache_got = 0;
3106 bytes_for_cache_size (int supers, int cset_size)
3109 bytes_for_cache_size (supers, cset_size)
3114 /* What the hell is this? !!!*/
3117 ( (1.03 * (float) ( rx_sizeof_bitset (cset_size)
3118 + sizeof (struct rx_super_edge)))
3119 + (1.80 * (float) sizeof (struct rx_possible_future))
3120 + (float) ( sizeof (struct rx_superstate)
3121 + cset_size * sizeof (struct rx_inx))));
3126 rx_morecore (struct rx_cache * cache)
3130 struct rx_cache * cache;
3133 if (rx_default_cache_got >= rx_cache_bound)
3136 rx_default_cache_got += 16;
3137 cache->superstates_allowed = rx_cache_bound;
3139 struct rx_blocklist ** pos = &cache->memory;
3140 int size = bytes_for_cache_size (16, cache->local_cset_size);
3142 pos = &(*pos)->next;
3143 *pos = ((struct rx_blocklist *)
3144 malloc (size + sizeof (struct rx_blocklist)));
3149 (*pos)->bytes = size;
3150 cache->memory_pos = *pos;
3151 cache->memory_addr = (char *)*pos + sizeof (**pos);
3152 cache->bytes_left = size;
3156 static struct rx_cache default_cache =
3160 super_hash_allocator,
3161 super_hash_liberator,
3163 superset_hash_item_liberator,
3189 rx_id_instruction_table,
3200 /* This adds an element to a superstate set. These sets are lists, such
3201 * that lists with == elements are ==. The empty set is returned by
3202 * superset_cons (rx, 0, 0) and is NOT equivelent to
3203 * (struct rx_superset)0.
3207 RX_DECL struct rx_superset *
3208 rx_superset_cons (struct rx * rx,
3209 struct rx_nfa_state *car, struct rx_superset *cdr)
3211 RX_DECL struct rx_superset *
3212 rx_superset_cons (rx, car, cdr)
3214 struct rx_nfa_state *car;
3215 struct rx_superset *cdr;
3218 struct rx_cache * cache = rx->cache;
3221 if (!cache->empty_superset)
3223 cache->empty_superset
3224 = ((struct rx_superset *)
3225 rx_cache_malloc_or_get (cache, &cache->free_supersets,
3226 sizeof (struct rx_superset)));
3227 if (!cache->empty_superset)
3229 bzero (cache->empty_superset, sizeof (struct rx_superset));
3230 cache->empty_superset->refs = 1000;
3232 return cache->empty_superset;
3235 struct rx_superset template;
3236 struct rx_hash_item * hit;
3239 template.id = car->id;
3240 hit = rx_hash_store (&cache->superset_table,
3241 (unsigned long)car ^ car->id ^ (unsigned long)cdr,
3243 &cache->superset_hash_rules);
3245 ? (struct rx_superset *)hit->data
3250 /* This computes a union of two NFA state sets. The sets do not have the
3251 * same representation though. One is a RX_SUPERSET structure (part
3252 * of the superstate NFA) and the other is an NFA_STATE_SET (part of the NFA).
3256 RX_DECL struct rx_superset *
3257 rx_superstate_eclosure_union
3258 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl)
3260 RX_DECL struct rx_superset *
3261 rx_superstate_eclosure_union (rx, set, ecl)
3263 struct rx_superset *set;
3264 struct rx_nfa_state_set *ecl;
3271 return rx_superset_cons (rx, ecl->car,
3272 rx_superstate_eclosure_union (rx, set, ecl->cdr));
3273 if (set->car == ecl->car)
3274 return rx_superstate_eclosure_union (rx, set, ecl->cdr);
3277 struct rx_superset * tail;
3278 struct rx_nfa_state * first;
3280 if (set->car > ecl->car)
3282 tail = rx_superstate_eclosure_union (rx, set->cdr, ecl);
3287 tail = rx_superstate_eclosure_union (rx, set, ecl->cdr);
3294 struct rx_superset * answer;
3295 answer = rx_superset_cons (rx, first, tail);
3298 rx_protect_superset (rx, tail);
3299 rx_release_superset (rx, tail);
3312 * This makes sure that a list of rx_distinct_futures contains
3313 * a future for each possible set of side effects in the eclosure
3314 * of a given state. This is some of the work of filling in a
3315 * superstate transition.
3319 static struct rx_distinct_future *
3320 include_futures (struct rx *rx,
3321 struct rx_distinct_future *df, struct rx_nfa_state
3322 *state, struct rx_superstate *superstate)
3324 static struct rx_distinct_future *
3325 include_futures (rx, df, state, superstate)
3327 struct rx_distinct_future *df;
3328 struct rx_nfa_state *state;
3329 struct rx_superstate *superstate;
3332 struct rx_possible_future *future;
3333 struct rx_cache * cache = rx->cache;
3334 for (future = state->futures; future; future = future->next)
3336 struct rx_distinct_future *dfp;
3337 struct rx_distinct_future *insert_before = 0;
3339 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3340 for (dfp = df; dfp; dfp = dfp->next_same_super_edge[0])
3341 if (dfp->effects == future->effects)
3345 int order = rx->se_list_cmp (rx, dfp->effects, future->effects);
3348 insert_before = dfp;
3354 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3358 = ((struct rx_distinct_future *)
3359 rx_cache_malloc_or_get (cache, &cache->free_discernable_futures,
3360 sizeof (struct rx_distinct_future)));
3365 df = insert_before = dfp;
3366 df->next_same_super_edge[0] = df->next_same_super_edge[1] = df;
3368 else if (!insert_before)
3370 else if (insert_before == df)
3373 dfp->next_same_super_edge[0] = insert_before;
3374 dfp->next_same_super_edge[1]
3375 = insert_before->next_same_super_edge[1];
3376 dfp->next_same_super_edge[1]->next_same_super_edge[0] = dfp;
3377 dfp->next_same_super_edge[0]->next_same_super_edge[1] = dfp;
3378 dfp->next_same_dest = dfp->prev_same_dest = dfp;
3380 dfp->present = superstate;
3381 dfp->future_frame.inx = rx->instruction_table[rx_cache_miss];
3382 dfp->future_frame.data = 0;
3383 dfp->future_frame.data_2 = (void *) dfp;
3384 dfp->side_effects_frame.inx
3385 = rx->instruction_table[rx_do_side_effects];
3386 dfp->side_effects_frame.data = 0;
3387 dfp->side_effects_frame.data_2 = (void *) dfp;
3388 dfp->effects = future->effects;
3396 /* This constructs a new superstate from its state set. The only
3397 * complexity here is memory management.
3400 RX_DECL struct rx_superstate *
3401 rx_superstate (struct rx *rx,
3402 struct rx_superset *set)
3404 RX_DECL struct rx_superstate *
3405 rx_superstate (rx, set)
3407 struct rx_superset *set;
3410 struct rx_cache * cache = rx->cache;
3411 struct rx_superstate * superstate = 0;
3413 /* Does the superstate already exist in the cache? */
3414 if (set->superstate)
3416 if (set->superstate->rx_id != rx->rx_id)
3418 /* Aha. It is in the cache, but belongs to a superstate
3419 * that refers to an NFA that no longer exists.
3420 * (We know it no longer exists because it was evidently
3421 * stored in the same region of memory as the current nfa
3422 * yet it has a different id.)
3424 superstate = set->superstate;
3425 if (!superstate->is_semifree)
3427 if (cache->lru_superstate == superstate)
3429 cache->lru_superstate = superstate->next_recyclable;
3430 if (cache->lru_superstate == superstate)
3431 cache->lru_superstate = 0;
3434 superstate->next_recyclable->prev_recyclable
3435 = superstate->prev_recyclable;
3436 superstate->prev_recyclable->next_recyclable
3437 = superstate->next_recyclable;
3438 if (!cache->semifree_superstate)
3440 (cache->semifree_superstate
3441 = superstate->next_recyclable
3442 = superstate->prev_recyclable
3447 superstate->next_recyclable = cache->semifree_superstate;
3448 superstate->prev_recyclable
3449 = cache->semifree_superstate->prev_recyclable;
3450 superstate->next_recyclable->prev_recyclable
3452 superstate->prev_recyclable->next_recyclable
3454 cache->semifree_superstate = superstate;
3456 ++cache->semifree_superstates;
3459 set->superstate = 0;
3460 goto handle_cache_miss;
3463 superstate = set->superstate;
3465 rx_refresh_this_superstate (cache, superstate);
3471 /* This point reached only for cache misses. */
3474 if (rx_debug_trace > 1)
3476 struct rx_superset * setp = set;
3477 fprintf (stderr, "Building a superstet %d(%d): ", rx->rx_id, set);
3480 fprintf (stderr, "%d ", setp->id);
3483 fprintf (stderr, "(%d)\n", set);
3486 superstate = (struct rx_superstate *)rx_cache_get_superstate (cache);
3490 if (!cache->lru_superstate)
3491 (cache->lru_superstate
3492 = superstate->next_recyclable
3493 = superstate->prev_recyclable
3497 superstate->next_recyclable = cache->lru_superstate;
3498 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
3499 ( superstate->prev_recyclable->next_recyclable
3500 = superstate->next_recyclable->prev_recyclable
3503 superstate->rx_id = rx->rx_id;
3504 superstate->transition_refs = 0;
3505 superstate->locks = 0;
3506 superstate->is_semifree = 0;
3507 set->superstate = superstate;
3508 superstate->contents = set;
3509 rx_protect_superset (rx, set);
3510 superstate->edges = 0;
3513 /* None of the transitions from this superstate are known yet. */
3514 for (x = 0; x < rx->local_cset_size; ++x) /* &&&&& 3.8 % */
3516 struct rx_inx * ifr = &superstate->transitions[x];
3517 ifr->inx = rx->instruction_table [rx_cache_miss];
3518 ifr->data = ifr->data_2 = 0;
3525 /* This computes the destination set of one edge of the superstate NFA.
3526 * Note that a RX_DISTINCT_FUTURE is a superstate edge.
3527 * Returns 0 on an allocation failure.
3532 solve_destination (struct rx *rx, struct rx_distinct_future *df)
3535 solve_destination (rx, df)
3537 struct rx_distinct_future *df;
3540 struct rx_super_edge *tc = df->edge;
3541 struct rx_superset *nfa_state;
3542 struct rx_superset *nil_set = rx_superset_cons (rx, 0, 0);
3543 struct rx_superset *solution = nil_set;
3544 struct rx_superstate *dest;
3546 rx_protect_superset (rx, solution);
3547 /* Iterate over all NFA states in the state set of this superstate. */
3548 for (nfa_state = df->present->contents;
3550 nfa_state = nfa_state->cdr)
3552 struct rx_nfa_edge *e;
3553 /* Iterate over all edges of each NFA state. */
3554 for (e = nfa_state->car->edges; e; e = e->next)
3555 /* If we find an edge that is labeled with
3556 * the characters we are solving for.....
3558 if (rx_bitset_is_subset (rx->local_cset_size,
3559 tc->cset, e->params.cset))
3561 struct rx_nfa_state *n = e->dest;
3562 struct rx_possible_future *pf;
3563 /* ....search the partial epsilon closures of the destination
3564 * of that edge for a path that involves the same set of
3565 * side effects we are solving for.
3566 * If we find such a RX_POSSIBLE_FUTURE, we add members to the
3567 * stateset we are computing.
3569 for (pf = n->futures; pf; pf = pf->next)
3570 if (pf->effects == df->effects)
3572 struct rx_superset * old_sol;
3574 solution = rx_superstate_eclosure_union (rx, solution,
3578 rx_protect_superset (rx, solution);
3579 rx_release_superset (rx, old_sol);
3583 /* It is possible that the RX_DISTINCT_FUTURE we are working on has
3584 * the empty set of NFA states as its definition. In that case, this
3585 * is a failure point.
3587 if (solution == nil_set)
3589 df->future_frame.inx = (void *) rx_backtrack;
3590 df->future_frame.data = 0;
3591 df->future_frame.data_2 = 0;
3594 dest = rx_superstate (rx, solution);
3595 rx_release_superset (rx, solution);
3600 struct rx_distinct_future *dft;
3602 df->prev_same_dest->next_same_dest = 0;
3606 dft->future_frame.inx = rx->instruction_table[rx_next_char];
3607 dft->future_frame.data = (void *) dest->transitions;
3608 dft = dft->next_same_dest;
3610 df->prev_same_dest->next_same_dest = df;
3612 if (!dest->transition_refs)
3613 dest->transition_refs = df;
3616 struct rx_distinct_future *dft = dest->transition_refs->next_same_dest;
3617 dest->transition_refs->next_same_dest = df->next_same_dest;
3618 df->next_same_dest->prev_same_dest = dest->transition_refs;
3619 df->next_same_dest = dft;
3620 dft->prev_same_dest = df;
3626 /* This takes a superstate and a character, and computes some edges
3627 * from the superstate NFA. In particular, this computes all edges
3628 * that lead from SUPERSTATE given CHR. This function also
3629 * computes the set of characters that share this edge set.
3630 * This returns 0 on allocation error.
3631 * The character set and list of edges are returned through
3632 * the paramters CSETOUT and DFOUT.
3637 compute_super_edge (struct rx *rx, struct rx_distinct_future **dfout,
3638 rx_Bitset csetout, struct rx_superstate *superstate,
3642 compute_super_edge (rx, dfout, csetout, superstate, chr)
3644 struct rx_distinct_future **dfout;
3646 struct rx_superstate *superstate;
3650 struct rx_superset *stateset = superstate->contents;
3652 /* To compute the set of characters that share edges with CHR,
3653 * we start with the full character set, and subtract.
3655 rx_bitset_universe (rx->local_cset_size, csetout);
3658 /* Iterate over the NFA states in the superstate state-set. */
3659 while (stateset->car)
3661 struct rx_nfa_edge *e;
3662 for (e = stateset->car->edges; e; e = e->next)
3663 if (RX_bitset_member (e->params.cset, chr))
3665 /* If we find an NFA edge that applies, we make sure there
3666 * are corresponding edges in the superstate NFA.
3669 struct rx_distinct_future * saved;
3671 *dfout = include_futures (rx, *dfout, e->dest, superstate);
3674 struct rx_distinct_future * df;
3677 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3680 struct rx_distinct_future *dft;
3682 df = df->next_same_super_edge[0];
3684 if (dft->future && dft->future->transition_refs == dft)
3686 dft->future->transition_refs = dft->next_same_dest;
3687 if (dft->future->transition_refs == dft)
3688 dft->future->transition_refs = 0;
3690 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
3691 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
3692 rx_cache_free (rx->cache,
3693 &rx->cache->free_discernable_futures,
3699 /* We also trim the character set a bit. */
3700 rx_bitset_intersection (rx->local_cset_size,
3701 csetout, e->params.cset);
3704 /* An edge that doesn't apply at least tells us some characters
3705 * that don't share the same edge set as CHR.
3707 rx_bitset_difference (rx->local_cset_size, csetout, e->params.cset);
3708 stateset = stateset->cdr;
3714 /* This is a constructor for RX_SUPER_EDGE structures. These are
3715 * wrappers for lists of superstate NFA edges that share character sets labels.
3716 * If a transition class contains more than one rx_distinct_future (superstate
3717 * edge), then it represents a non-determinism in the superstate NFA.
3721 static struct rx_super_edge *
3722 rx_super_edge (struct rx *rx,
3723 struct rx_superstate *super, rx_Bitset cset,
3724 struct rx_distinct_future *df)
3726 static struct rx_super_edge *
3727 rx_super_edge (rx, super, cset, df)
3729 struct rx_superstate *super;
3731 struct rx_distinct_future *df;
3734 struct rx_super_edge *tc =
3735 (struct rx_super_edge *)rx_cache_malloc_or_get
3736 (rx->cache, &rx->cache->free_transition_classes,
3737 sizeof (struct rx_super_edge) + rx_sizeof_bitset (rx->local_cset_size));
3741 tc->next = super->edges;
3743 tc->rx_backtrack_frame.inx = rx->instruction_table[rx_backtrack_point];
3744 tc->rx_backtrack_frame.data = 0;
3745 tc->rx_backtrack_frame.data_2 = (void *) tc;
3747 tc->cset = (rx_Bitset) ((char *) tc + sizeof (*tc));
3748 rx_bitset_assign (rx->local_cset_size, tc->cset, cset);
3751 struct rx_distinct_future * dfp = df;
3752 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3756 dfp = dfp->next_same_super_edge[0];
3758 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3764 /* There are three kinds of cache miss. The first occurs when a
3765 * transition is taken that has never been computed during the
3766 * lifetime of the source superstate. That cache miss is handled by
3767 * calling COMPUTE_SUPER_EDGE. The second kind of cache miss
3768 * occurs when the destination superstate of a transition doesn't
3769 * exist. SOLVE_DESTINATION is used to construct the destination superstate.
3770 * Finally, the third kind of cache miss occurs when the destination
3771 * superstate of a transition is in a `semi-free state'. That case is
3772 * handled by UNFREE_SUPERSTATE.
3774 * The function of HANDLE_CACHE_MISS is to figure out which of these
3780 install_partial_transition (struct rx_superstate *super,
3781 struct rx_inx *answer,
3782 RX_subset set, int offset)
3785 install_partial_transition (super, answer, set, offset)
3786 struct rx_superstate *super;
3787 struct rx_inx *answer;
3793 int end = start + 32;
3795 struct rx_inx * transitions = super->transitions;
3800 transitions[start] = *answer;
3808 RX_DECL struct rx_inx *
3809 rx_handle_cache_miss
3810 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data)
3812 RX_DECL struct rx_inx *
3813 rx_handle_cache_miss (rx, super, chr, data)
3815 struct rx_superstate *super;
3820 int offset = chr / RX_subset_bits;
3821 struct rx_distinct_future *df = data;
3823 if (!df) /* must be the shared_cache_miss_frame */
3825 /* Perhaps this is just a transition waiting to be filled. */
3826 struct rx_super_edge *tc;
3827 RX_subset mask = rx_subset_singletons [chr % RX_subset_bits];
3829 for (tc = super->edges; tc; tc = tc->next)
3830 if (tc->cset[offset] & mask)
3832 struct rx_inx * answer;
3834 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3835 ? &tc->rx_backtrack_frame
3837 ? &df->side_effects_frame
3838 : &df->future_frame));
3839 install_partial_transition (super, answer,
3840 tc->cset [offset], offset * 32);
3843 /* Otherwise, it's a flushed or newly encountered edge. */
3845 char cset_space[1024]; /* this limit is far from unreasonable */
3847 struct rx_inx *answer;
3849 if (rx_sizeof_bitset (rx->local_cset_size) > sizeof (cset_space))
3850 return 0; /* If the arbitrary limit is hit, always fail */
3852 trcset = (rx_Bitset)cset_space;
3853 rx_lock_superstate (rx, super);
3854 if (!compute_super_edge (rx, &df, trcset, super, chr))
3856 rx_unlock_superstate (rx, super);
3859 if (!df) /* We just computed the fail transition. */
3861 static struct rx_inx
3862 shared_fail_frame = { 0, 0, (void *)rx_backtrack, 0 };
3863 answer = &shared_fail_frame;
3867 tc = rx_super_edge (rx, super, trcset, df);
3870 rx_unlock_superstate (rx, super);
3873 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3874 ? &tc->rx_backtrack_frame
3876 ? &df->side_effects_frame
3877 : &df->future_frame));
3879 install_partial_transition (super, answer,
3880 trcset[offset], offset * 32);
3881 rx_unlock_superstate (rx, super);
3885 else if (df->future) /* A cache miss on an edge with a future? Must be
3886 * a semi-free destination. */
3888 if (df->future->is_semifree)
3889 refresh_semifree_superstate (rx->cache, df->future);
3890 return &df->future_frame;
3893 /* no future superstate on an existing edge */
3895 rx_lock_superstate (rx, super);
3896 if (!solve_destination (rx, df))
3898 rx_unlock_superstate (rx, super);
3902 && (df->edge->options->next_same_super_edge[0] == df->edge->options))
3903 install_partial_transition (super, &df->future_frame,
3904 df->edge->cset[offset], offset * 32);
3905 rx_unlock_superstate (rx, super);
3906 return &df->future_frame;
3913 /* The rest of the code provides a regex.c compatable interface. */
3916 __const__ char *re_error_msg[] =
3919 "No match", /* REG_NOMATCH */
3920 "Invalid regular expression", /* REG_BADPAT */
3921 "Invalid collation character", /* REG_ECOLLATE */
3922 "Invalid character class name", /* REG_ECTYPE */
3923 "Trailing backslash", /* REG_EESCAPE */
3924 "Invalid back reference", /* REG_ESUBREG */
3925 "Unmatched [ or [^", /* REG_EBRACK */
3926 "Unmatched ( or \\(", /* REG_EPAREN */
3927 "Unmatched \\{", /* REG_EBRACE */
3928 "Invalid content of \\{\\}", /* REG_BADBR */
3929 "Invalid range end", /* REG_ERANGE */
3930 "Memory exhausted", /* REG_ESPACE */
3931 "Invalid preceding regular expression", /* REG_BADRPT */
3932 "Premature end of regular expression", /* REG_EEND */
3933 "Regular expression too big", /* REG_ESIZE */
3934 "Unmatched ) or \\)", /* REG_ERPAREN */
3940 * Macros used while compiling patterns.
3942 * By convention, PEND points just past the end of the uncompiled pattern,
3943 * P points to the read position in the pattern. `translate' is the name
3944 * of the translation table (`TRANSLATE' is the name of a macro that looks
3945 * things up in `translate').
3950 * Fetch the next character in the uncompiled pattern---translating it
3951 * if necessary. *Also cast from a signed character in the constant
3952 * string passed to us by the user to an unsigned char that we can use
3953 * as an array index (in, e.g., `translate').
3955 #define PATFETCH(c) \
3956 do {if (p == pend) return REG_EEND; \
3957 c = (unsigned char) *p++; \
3962 * Fetch the next character in the uncompiled pattern, with no
3965 #define PATFETCH_RAW(c) \
3966 do {if (p == pend) return REG_EEND; \
3967 c = (unsigned char) *p++; \
3970 /* Go backwards one character in the pattern. */
3971 #define PATUNFETCH p--
3974 #define TRANSLATE(d) translate[(unsigned char) (d)]
3976 typedef unsigned regnum_t;
3978 /* Since offsets can go either forwards or backwards, this type needs to
3979 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
3981 typedef int pattern_offset_t;
3985 struct rexp_node ** top_expression; /* was begalt */
3986 struct rexp_node ** last_expression; /* was laststart */
3987 pattern_offset_t inner_group_offset;
3989 } compile_stack_elt_t;
3993 compile_stack_elt_t *stack;
3995 unsigned avail; /* Offset of next open position. */
3996 } compile_stack_type;
3999 #define INIT_COMPILE_STACK_SIZE 32
4001 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
4002 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
4004 /* The next available element. */
4005 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
4008 /* Set the bit for character C in a list. */
4009 #define SET_LIST_BIT(c) \
4010 (b[((unsigned char) (c)) / CHARBITS] \
4011 |= 1 << (((unsigned char) c) % CHARBITS))
4013 /* Get the next unsigned number in the uncompiled pattern. */
4014 #define GET_UNSIGNED_NUMBER(num) \
4018 while (isdigit (c)) \
4022 num = num * 10 + c - '0'; \
4030 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
4032 #define IS_CHAR_CLASS(string) \
4033 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
4034 || !strcmp (string, "lower") || !strcmp (string, "digit") \
4035 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
4036 || !strcmp (string, "space") || !strcmp (string, "print") \
4037 || !strcmp (string, "punct") || !strcmp (string, "graph") \
4038 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
4041 /* These predicates are used in regex_compile. */
4043 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4044 * after an alternative or a begin-subexpression. We assume there is at
4045 * least one character before the ^.
4050 at_begline_loc_p (__const__ char *pattern, __const__ char * p, reg_syntax_t syntax)
4053 at_begline_loc_p (pattern, p, syntax)
4054 __const__ char *pattern;
4056 reg_syntax_t syntax;
4059 __const__ char *prev = p - 2;
4060 boolean prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\'));
4064 (/* After a subexpression? */
4065 ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash))
4067 /* After an alternative? */
4068 ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash))
4072 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4073 * at least one character after the $, i.e., `P < PEND'.
4078 at_endline_loc_p (__const__ char *p, __const__ char *pend, int syntax)
4081 at_endline_loc_p (p, pend, syntax)
4083 __const__ char *pend;
4087 __const__ char *next = p;
4088 boolean next_backslash = (*next == '\\');
4089 __const__ char *next_next = (p + 1 < pend) ? (p + 1) : 0;
4093 /* Before a subexpression? */
4094 ((syntax & RE_NO_BK_PARENS)
4096 : (next_backslash && next_next && (*next_next == ')')))
4098 /* Before an alternative? */
4099 ((syntax & RE_NO_BK_VBAR)
4101 : (next_backslash && next_next && (*next_next == '|')))
4106 unsigned char rx_id_translation[256] =
4108 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4109 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
4110 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
4111 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
4112 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
4113 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
4114 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
4115 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
4116 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
4117 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
4119 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
4120 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
4121 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
4122 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
4123 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
4124 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
4125 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
4126 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
4127 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
4128 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
4130 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
4131 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
4132 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
4133 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
4134 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
4135 250, 251, 252, 253, 254, 255
4138 /* The compiler keeps an inverted translation table.
4139 * This looks up/inititalize elements.
4140 * VALID is an array of booleans that validate CACHE.
4145 inverse_translation (struct re_pattern_buffer * rxb,
4146 char * valid, rx_Bitset cache,
4147 unsigned char * translate, int c)
4150 inverse_translation (rxb, valid, cache, translate, c)
4151 struct re_pattern_buffer * rxb;
4154 unsigned char * translate;
4159 = cache + c * rx_bitset_numb_subsets (rxb->rx.local_cset_size);
4164 int c_tr = TRANSLATE(c);
4165 rx_bitset_null (rxb->rx.local_cset_size, cs);
4166 for (x = 0; x < 256; ++x) /* &&&& 13.37 */
4167 if (TRANSLATE(x) == c_tr)
4168 RX_bitset_enjoin (cs, x);
4177 /* More subroutine declarations and macros for regex_compile. */
4179 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4180 false if it's not. */
4184 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4187 group_in_compile_stack (compile_stack, regnum)
4188 compile_stack_type compile_stack;
4194 for (this_element = compile_stack.avail - 1;
4197 if (compile_stack.stack[this_element].regnum == regnum)
4205 * Read the ending character of a range (in a bracket expression) from the
4206 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
4207 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4208 * Then we set the translation of all bits between the starting and
4209 * ending characters (inclusive) in the compiled pattern B.
4211 * Return an error code.
4213 * We use these short variable names so we can use the same macros as
4214 * `regex_compile' itself.
4218 static reg_errcode_t
4219 compile_range (struct re_pattern_buffer * rxb, rx_Bitset cs,
4220 __const__ char ** p_ptr, __const__ char * pend,
4221 unsigned char * translate, reg_syntax_t syntax,
4222 rx_Bitset inv_tr, char * valid_inv_tr)
4224 static reg_errcode_t
4225 compile_range (rxb, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr)
4226 struct re_pattern_buffer * rxb;
4228 __const__ char ** p_ptr;
4229 __const__ char * pend;
4230 unsigned char * translate;
4231 reg_syntax_t syntax;
4233 char * valid_inv_tr;
4238 __const__ char *p = *p_ptr;
4240 unsigned char range_end;
4241 unsigned char range_start = TRANSLATE(p[-2]);
4246 PATFETCH (range_end);
4250 if (range_start > range_end)
4251 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4253 for (this_char = range_start; this_char <= range_end; this_char++)
4256 inverse_translation (rxb, valid_inv_tr, inv_tr, translate, this_char);
4257 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
4264 /* This searches a regexp for backreference side effects.
4265 * It fills in the array OUT with 1 at the index of every register pair
4266 * referenced by a backreference.
4268 * This is used to help optimize patterns for searching. The information is
4269 * useful because, if the caller doesn't want register values, backreferenced
4270 * registers are the only registers for which we need rx_backtrack.
4275 find_backrefs (char * out, struct rexp_node * rexp,
4276 struct re_se_params * params)
4279 find_backrefs (out, rexp, params)
4281 struct rexp_node * rexp;
4282 struct re_se_params * params;
4296 find_backrefs (out, rexp->params.pair.left, params);
4297 find_backrefs (out, rexp->params.pair.right, params);
4300 if ( ((long)rexp->params.side_effect >= 0)
4301 && (params [(long)rexp->params.side_effect].se == re_se_backref))
4302 out[ params [(long)rexp->params.side_effect].op1] = 1;
4309 /* Returns 0 unless the pattern can match the empty string. */
4313 compute_fastset (struct re_pattern_buffer * rxb, struct rexp_node * rexp)
4316 compute_fastset (rxb, rexp)
4317 struct re_pattern_buffer * rxb;
4318 struct rexp_node * rexp;
4329 rx_bitset_union (rxb->rx.local_cset_size,
4330 rxb->fastset, rexp->params.cset);
4334 return (compute_fastset (rxb, rexp->params.pair.left)
4335 && compute_fastset (rxb, rexp->params.pair.right));
4337 compute_fastset (rxb, rexp->params.pair.left);
4338 /* compute_fastset (rxb, rexp->params.pair.right); nope... */
4341 return !!(compute_fastset (rxb, rexp->params.pair.left)
4342 + compute_fastset (rxb, rexp->params.pair.right));
4345 compute_fastset (rxb, rexp->params.pair.left);
4351 /* this should never happen */
4357 * 1 -- yes, definately anchored by the given side effect.
4358 * 2 -- maybe anchored, maybe the empty string.
4359 * 0 -- definately not anchored
4360 * There is simply no other possibility.
4365 is_anchored (struct rexp_node * rexp, rx_side_effect se)
4368 is_anchored (rexp, se)
4369 struct rexp_node * rexp;
4383 int l = is_anchored (rexp->params.pair.left, se);
4384 return (l == 2 ? is_anchored (rexp->params.pair.right, se) : l);
4388 int l = is_anchored (rexp->params.pair.left, se);
4389 int r = l ? is_anchored (rexp->params.pair.right, se) : 0;
4393 else if ((l == 0) || (r == 0))
4400 return is_anchored (rexp->params.pair.left, se) ? 2 : 0;
4403 return ((rexp->params.side_effect == se)
4407 /* this should never happen */
4412 /* This removes register assignments that aren't required by backreferencing.
4413 * This can speed up explore_future, especially if it eliminates
4414 * non-determinism in the superstate NFA.
4416 * NEEDED is an array of characters, presumably filled in by FIND_BACKREFS.
4417 * The non-zero elements of the array indicate which register assignments
4418 * can NOT be removed from the expression.
4422 static struct rexp_node *
4423 remove_unecessary_side_effects (struct rx * rx, char * needed,
4424 struct rexp_node * rexp,
4425 struct re_se_params * params)
4427 static struct rexp_node *
4428 remove_unecessary_side_effects (rx, needed, rexp, params)
4431 struct rexp_node * rexp;
4432 struct re_se_params * params;
4435 struct rexp_node * l;
4436 struct rexp_node * r;
4448 l = remove_unecessary_side_effects (rx, needed,
4449 rexp->params.pair.left, params);
4450 r = remove_unecessary_side_effects (rx, needed,
4451 rexp->params.pair.right, params);
4452 if ((l && r) || (rexp->type != r_concat))
4454 rexp->params.pair.left = l;
4455 rexp->params.pair.right = r;
4460 rexp->params.pair.left = rexp->params.pair.right = 0;
4461 rx_free_rexp (rx, rexp);
4466 l = remove_unecessary_side_effects (rx, needed,
4467 rexp->params.pair.left, params);
4470 rexp->params.pair.left = l;
4475 rexp->params.pair.left = 0;
4476 rx_free_rexp (rx, rexp);
4481 int se = (long)rexp->params.side_effect;
4483 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4484 || ((enum re_side_effects)params[se].se == re_se_rparen))
4485 && (params [se].op1 > 0)
4486 && (!needed [params [se].op1]))
4488 rx_free_rexp (rx, rexp);
4496 /* this should never happen */
4504 pointless_if_repeated (struct rexp_node * node, struct re_se_params * params)
4507 pointless_if_repeated (node, params)
4508 struct rexp_node * node;
4509 struct re_se_params * params;
4521 return (pointless_if_repeated (node->params.pair.left, params)
4522 && pointless_if_repeated (node->params.pair.right, params));
4525 return pointless_if_repeated (node->params.pair.left, params);
4527 switch (((long)node->params.side_effect < 0)
4528 ? (enum re_side_effects)node->params.side_effect
4529 : (enum re_side_effects)params[(long)node->params.side_effect].se)
4536 case re_se_wordbound:
4537 case re_se_notwordbound:
4547 case re_se_end_iter:
4549 case re_se_not_syntax:
4563 registers_on_stack (struct re_pattern_buffer * rxb,
4564 struct rexp_node * rexp, int in_danger,
4565 struct re_se_params * params)
4568 registers_on_stack (rxb, rexp, in_danger, params)
4569 struct re_pattern_buffer * rxb;
4570 struct rexp_node * rexp;
4572 struct re_se_params * params;
4585 return ( registers_on_stack (rxb, rexp->params.pair.left,
4587 || (registers_on_stack
4588 (rxb, rexp->params.pair.right,
4589 in_danger, params)));
4591 return registers_on_stack (rxb, rexp->params.pair.left, 0, params);
4593 return registers_on_stack (rxb, rexp->params.pair.left, 1, params);
4596 ( registers_on_stack (rxb, rexp->params.pair.left, 1, params)
4597 || registers_on_stack (rxb, rexp->params.pair.right, 1, params));
4600 int se = (long)rexp->params.side_effect;
4603 && (params [se].op1 > 0)
4604 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4605 || ((enum re_side_effects)params[se].se == re_se_rparen)))
4612 /* this should never happen */
4618 static char idempotent_complex_se[] =
4620 #define RX_WANT_SE_DEFS 1
4622 #undef RX_DEF_CPLX_SE
4623 #define RX_DEF_SE(IDEM, NAME, VALUE)
4624 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) IDEM,
4627 #undef RX_DEF_CPLX_SE
4628 #undef RX_WANT_SE_DEFS
4632 static char idempotent_se[] =
4635 #define RX_WANT_SE_DEFS 1
4637 #undef RX_DEF_CPLX_SE
4638 #define RX_DEF_SE(IDEM, NAME, VALUE) IDEM,
4639 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE)
4642 #undef RX_DEF_CPLX_SE
4643 #undef RX_WANT_SE_DEFS
4652 has_any_se (struct rx * rx,
4653 struct rexp_node * rexp)
4656 has_any_se (rx, rexp)
4658 struct rexp_node * rexp;
4677 ( has_any_se (rx, rexp->params.pair.left)
4678 || has_any_se (rx, rexp->params.pair.right));
4682 return has_any_se (rx, rexp->params.pair.left);
4685 /* this should never happen */
4691 /* This must be called AFTER `convert_hard_loops' for a given REXP. */
4694 has_non_idempotent_epsilon_path (struct rx * rx,
4695 struct rexp_node * rexp,
4696 struct re_se_params * params)
4699 has_non_idempotent_epsilon_path (rx, rexp, params)
4701 struct rexp_node * rexp;
4702 struct re_se_params * params;
4717 !((long)rexp->params.side_effect > 0
4718 ? idempotent_complex_se [ params [(long)rexp->params.side_effect].se ]
4719 : idempotent_se [-(long)rexp->params.side_effect]);
4723 ( has_non_idempotent_epsilon_path (rx,
4724 rexp->params.pair.left, params)
4725 || has_non_idempotent_epsilon_path (rx,
4726 rexp->params.pair.right, params));
4731 ( has_non_idempotent_epsilon_path (rx,
4732 rexp->params.pair.left, params)
4733 && has_non_idempotent_epsilon_path (rx,
4734 rexp->params.pair.right, params));
4737 return has_non_idempotent_epsilon_path (rx,
4738 rexp->params.pair.left, params);
4741 /* this should never happen */
4747 /* This computes rougly what it's name suggests. It can (and does) go wrong
4748 * in the direction of returning spurious 0 without causing disasters.
4752 begins_with_complex_se (struct rx * rx, struct rexp_node * rexp)
4755 begins_with_complex_se (rx, rexp)
4757 struct rexp_node * rexp;
4770 return ((long)rexp->params.side_effect >= 0);
4774 ( begins_with_complex_se (rx, rexp->params.pair.left)
4775 && begins_with_complex_se (rx, rexp->params.pair.right));
4779 return has_any_se (rx, rexp->params.pair.left);
4786 /* this should never happen */
4791 /* This destructively removes some of the re_se_tv side effects from
4792 * a rexp tree. In particular, during parsing re_se_tv was inserted on the
4793 * right half of every | to guarantee that posix path preference could be
4794 * honored. This function removes some which it can be determined aren't
4800 speed_up_alt (struct rx * rx,
4801 struct rexp_node * rexp,
4805 speed_up_alt (rx, rexp, unposix)
4807 struct rexp_node * rexp;
4823 speed_up_alt (rx, rexp->params.pair.left, unposix);
4828 speed_up_alt (rx, rexp->params.pair.left, unposix);
4829 speed_up_alt (rx, rexp->params.pair.right, unposix);
4833 /* the right child is guaranteed to be (concat re_se_tv <subexp>) */
4835 speed_up_alt (rx, rexp->params.pair.left, unposix);
4836 speed_up_alt (rx, rexp->params.pair.right->params.pair.right, unposix);
4839 || (begins_with_complex_se
4840 (rx, rexp->params.pair.right->params.pair.right))
4841 || !( has_any_se (rx, rexp->params.pair.right->params.pair.right)
4842 || has_any_se (rx, rexp->params.pair.left)))
4844 struct rexp_node * conc = rexp->params.pair.right;
4845 rexp->params.pair.right = conc->params.pair.right;
4846 conc->params.pair.right = 0;
4847 rx_free_rexp (rx, conc);
4856 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
4857 Returns one of error codes defined in `regex.h', or zero for success.
4859 Assumes the `allocated' (and perhaps `buffer') and `translate'
4860 fields are set in BUFP on entry.
4862 If it succeeds, results are put in BUFP (if it returns an error, the
4863 contents of BUFP are undefined):
4864 `buffer' is the compiled pattern;
4865 `syntax' is set to SYNTAX;
4866 `used' is set to the length of the compiled pattern;
4867 `fastmap_accurate' is set to zero;
4868 `re_nsub' is set to the number of groups in PATTERN;
4869 `not_bol' and `not_eol' are set to zero.
4871 The `fastmap' and `newline_anchor' fields are neither
4872 examined nor set. */
4877 RX_DECL reg_errcode_t
4878 rx_compile (__const__ char *pattern, int size,
4879 reg_syntax_t syntax,
4880 struct re_pattern_buffer * rxb)
4882 RX_DECL reg_errcode_t
4883 rx_compile (pattern, size, syntax, rxb)
4884 __const__ char *pattern;
4886 reg_syntax_t syntax;
4887 struct re_pattern_buffer * rxb;
4891 inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4893 validate_inv_tr [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4895 /* We fetch characters from PATTERN here. Even though PATTERN is
4896 `char *' (i.e., signed), we declare these variables as unsigned, so
4897 they can be reliably used as array indices. */
4898 register unsigned char c, c1;
4900 /* A random tempory spot in PATTERN. */
4903 /* Keeps track of unclosed groups. */
4904 compile_stack_type compile_stack;
4906 /* Points to the current (ending) position in the pattern. */
4907 __const__ char *p = pattern;
4908 __const__ char *pend = pattern + size;
4910 /* How to translate the characters in the pattern. */
4911 unsigned char *translate = (rxb->translate
4913 : rx_id_translation);
4915 /* When parsing is done, this will hold the expression tree. */
4916 struct rexp_node * rexp = 0;
4918 /* In the midst of compilation, this holds onto the regexp
4919 * first parst while rexp goes on to aquire additional constructs.
4921 struct rexp_node * orig_rexp = 0;
4922 struct rexp_node * fewer_side_effects = 0;
4924 /* This and top_expression are saved on the compile stack. */
4925 struct rexp_node ** top_expression = &rexp;
4926 struct rexp_node ** last_expression = top_expression;
4928 /* Parameter to `goto append_node' */
4929 struct rexp_node * append;
4931 /* Counts open-groups as they are encountered. This is the index of the
4932 * innermost group being compiled.
4934 regnum_t regnum = 0;
4936 /* Place in the uncompiled pattern (i.e., the {) to
4937 * which to go back if the interval is invalid.
4939 __const__ char *beg_interval;
4941 struct re_se_params * params = 0;
4942 int paramc = 0; /* How many complex side effects so far? */
4944 rx_side_effect side; /* param to `goto add_side_effect' */
4946 bzero (validate_inv_tr, sizeof (validate_inv_tr));
4948 rxb->rx.instruction_table = rx_id_instruction_table;
4951 /* Initialize the compile stack. */
4952 compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t)));
4953 if (compile_stack.stack == 0)
4956 compile_stack.size = INIT_COMPILE_STACK_SIZE;
4957 compile_stack.avail = 0;
4959 /* Initialize the pattern buffer. */
4960 rxb->rx.cache = &default_cache;
4961 rxb->syntax = syntax;
4962 rxb->fastmap_accurate = 0;
4963 rxb->not_bol = rxb->not_eol = 0;
4964 rxb->least_subs = 0;
4966 /* Always count groups, whether or not rxb->no_sub is set.
4967 * The whole pattern is implicitly group 0, so counting begins
4972 #if !defined (emacs) && !defined (SYNTAX_TABLE)
4973 /* Initialize the syntax table. */
4974 init_syntax_once ();
4977 /* Loop through the uncompiled pattern until we're at the end. */
4986 if ( /* If at start of pattern, it's an operator. */
4988 /* If context independent, it's an operator. */
4989 || syntax & RE_CONTEXT_INDEP_ANCHORS
4990 /* Otherwise, depends on what's come before. */
4991 || at_begline_loc_p (pattern, p, syntax))
4993 struct rexp_node * n
4994 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_hat);
5008 if ( /* If at end of pattern, it's an operator. */
5010 /* If context independent, it's an operator. */
5011 || syntax & RE_CONTEXT_INDEP_ANCHORS
5012 /* Otherwise, depends on what's next. */
5013 || at_endline_loc_p (p, pend, syntax))
5015 struct rexp_node * n
5016 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_dollar);
5030 if ((syntax & RE_BK_PLUS_QM)
5031 || (syntax & RE_LIMITED_OPS))
5036 /* If there is no previous pattern... */
5037 if (pointless_if_repeated (*last_expression, params))
5039 if (syntax & RE_CONTEXT_INVALID_OPS)
5041 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5046 /* 1 means zero (many) matches is allowed. */
5047 char zero_times_ok = 0, many_times_ok = 0;
5049 /* If there is a sequence of repetition chars, collapse it
5050 down to just one (the right one). We can't combine
5051 interval operators with these because of, e.g., `a{2}*',
5052 which should only match an even number of `a's. */
5056 zero_times_ok |= c != '+';
5057 many_times_ok |= c != '?';
5065 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
5068 else if (syntax & RE_BK_PLUS_QM && c == '\\')
5070 if (p == pend) return REG_EESCAPE;
5073 if (!(c1 == '+' || c1 == '?'))
5088 /* If we get here, we found another repeat character. */
5091 /* Star, etc. applied to an empty pattern is equivalent
5092 to an empty pattern. */
5093 if (!last_expression)
5096 /* Now we know whether or not zero matches is allowed
5097 * and also whether or not two or more matches is allowed.
5101 struct rexp_node * inner_exp = *last_expression;
5105 && has_non_idempotent_epsilon_path (&rxb->rx,
5108 struct rexp_node * pusher
5109 = rx_mk_r_side_effect (&rxb->rx,
5110 (rx_side_effect)re_se_pushpos);
5111 struct rexp_node * checker
5112 = rx_mk_r_side_effect (&rxb->rx,
5113 (rx_side_effect)re_se_chkpos);
5114 struct rexp_node * pushback
5115 = rx_mk_r_side_effect (&rxb->rx,
5116 (rx_side_effect)re_se_pushback);
5117 rx_Bitset cs = rx_cset (&rxb->rx);
5118 struct rexp_node * lit_t = rx_mk_r_cset (&rxb->rx, cs);
5119 struct rexp_node * fake_state
5120 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5121 struct rexp_node * phase2
5122 = rx_mk_r_concat (&rxb->rx, checker, fake_state);
5123 struct rexp_node * popper
5124 = rx_mk_r_side_effect (&rxb->rx,
5125 (rx_side_effect)re_se_poppos);
5126 struct rexp_node * star
5127 = rx_mk_r_2phase_star (&rxb->rx, inner_exp, phase2);
5128 struct rexp_node * a
5129 = rx_mk_r_concat (&rxb->rx, pusher, star);
5130 struct rexp_node * whole_thing
5131 = rx_mk_r_concat (&rxb->rx, a, popper);
5132 if (!(pusher && star && pushback && lit_t && fake_state
5133 && lit_t && phase2 && checker && popper
5134 && a && whole_thing))
5136 RX_bitset_enjoin (cs, 't');
5137 *last_expression = whole_thing;
5141 struct rexp_node * star =
5142 (many_times_ok ? rx_mk_r_star : rx_mk_r_opt)
5143 (&rxb->rx, *last_expression);
5146 *last_expression = star;
5147 need_sync = has_any_se (&rxb->rx, *last_expression);
5151 struct rexp_node * concat
5152 = rx_mk_r_concat (&rxb->rx, inner_exp,
5153 rx_copy_rexp (&rxb->rx,
5157 *last_expression = concat;
5161 int sync_se = paramc;
5163 ? ((struct re_se_params *)
5165 sizeof (*params) * (1 + paramc)))
5166 : ((struct re_se_params *)
5167 malloc (sizeof (*params))));
5171 params [sync_se].se = re_se_tv;
5172 side = (rx_side_effect)sync_se;
5173 goto add_side_effect;
5176 /* The old regex.c used to optimize `.*\n'.
5177 * Maybe rx should too?
5185 rx_Bitset cs = rx_cset (&rxb->rx);
5186 struct rexp_node * n = rx_mk_r_cset (&rxb->rx, cs);
5190 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5191 if (!(rxb->syntax & RE_DOT_NEWLINE))
5192 RX_bitset_remove (cs, '\n');
5193 if (!(rxb->syntax & RE_DOT_NOT_NULL))
5194 RX_bitset_remove (cs, 0);
5203 if (p == pend) return REG_EBRACK;
5205 boolean had_char_class = false;
5206 rx_Bitset cs = rx_cset (&rxb->rx);
5207 struct rexp_node * node = rx_mk_r_cset (&rxb->rx, cs);
5208 int is_inverted = *p == '^';
5213 /* This branch of the switch is normally exited with
5221 /* Remember the first position in the bracket expression. */
5224 /* Read in characters and ranges, setting map bits. */
5227 if (p == pend) return REG_EBRACK;
5231 /* \ might escape characters inside [...] and [^...]. */
5232 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
5234 if (p == pend) return REG_EESCAPE;
5238 rx_Bitset it = inverse_translation (rxb,
5243 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5248 /* Could be the end of the bracket expression. If it's
5249 not (i.e., when the bracket expression is `[]' so
5250 far), the ']' character bit gets set way below. */
5251 if (c == ']' && p != p1 + 1)
5252 goto finalize_class_and_append;
5254 /* Look ahead to see if it's a range when the last thing
5255 was a character class. */
5256 if (had_char_class && c == '-' && *p != ']')
5259 /* Look ahead to see if it's a range when the last thing
5260 was a character: if this is a hyphen not at the
5261 beginning or the end of a list, then it's the range
5264 && !(p - 2 >= pattern && p[-2] == '[')
5265 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
5269 = compile_range (rxb, cs, &p, pend, translate, syntax,
5270 inverse_translate, validate_inv_tr);
5271 if (ret != REG_NOERROR) return ret;
5274 else if (p[0] == '-' && p[1] != ']')
5275 { /* This handles ranges made up of characters only. */
5278 /* Move past the `-'. */
5281 ret = compile_range (rxb, cs, &p, pend, translate, syntax,
5282 inverse_translate, validate_inv_tr);
5283 if (ret != REG_NOERROR) return ret;
5286 /* See if we're at the beginning of a possible character
5289 else if ((syntax & RE_CHAR_CLASSES)
5290 && (c == '[') && (*p == ':'))
5292 char str[CHAR_CLASS_MAX_LENGTH + 1];
5297 /* If pattern is `[[:'. */
5298 if (p == pend) return REG_EBRACK;
5303 if (c == ':' || c == ']' || p == pend
5304 || c1 == CHAR_CLASS_MAX_LENGTH)
5310 /* If isn't a word bracketed by `[:' and:`]':
5311 undo the ending character, the letters, and leave
5312 the leading `:' and `[' (but set bits for them). */
5313 if (c == ':' && *p == ']')
5316 boolean is_alnum = !strcmp (str, "alnum");
5317 boolean is_alpha = !strcmp (str, "alpha");
5318 boolean is_blank = !strcmp (str, "blank");
5319 boolean is_cntrl = !strcmp (str, "cntrl");
5320 boolean is_digit = !strcmp (str, "digit");
5321 boolean is_graph = !strcmp (str, "graph");
5322 boolean is_lower = !strcmp (str, "lower");
5323 boolean is_print = !strcmp (str, "print");
5324 boolean is_punct = !strcmp (str, "punct");
5325 boolean is_space = !strcmp (str, "space");
5326 boolean is_upper = !strcmp (str, "upper");
5327 boolean is_xdigit = !strcmp (str, "xdigit");
5329 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
5331 /* Throw away the ] at the end of the character
5335 if (p == pend) return REG_EBRACK;
5337 for (ch = 0; ch < 1 << CHARBITS; ch++)
5339 if ( (is_alnum && isalnum (ch))
5340 || (is_alpha && isalpha (ch))
5341 || (is_blank && isblank (ch))
5342 || (is_cntrl && iscntrl (ch))
5343 || (is_digit && isdigit (ch))
5344 || (is_graph && isgraph (ch))
5345 || (is_lower && islower (ch))
5346 || (is_print && isprint (ch))
5347 || (is_punct && ispunct (ch))
5348 || (is_space && isspace (ch))
5349 || (is_upper && isupper (ch))
5350 || (is_xdigit && isxdigit (ch)))
5353 inverse_translation (rxb,
5358 rx_bitset_union (rxb->rx.local_cset_size,
5362 had_char_class = true;
5371 inverse_translation (rxb,
5376 rx_bitset_union (rxb->rx.local_cset_size,
5381 inverse_translation (rxb,
5386 rx_bitset_union (rxb->rx.local_cset_size,
5389 had_char_class = false;
5394 had_char_class = false;
5396 rx_Bitset it = inverse_translation (rxb,
5401 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5406 finalize_class_and_append:
5409 rx_bitset_complement (rxb->rx.local_cset_size, cs);
5410 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5411 RX_bitset_remove (cs, '\n');
5419 if (syntax & RE_NO_BK_PARENS)
5426 if (syntax & RE_NO_BK_PARENS)
5433 if (syntax & RE_NEWLINE_ALT)
5440 if (syntax & RE_NO_BK_VBAR)
5447 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5448 goto handle_interval;
5454 if (p == pend) return REG_EESCAPE;
5456 /* Do not translate the character after the \, so that we can
5457 distinguish, e.g., \B from \b, even if we normally would
5458 translate, e.g., B to b. */
5464 if (syntax & RE_NO_BK_PARENS)
5465 goto normal_backslash;
5470 if (COMPILE_STACK_FULL)
5472 ((compile_stack.stack) =
5473 (compile_stack_elt_t *) realloc (compile_stack.stack, ( compile_stack.size << 1) * sizeof (
5474 compile_stack_elt_t)));
5475 if (compile_stack.stack == 0) return REG_ESPACE;
5477 compile_stack.size <<= 1;
5480 if (*last_expression)
5482 struct rexp_node * concat
5483 = rx_mk_r_concat (&rxb->rx, *last_expression, 0);
5486 *last_expression = concat;
5487 last_expression = &concat->params.pair.right;
5491 * These are the values to restore when we hit end of this
5494 COMPILE_STACK_TOP.top_expression = top_expression;
5495 COMPILE_STACK_TOP.last_expression = last_expression;
5496 COMPILE_STACK_TOP.regnum = regnum;
5498 compile_stack.avail++;
5500 top_expression = last_expression;
5505 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
5508 /* See similar code for backslashed left paren above. */
5509 if (COMPILE_STACK_EMPTY)
5510 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
5515 /* Since we just checked for an empty stack above, this
5516 ``can't happen''. */
5519 /* We don't just want to restore into `regnum', because
5520 later groups should continue to be numbered higher,
5521 as in `(ab)c(de)' -- the second group is #2. */
5522 regnum_t this_group_regnum;
5523 struct rexp_node ** inner = top_expression;
5525 compile_stack.avail--;
5526 top_expression = COMPILE_STACK_TOP.top_expression;
5527 last_expression = COMPILE_STACK_TOP.last_expression;
5528 this_group_regnum = COMPILE_STACK_TOP.regnum;
5530 int left_se = paramc;
5531 int right_se = paramc + 1;
5534 ? ((struct re_se_params *)
5536 (paramc + 2) * sizeof (params[0])))
5537 : ((struct re_se_params *)
5538 malloc (2 * sizeof (params[0]))));
5543 params[left_se].se = re_se_lparen;
5544 params[left_se].op1 = this_group_regnum;
5545 params[right_se].se = re_se_rparen;
5546 params[right_se].op1 = this_group_regnum;
5548 struct rexp_node * left
5549 = rx_mk_r_side_effect (&rxb->rx,
5550 (rx_side_effect)left_se);
5551 struct rexp_node * right
5552 = rx_mk_r_side_effect (&rxb->rx,
5553 (rx_side_effect)right_se);
5554 struct rexp_node * c1
5556 ? rx_mk_r_concat (&rxb->rx, left, *inner) : left);
5557 struct rexp_node * c2
5558 = rx_mk_r_concat (&rxb->rx, c1, right);
5559 if (!(left && right && c1 && c2))
5567 case '|': /* `\|'. */
5568 if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR))
5569 goto normal_backslash;
5571 if (syntax & RE_LIMITED_OPS)
5575 struct rexp_node * alt
5576 = rx_mk_r_alternate (&rxb->rx, *top_expression, 0);
5579 *top_expression = alt;
5580 last_expression = &alt->params.pair.right;
5582 int sync_se = paramc;
5585 ? ((struct re_se_params *)
5587 (paramc + 1) * sizeof (params[0])))
5588 : ((struct re_se_params *)
5589 malloc (sizeof (params[0]))));
5594 params[sync_se].se = re_se_tv;
5596 struct rexp_node * sync
5597 = rx_mk_r_side_effect (&rxb->rx,
5598 (rx_side_effect)sync_se);
5599 struct rexp_node * conc
5600 = rx_mk_r_concat (&rxb->rx, sync, 0);
5605 *last_expression = conc;
5606 last_expression = &conc->params.pair.right;
5614 /* If \{ is a literal. */
5615 if (!(syntax & RE_INTERVALS)
5616 /* If we're at `\{' and it's not the open-interval
5618 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5619 || (p - 2 == pattern && p == pend))
5620 goto normal_backslash;
5624 /* If got here, then the syntax allows intervals. */
5626 /* At least (most) this many matches must be made. */
5627 int lower_bound = -1, upper_bound = -1;
5629 beg_interval = p - 1;
5633 if (syntax & RE_NO_BK_BRACES)
5634 goto unfetch_interval;
5639 GET_UNSIGNED_NUMBER (lower_bound);
5643 GET_UNSIGNED_NUMBER (upper_bound);
5644 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
5647 /* Interval such as `{1}' => match exactly once. */
5648 upper_bound = lower_bound;
5650 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
5651 || lower_bound > upper_bound)
5653 if (syntax & RE_NO_BK_BRACES)
5654 goto unfetch_interval;
5659 if (!(syntax & RE_NO_BK_BRACES))
5661 if (c != '\\') return REG_EBRACE;
5667 if (syntax & RE_NO_BK_BRACES)
5668 goto unfetch_interval;
5673 /* We just parsed a valid interval. */
5675 /* If it's invalid to have no preceding re. */
5676 if (pointless_if_repeated (*last_expression, params))
5678 if (syntax & RE_CONTEXT_INVALID_OPS)
5680 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5681 goto unfetch_interval;
5682 /* was: else laststart = b; */
5685 /* If the upper bound is zero, don't want to iterate
5688 if (upper_bound == 0)
5690 if (*last_expression)
5692 rx_free_rexp (&rxb->rx, *last_expression);
5693 *last_expression = 0;
5697 /* Otherwise, we have a nontrivial interval. */
5699 int iter_se = paramc;
5700 int end_se = paramc + 1;
5702 ? ((struct re_se_params *)
5704 sizeof (*params) * (2 + paramc)))
5705 : ((struct re_se_params *)
5706 malloc (2 * sizeof (*params))));
5710 params [iter_se].se = re_se_iter;
5711 params [iter_se].op1 = lower_bound;
5712 params[iter_se].op2 = upper_bound;
5714 params[end_se].se = re_se_end_iter;
5715 params[end_se].op1 = lower_bound;
5716 params[end_se].op2 = upper_bound;
5718 struct rexp_node * push0
5719 = rx_mk_r_side_effect (&rxb->rx,
5720 (rx_side_effect)re_se_push0);
5721 struct rexp_node * start_one_iter
5722 = rx_mk_r_side_effect (&rxb->rx,
5723 (rx_side_effect)iter_se);
5724 struct rexp_node * phase1
5725 = rx_mk_r_concat (&rxb->rx, start_one_iter,
5727 struct rexp_node * pushback
5728 = rx_mk_r_side_effect (&rxb->rx,
5729 (rx_side_effect)re_se_pushback);
5730 rx_Bitset cs = rx_cset (&rxb->rx);
5731 struct rexp_node * lit_t
5732 = rx_mk_r_cset (&rxb->rx, cs);
5733 struct rexp_node * phase2
5734 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5735 struct rexp_node * loop
5736 = rx_mk_r_2phase_star (&rxb->rx, phase1, phase2);
5737 struct rexp_node * push_n_loop
5738 = rx_mk_r_concat (&rxb->rx, push0, loop);
5739 struct rexp_node * final_test
5740 = rx_mk_r_side_effect (&rxb->rx,
5741 (rx_side_effect)end_se);
5742 struct rexp_node * full_exp
5743 = rx_mk_r_concat (&rxb->rx, push_n_loop, final_test);
5745 if (!(push0 && start_one_iter && phase1
5746 && pushback && lit_t && phase2
5747 && loop && push_n_loop && final_test && full_exp))
5750 RX_bitset_enjoin(cs, 't');
5752 *last_expression = full_exp;
5760 /* If an invalid interval, match the characters as literals. */
5764 /* normal_char and normal_backslash need `c'. */
5767 if (!(syntax & RE_NO_BK_BRACES))
5769 if (p > pattern && p[-1] == '\\')
5770 goto normal_backslash;
5775 /* There is no way to specify the before_dot and after_dot
5776 operators. rms says this is ok. --karl */
5778 side = (rx_side_effect)rx_se_at_dot;
5779 goto add_side_effect;
5785 rx_Bitset cs = rx_cset (&rxb->rx);
5786 struct rexp_node * set = rx_mk_r_cset (&rxb->rx, cs);
5790 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5795 enum syntaxcode code = syntax_spec_code [c];
5796 for (x = 0; x < 256; ++x)
5799 if (SYNTAX (x) == code)
5802 inverse_translation (rxb, validate_inv_tr,
5805 rx_bitset_xor (rxb->rx.local_cset_size, cs, it);
5819 rx_Bitset cs = rx_cset (&rxb->rx);
5820 struct rexp_node * n = (cs ? rx_mk_r_cset (&rxb->rx, cs) : 0);
5824 rx_bitset_universe (rxb->rx.local_cset_size ,cs);
5827 for (x = rxb->rx.local_cset_size - 1; x > 0; --x)
5828 if (SYNTAX(x) & Sword)
5829 RX_bitset_toggle (cs, x);
5836 /* With a little extra work, some of these side effects could be optimized
5837 * away (basicly by looking at what we already know about the surrounding
5841 side = (rx_side_effect)re_se_wordbeg;
5842 goto add_side_effect;
5846 side = (rx_side_effect)re_se_wordend;
5847 goto add_side_effect;
5851 side = (rx_side_effect)re_se_wordbound;
5852 goto add_side_effect;
5856 side = (rx_side_effect)re_se_notwordbound;
5857 goto add_side_effect;
5861 side = (rx_side_effect)re_se_begbuf;
5862 goto add_side_effect;
5866 side = (rx_side_effect)re_se_endbuf;
5867 goto add_side_effect;
5872 struct rexp_node * se
5873 = rx_mk_r_side_effect (&rxb->rx, side);
5881 case '1': case '2': case '3': case '4': case '5':
5882 case '6': case '7': case '8': case '9':
5883 if (syntax & RE_NO_BK_REFS)
5891 /* Can't back reference to a subexpression if inside of it. */
5892 if (group_in_compile_stack (compile_stack, c1))
5896 int backref_se = paramc;
5898 ? ((struct re_se_params *)
5900 sizeof (*params) * (1 + paramc)))
5901 : ((struct re_se_params *)
5902 malloc (sizeof (*params))));
5906 params[backref_se].se = re_se_backref;
5907 params[backref_se].op1 = c1;
5908 side = (rx_side_effect)backref_se;
5909 goto add_side_effect;
5915 if (syntax & RE_BK_PLUS_QM)
5918 goto normal_backslash;
5922 /* You might think it would be useful for \ to mean
5923 not to translate; but if we don't translate it
5924 it will never match anything. */
5932 /* Expects the character in `c'. */
5935 rx_Bitset cs = rx_cset(&rxb->rx);
5936 struct rexp_node * match = rx_mk_r_cset (&rxb->rx, cs);
5940 it = inverse_translation (rxb, validate_inv_tr,
5941 inverse_translate, translate, c);
5942 rx_bitset_union (CHAR_SET_SIZE, cs, it);
5946 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
5947 * and then parses the next character normally.
5949 if (*last_expression)
5951 struct rexp_node * concat
5952 = rx_mk_r_concat (&rxb->rx, *last_expression, append);
5955 *last_expression = concat;
5956 last_expression = &concat->params.pair.right;
5959 *last_expression = append;
5962 } /* while p != pend */
5966 int win_se = paramc;
5968 ? ((struct re_se_params *)
5970 sizeof (*params) * (1 + paramc)))
5971 : ((struct re_se_params *)
5972 malloc (sizeof (*params))));
5976 params[win_se].se = re_se_win;
5978 struct rexp_node * se
5979 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)win_se);
5980 struct rexp_node * concat
5981 = rx_mk_r_concat (&rxb->rx, rexp, se);
5982 if (!(se && concat))
5989 /* Through the pattern now. */
5991 if (!COMPILE_STACK_EMPTY)
5994 free (compile_stack.stack);
5998 if (rx_debug_compile)
6001 fputs ("\n\nCompiling ", stdout);
6002 fwrite (pattern, 1, size, stdout);
6003 fputs (":\n", stdout);
6004 rxb->se_params = params;
6005 print_rexp (&rxb->rx, orig_rexp, 2, re_seprint, stdout);
6009 rx_Bitset cs = rx_cset(&rxb->rx);
6010 rx_Bitset cs2 = rx_cset(&rxb->rx);
6011 char * se_map = (char *) alloca (paramc);
6012 struct rexp_node * new_rexp = 0;
6015 bzero (se_map, paramc);
6016 find_backrefs (se_map, rexp, params);
6017 fewer_side_effects =
6018 remove_unecessary_side_effects (&rxb->rx, se_map,
6019 rx_copy_rexp (&rxb->rx, rexp), params);
6021 speed_up_alt (&rxb->rx, rexp, 0);
6022 speed_up_alt (&rxb->rx, fewer_side_effects, 1);
6025 char * syntax_parens = rxb->syntax_parens;
6026 if (syntax_parens == (char *)0x1)
6027 rexp = remove_unecessary_side_effects
6028 (&rxb->rx, se_map, rexp, params);
6029 else if (syntax_parens)
6032 for (x = 0; x < paramc; ++x)
6033 if (( (params[x].se == re_se_lparen)
6034 || (params[x].se == re_se_rparen))
6035 && (!syntax_parens [params[x].op1]))
6037 rexp = remove_unecessary_side_effects
6038 (&rxb->rx, se_map, rexp, params);
6042 /* At least one more optimization would be nice to have here but i ran out
6043 * of time. The idea would be to delay side effects.
6044 * For examle, `(abc)' is the same thing as `abc()' except that the
6045 * left paren is offset by 3 (which we know at compile time).
6046 * (In this comment, write that second pattern `abc(:3:)'
6047 * where `(:3:' is a syntactic unit.)
6049 * Trickier: `(abc|defg)' is the same as `(abc(:3:|defg(:4:))'
6050 * (The paren nesting may be hard to follow -- that's an alternation
6051 * of `abc(:3:' and `defg(:4:' inside (purely syntactic) parens
6052 * followed by the closing paren from the original expression.)
6054 * Neither the expression tree representation nor the the nfa make
6055 * this very easy to write. :(
6058 /* What we compile is different than what the parser returns.
6059 * Suppose the parser returns expression R.
6060 * Let R' be R with unnecessary register assignments removed
6061 * (see REMOVE_UNECESSARY_SIDE_EFFECTS, above).
6063 * What we will compile is the expression:
6065 * m{try}R{win}\|s{try}R'{win}
6067 * {try} and {win} denote side effect epsilons (see EXPLORE_FUTURE).
6069 * When trying a match, we insert an `m' at the beginning of the
6070 * string if the user wants registers to be filled, `s' if not.
6075 rx_mk_r_concat (&rxb->rx, rx_mk_r_cset (&rxb->rx, cs2), rexp),
6076 rx_mk_r_concat (&rxb->rx,
6077 rx_mk_r_cset (&rxb->rx, cs), fewer_side_effects));
6079 if (!(new_rexp && cs && cs2))
6081 RX_bitset_enjoin (cs2, '\0'); /* prefixed to the rexp used for matching. */
6082 RX_bitset_enjoin (cs, '\1'); /* prefixed to the rexp used for searching. */
6087 if (rx_debug_compile)
6089 fputs ("\n...which is compiled as:\n", stdout);
6090 print_rexp (&rxb->rx, rexp, 2, re_seprint, stdout);
6094 struct rx_nfa_state *start = 0;
6095 struct rx_nfa_state *end = 0;
6097 if (!rx_build_nfa (&rxb->rx, rexp, &start, &end))
6098 return REG_ESPACE; /* */
6101 void * mem = (void *)rxb->buffer;
6102 unsigned long size = rxb->allocated;
6105 int iterator_size = paramc * sizeof (params[0]);
6108 start->is_start = 1;
6109 rx_name_nfa_states (&rxb->rx);
6110 start_id = start->id;
6112 if (rx_debug_compile)
6114 fputs ("...giving the NFA: \n", stdout);
6116 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6119 if (!rx_eclose_nfa (&rxb->rx))
6123 rx_delete_epsilon_transitions (&rxb->rx);
6125 /* For compatability reasons, we need to shove the
6126 * compiled nfa into one chunk of malloced memory.
6128 rxb->rx.reserved = ( sizeof (params[0]) * paramc
6129 + rx_sizeof_bitset (rxb->rx.local_cset_size));
6131 if (rx_debug_compile)
6134 fputs ("...which cooks down (uncompactified) to: \n", stdout);
6135 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6138 if (!rx_compactify_nfa (&rxb->rx, &mem, &size))
6141 rxb->allocated = size;
6142 rxb->rx.buffer = mem;
6143 rxb->rx.allocated = size;
6144 perm_mem = ((char *)rxb->rx.buffer
6145 + rxb->rx.allocated - rxb->rx.reserved);
6146 rxb->se_params = ((struct re_se_params *)perm_mem);
6147 bcopy (params, rxb->se_params, iterator_size);
6148 perm_mem += iterator_size;
6149 rxb->fastset = (rx_Bitset) perm_mem;
6150 rxb->start = rx_id_to_nfa_state (&rxb->rx, start_id);
6152 rx_bitset_null (rxb->rx.local_cset_size, rxb->fastset);
6153 rxb->can_match_empty = compute_fastset (rxb, orig_rexp);
6154 rxb->match_regs_on_stack =
6155 registers_on_stack (rxb, orig_rexp, 0, params);
6156 rxb->search_regs_on_stack =
6157 registers_on_stack (rxb, fewer_side_effects, 0, params);
6158 if (rxb->can_match_empty)
6159 rx_bitset_universe (rxb->rx.local_cset_size, rxb->fastset);
6160 rxb->is_anchored = is_anchored (orig_rexp, (rx_side_effect) re_se_hat);
6161 rxb->begbuf_only = is_anchored (orig_rexp,
6162 (rx_side_effect) re_se_begbuf);
6164 rx_free_rexp (&rxb->rx, rexp);
6168 if (rx_debug_compile)
6171 fputs ("...which cooks down to: \n", stdout);
6172 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6181 /* This table gives an error message for each of the error codes listed
6182 in regex.h. Obviously the order here has to be same as there. */
6184 __const__ char * rx_error_msg[] =
6185 { 0, /* REG_NOERROR */
6186 "No match", /* REG_NOMATCH */
6187 "Invalid regular expression", /* REG_BADPAT */
6188 "Invalid collation character", /* REG_ECOLLATE */
6189 "Invalid character class name", /* REG_ECTYPE */
6190 "Trailing backslash", /* REG_EESCAPE */
6191 "Invalid back reference", /* REG_ESUBREG */
6192 "Unmatched [ or [^", /* REG_EBRACK */
6193 "Unmatched ( or \\(", /* REG_EPAREN */
6194 "Unmatched \\{", /* REG_EBRACE */
6195 "Invalid content of \\{\\}", /* REG_BADBR */
6196 "Invalid range end", /* REG_ERANGE */
6197 "Memory exhausted", /* REG_ESPACE */
6198 "Invalid preceding regular expression", /* REG_BADRPT */
6199 "Premature end of regular expression", /* REG_EEND */
6200 "Regular expression too big", /* REG_ESIZE */
6201 "Unmatched ) or \\)", /* REG_ERPAREN */
6207 char rx_slowmap [256] =
6209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6222 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6223 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6224 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6229 rx_blow_up_fastmap (struct re_pattern_buffer * rxb)
6232 rx_blow_up_fastmap (rxb)
6233 struct re_pattern_buffer * rxb;
6237 for (x = 0; x < 256; ++x) /* &&&& 3.6 % */
6238 rxb->fastmap [x] = !!RX_bitset_member (rxb->fastset, x);
6239 rxb->fastmap_accurate = 1;
6245 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6246 #define RE_SEARCH_2_FN inner_re_search_2
6247 #define RE_S2_QUAL static
6249 #define RE_SEARCH_2_FN re_search_2
6253 struct re_search_2_closure
6255 __const__ unsigned char * string1;
6257 __const__ unsigned char * string2;
6262 static __inline__ enum rx_get_burst_return
6263 re_search_2_get_burst (pos, vclosure, stop)
6264 struct rx_string_position * pos;
6268 struct re_search_2_closure * closure;
6269 closure = (struct re_search_2_closure *)vclosure;
6270 if (!closure->string2)
6274 inset = pos->pos - pos->string;
6275 if ((inset < -1) || (inset > closure->size1))
6276 return rx_get_burst_no_more;
6279 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6280 pos->string = (__const__ unsigned char *) closure->string1;
6281 pos->size = closure->size1;
6282 pos->end = ((__const__ unsigned char *)
6283 MIN(closure->string1 + closure->size1,
6284 closure->string1 + stop));
6286 return ((pos->pos < pos->end)
6288 : rx_get_burst_no_more);
6291 else if (!closure->string1)
6295 inset = pos->pos - pos->string;
6296 pos->pos = (__const__ unsigned char *) closure->string2 + inset;
6297 pos->string = (__const__ unsigned char *) closure->string2;
6298 pos->size = closure->size2;
6299 pos->end = ((__const__ unsigned char *)
6300 MIN(closure->string2 + closure->size2,
6301 closure->string2 + stop));
6303 return ((pos->pos < pos->end)
6305 : rx_get_burst_no_more);
6311 inset = pos->pos - pos->string + pos->offset;
6312 if (inset < closure->size1)
6314 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6315 pos->string = (__const__ unsigned char *) closure->string1;
6316 pos->size = closure->size1;
6317 pos->end = ((__const__ unsigned char *)
6318 MIN(closure->string1 + closure->size1,
6319 closure->string1 + stop));
6321 return rx_get_burst_ok;
6325 pos->pos = ((__const__ unsigned char *)
6326 closure->string2 + inset - closure->size1);
6327 pos->string = (__const__ unsigned char *) closure->string2;
6328 pos->size = closure->size2;
6329 pos->end = ((__const__ unsigned char *)
6330 MIN(closure->string2 + closure->size2,
6331 closure->string2 + stop - closure->size1));
6332 pos->offset = closure->size1;
6333 return ((pos->pos < pos->end)
6335 : rx_get_burst_no_more);
6341 static __inline__ enum rx_back_check_return
6342 re_search_2_back_check (pos, lparen, rparen, translate, vclosure, stop)
6343 struct rx_string_position * pos;
6346 unsigned char * translate;
6350 struct rx_string_position there;
6351 struct rx_string_position past;
6354 there.pos = there.string + lparen - there.offset;
6355 re_search_2_get_burst (&there, vclosure, stop);
6358 past.pos = past.string + rparen - there.offset;
6359 re_search_2_get_burst (&past, vclosure, stop);
6362 re_search_2_get_burst (pos, vclosure, stop);
6364 while ( (there.pos != past.pos)
6365 && (pos->pos != pos->end))
6366 if (TRANSLATE(*there.pos) != TRANSLATE(*pos->pos))
6367 return rx_back_check_fail;
6372 if (there.pos == there.end)
6373 re_search_2_get_burst (&there, vclosure, stop);
6374 if (pos->pos == pos->end)
6375 re_search_2_get_burst (pos, vclosure, stop);
6378 if (there.pos != past.pos)
6379 return rx_back_check_fail;
6381 re_search_2_get_burst (pos, vclosure, stop);
6382 return rx_back_check_pass;
6385 static __inline__ int
6386 re_search_2_fetch_char (pos, offset, app_closure, stop)
6387 struct rx_string_position * pos;
6392 struct re_search_2_closure * closure;
6393 closure = (struct re_search_2_closure *)app_closure;
6396 if (pos->pos >= pos->string)
6400 if ( (pos->string == closure->string2)
6401 && (closure->string1)
6402 && (closure->size1))
6403 return closure->string1[closure->size1 - 1];
6405 return 0; /* sure, why not. */
6408 if (pos->pos == pos->end)
6409 return *closure->string2;
6417 RE_SEARCH_2_FN (struct re_pattern_buffer *rxb,
6418 __const__ char * string1, int size1,
6419 __const__ char * string2, int size2,
6420 int startpos, int range,
6421 struct re_registers *regs,
6425 RE_SEARCH_2_FN (rxb,
6426 string1, size1, string2, size2, startpos, range, regs, stop)
6427 struct re_pattern_buffer *rxb;
6428 __const__ char * string1;
6430 __const__ char * string2;
6434 struct re_registers *regs;
6439 struct re_search_2_closure closure;
6440 closure.string1 = string1;
6441 closure.size1 = size1;
6442 closure.string2 = string2;
6443 closure.size2 = size2;
6444 answer = rx_search (rxb, startpos, range, stop, size1 + size2,
6445 re_search_2_get_burst,
6446 re_search_2_back_check,
6447 re_search_2_fetch_char,
6454 case rx_search_continuation:
6456 case rx_search_error:
6458 case rx_search_soft_fail:
6459 case rx_search_fail:
6466 /* Export rx_search to callers outside this file. */
6469 re_rx_search (rxb, startpos, range, stop, total_size,
6470 get_burst, back_check, fetch_char,
6471 app_closure, regs, resume_state, save_state)
6472 struct re_pattern_buffer * rxb;
6477 rx_get_burst_fn get_burst;
6478 rx_back_check_fn back_check;
6479 rx_fetch_char_fn fetch_char;
6481 struct re_registers * regs;
6482 struct rx_search_state * resume_state;
6483 struct rx_search_state * save_state;
6485 return rx_search (rxb, startpos, range, stop, total_size,
6486 get_burst, back_check, fetch_char, app_closure,
6487 regs, resume_state, save_state);
6490 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6493 re_search_2 (struct re_pattern_buffer *rxb,
6494 __const__ char * string1, int size1,
6495 __const__ char * string2, int size2,
6496 int startpos, int range,
6497 struct re_registers *regs,
6501 re_search_2 (rxb, string1, size1, string2, size2, startpos, range, regs, stop)
6502 struct re_pattern_buffer *rxb;
6503 __const__ char * string1;
6505 __const__ char * string2;
6509 struct re_registers *regs;
6514 ret = inner_re_search_2 (rxb, string1, size1, string2, size2, startpos,
6522 /* Like re_search_2, above, but only one string is specified, and
6523 * doesn't let you say where to stop matching.
6528 re_search (struct re_pattern_buffer * rxb, __const__ char *string,
6529 int size, int startpos, int range,
6530 struct re_registers *regs)
6533 re_search (rxb, string, size, startpos, range, regs)
6534 struct re_pattern_buffer * rxb;
6535 __const__ char * string;
6539 struct re_registers *regs;
6542 return re_search_2 (rxb, 0, 0, string, size, startpos, range, regs, size);
6547 re_match_2 (struct re_pattern_buffer * rxb,
6548 __const__ char * string1, int size1,
6549 __const__ char * string2, int size2,
6550 int pos, struct re_registers *regs, int stop)
6553 re_match_2 (rxb, string1, size1, string2, size2, pos, regs, stop)
6554 struct re_pattern_buffer * rxb;
6555 __const__ char * string1;
6557 __const__ char * string2;
6560 struct re_registers *regs;
6564 struct re_registers some_regs;
6568 int save = rxb->regs_allocated;
6569 struct re_registers * regs_to_pass = regs;
6573 some_regs.start = &start;
6574 some_regs.end = &end;
6575 some_regs.num_regs = 1;
6576 regs_to_pass = &some_regs;
6577 rxb->regs_allocated = REGS_FIXED;
6580 srch = re_search_2 (rxb, string1, size1, string2, size2,
6581 pos, 1, regs_to_pass, stop);
6582 if (regs_to_pass != regs)
6583 rxb->regs_allocated = save;
6586 return regs_to_pass->end[0] - regs_to_pass->start[0];
6589 /* re_match is like re_match_2 except it takes only a single string. */
6593 re_match (struct re_pattern_buffer * rxb,
6594 __const__ char * string,
6596 struct re_registers *regs)
6599 re_match (rxb, string, size, pos, regs)
6600 struct re_pattern_buffer * rxb;
6601 __const__ char *string;
6604 struct re_registers *regs;
6607 return re_match_2 (rxb, string, size, 0, 0, pos, regs, size);
6612 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
6613 also be assigned to arbitrarily: each pattern buffer stores its own
6614 syntax, so it can be changed between regex compilations. */
6615 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
6618 /* Specify the precise syntax of regexps for compilation. This provides
6619 for compatibility for various utilities which historically have
6620 different, incompatible syntaxes.
6622 The argument SYNTAX is a bit mask comprised of the various bits
6623 defined in regex.h. We return the old syntax. */
6627 re_set_syntax (reg_syntax_t syntax)
6630 re_set_syntax (syntax)
6631 reg_syntax_t syntax;
6634 reg_syntax_t ret = re_syntax_options;
6636 re_syntax_options = syntax;
6641 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6642 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6643 this memory for recording register information. STARTS and ENDS
6644 must be allocated using the malloc library routine, and must each
6645 be at least NUM_REGS * sizeof (regoff_t) bytes long.
6647 If NUM_REGS == 0, then subsequent matches should allocate their own
6650 Unless this function is called, the first search or match using
6651 PATTERN_BUFFER will allocate its own register data, without
6652 freeing the old data. */
6656 re_set_registers (struct re_pattern_buffer *bufp,
6657 struct re_registers *regs,
6659 regoff_t * starts, regoff_t * ends)
6662 re_set_registers (bufp, regs, num_regs, starts, ends)
6663 struct re_pattern_buffer *bufp;
6664 struct re_registers *regs;
6672 bufp->regs_allocated = REGS_REALLOCATE;
6673 regs->num_regs = num_regs;
6674 regs->start = starts;
6679 bufp->regs_allocated = REGS_UNALLOCATED;
6681 regs->start = regs->end = (regoff_t) 0;
6690 cplx_se_sublist_len (struct rx_se_list * list)
6693 cplx_se_sublist_len (list)
6694 struct rx_se_list * list;
6700 if ((long)list->car >= 0)
6708 /* For rx->se_list_cmp */
6712 posix_se_list_order (struct rx * rx,
6713 struct rx_se_list * a, struct rx_se_list * b)
6716 posix_se_list_order (rx, a, b)
6718 struct rx_se_list * a;
6719 struct rx_se_list * b;
6722 int al = cplx_se_sublist_len (a);
6723 int bl = cplx_se_sublist_len (b);
6728 : ((a < b) ? -1 : 1));
6738 rx_side_effect * av = ((rx_side_effect *)
6739 alloca (sizeof (rx_side_effect) * (al + 1)));
6740 rx_side_effect * bv = ((rx_side_effect *)
6741 alloca (sizeof (rx_side_effect) * (bl + 1)));
6742 struct rx_se_list * ap = a;
6743 struct rx_se_list * bp = b;
6746 for (ai = al - 1; ai >= 0; --ai)
6748 while ((long)ap->car < 0)
6753 av[al] = (rx_side_effect)-2;
6754 for (bi = bl - 1; bi >= 0; --bi)
6756 while ((long)bp->car < 0)
6761 bv[bl] = (rx_side_effect)-1;
6766 while (av[x] == bv[x])
6768 ret = (((unsigned *)(av[x]) < (unsigned *)(bv[x])) ? -1 : 1);
6777 /* re_compile_pattern is the GNU regular expression compiler: it
6778 compiles PATTERN (of length SIZE) and puts the result in RXB.
6779 Returns 0 if the pattern was valid, otherwise an error string.
6781 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6782 are set in RXB on entry.
6784 We call rx_compile to do the actual compilation. */
6788 re_compile_pattern (__const__ char *pattern,
6790 struct re_pattern_buffer * rxb)
6793 re_compile_pattern (pattern, length, rxb)
6794 __const__ char *pattern;
6796 struct re_pattern_buffer * rxb;
6801 /* GNU code is written to assume at least RE_NREGS registers will be set
6802 (and at least one extra will be -1). */
6803 rxb->regs_allocated = REGS_UNALLOCATED;
6805 /* And GNU code determines whether or not to get register information
6806 by passing null for the REGS argument to re_match, etc., not by
6810 rxb->rx.local_cset_size = 256;
6812 /* Match anchors at newline. */
6813 rxb->newline_anchor = 1;
6819 rxb->rx.epsnodec = 0;
6820 rxb->rx.instruction_table = 0;
6821 rxb->rx.nfa_states = 0;
6822 rxb->rx.se_list_cmp = posix_se_list_order;
6823 rxb->rx.start_set = 0;
6825 ret = rx_compile (pattern, length, re_syntax_options, rxb);
6827 return rx_error_msg[(int) ret];
6834 re_compile_fastmap (struct re_pattern_buffer * rxb)
6837 re_compile_fastmap (rxb)
6838 struct re_pattern_buffer * rxb;
6841 rx_blow_up_fastmap (rxb);
6848 /* Entry points compatible with 4.2 BSD regex library. We don't define
6849 them if this is an Emacs or POSIX compilation. */
6851 #if (!defined (emacs) && !defined (_POSIX_SOURCE)) || defined(USE_BSD_REGEX)
6853 /* BSD has one and only one pattern buffer. */
6854 static struct re_pattern_buffer rx_comp_buf;
6858 re_comp (__const__ char *s)
6867 if (!s || (*s == '\0'))
6869 if (!rx_comp_buf.buffer)
6870 return "No previous regular expression";
6874 if (!rx_comp_buf.fastmap)
6876 rx_comp_buf.fastmap = (char *) malloc (1 << CHARBITS);
6877 if (!rx_comp_buf.fastmap)
6878 return "Memory exhausted";
6881 /* Since `rx_exec' always passes NULL for the `regs' argument, we
6882 don't need to initialize the pattern buffer fields which affect it. */
6884 /* Match anchors at newlines. */
6885 rx_comp_buf.newline_anchor = 1;
6887 rx_comp_buf.fastmap_accurate = 0;
6888 rx_comp_buf.re_nsub = 0;
6889 rx_comp_buf.start = 0;
6890 rx_comp_buf.se_params = 0;
6891 rx_comp_buf.rx.nodec = 0;
6892 rx_comp_buf.rx.epsnodec = 0;
6893 rx_comp_buf.rx.instruction_table = 0;
6894 rx_comp_buf.rx.nfa_states = 0;
6895 rx_comp_buf.rx.start = 0;
6896 rx_comp_buf.rx.se_list_cmp = posix_se_list_order;
6897 rx_comp_buf.rx.start_set = 0;
6898 rx_comp_buf.rx.local_cset_size = 256;
6900 ret = rx_compile (s, strlen (s), re_syntax_options, &rx_comp_buf);
6903 /* Yes, we're discarding `__const__' here. */
6904 return (char *) rx_error_msg[(int) ret];
6910 re_exec (__const__ char *s)
6917 __const__ int len = strlen (s);
6919 0 <= re_search (&rx_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6921 #endif /* not emacs and not _POSIX_SOURCE */
6925 /* POSIX.2 functions. Don't define these for Emacs. */
6929 /* regcomp takes a regular expression as a string and compiles it.
6931 PREG is a regex_t *. We do not expect any fields to be initialized,
6932 since POSIX says we shouldn't. Thus, we set
6934 `buffer' to the compiled pattern;
6935 `used' to the length of the compiled pattern;
6936 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6937 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6938 RE_SYNTAX_POSIX_BASIC;
6939 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6940 `fastmap' and `fastmap_accurate' to zero;
6941 `re_nsub' to the number of subexpressions in PATTERN.
6943 PATTERN is the address of the pattern string.
6945 CFLAGS is a series of bits which affect compilation.
6947 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6948 use POSIX basic syntax.
6950 If REG_NEWLINE is set, then . and [^...] don't match newline.
6951 Also, regexec will try a match beginning after every newline.
6953 If REG_ICASE is set, then we considers upper- and lowercase
6954 versions of letters to be equivalent when matching.
6956 If REG_NOSUB is set, then when PREG is passed to regexec, that
6957 routine will report only success or failure, and nothing about the
6960 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6961 the return codes and their meanings.) */
6966 regcomp (regex_t * preg, __const__ char * pattern, int cflags)
6969 regcomp (preg, pattern, cflags)
6971 __const__ char * pattern;
6977 = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6979 /* regex_compile will allocate the space for the compiled pattern. */
6981 preg->allocated = 0;
6982 preg->fastmap = malloc (256);
6985 preg->fastmap_accurate = 0;
6987 if (cflags & REG_ICASE)
6991 preg->translate = (unsigned char *) malloc (256);
6992 if (!preg->translate)
6993 return (int) REG_ESPACE;
6995 /* Map uppercase characters to corresponding lowercase ones. */
6996 for (i = 0; i < CHAR_SET_SIZE; i++)
6997 preg->translate[i] = isupper (i) ? tolower (i) : i;
7000 preg->translate = 0;
7002 /* If REG_NEWLINE is set, newlines are treated differently. */
7003 if (cflags & REG_NEWLINE)
7004 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7005 syntax &= ~RE_DOT_NEWLINE;
7006 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7007 /* It also changes the matching behavior. */
7008 preg->newline_anchor = 1;
7011 preg->newline_anchor = 0;
7013 preg->no_sub = !!(cflags & REG_NOSUB);
7015 /* POSIX says a null character in the pattern terminates it, so we
7016 can use strlen here in compiling the pattern. */
7019 preg->se_params = 0;
7020 preg->syntax_parens = 0;
7022 preg->rx.epsnodec = 0;
7023 preg->rx.instruction_table = 0;
7024 preg->rx.nfa_states = 0;
7025 preg->rx.local_cset_size = 256;
7027 preg->rx.se_list_cmp = posix_se_list_order;
7028 preg->rx.start_set = 0;
7029 ret = rx_compile (pattern, strlen (pattern), syntax, preg);
7032 /* POSIX doesn't distinguish between an unmatched open-group and an
7033 unmatched close-group: both are REG_EPAREN. */
7034 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7040 /* regexec searches for a given pattern, specified by PREG, in the
7043 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7044 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7045 least NMATCH elements, and we set them to the offsets of the
7046 corresponding matched substrings.
7048 EFLAGS specifies `execution flags' which affect matching: if
7049 REG_NOTBOL is set, then ^ does not match at the beginning of the
7050 string; if REG_NOTEOL is set, then $ does not match at the end.
7052 We return 0 if we find a match and REG_NOMATCH if not. */
7056 regexec (__const__ regex_t *preg, __const__ char *string,
7057 size_t nmatch, regmatch_t pmatch[],
7061 regexec (preg, string, nmatch, pmatch, eflags)
7062 __const__ regex_t *preg;
7063 __const__ char *string;
7065 regmatch_t pmatch[];
7070 struct re_registers regs;
7071 regex_t private_preg;
7072 int len = strlen (string);
7073 boolean want_reg_info = !preg->no_sub && nmatch > 0;
7075 private_preg = *preg;
7077 private_preg.not_bol = !!(eflags & REG_NOTBOL);
7078 private_preg.not_eol = !!(eflags & REG_NOTEOL);
7080 /* The user has told us exactly how many registers to return
7081 * information about, via `nmatch'. We have to pass that on to the
7082 * matching routines.
7084 private_preg.regs_allocated = REGS_FIXED;
7088 regs.num_regs = nmatch;
7089 regs.start = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7090 regs.end = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7091 if (regs.start == 0 || regs.end == 0)
7092 return (int) REG_NOMATCH;
7095 /* Perform the searching operation. */
7096 ret = re_search (&private_preg,
7100 want_reg_info ? ®s : (struct re_registers *) 0);
7102 /* Copy the register information to the POSIX structure. */
7109 for (r = 0; r < nmatch; r++)
7111 pmatch[r].rm_so = regs.start[r];
7112 pmatch[r].rm_eo = regs.end[r];
7116 /* If we needed the temporary register info, free the space now. */
7121 /* We want zero return to mean success, unlike `re_search'. */
7122 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
7126 /* Returns a message corresponding to an error code, ERRCODE, returned
7127 from either regcomp or regexec. */
7131 regerror (int errcode, __const__ regex_t *preg,
7132 char *errbuf, size_t errbuf_size)
7135 regerror (errcode, preg, errbuf, errbuf_size)
7137 __const__ regex_t *preg;
7143 = rx_error_msg[errcode] == 0 ? "Success" : rx_error_msg[errcode];
7144 size_t msg_size = strlen (msg) + 1; /* Includes the 0. */
7146 if (errbuf_size != 0)
7148 if (msg_size > errbuf_size)
7150 strncpy (errbuf, msg, errbuf_size - 1);
7151 errbuf[errbuf_size - 1] = 0;
7154 strcpy (errbuf, msg);
7161 /* Free dynamically allocated space used by PREG. */
7165 regfree (regex_t *preg)
7172 if (preg->buffer != 0)
7173 free (preg->buffer);
7175 preg->allocated = 0;
7177 if (preg->fastmap != 0)
7178 free (preg->fastmap);
7180 preg->fastmap_accurate = 0;
7182 if (preg->translate != 0)
7183 free (preg->translate);
7184 preg->translate = 0;
7187 #endif /* not emacs */