1 /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
3 This file is part of the librx library.
5 Librx is free software; you can redistribute it and/or modify it under
6 the terms of the GNU Library General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 Librx is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 You should have received a copy of the GNU Library General Public
16 License along with this software; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
20 /* NOTE!!! AIX is so losing it requires this to be the first thing in the
22 * Do not put ANYTHING before it!
24 #if !defined (__GNUC__) && defined (_AIX)
28 /* To make linux happy? */
38 char rx_version_string[] = "GNU Rx version 0.07.1";
48 #define isgraph(c) (isprint (c) && !isspace (c))
51 #define isblank(c) ((c) == ' ' || (c) == '\t')
54 #include <sys/types.h>
58 #define MAX(a, b) ((a) > (b) ? (a) : (b))
59 #define MIN(a, b) ((a) < (b) ? (a) : (b))
70 /* Emacs already defines alloca, sometimes. */
73 /* Make alloca work the best possible way. */
75 #define alloca __builtin_alloca
76 #else /* not __GNUC__ */
79 #else /* not __GNUC__ or HAVE_ALLOCA_H */
80 #ifndef _AIX /* Already did AIX, up at the top. */
83 #endif /* not HAVE_ALLOCA_H */
84 #endif /* not __GNUC__ */
86 #endif /* not alloca */
88 /* Memory management and stuff for emacs. */
91 #define remalloc(M, S) (M ? realloc (M, S) : malloc (S))
94 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
95 * use `alloca' instead of `malloc' for the backtracking stack.
97 * Emacs will die miserably if we don't do this.
101 #define REGEX_ALLOCATE malloc
102 #else /* not REGEX_MALLOC */
103 #define REGEX_ALLOCATE alloca
104 #endif /* not REGEX_MALLOC */
107 #ifdef RX_WANT_RX_DEFS
108 #define RX_DECL extern
111 #define RX_WANT_RX_DEFS
112 #define RX_DECL static
113 #define RX_DEF_QUAL static
117 #define RX_DECL RX_DEF_QUAL
123 extern char *re_syntax_table;
124 #else /* not SYNTAX_TABLE */
128 init_syntax_once (void)
140 bzero (re_syntax_table, sizeof re_syntax_table);
142 for (c = 'a'; c <= 'z'; c++)
143 re_syntax_table[c] = Sword;
145 for (c = 'A'; c <= 'Z'; c++)
146 re_syntax_table[c] = Sword;
148 for (c = '0'; c <= '9'; c++)
149 re_syntax_table[c] = Sword;
151 re_syntax_table['_'] = Sword;
155 #endif /* not SYNTAX_TABLE */
156 #endif /* not emacs */
158 /* Compile with `-DRX_DEBUG' and use the following flags.
161 * rx_debug - print information as a regexp is compiled
162 * rx_debug_trace - print information as a regexp is executed
167 int rx_debug_compile = 0;
168 int rx_debug_trace = 0;
169 static struct re_pattern_buffer * dbug_rxb = 0;
172 typedef void (*side_effect_printer) (struct rx *, void *, FILE *);
174 typedef void (*side_effect_printer) ();
178 static void print_cset (struct rx *rx, rx_Bitset cset, FILE * fp);
180 static void print_cset ();
185 print_rexp (struct rx *rx,
186 struct rexp_node *node, int depth,
187 side_effect_printer seprint, FILE * fp)
190 print_rexp (rx, node, depth, seprint, fp)
192 struct rexp_node *node;
194 side_effect_printer seprint;
206 fprintf (fp, "%*s", depth, "");
207 print_cset (rx, node->params.cset, fp);
214 fprintf (fp, "%*s%s\n", depth, "",
215 node->type == r_opt ? "opt" : "star");
216 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
220 fprintf (fp, "%*s2phase star\n", depth, "");
221 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
222 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
228 fprintf (fp, "%*s%s\n", depth, "",
229 node->type == r_alternate ? "alt" : "concat");
230 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
231 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
234 fprintf (fp, "%*sSide effect: ", depth, "");
235 seprint (rx, node->params.side_effect, fp);
243 print_nfa (struct rx * rx,
244 struct rx_nfa_state * n,
245 side_effect_printer seprint, FILE * fp)
248 print_nfa (rx, n, seprint, fp)
250 struct rx_nfa_state * n;
251 side_effect_printer seprint;
257 struct rx_nfa_edge *e = n->edges;
258 struct rx_possible_future *ec = n->futures;
259 fprintf (fp, "node %d %s\n", n->id,
260 n->is_final ? "final" : (n->is_start ? "start" : ""));
263 fprintf (fp, " edge to %d, ", e->dest->id);
267 fprintf (fp, "epsilon\n");
270 fprintf (fp, "side effect ");
271 seprint (rx, e->params.side_effect, fp);
275 fprintf (fp, "cset ");
276 print_cset (rx, e->params.cset, fp);
286 struct rx_nfa_state_set * s;
287 struct rx_se_list * l;
288 fprintf (fp, " eclosure to {");
289 for (s = ec->destset; s; s = s->cdr)
290 fprintf (fp, "%d ", s->car->id);
292 for (l = ec->effects; l; l = l->cdr)
294 seprint (rx, l->car, fp);
304 static char * efnames [] =
320 "re_se_notwordbound",
327 static char * efnames2[] =
338 static char * inx_names[] =
340 "rx_backtrack_point",
341 "rx_do_side_effects",
346 "rx_num_instructions"
352 re_seprint (struct rx * rx, void * effect, FILE * fp)
355 re_seprint (rx, effect, fp)
362 fputs (efnames[-(int)effect], fp);
365 struct re_se_params * p = &dbug_rxb->se_params[(int)effect];
366 fprintf (fp, "%s(%d,%d)", efnames2[p->se], p->op1, p->op2);
369 fprintf (fp, "[complex op # %d]", (int)effect);
373 /* These are so the regex.c regression tests will compile. */
375 print_compiled_pattern (rxb)
376 struct re_pattern_buffer * rxb;
386 #endif /* RX_DEBUG */
390 /* This page: Bitsets. Completely unintersting. */
394 rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b)
397 rx_bitset_is_equal (size, a, b)
407 for (x = rx_bitset_numb_subsets(size) - 1; a[x] == b[x]; --x)
411 return !x && s == a[0];
416 rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b)
419 rx_bitset_is_subset (size, a, b)
425 int x = rx_bitset_numb_subsets(size) - 1;
426 while (x-- && (a[x] & b[x]) == a[x]);
433 rx_bitset_empty (int size, rx_Bitset set)
436 rx_bitset_empty (size, set)
442 RX_subset s = set[0];
444 for (x = rx_bitset_numb_subsets(size) - 1; !set[x]; --x)
452 rx_bitset_null (int size, rx_Bitset b)
455 rx_bitset_null (size, b)
460 bzero (b, rx_sizeof_bitset(size));
466 rx_bitset_universe (int size, rx_Bitset b)
469 rx_bitset_universe (size, b)
474 int x = rx_bitset_numb_subsets (size);
476 *b++ = ~(RX_subset)0;
482 rx_bitset_complement (int size, rx_Bitset b)
485 rx_bitset_complement (size, b)
490 int x = rx_bitset_numb_subsets (size);
501 rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b)
504 rx_bitset_assign (size, a, b)
511 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
518 rx_bitset_union (int size, rx_Bitset a, rx_Bitset b)
521 rx_bitset_union (size, a, b)
528 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
535 rx_bitset_intersection (int size,
536 rx_Bitset a, rx_Bitset b)
539 rx_bitset_intersection (size, a, b)
546 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
553 rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b)
556 rx_bitset_difference (size, a, b)
563 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
570 rx_bitset_revdifference (int size,
571 rx_Bitset a, rx_Bitset b)
574 rx_bitset_revdifference (size, a, b)
581 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
587 rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b)
590 rx_bitset_xor (size, a, b)
597 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
603 RX_DECL unsigned long
604 rx_bitset_hash (int size, rx_Bitset b)
606 RX_DECL unsigned long
607 rx_bitset_hash (size, b)
613 unsigned long hash = (unsigned long)rx_bitset_hash;
615 for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x)
616 hash ^= rx_bitset_subset_val(b, x);
622 RX_DECL RX_subset rx_subset_singletons [RX_subset_bits] =
662 print_cset (struct rx *rx, rx_Bitset cset, FILE * fp)
665 print_cset (rx, cset, fp)
673 for (x = 0; x < rx->local_cset_size; ++x)
674 if (RX_bitset_member (cset, x))
679 fprintf (fp, "\\0%o ", x);
684 #endif /* RX_DEBUG */
688 static unsigned long rx_hash_masks[4] =
699 RX_DECL struct rx_hash_item *
700 rx_hash_find (struct rx_hash * table,
703 struct rx_hash_rules * rules)
705 RX_DECL struct rx_hash_item *
706 rx_hash_find (table, hash, value, rules)
707 struct rx_hash * table;
710 struct rx_hash_rules * rules;
713 rx_hash_eq eq = rules->eq;
715 long mask = rx_hash_masks [0];
716 int bucket = (hash & mask) % 13;
718 while (table->children [bucket])
720 table = table->children [bucket];
722 mask = rx_hash_masks[maskc];
723 bucket = (hash & mask) % 13;
727 struct rx_hash_item * it = table->buckets[bucket];
729 if (eq (it->data, value))
732 it = it->next_same_hash;
740 RX_DECL struct rx_hash_item *
741 rx_hash_store (struct rx_hash * table,
744 struct rx_hash_rules * rules)
746 RX_DECL struct rx_hash_item *
747 rx_hash_store (table, hash, value, rules)
748 struct rx_hash * table;
751 struct rx_hash_rules * rules;
754 rx_hash_eq eq = rules->eq;
756 long mask = rx_hash_masks[0];
757 int bucket = (hash & mask) % 13;
760 while (table->children [bucket])
762 table = table->children [bucket];
764 mask = rx_hash_masks[maskc];
765 bucket = (hash & mask) % 13;
770 struct rx_hash_item * it = table->buckets[bucket];
772 if (eq (it->data, value))
775 it = it->next_same_hash;
780 && (table->bucket_size [bucket] >= 4))
782 struct rx_hash * newtab = ((struct rx_hash *)
783 rules->hash_alloc (rules));
786 bzero (newtab, sizeof (*newtab));
787 newtab->parent = table;
789 struct rx_hash_item * them = table->buckets[bucket];
790 unsigned long newmask = rx_hash_masks[maskc + 1];
793 struct rx_hash_item * save = them->next_same_hash;
794 int new_buck = (them->hash & newmask) % 13;
795 them->next_same_hash = newtab->buckets[new_buck];
796 newtab->buckets[new_buck] = them;
797 them->table = newtab;
799 ++newtab->bucket_size[new_buck];
802 table->refs = (table->refs - table->bucket_size[bucket] + 1);
803 table->bucket_size[bucket] = 0;
804 table->buckets[bucket] = 0;
805 table->children[bucket] = newtab;
807 bucket = (hash & newmask) % 13;
813 struct rx_hash_item * it = ((struct rx_hash_item *)
814 rules->hash_item_alloc (rules, value));
819 /* DATA and BINDING are to be set in hash_item_alloc */
820 it->next_same_hash = table->buckets [bucket];
821 table->buckets[bucket] = it;
822 ++table->bucket_size [bucket];
831 rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules)
834 rx_hash_free (it, rules)
835 struct rx_hash_item * it;
836 struct rx_hash_rules * rules;
841 struct rx_hash * table = it->table;
842 unsigned long hash = it->hash;
843 int depth = (table->parent
844 ? (table->parent->parent
845 ? (table->parent->parent->parent
850 int bucket = (hash & rx_hash_masks [depth]) % 13;
851 struct rx_hash_item ** pos = &table->buckets [bucket];
854 pos = &(*pos)->next_same_hash;
855 *pos = it->next_same_hash;
856 rules->free_hash_item (it, rules);
857 --table->bucket_size[bucket];
859 while (!table->refs && depth)
861 struct rx_hash * save = table;
862 table = table->parent;
864 bucket = (hash & rx_hash_masks [depth]) % 13;
866 table->children[bucket] = 0;
867 rules->free_hash (save, rules);
874 rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
875 struct rx_hash_rules * rules)
878 rx_free_hash_table (tab, freefn, rules)
879 struct rx_hash * tab;
880 rx_hash_freefn freefn;
881 struct rx_hash_rules * rules;
886 for (x = 0; x < 13; ++x)
887 if (tab->children[x])
889 rx_free_hash_table (tab->children[x], freefn, rules);
890 rules->free_hash (tab->children[x], rules);
894 struct rx_hash_item * them = tab->buckets[x];
897 struct rx_hash_item * that = them;
898 them = that->next_same_hash;
900 rules->free_hash_item (that, rules);
907 /* Utilities for manipulating bitset represntations of characters sets. */
911 rx_cset (struct rx *rx)
918 rx_Bitset b = (rx_Bitset) malloc (rx_sizeof_bitset (rx->local_cset_size));
920 rx_bitset_null (rx->local_cset_size, b);
927 rx_copy_cset (struct rx *rx, rx_Bitset a)
935 rx_Bitset cs = rx_cset (rx);
938 rx_bitset_union (rx->local_cset_size, cs, a);
946 rx_free_cset (struct rx * rx, rx_Bitset c)
959 /* Hash table memory allocation policy for the regexp compiler */
962 static struct rx_hash *
963 compiler_hash_alloc (struct rx_hash_rules * rules)
965 static struct rx_hash *
966 compiler_hash_alloc (rules)
967 struct rx_hash_rules * rules;
970 return (struct rx_hash *)malloc (sizeof (struct rx_hash));
975 static struct rx_hash_item *
976 compiler_hash_item_alloc (struct rx_hash_rules * rules, void * value)
978 static struct rx_hash_item *
979 compiler_hash_item_alloc (rules, value)
980 struct rx_hash_rules * rules;
984 struct rx_hash_item * it;
985 it = (struct rx_hash_item *)malloc (sizeof (*it));
997 compiler_free_hash (struct rx_hash * tab,
998 struct rx_hash_rules * rules)
1001 compiler_free_hash (tab, rules)
1002 struct rx_hash * tab;
1003 struct rx_hash_rules * rules;
1012 compiler_free_hash_item (struct rx_hash_item * item,
1013 struct rx_hash_rules * rules)
1016 compiler_free_hash_item (item, rules)
1017 struct rx_hash_item * item;
1018 struct rx_hash_rules * rules;
1021 free ((char *)item);
1025 /* This page: REXP_NODE (expression tree) structures. */
1028 RX_DECL struct rexp_node *
1029 rexp_node (struct rx *rx,
1030 enum rexp_node_type type)
1032 RX_DECL struct rexp_node *
1033 rexp_node (rx, type)
1035 enum rexp_node_type type;
1038 struct rexp_node *n;
1040 n = (struct rexp_node *)malloc (sizeof (*n));
1041 bzero (n, sizeof (*n));
1048 /* free_rexp_node assumes that the bitset passed to rx_mk_r_cset
1049 * can be freed using rx_free_cset.
1052 RX_DECL struct rexp_node *
1053 rx_mk_r_cset (struct rx * rx,
1056 RX_DECL struct rexp_node *
1057 rx_mk_r_cset (rx, b)
1062 struct rexp_node * n = rexp_node (rx, r_cset);
1070 RX_DECL struct rexp_node *
1071 rx_mk_r_concat (struct rx * rx,
1072 struct rexp_node * a,
1073 struct rexp_node * b)
1075 RX_DECL struct rexp_node *
1076 rx_mk_r_concat (rx, a, b)
1078 struct rexp_node * a;
1079 struct rexp_node * b;
1082 struct rexp_node * n = rexp_node (rx, r_concat);
1085 n->params.pair.left = a;
1086 n->params.pair.right = b;
1093 RX_DECL struct rexp_node *
1094 rx_mk_r_alternate (struct rx * rx,
1095 struct rexp_node * a,
1096 struct rexp_node * b)
1098 RX_DECL struct rexp_node *
1099 rx_mk_r_alternate (rx, a, b)
1101 struct rexp_node * a;
1102 struct rexp_node * b;
1105 struct rexp_node * n = rexp_node (rx, r_alternate);
1108 n->params.pair.left = a;
1109 n->params.pair.right = b;
1116 RX_DECL struct rexp_node *
1117 rx_mk_r_opt (struct rx * rx,
1118 struct rexp_node * a)
1120 RX_DECL struct rexp_node *
1123 struct rexp_node * a;
1126 struct rexp_node * n = rexp_node (rx, r_opt);
1129 n->params.pair.left = a;
1130 n->params.pair.right = 0;
1137 RX_DECL struct rexp_node *
1138 rx_mk_r_star (struct rx * rx,
1139 struct rexp_node * a)
1141 RX_DECL struct rexp_node *
1142 rx_mk_r_star (rx, a)
1144 struct rexp_node * a;
1147 struct rexp_node * n = rexp_node (rx, r_star);
1150 n->params.pair.left = a;
1151 n->params.pair.right = 0;
1158 RX_DECL struct rexp_node *
1159 rx_mk_r_2phase_star (struct rx * rx,
1160 struct rexp_node * a,
1161 struct rexp_node * b)
1163 RX_DECL struct rexp_node *
1164 rx_mk_r_2phase_star (rx, a, b)
1166 struct rexp_node * a;
1167 struct rexp_node * b;
1170 struct rexp_node * n = rexp_node (rx, r_2phase_star);
1173 n->params.pair.left = a;
1174 n->params.pair.right = b;
1181 RX_DECL struct rexp_node *
1182 rx_mk_r_side_effect (struct rx * rx,
1185 RX_DECL struct rexp_node *
1186 rx_mk_r_side_effect (rx, a)
1191 struct rexp_node * n = rexp_node (rx, r_side_effect);
1194 n->params.side_effect = a;
1195 n->params.pair.right = 0;
1202 RX_DECL struct rexp_node *
1203 rx_mk_r_data (struct rx * rx,
1206 RX_DECL struct rexp_node *
1207 rx_mk_r_data (rx, a)
1212 struct rexp_node * n = rexp_node (rx, r_data);
1215 n->params.pair.left = a;
1216 n->params.pair.right = 0;
1224 rx_free_rexp (struct rx * rx, struct rexp_node * node)
1227 rx_free_rexp (rx, node)
1229 struct rexp_node * node;
1237 if (node->params.cset)
1238 rx_free_cset (rx, node->params.cset);
1248 rx_free_rexp (rx, node->params.pair.left);
1249 rx_free_rexp (rx, node->params.pair.right);
1253 /* This shouldn't occur. */
1256 free ((char *)node);
1262 RX_DECL struct rexp_node *
1263 rx_copy_rexp (struct rx *rx,
1264 struct rexp_node *node)
1266 RX_DECL struct rexp_node *
1267 rx_copy_rexp (rx, node)
1269 struct rexp_node *node;
1276 struct rexp_node *n = rexp_node (rx, node->type);
1282 n->params.cset = rx_copy_cset (rx, node->params.cset);
1283 if (!n->params.cset)
1285 rx_free_rexp (rx, n);
1291 n->params.side_effect = node->params.side_effect;
1299 n->params.pair.left =
1300 rx_copy_rexp (rx, node->params.pair.left);
1301 n->params.pair.right =
1302 rx_copy_rexp (rx, node->params.pair.right);
1303 if ( (node->params.pair.left && !n->params.pair.left)
1304 || (node->params.pair.right && !n->params.pair.right))
1306 rx_free_rexp (rx, n);
1311 /* shouldn't happen */
1320 /* This page: functions to build and destroy graphs that describe nfa's */
1322 /* Constructs a new nfa node. */
1324 RX_DECL struct rx_nfa_state *
1325 rx_nfa_state (struct rx *rx)
1327 RX_DECL struct rx_nfa_state *
1332 struct rx_nfa_state * n = (struct rx_nfa_state *)malloc (sizeof (*n));
1335 bzero (n, sizeof (*n));
1336 n->next = rx->nfa_states;
1344 rx_free_nfa_state (struct rx_nfa_state * n)
1347 rx_free_nfa_state (n)
1348 struct rx_nfa_state * n;
1355 /* This looks up an nfa node, given a numeric id. Numeric id's are
1356 * assigned after the nfa has been built.
1359 RX_DECL struct rx_nfa_state *
1360 rx_id_to_nfa_state (struct rx * rx,
1363 RX_DECL struct rx_nfa_state *
1364 rx_id_to_nfa_state (rx, id)
1369 struct rx_nfa_state * n;
1370 for (n = rx->nfa_states; n; n = n->next)
1377 /* This adds an edge between two nodes, but doesn't initialize the
1382 RX_DECL struct rx_nfa_edge *
1383 rx_nfa_edge (struct rx *rx,
1384 enum rx_nfa_etype type,
1385 struct rx_nfa_state *start,
1386 struct rx_nfa_state *dest)
1388 RX_DECL struct rx_nfa_edge *
1389 rx_nfa_edge (rx, type, start, dest)
1391 enum rx_nfa_etype type;
1392 struct rx_nfa_state *start;
1393 struct rx_nfa_state *dest;
1396 struct rx_nfa_edge *e;
1397 e = (struct rx_nfa_edge *)malloc (sizeof (*e));
1400 e->next = start->edges;
1410 rx_free_nfa_edge (struct rx_nfa_edge * e)
1413 rx_free_nfa_edge (e)
1414 struct rx_nfa_edge * e;
1421 /* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure
1422 * of an NFA. These are added to an nfa automaticly by eclose_nfa.
1426 static struct rx_possible_future *
1427 rx_possible_future (struct rx * rx,
1428 struct rx_se_list * effects)
1430 static struct rx_possible_future *
1431 rx_possible_future (rx, effects)
1433 struct rx_se_list * effects;
1436 struct rx_possible_future *ec;
1437 ec = (struct rx_possible_future *) malloc (sizeof (*ec));
1442 ec->effects = effects;
1449 rx_free_possible_future (struct rx_possible_future * pf)
1452 rx_free_possible_future (pf)
1453 struct rx_possible_future * pf;
1462 rx_free_nfa (struct rx *rx)
1469 while (rx->nfa_states)
1471 while (rx->nfa_states->edges)
1473 switch (rx->nfa_states->edges->type)
1476 rx_free_cset (rx, rx->nfa_states->edges->params.cset);
1482 struct rx_nfa_edge * e;
1483 e = rx->nfa_states->edges;
1484 rx->nfa_states->edges = rx->nfa_states->edges->next;
1485 rx_free_nfa_edge (e);
1487 } /* while (rx->nfa_states->edges) */
1489 /* Iterate over the partial epsilon closures of rx->nfa_states */
1490 struct rx_possible_future * pf = rx->nfa_states->futures;
1493 struct rx_possible_future * pft = pf;
1495 rx_free_possible_future (pft);
1499 struct rx_nfa_state *n;
1501 rx->nfa_states = rx->nfa_states->next;
1502 rx_free_nfa_state (n);
1509 /* This page: translating a pattern expression into an nfa and doing the
1510 * static part of the nfa->super-nfa translation.
1513 /* This is the thompson regexp->nfa algorithm.
1514 * It is modified to allow for `side-effect epsilons.' Those are
1515 * edges that are taken whenever a similar epsilon edge would be,
1516 * but which imply that some side effect occurs when the edge
1519 * Side effects are used to model parts of the pattern langauge
1520 * that are not regular (in the formal sense).
1525 rx_build_nfa (struct rx *rx,
1526 struct rexp_node *rexp,
1527 struct rx_nfa_state **start,
1528 struct rx_nfa_state **end)
1531 rx_build_nfa (rx, rexp, start, end)
1533 struct rexp_node *rexp;
1534 struct rx_nfa_state **start;
1535 struct rx_nfa_state **end;
1538 struct rx_nfa_edge *edge;
1540 /* Start & end nodes may have been allocated by the caller. */
1541 *start = *start ? *start : rx_nfa_state (rx);
1552 *end = *end ? *end : rx_nfa_state (rx);
1556 rx_free_nfa_state (*start);
1566 edge = rx_nfa_edge (rx, ne_cset, *start, *end);
1569 edge->params.cset = rx_copy_cset (rx, rexp->params.cset);
1570 if (!edge->params.cset)
1572 rx_free_nfa_edge (edge);
1578 return (rx_build_nfa (rx, rexp->params.pair.left, start, end)
1579 && rx_nfa_edge (rx, ne_epsilon, *start, *end));
1583 struct rx_nfa_state * star_start = 0;
1584 struct rx_nfa_state * star_end = 0;
1585 return (rx_build_nfa (rx, rexp->params.pair.left,
1586 &star_start, &star_end)
1589 && rx_nfa_edge (rx, ne_epsilon, star_start, star_end)
1590 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1591 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1593 && rx_nfa_edge (rx, ne_epsilon, star_end, star_start));
1598 struct rx_nfa_state * star_start = 0;
1599 struct rx_nfa_state * star_end = 0;
1600 struct rx_nfa_state * loop_exp_start = 0;
1601 struct rx_nfa_state * loop_exp_end = 0;
1603 return (rx_build_nfa (rx, rexp->params.pair.left,
1604 &star_start, &star_end)
1605 && rx_build_nfa (rx, rexp->params.pair.right,
1606 &loop_exp_start, &loop_exp_end)
1611 && rx_nfa_edge (rx, ne_epsilon, star_start, *end)
1612 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1613 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1615 && rx_nfa_edge (rx, ne_epsilon, star_end, loop_exp_start)
1616 && rx_nfa_edge (rx, ne_epsilon, loop_exp_end, star_start));
1622 struct rx_nfa_state *shared = 0;
1624 (rx_build_nfa (rx, rexp->params.pair.left, start, &shared)
1625 && rx_build_nfa (rx, rexp->params.pair.right, &shared, end));
1630 struct rx_nfa_state *ls = 0;
1631 struct rx_nfa_state *le = 0;
1632 struct rx_nfa_state *rs = 0;
1633 struct rx_nfa_state *re = 0;
1634 return (rx_build_nfa (rx, rexp->params.pair.left, &ls, &le)
1635 && rx_build_nfa (rx, rexp->params.pair.right, &rs, &re)
1636 && rx_nfa_edge (rx, ne_epsilon, *start, ls)
1637 && rx_nfa_edge (rx, ne_epsilon, *start, rs)
1638 && rx_nfa_edge (rx, ne_epsilon, le, *end)
1639 && rx_nfa_edge (rx, ne_epsilon, re, *end));
1643 edge = rx_nfa_edge (rx, ne_side_effect, *start, *end);
1646 edge->params.side_effect = rexp->params.side_effect;
1650 /* this should never happen */
1655 /* RX_NAME_NFA_STATES identifies all nodes with outgoing non-epsilon
1656 * transitions. Only these nodes can occur in super-states.
1657 * All nodes are given an integer id.
1658 * The id is non-negative if the node has non-epsilon out-transitions, negative
1659 * otherwise (this is because we want the non-negative ids to be used as
1660 * array indexes in a few places).
1665 rx_name_nfa_states (struct rx *rx)
1668 rx_name_nfa_states (rx)
1672 struct rx_nfa_state *n = rx->nfa_states;
1679 struct rx_nfa_edge *e = n->edges;
1682 n->eclosure_needed = 1;
1689 case ne_side_effect:
1693 n->id = rx->nodec++;
1695 struct rx_nfa_edge *from_n = n->edges;
1698 from_n->dest->eclosure_needed = 1;
1699 from_n = from_n->next;
1706 n->id = rx->epsnodec--;
1710 rx->epsnodec = -rx->epsnodec;
1714 /* This page: data structures for the static part of the nfa->supernfa
1717 * There are side effect lists -- lists of side effects occuring
1718 * along an uninterrupted, acyclic path of side-effect epsilon edges.
1719 * Such paths are collapsed to single edges in the course of computing
1720 * epsilon closures. Such single edges are labled with a list of all
1721 * the side effects entailed in crossing them. Like lists of side
1722 * effects are made == by the constructors below.
1724 * There are also nfa state sets. These are used to hold a list of all
1725 * states reachable from a starting state for a given type of transition
1726 * and side effect list. These are also hash-consed.
1729 /* The next several functions compare, construct, etc. lists of side
1730 * effects. See ECLOSE_NFA (below) for details.
1733 /* Ordering of rx_se_list
1734 * (-1, 0, 1 return value convention).
1739 se_list_cmp (void * va, void * vb)
1742 se_list_cmp (va, vb)
1747 struct rx_se_list * a = (struct rx_se_list *)va;
1748 struct rx_se_list * b = (struct rx_se_list *)vb;
1756 : ((long)a->car < (long)b->car
1758 : ((long)a->car > (long)b->car
1760 : se_list_cmp ((void *)a->cdr, (void *)b->cdr))))));
1766 se_list_equal (void * va, void * vb)
1769 se_list_equal (va, vb)
1774 return !(se_list_cmp (va, vb));
1777 static struct rx_hash_rules se_list_hash_rules =
1780 compiler_hash_alloc,
1782 compiler_hash_item_alloc,
1783 compiler_free_hash_item
1788 static struct rx_se_list *
1789 side_effect_cons (struct rx * rx,
1790 void * se, struct rx_se_list * list)
1792 static struct rx_se_list *
1793 side_effect_cons (rx, se, list)
1796 struct rx_se_list * list;
1799 struct rx_se_list * l;
1800 l = ((struct rx_se_list *) malloc (sizeof (*l)));
1810 static struct rx_se_list *
1811 hash_cons_se_prog (struct rx * rx,
1812 struct rx_hash * memo,
1813 void * car, struct rx_se_list * cdr)
1815 static struct rx_se_list *
1816 hash_cons_se_prog (rx, memo, car, cdr)
1818 struct rx_hash * memo;
1820 struct rx_se_list * cdr;
1823 long hash = (long)car ^ (long)cdr;
1824 struct rx_se_list template;
1829 struct rx_hash_item * it = rx_hash_store (memo, hash,
1831 &se_list_hash_rules);
1834 if (it->data == (void *)&template)
1836 struct rx_se_list * consed;
1837 consed = (struct rx_se_list *) malloc (sizeof (*consed));
1839 it->data = (void *)consed;
1841 return (struct rx_se_list *)it->data;
1847 static struct rx_se_list *
1848 hash_se_prog (struct rx * rx, struct rx_hash * memo, struct rx_se_list * prog)
1850 static struct rx_se_list *
1851 hash_se_prog (rx, memo, prog)
1853 struct rx_hash * memo;
1854 struct rx_se_list * prog;
1857 struct rx_se_list * answer = 0;
1860 answer = hash_cons_se_prog (rx, memo, prog->car, answer);
1870 nfa_set_cmp (void * va, void * vb)
1873 nfa_set_cmp (va, vb)
1878 struct rx_nfa_state_set * a = (struct rx_nfa_state_set *)va;
1879 struct rx_nfa_state_set * b = (struct rx_nfa_state_set *)vb;
1887 : (a->car->id < b->car->id
1889 : (a->car->id > b->car->id
1891 : nfa_set_cmp ((void *)a->cdr, (void *)b->cdr))))));
1896 nfa_set_equal (void * va, void * vb)
1899 nfa_set_equal (va, vb)
1904 return !nfa_set_cmp (va, vb);
1907 static struct rx_hash_rules nfa_set_hash_rules =
1910 compiler_hash_alloc,
1912 compiler_hash_item_alloc,
1913 compiler_free_hash_item
1918 static struct rx_nfa_state_set *
1919 nfa_set_cons (struct rx * rx,
1920 struct rx_hash * memo, struct rx_nfa_state * state,
1921 struct rx_nfa_state_set * set)
1923 static struct rx_nfa_state_set *
1924 nfa_set_cons (rx, memo, state, set)
1926 struct rx_hash * memo;
1927 struct rx_nfa_state * state;
1928 struct rx_nfa_state_set * set;
1931 struct rx_nfa_state_set template;
1932 struct rx_hash_item * node;
1933 template.car = state;
1935 node = rx_hash_store (memo,
1936 (((long)state) >> 8) ^ (long)set,
1937 &template, &nfa_set_hash_rules);
1940 if (node->data == &template)
1942 struct rx_nfa_state_set * l;
1943 l = (struct rx_nfa_state_set *) malloc (sizeof (*l));
1944 node->data = (void *) l;
1949 return (struct rx_nfa_state_set *)node->data;
1954 static struct rx_nfa_state_set *
1955 nfa_set_enjoin (struct rx * rx,
1956 struct rx_hash * memo, struct rx_nfa_state * state,
1957 struct rx_nfa_state_set * set)
1959 static struct rx_nfa_state_set *
1960 nfa_set_enjoin (rx, memo, state, set)
1962 struct rx_hash * memo;
1963 struct rx_nfa_state * state;
1964 struct rx_nfa_state_set * set;
1967 if (!set || state->id < set->car->id)
1968 return nfa_set_cons (rx, memo, state, set);
1969 if (state->id == set->car->id)
1973 struct rx_nfa_state_set * newcdr
1974 = nfa_set_enjoin (rx, memo, state, set->cdr);
1975 if (newcdr != set->cdr)
1976 set = nfa_set_cons (rx, memo, set->car, newcdr);
1983 /* This page: computing epsilon closures. The closures aren't total.
1984 * Each node's closures are partitioned according to the side effects entailed
1985 * along the epsilon edges. Return true on success.
1990 struct rx_se_list *prog_backwards;
1996 eclose_node (struct rx *rx, struct rx_nfa_state *outnode,
1997 struct rx_nfa_state *node, struct eclose_frame *frame)
2000 eclose_node (rx, outnode, node, frame)
2002 struct rx_nfa_state *outnode;
2003 struct rx_nfa_state *node;
2004 struct eclose_frame *frame;
2007 struct rx_nfa_edge *e = node->edges;
2009 /* For each node, we follow all epsilon paths to build the closure.
2010 * The closure omits nodes that have only epsilon edges.
2011 * The closure is split into partial closures -- all the states in
2012 * a partial closure are reached by crossing the same list of
2013 * of side effects (though not necessarily the same path).
2019 if (node->id >= 0 || node->is_final)
2021 struct rx_possible_future **ec;
2022 struct rx_se_list * prog_in_order
2023 = ((struct rx_se_list *)hash_se_prog (rx,
2025 frame->prog_backwards));
2028 ec = &outnode->futures;
2032 cmp = se_list_cmp ((void *)(*ec)->effects, (void *)prog_in_order);
2037 if (!*ec || (cmp < 0))
2039 struct rx_possible_future * saved = *ec;
2040 *ec = rx_possible_future (rx, prog_in_order);
2041 (*ec)->next = saved;
2047 (*ec)->destset = nfa_set_enjoin (rx, &rx->set_list_memo,
2048 node, (*ec)->destset);
2049 if (!(*ec)->destset)
2059 if (!eclose_node (rx, outnode, e->dest, frame))
2062 case ne_side_effect:
2064 frame->prog_backwards = side_effect_cons (rx,
2065 e->params.side_effect,
2066 frame->prog_backwards);
2067 if (!frame->prog_backwards)
2069 if (!eclose_node (rx, outnode, e->dest, frame))
2072 struct rx_se_list * dying = frame->prog_backwards;
2073 frame->prog_backwards = frame->prog_backwards->cdr;
2074 free ((char *)dying);
2090 rx_eclose_nfa (struct rx *rx)
2097 struct rx_nfa_state *n = rx->nfa_states;
2098 struct eclose_frame frame;
2099 static int rx_id = 0;
2101 frame.prog_backwards = 0;
2102 rx->rx_id = rx_id++;
2103 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2104 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2108 if (n->eclosure_needed && !eclose_node (rx, n, n, &frame))
2110 /* clear_marks (rx); */
2117 /* This deletes epsilon edges from an NFA. After running eclose_node,
2118 * we have no more need for these edges. They are removed to simplify
2119 * further operations on the NFA.
2124 rx_delete_epsilon_transitions (struct rx *rx)
2127 rx_delete_epsilon_transitions (rx)
2131 struct rx_nfa_state *n = rx->nfa_states;
2132 struct rx_nfa_edge **e;
2139 struct rx_nfa_edge *t;
2143 case ne_side_effect:
2146 rx_free_nfa_edge (t);
2159 /* This page: storing the nfa in a contiguous region of memory for
2160 * subsequent conversion to a super-nfa.
2163 /* This is for qsort on an array of nfa_states. The order
2164 * is based on state ids and goes
2165 * [0...MAX][MIN..-1] where (MAX>=0) and (MIN<0)
2166 * This way, positive ids double as array indices.
2171 nfacmp (void * va, void * vb)
2179 struct rx_nfa_state **a = (struct rx_nfa_state **)va;
2180 struct rx_nfa_state **b = (struct rx_nfa_state **)vb;
2181 return (*a == *b /* &&&& 3.18 */
2183 : (((*a)->id < 0) == ((*b)->id < 0)
2184 ? (((*a)->id < (*b)->id) ? -1 : 1)
2191 count_hash_nodes (struct rx_hash * st)
2194 count_hash_nodes (st)
2195 struct rx_hash * st;
2200 for (x = 0; x < 13; ++x)
2201 count += ((st->children[x])
2202 ? count_hash_nodes (st->children[x])
2203 : st->bucket_size[x]);
2211 se_memo_freer (struct rx_hash_item * node)
2214 se_memo_freer (node)
2215 struct rx_hash_item * node;
2218 free ((char *)node->data);
2224 nfa_set_freer (struct rx_hash_item * node)
2227 nfa_set_freer (node)
2228 struct rx_hash_item * node;
2231 free ((char *)node->data);
2235 /* This copies an entire NFA into a single malloced block of memory.
2236 * Mostly this is for compatability with regex.c, though it is convenient
2237 * to have the nfa nodes in an array.
2242 rx_compactify_nfa (struct rx *rx,
2243 void **mem, unsigned long *size)
2246 rx_compactify_nfa (rx, mem, size)
2249 unsigned long *size;
2253 struct rx_nfa_state *n;
2256 int se_list_consc = count_hash_nodes (&rx->se_list_memo);
2257 int nfa_setc = count_hash_nodes (&rx->set_list_memo);
2258 unsigned long total_size;
2260 /* This takes place in two stages. First, the total size of the
2261 * nfa is computed, then structures are copied.
2267 struct rx_nfa_edge *e = n->edges;
2268 struct rx_possible_future *ec = n->futures;
2283 total_size = (total_nodec * sizeof (struct rx_nfa_state)
2284 + edgec * rx_sizeof_bitset (rx->local_cset_size)
2285 + edgec * sizeof (struct rx_nfa_edge)
2286 + nfa_setc * sizeof (struct rx_nfa_state_set)
2287 + eclosec * sizeof (struct rx_possible_future)
2288 + se_list_consc * sizeof (struct rx_se_list)
2291 if (total_size > *size)
2293 *mem = remalloc (*mem, total_size);
2299 /* Now we've allocated the memory; this copies the NFA. */
2301 static struct rx_nfa_state **scratch = 0;
2302 static int scratch_alloc = 0;
2303 struct rx_nfa_state *state_base = (struct rx_nfa_state *) * mem;
2304 struct rx_nfa_state *new_state = state_base;
2305 struct rx_nfa_edge *new_edge =
2306 (struct rx_nfa_edge *)
2307 ((char *) state_base + total_nodec * sizeof (struct rx_nfa_state));
2308 struct rx_se_list * new_se_list =
2309 (struct rx_se_list *)
2310 ((char *)new_edge + edgec * sizeof (struct rx_nfa_edge));
2311 struct rx_possible_future *new_close =
2312 ((struct rx_possible_future *)
2313 ((char *) new_se_list
2314 + se_list_consc * sizeof (struct rx_se_list)));
2315 struct rx_nfa_state_set * new_nfa_set =
2316 ((struct rx_nfa_state_set *)
2317 ((char *)new_close + eclosec * sizeof (struct rx_possible_future)));
2319 ((char *) new_nfa_set + nfa_setc * sizeof (struct rx_nfa_state_set));
2321 struct rx_nfa_state *n;
2323 if (scratch_alloc < total_nodec)
2325 scratch = ((struct rx_nfa_state **)
2326 remalloc (scratch, total_nodec * sizeof (*scratch)));
2328 scratch_alloc = total_nodec;
2336 for (x = 0, n = rx->nfa_states; n; n = n->next)
2339 qsort (scratch, total_nodec,
2340 sizeof (struct rx_nfa_state *), (int (*)())nfacmp);
2342 for (x = 0; x < total_nodec; ++x)
2344 struct rx_possible_future *eclose = scratch[x]->futures;
2345 struct rx_nfa_edge *edge = scratch[x]->edges;
2346 struct rx_nfa_state *cn = new_state++;
2349 cn->next = (x == total_nodec - 1) ? 0 : (cn + 1);
2350 cn->id = scratch[x]->id;
2351 cn->is_final = scratch[x]->is_final;
2352 cn->is_start = scratch[x]->is_start;
2356 int indx = (edge->dest->id < 0
2357 ? (total_nodec + edge->dest->id)
2359 struct rx_nfa_edge *e = new_edge++;
2360 rx_Bitset cset = (rx_Bitset) new_bitset;
2361 new_bitset += rx_sizeof_bitset (rx->local_cset_size);
2362 rx_bitset_null (rx->local_cset_size, cset);
2363 rx_bitset_union (rx->local_cset_size, cset, edge->params.cset);
2364 e->next = cn->edges;
2366 e->type = edge->type;
2367 e->dest = state_base + indx;
2368 e->params.cset = cset;
2373 struct rx_possible_future *ec = new_close++;
2374 struct rx_hash_item * sp;
2375 struct rx_se_list ** sepos;
2376 struct rx_se_list * sesrc;
2377 struct rx_nfa_state_set * destlst;
2378 struct rx_nfa_state_set ** destpos;
2379 ec->next = cn->futures;
2381 for (sepos = &ec->effects, sesrc = eclose->effects;
2383 sesrc = sesrc->cdr, sepos = &(*sepos)->cdr)
2385 sp = rx_hash_find (&rx->se_list_memo,
2386 (long)sesrc->car ^ (long)sesrc->cdr,
2387 sesrc, &se_list_hash_rules);
2390 sesrc = (struct rx_se_list *)sp->binding;
2393 *new_se_list = *sesrc;
2394 sp->binding = (void *)new_se_list;
2395 *sepos = new_se_list;
2399 for (destpos = &ec->destset, destlst = eclose->destset;
2401 destpos = &(*destpos)->cdr, destlst = destlst->cdr)
2403 sp = rx_hash_find (&rx->set_list_memo,
2404 ((((long)destlst->car) >> 8)
2405 ^ (long)destlst->cdr),
2406 destlst, &nfa_set_hash_rules);
2409 destlst = (struct rx_nfa_state_set *)sp->binding;
2412 *new_nfa_set = *destlst;
2413 new_nfa_set->car = state_base + destlst->car->id;
2414 sp->binding = (void *)new_nfa_set;
2415 *destpos = new_nfa_set;
2419 eclose = eclose->next;
2423 rx_free_hash_table (&rx->se_list_memo, se_memo_freer, &se_list_hash_rules);
2424 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2425 rx_free_hash_table (&rx->set_list_memo, nfa_set_freer, &nfa_set_hash_rules);
2426 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2429 rx->nfa_states = (struct rx_nfa_state *)*mem;
2434 /* The functions in the next several pages define the lazy-NFA-conversion used
2435 * by matchers. The input to this construction is an NFA such as
2436 * is built by compactify_nfa (rx.c). The output is the superNFA.
2439 /* Match engines can use arbitrary values for opcodes. So, the parse tree
2440 * is built using instructions names (enum rx_opcode), but the superstate
2441 * nfa is populated with mystery opcodes (void *).
2443 * For convenience, here is an id table. The opcodes are == to their inxs
2445 * The lables in re_search_2 would make good values for instructions.
2448 void * rx_id_instruction_table[rx_num_instructions] =
2450 (void *) rx_backtrack_point,
2451 (void *) rx_do_side_effects,
2452 (void *) rx_cache_miss,
2453 (void *) rx_next_char,
2454 (void *) rx_backtrack,
2455 (void *) rx_error_inx
2460 /* Memory mgt. for superstate graphs. */
2464 rx_cache_malloc (struct rx_cache * cache, int bytes)
2467 rx_cache_malloc (cache, bytes)
2468 struct rx_cache * cache;
2472 while (cache->bytes_left < bytes)
2474 if (cache->memory_pos)
2475 cache->memory_pos = cache->memory_pos->next;
2476 if (!cache->memory_pos)
2478 cache->morecore (cache);
2479 if (!cache->memory_pos)
2482 cache->bytes_left = cache->memory_pos->bytes;
2483 cache->memory_addr = ((char *)cache->memory_pos
2484 + sizeof (struct rx_blocklist));
2486 cache->bytes_left -= bytes;
2488 char * addr = cache->memory_addr;
2489 cache->memory_addr += bytes;
2496 rx_cache_free (struct rx_cache * cache,
2497 struct rx_freelist ** freelist, char * mem)
2500 rx_cache_free (cache, freelist, mem)
2501 struct rx_cache * cache;
2502 struct rx_freelist ** freelist;
2506 struct rx_freelist * it = (struct rx_freelist *)mem;
2507 it->next = *freelist;
2512 /* The partially instantiated superstate graph has a transition
2513 * table at every node. There is one entry for every character.
2514 * This fills in the transition for a set.
2518 install_transition (struct rx_superstate *super,
2519 struct rx_inx *answer, rx_Bitset trcset)
2522 install_transition (super, answer, trcset)
2523 struct rx_superstate *super;
2524 struct rx_inx *answer;
2528 struct rx_inx * transitions = super->transitions;
2530 for (chr = 0; chr < 256; )
2538 RX_subset sub = *trcset;
2540 int bound = chr + 32;
2544 transitions [chr] = *answer;
2555 qlen (struct rx_superstate * q)
2559 struct rx_superstate * q;
2563 struct rx_superstate * it;
2566 for (it = q->next_recyclable; it != q; it = it->next_recyclable)
2573 check_cache (struct rx_cache * cache)
2577 struct rx_cache * cache;
2580 struct rx_cache * you_fucked_up = 0;
2581 int total = cache->superstates;
2582 int semi = cache->semifree_superstates;
2583 if (semi != qlen (cache->semifree_superstate))
2584 check_cache (you_fucked_up);
2585 if ((total - semi) != qlen (cache->lru_superstate))
2586 check_cache (you_fucked_up);
2589 /* When a superstate is old and neglected, it can enter a
2590 * semi-free state. A semi-free state is slated to die.
2591 * Incoming transitions to a semi-free state are re-written
2592 * to cause an (interpreted) fault when they are taken.
2593 * The fault handler revives the semi-free state, patches
2594 * incoming transitions back to normal, and continues.
2596 * The idea is basicly to free in two stages, aborting
2597 * between the two if the state turns out to be useful again.
2598 * When a free is aborted, the rescued superstate is placed
2599 * in the most-favored slot to maximize the time until it
2600 * is next semi-freed.
2605 semifree_superstate (struct rx_cache * cache)
2608 semifree_superstate (cache)
2609 struct rx_cache * cache;
2612 int disqualified = cache->semifree_superstates;
2613 if (disqualified == cache->superstates)
2615 while (cache->lru_superstate->locks)
2617 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2619 if (disqualified == cache->superstates)
2623 struct rx_superstate * it = cache->lru_superstate;
2624 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2625 it->prev_recyclable->next_recyclable = it->next_recyclable;
2626 cache->lru_superstate = (it == it->next_recyclable
2628 : it->next_recyclable);
2629 if (!cache->semifree_superstate)
2631 cache->semifree_superstate = it;
2632 it->next_recyclable = it;
2633 it->prev_recyclable = it;
2637 it->prev_recyclable = cache->semifree_superstate->prev_recyclable;
2638 it->next_recyclable = cache->semifree_superstate;
2639 it->prev_recyclable->next_recyclable = it;
2640 it->next_recyclable->prev_recyclable = it;
2643 struct rx_distinct_future *df;
2644 it->is_semifree = 1;
2645 ++cache->semifree_superstates;
2646 df = it->transition_refs;
2649 df->prev_same_dest->next_same_dest = 0;
2650 for (df = it->transition_refs; df; df = df->next_same_dest)
2652 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2653 df->future_frame.data = 0;
2654 df->future_frame.data_2 = (void *) df;
2655 /* If there are any NEXT-CHAR instruction frames that
2656 * refer to this state, we convert them to CACHE-MISS frames.
2659 && (df->edge->options->next_same_super_edge[0]
2660 == df->edge->options))
2661 install_transition (df->present, &df->future_frame,
2664 df = it->transition_refs;
2665 df->prev_same_dest->next_same_dest = df;
2674 refresh_semifree_superstate (struct rx_cache * cache,
2675 struct rx_superstate * super)
2678 refresh_semifree_superstate (cache, super)
2679 struct rx_cache * cache;
2680 struct rx_superstate * super;
2683 struct rx_distinct_future *df;
2685 if (super->transition_refs)
2687 super->transition_refs->prev_same_dest->next_same_dest = 0;
2688 for (df = super->transition_refs; df; df = df->next_same_dest)
2690 df->future_frame.inx = cache->instruction_table[rx_next_char];
2691 df->future_frame.data = (void *) super->transitions;
2692 /* CACHE-MISS instruction frames that refer to this state,
2693 * must be converted to NEXT-CHAR frames.
2696 && (df->edge->options->next_same_super_edge[0]
2697 == df->edge->options))
2698 install_transition (df->present, &df->future_frame,
2701 super->transition_refs->prev_same_dest->next_same_dest
2702 = super->transition_refs;
2704 if (cache->semifree_superstate == super)
2705 cache->semifree_superstate = (super->prev_recyclable == super
2707 : super->prev_recyclable);
2708 super->next_recyclable->prev_recyclable = super->prev_recyclable;
2709 super->prev_recyclable->next_recyclable = super->next_recyclable;
2711 if (!cache->lru_superstate)
2712 (cache->lru_superstate
2713 = super->next_recyclable
2714 = super->prev_recyclable
2718 super->next_recyclable = cache->lru_superstate;
2719 super->prev_recyclable = cache->lru_superstate->prev_recyclable;
2720 super->next_recyclable->prev_recyclable = super;
2721 super->prev_recyclable->next_recyclable = super;
2723 super->is_semifree = 0;
2724 --cache->semifree_superstates;
2729 rx_refresh_this_superstate (struct rx_cache * cache, struct rx_superstate * superstate)
2732 rx_refresh_this_superstate (cache, superstate)
2733 struct rx_cache * cache;
2734 struct rx_superstate * superstate;
2737 if (superstate->is_semifree)
2738 refresh_semifree_superstate (cache, superstate);
2739 else if (cache->lru_superstate == superstate)
2740 cache->lru_superstate = superstate->next_recyclable;
2741 else if (superstate != cache->lru_superstate->prev_recyclable)
2743 superstate->next_recyclable->prev_recyclable
2744 = superstate->prev_recyclable;
2745 superstate->prev_recyclable->next_recyclable
2746 = superstate->next_recyclable;
2747 superstate->next_recyclable = cache->lru_superstate;
2748 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
2749 superstate->next_recyclable->prev_recyclable = superstate;
2750 superstate->prev_recyclable->next_recyclable = superstate;
2756 release_superset_low (struct rx_cache * cache,
2757 struct rx_superset *set)
2760 release_superset_low (cache, set)
2761 struct rx_cache * cache;
2762 struct rx_superset *set;
2768 release_superset_low (cache, set->cdr);
2770 set->starts_for = 0;
2774 (&cache->superset_table,
2775 (unsigned long)set->car ^ set->id ^ (unsigned long)set->cdr,
2777 &cache->superset_hash_rules),
2778 &cache->superset_hash_rules);
2779 rx_cache_free (cache, &cache->free_supersets, (char *)set);
2785 rx_release_superset (struct rx *rx,
2786 struct rx_superset *set)
2789 rx_release_superset (rx, set)
2791 struct rx_superset *set;
2794 release_superset_low (rx->cache, set);
2797 /* This tries to add a new superstate to the superstate freelist.
2798 * It might, as a result, free some edge pieces or hash tables.
2799 * If nothing can be freed because too many locks are being held, fail.
2804 rx_really_free_superstate (struct rx_cache * cache)
2807 rx_really_free_superstate (cache)
2808 struct rx_cache * cache;
2811 int locked_superstates = 0;
2812 struct rx_superstate * it;
2814 if (!cache->superstates)
2818 /* This is a total guess. The idea is that we should expect as
2819 * many misses as we've recently experienced. I.e., cache->misses
2820 * should be the same as cache->semifree_superstates.
2822 while ((cache->hits + cache->misses) > cache->superstates_allowed)
2825 cache->misses >>= 1;
2827 if ( ((cache->hits + cache->misses) * cache->semifree_superstates)
2828 < (cache->superstates * cache->misses))
2830 semifree_superstate (cache);
2831 semifree_superstate (cache);
2835 while (cache->semifree_superstate && cache->semifree_superstate->locks)
2837 refresh_semifree_superstate (cache, cache->semifree_superstate);
2838 ++locked_superstates;
2839 if (locked_superstates == cache->superstates)
2843 if (cache->semifree_superstate)
2845 it = cache->semifree_superstate;
2846 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2847 it->prev_recyclable->next_recyclable = it->next_recyclable;
2848 cache->semifree_superstate = ((it == it->next_recyclable)
2850 : it->next_recyclable);
2851 --cache->semifree_superstates;
2855 while (cache->lru_superstate->locks)
2857 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2858 ++locked_superstates;
2859 if (locked_superstates == cache->superstates)
2862 it = cache->lru_superstate;
2863 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2864 it->prev_recyclable->next_recyclable = it->next_recyclable;
2865 cache->lru_superstate = ((it == it->next_recyclable)
2867 : it->next_recyclable);
2870 if (it->transition_refs)
2872 struct rx_distinct_future *df;
2873 for (df = it->transition_refs,
2874 df->prev_same_dest->next_same_dest = 0;
2876 df = df->next_same_dest)
2878 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2879 df->future_frame.data = 0;
2880 df->future_frame.data_2 = (void *) df;
2883 it->transition_refs->prev_same_dest->next_same_dest =
2884 it->transition_refs;
2887 struct rx_super_edge *tc = it->edges;
2890 struct rx_distinct_future * df;
2891 struct rx_super_edge *tct = tc->next;
2893 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
2896 struct rx_distinct_future *dft = df;
2897 df = df->next_same_super_edge[0];
2900 if (dft->future && dft->future->transition_refs == dft)
2902 dft->future->transition_refs = dft->next_same_dest;
2903 if (dft->future->transition_refs == dft)
2904 dft->future->transition_refs = 0;
2906 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
2907 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
2908 rx_cache_free (cache, &cache->free_discernable_futures,
2911 rx_cache_free (cache, &cache->free_transition_classes, (char *)tc);
2916 if (it->contents->superstate == it)
2917 it->contents->superstate = 0;
2918 release_superset_low (cache, it->contents);
2919 rx_cache_free (cache, &cache->free_superstates, (char *)it);
2920 --cache->superstates;
2926 rx_cache_get (struct rx_cache * cache,
2927 struct rx_freelist ** freelist)
2930 rx_cache_get (cache, freelist)
2931 struct rx_cache * cache;
2932 struct rx_freelist ** freelist;
2935 while (!*freelist && rx_really_free_superstate (cache))
2940 struct rx_freelist * it = *freelist;
2941 *freelist = it->next;
2948 rx_cache_malloc_or_get (struct rx_cache * cache,
2949 struct rx_freelist ** freelist, int bytes)
2952 rx_cache_malloc_or_get (cache, freelist, bytes)
2953 struct rx_cache * cache;
2954 struct rx_freelist ** freelist;
2960 char * answer = rx_cache_malloc (cache, bytes);
2965 return rx_cache_get (cache, freelist);
2970 rx_cache_get_superstate (struct rx_cache * cache)
2973 rx_cache_get_superstate (cache)
2974 struct rx_cache * cache;
2978 int bytes = ( sizeof (struct rx_superstate)
2979 + cache->local_cset_size * sizeof (struct rx_inx));
2980 if (!cache->free_superstates
2981 && (cache->superstates < cache->superstates_allowed))
2983 answer = rx_cache_malloc (cache, bytes);
2986 ++cache->superstates;
2990 answer = rx_cache_get (cache, &cache->free_superstates);
2993 answer = rx_cache_malloc (cache, bytes);
2995 ++cache->superstates_allowed;
2997 ++cache->superstates;
3005 supersetcmp (void * va, void * vb)
3008 supersetcmp (va, vb)
3013 struct rx_superset * a = (struct rx_superset *)va;
3014 struct rx_superset * b = (struct rx_superset *)vb;
3016 || (a && b && (a->car == b->car) && (a->cdr == b->cdr)));
3020 static struct rx_hash_item *
3021 superset_allocator (struct rx_hash_rules * rules, void * val)
3023 static struct rx_hash_item *
3024 superset_allocator (rules, val)
3025 struct rx_hash_rules * rules;
3029 struct rx_cache * cache
3030 = ((struct rx_cache *)
3032 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3033 struct rx_superset * template = (struct rx_superset *)val;
3034 struct rx_superset * newset
3035 = ((struct rx_superset *)
3036 rx_cache_malloc_or_get (cache,
3037 &cache->free_supersets,
3038 sizeof (*template)));
3042 newset->car = template->car;
3043 newset->id = template->car->id;
3044 newset->cdr = template->cdr;
3045 newset->superstate = 0;
3046 rx_protect_superset (rx, template->cdr);
3047 newset->hash_item.data = (void *)newset;
3048 newset->hash_item.binding = 0;
3049 return &newset->hash_item;
3053 static struct rx_hash *
3054 super_hash_allocator (struct rx_hash_rules * rules)
3056 static struct rx_hash *
3057 super_hash_allocator (rules)
3058 struct rx_hash_rules * rules;
3061 struct rx_cache * cache
3062 = ((struct rx_cache *)
3064 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3065 return ((struct rx_hash *)
3066 rx_cache_malloc_or_get (cache,
3067 &cache->free_hash, sizeof (struct rx_hash)));
3073 super_hash_liberator (struct rx_hash * hash, struct rx_hash_rules * rules)
3076 super_hash_liberator (hash, rules)
3077 struct rx_hash * hash;
3078 struct rx_hash_rules * rules;
3081 struct rx_cache * cache
3082 = ((struct rx_cache *)
3083 (char *)rules - (long)(&((struct rx_cache *)0)->superset_hash_rules));
3084 rx_cache_free (cache, &cache->free_hash, (char *)hash);
3089 superset_hash_item_liberator (struct rx_hash_item * it,
3090 struct rx_hash_rules * rules)
3093 superset_hash_item_liberator (it, rules) /* Well, it does ya know. */
3094 struct rx_hash_item * it;
3095 struct rx_hash_rules * rules;
3100 int rx_cache_bound = 128;
3101 static int rx_default_cache_got = 0;
3105 bytes_for_cache_size (int supers, int cset_size)
3108 bytes_for_cache_size (supers, cset_size)
3113 /* What the hell is this? !!!*/
3116 ( (1.03 * (float) ( rx_sizeof_bitset (cset_size)
3117 + sizeof (struct rx_super_edge)))
3118 + (1.80 * (float) sizeof (struct rx_possible_future))
3119 + (float) ( sizeof (struct rx_superstate)
3120 + cset_size * sizeof (struct rx_inx))));
3125 rx_morecore (struct rx_cache * cache)
3129 struct rx_cache * cache;
3132 if (rx_default_cache_got >= rx_cache_bound)
3135 rx_default_cache_got += 16;
3136 cache->superstates_allowed = rx_cache_bound;
3138 struct rx_blocklist ** pos = &cache->memory;
3139 int size = bytes_for_cache_size (16, cache->local_cset_size);
3141 pos = &(*pos)->next;
3142 *pos = ((struct rx_blocklist *)
3143 malloc (size + sizeof (struct rx_blocklist)));
3148 (*pos)->bytes = size;
3149 cache->memory_pos = *pos;
3150 cache->memory_addr = (char *)*pos + sizeof (**pos);
3151 cache->bytes_left = size;
3155 static struct rx_cache default_cache =
3159 super_hash_allocator,
3160 super_hash_liberator,
3162 superset_hash_item_liberator,
3188 rx_id_instruction_table,
3199 /* This adds an element to a superstate set. These sets are lists, such
3200 * that lists with == elements are ==. The empty set is returned by
3201 * superset_cons (rx, 0, 0) and is NOT equivelent to
3202 * (struct rx_superset)0.
3206 RX_DECL struct rx_superset *
3207 rx_superset_cons (struct rx * rx,
3208 struct rx_nfa_state *car, struct rx_superset *cdr)
3210 RX_DECL struct rx_superset *
3211 rx_superset_cons (rx, car, cdr)
3213 struct rx_nfa_state *car;
3214 struct rx_superset *cdr;
3217 struct rx_cache * cache = rx->cache;
3220 if (!cache->empty_superset)
3222 cache->empty_superset
3223 = ((struct rx_superset *)
3224 rx_cache_malloc_or_get (cache, &cache->free_supersets,
3225 sizeof (struct rx_superset)));
3226 if (!cache->empty_superset)
3228 bzero (cache->empty_superset, sizeof (struct rx_superset));
3229 cache->empty_superset->refs = 1000;
3231 return cache->empty_superset;
3234 struct rx_superset template;
3235 struct rx_hash_item * hit;
3238 template.id = car->id;
3239 hit = rx_hash_store (&cache->superset_table,
3240 (unsigned long)car ^ car->id ^ (unsigned long)cdr,
3242 &cache->superset_hash_rules);
3244 ? (struct rx_superset *)hit->data
3249 /* This computes a union of two NFA state sets. The sets do not have the
3250 * same representation though. One is a RX_SUPERSET structure (part
3251 * of the superstate NFA) and the other is an NFA_STATE_SET (part of the NFA).
3255 RX_DECL struct rx_superset *
3256 rx_superstate_eclosure_union
3257 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl)
3259 RX_DECL struct rx_superset *
3260 rx_superstate_eclosure_union (rx, set, ecl)
3262 struct rx_superset *set;
3263 struct rx_nfa_state_set *ecl;
3270 return rx_superset_cons (rx, ecl->car,
3271 rx_superstate_eclosure_union (rx, set, ecl->cdr));
3272 if (set->car == ecl->car)
3273 return rx_superstate_eclosure_union (rx, set, ecl->cdr);
3276 struct rx_superset * tail;
3277 struct rx_nfa_state * first;
3279 if (set->car > ecl->car)
3281 tail = rx_superstate_eclosure_union (rx, set->cdr, ecl);
3286 tail = rx_superstate_eclosure_union (rx, set, ecl->cdr);
3293 struct rx_superset * answer;
3294 answer = rx_superset_cons (rx, first, tail);
3297 rx_protect_superset (rx, tail);
3298 rx_release_superset (rx, tail);
3311 * This makes sure that a list of rx_distinct_futures contains
3312 * a future for each possible set of side effects in the eclosure
3313 * of a given state. This is some of the work of filling in a
3314 * superstate transition.
3318 static struct rx_distinct_future *
3319 include_futures (struct rx *rx,
3320 struct rx_distinct_future *df, struct rx_nfa_state
3321 *state, struct rx_superstate *superstate)
3323 static struct rx_distinct_future *
3324 include_futures (rx, df, state, superstate)
3326 struct rx_distinct_future *df;
3327 struct rx_nfa_state *state;
3328 struct rx_superstate *superstate;
3331 struct rx_possible_future *future;
3332 struct rx_cache * cache = rx->cache;
3333 for (future = state->futures; future; future = future->next)
3335 struct rx_distinct_future *dfp;
3336 struct rx_distinct_future *insert_before = 0;
3338 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3339 for (dfp = df; dfp; dfp = dfp->next_same_super_edge[0])
3340 if (dfp->effects == future->effects)
3344 int order = rx->se_list_cmp (rx, dfp->effects, future->effects);
3347 insert_before = dfp;
3353 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3357 = ((struct rx_distinct_future *)
3358 rx_cache_malloc_or_get (cache, &cache->free_discernable_futures,
3359 sizeof (struct rx_distinct_future)));
3364 df = insert_before = dfp;
3365 df->next_same_super_edge[0] = df->next_same_super_edge[1] = df;
3367 else if (!insert_before)
3369 else if (insert_before == df)
3372 dfp->next_same_super_edge[0] = insert_before;
3373 dfp->next_same_super_edge[1]
3374 = insert_before->next_same_super_edge[1];
3375 dfp->next_same_super_edge[1]->next_same_super_edge[0] = dfp;
3376 dfp->next_same_super_edge[0]->next_same_super_edge[1] = dfp;
3377 dfp->next_same_dest = dfp->prev_same_dest = dfp;
3379 dfp->present = superstate;
3380 dfp->future_frame.inx = rx->instruction_table[rx_cache_miss];
3381 dfp->future_frame.data = 0;
3382 dfp->future_frame.data_2 = (void *) dfp;
3383 dfp->side_effects_frame.inx
3384 = rx->instruction_table[rx_do_side_effects];
3385 dfp->side_effects_frame.data = 0;
3386 dfp->side_effects_frame.data_2 = (void *) dfp;
3387 dfp->effects = future->effects;
3395 /* This constructs a new superstate from its state set. The only
3396 * complexity here is memory management.
3399 RX_DECL struct rx_superstate *
3400 rx_superstate (struct rx *rx,
3401 struct rx_superset *set)
3403 RX_DECL struct rx_superstate *
3404 rx_superstate (rx, set)
3406 struct rx_superset *set;
3409 struct rx_cache * cache = rx->cache;
3410 struct rx_superstate * superstate = 0;
3412 /* Does the superstate already exist in the cache? */
3413 if (set->superstate)
3415 if (set->superstate->rx_id != rx->rx_id)
3417 /* Aha. It is in the cache, but belongs to a superstate
3418 * that refers to an NFA that no longer exists.
3419 * (We know it no longer exists because it was evidently
3420 * stored in the same region of memory as the current nfa
3421 * yet it has a different id.)
3423 superstate = set->superstate;
3424 if (!superstate->is_semifree)
3426 if (cache->lru_superstate == superstate)
3428 cache->lru_superstate = superstate->next_recyclable;
3429 if (cache->lru_superstate == superstate)
3430 cache->lru_superstate = 0;
3433 superstate->next_recyclable->prev_recyclable
3434 = superstate->prev_recyclable;
3435 superstate->prev_recyclable->next_recyclable
3436 = superstate->next_recyclable;
3437 if (!cache->semifree_superstate)
3439 (cache->semifree_superstate
3440 = superstate->next_recyclable
3441 = superstate->prev_recyclable
3446 superstate->next_recyclable = cache->semifree_superstate;
3447 superstate->prev_recyclable
3448 = cache->semifree_superstate->prev_recyclable;
3449 superstate->next_recyclable->prev_recyclable
3451 superstate->prev_recyclable->next_recyclable
3453 cache->semifree_superstate = superstate;
3455 ++cache->semifree_superstates;
3458 set->superstate = 0;
3459 goto handle_cache_miss;
3462 superstate = set->superstate;
3464 rx_refresh_this_superstate (cache, superstate);
3470 /* This point reached only for cache misses. */
3473 if (rx_debug_trace > 1)
3475 struct rx_superset * setp = set;
3476 fprintf (stderr, "Building a superstet %d(%d): ", rx->rx_id, set);
3479 fprintf (stderr, "%d ", setp->id);
3482 fprintf (stderr, "(%d)\n", set);
3485 superstate = (struct rx_superstate *)rx_cache_get_superstate (cache);
3489 if (!cache->lru_superstate)
3490 (cache->lru_superstate
3491 = superstate->next_recyclable
3492 = superstate->prev_recyclable
3496 superstate->next_recyclable = cache->lru_superstate;
3497 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
3498 ( superstate->prev_recyclable->next_recyclable
3499 = superstate->next_recyclable->prev_recyclable
3502 superstate->rx_id = rx->rx_id;
3503 superstate->transition_refs = 0;
3504 superstate->locks = 0;
3505 superstate->is_semifree = 0;
3506 set->superstate = superstate;
3507 superstate->contents = set;
3508 rx_protect_superset (rx, set);
3509 superstate->edges = 0;
3512 /* None of the transitions from this superstate are known yet. */
3513 for (x = 0; x < rx->local_cset_size; ++x) /* &&&&& 3.8 % */
3515 struct rx_inx * ifr = &superstate->transitions[x];
3516 ifr->inx = rx->instruction_table [rx_cache_miss];
3517 ifr->data = ifr->data_2 = 0;
3524 /* This computes the destination set of one edge of the superstate NFA.
3525 * Note that a RX_DISTINCT_FUTURE is a superstate edge.
3526 * Returns 0 on an allocation failure.
3531 solve_destination (struct rx *rx, struct rx_distinct_future *df)
3534 solve_destination (rx, df)
3536 struct rx_distinct_future *df;
3539 struct rx_super_edge *tc = df->edge;
3540 struct rx_superset *nfa_state;
3541 struct rx_superset *nil_set = rx_superset_cons (rx, 0, 0);
3542 struct rx_superset *solution = nil_set;
3543 struct rx_superstate *dest;
3545 rx_protect_superset (rx, solution);
3546 /* Iterate over all NFA states in the state set of this superstate. */
3547 for (nfa_state = df->present->contents;
3549 nfa_state = nfa_state->cdr)
3551 struct rx_nfa_edge *e;
3552 /* Iterate over all edges of each NFA state. */
3553 for (e = nfa_state->car->edges; e; e = e->next)
3554 /* If we find an edge that is labeled with
3555 * the characters we are solving for.....
3557 if (rx_bitset_is_subset (rx->local_cset_size,
3558 tc->cset, e->params.cset))
3560 struct rx_nfa_state *n = e->dest;
3561 struct rx_possible_future *pf;
3562 /* ....search the partial epsilon closures of the destination
3563 * of that edge for a path that involves the same set of
3564 * side effects we are solving for.
3565 * If we find such a RX_POSSIBLE_FUTURE, we add members to the
3566 * stateset we are computing.
3568 for (pf = n->futures; pf; pf = pf->next)
3569 if (pf->effects == df->effects)
3571 struct rx_superset * old_sol;
3573 solution = rx_superstate_eclosure_union (rx, solution,
3577 rx_protect_superset (rx, solution);
3578 rx_release_superset (rx, old_sol);
3582 /* It is possible that the RX_DISTINCT_FUTURE we are working on has
3583 * the empty set of NFA states as its definition. In that case, this
3584 * is a failure point.
3586 if (solution == nil_set)
3588 df->future_frame.inx = (void *) rx_backtrack;
3589 df->future_frame.data = 0;
3590 df->future_frame.data_2 = 0;
3593 dest = rx_superstate (rx, solution);
3594 rx_release_superset (rx, solution);
3599 struct rx_distinct_future *dft;
3601 df->prev_same_dest->next_same_dest = 0;
3605 dft->future_frame.inx = rx->instruction_table[rx_next_char];
3606 dft->future_frame.data = (void *) dest->transitions;
3607 dft = dft->next_same_dest;
3609 df->prev_same_dest->next_same_dest = df;
3611 if (!dest->transition_refs)
3612 dest->transition_refs = df;
3615 struct rx_distinct_future *dft = dest->transition_refs->next_same_dest;
3616 dest->transition_refs->next_same_dest = df->next_same_dest;
3617 df->next_same_dest->prev_same_dest = dest->transition_refs;
3618 df->next_same_dest = dft;
3619 dft->prev_same_dest = df;
3625 /* This takes a superstate and a character, and computes some edges
3626 * from the superstate NFA. In particular, this computes all edges
3627 * that lead from SUPERSTATE given CHR. This function also
3628 * computes the set of characters that share this edge set.
3629 * This returns 0 on allocation error.
3630 * The character set and list of edges are returned through
3631 * the paramters CSETOUT and DFOUT.
3636 compute_super_edge (struct rx *rx, struct rx_distinct_future **dfout,
3637 rx_Bitset csetout, struct rx_superstate *superstate,
3641 compute_super_edge (rx, dfout, csetout, superstate, chr)
3643 struct rx_distinct_future **dfout;
3645 struct rx_superstate *superstate;
3649 struct rx_superset *stateset = superstate->contents;
3651 /* To compute the set of characters that share edges with CHR,
3652 * we start with the full character set, and subtract.
3654 rx_bitset_universe (rx->local_cset_size, csetout);
3657 /* Iterate over the NFA states in the superstate state-set. */
3658 while (stateset->car)
3660 struct rx_nfa_edge *e;
3661 for (e = stateset->car->edges; e; e = e->next)
3662 if (RX_bitset_member (e->params.cset, chr))
3664 /* If we find an NFA edge that applies, we make sure there
3665 * are corresponding edges in the superstate NFA.
3668 struct rx_distinct_future * saved;
3670 *dfout = include_futures (rx, *dfout, e->dest, superstate);
3673 struct rx_distinct_future * df;
3675 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3678 struct rx_distinct_future *dft;
3680 df = df->next_same_super_edge[0];
3682 if (dft->future && dft->future->transition_refs == dft)
3684 dft->future->transition_refs = dft->next_same_dest;
3685 if (dft->future->transition_refs == dft)
3686 dft->future->transition_refs = 0;
3688 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
3689 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
3690 rx_cache_free (rx->cache,
3691 &rx->cache->free_discernable_futures,
3697 /* We also trim the character set a bit. */
3698 rx_bitset_intersection (rx->local_cset_size,
3699 csetout, e->params.cset);
3702 /* An edge that doesn't apply at least tells us some characters
3703 * that don't share the same edge set as CHR.
3705 rx_bitset_difference (rx->local_cset_size, csetout, e->params.cset);
3706 stateset = stateset->cdr;
3712 /* This is a constructor for RX_SUPER_EDGE structures. These are
3713 * wrappers for lists of superstate NFA edges that share character sets labels.
3714 * If a transition class contains more than one rx_distinct_future (superstate
3715 * edge), then it represents a non-determinism in the superstate NFA.
3719 static struct rx_super_edge *
3720 rx_super_edge (struct rx *rx,
3721 struct rx_superstate *super, rx_Bitset cset,
3722 struct rx_distinct_future *df)
3724 static struct rx_super_edge *
3725 rx_super_edge (rx, super, cset, df)
3727 struct rx_superstate *super;
3729 struct rx_distinct_future *df;
3732 struct rx_super_edge *tc =
3733 (struct rx_super_edge *)rx_cache_malloc_or_get
3734 (rx->cache, &rx->cache->free_transition_classes,
3735 sizeof (struct rx_super_edge) + rx_sizeof_bitset (rx->local_cset_size));
3739 tc->next = super->edges;
3741 tc->rx_backtrack_frame.inx = rx->instruction_table[rx_backtrack_point];
3742 tc->rx_backtrack_frame.data = 0;
3743 tc->rx_backtrack_frame.data_2 = (void *) tc;
3745 tc->cset = (rx_Bitset) ((char *) tc + sizeof (*tc));
3746 rx_bitset_assign (rx->local_cset_size, tc->cset, cset);
3749 struct rx_distinct_future * dfp = df;
3750 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3754 dfp = dfp->next_same_super_edge[0];
3756 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3762 /* There are three kinds of cache miss. The first occurs when a
3763 * transition is taken that has never been computed during the
3764 * lifetime of the source superstate. That cache miss is handled by
3765 * calling COMPUTE_SUPER_EDGE. The second kind of cache miss
3766 * occurs when the destination superstate of a transition doesn't
3767 * exist. SOLVE_DESTINATION is used to construct the destination superstate.
3768 * Finally, the third kind of cache miss occurs when the destination
3769 * superstate of a transition is in a `semi-free state'. That case is
3770 * handled by UNFREE_SUPERSTATE.
3772 * The function of HANDLE_CACHE_MISS is to figure out which of these
3778 install_partial_transition (struct rx_superstate *super,
3779 struct rx_inx *answer,
3780 RX_subset set, int offset)
3783 install_partial_transition (super, answer, set, offset)
3784 struct rx_superstate *super;
3785 struct rx_inx *answer;
3791 int end = start + 32;
3793 struct rx_inx * transitions = super->transitions;
3798 transitions[start] = *answer;
3806 RX_DECL struct rx_inx *
3807 rx_handle_cache_miss
3808 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data)
3810 RX_DECL struct rx_inx *
3811 rx_handle_cache_miss (rx, super, chr, data)
3813 struct rx_superstate *super;
3818 int offset = chr / RX_subset_bits;
3819 struct rx_distinct_future *df = data;
3821 if (!df) /* must be the shared_cache_miss_frame */
3823 /* Perhaps this is just a transition waiting to be filled. */
3824 struct rx_super_edge *tc;
3825 RX_subset mask = rx_subset_singletons [chr % RX_subset_bits];
3827 for (tc = super->edges; tc; tc = tc->next)
3828 if (tc->cset[offset] & mask)
3830 struct rx_inx * answer;
3832 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3833 ? &tc->rx_backtrack_frame
3835 ? &df->side_effects_frame
3836 : &df->future_frame));
3837 install_partial_transition (super, answer,
3838 tc->cset [offset], offset * 32);
3841 /* Otherwise, it's a flushed or newly encountered edge. */
3843 char cset_space[1024]; /* this limit is far from unreasonable */
3845 struct rx_inx *answer;
3847 if (rx_sizeof_bitset (rx->local_cset_size) > sizeof (cset_space))
3848 return 0; /* If the arbitrary limit is hit, always fail */
3850 trcset = (rx_Bitset)cset_space;
3851 rx_lock_superstate (rx, super);
3852 if (!compute_super_edge (rx, &df, trcset, super, chr))
3854 rx_unlock_superstate (rx, super);
3857 if (!df) /* We just computed the fail transition. */
3859 static struct rx_inx
3860 shared_fail_frame = { 0, 0, (void *)rx_backtrack, 0 };
3861 answer = &shared_fail_frame;
3865 tc = rx_super_edge (rx, super, trcset, df);
3868 rx_unlock_superstate (rx, super);
3871 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3872 ? &tc->rx_backtrack_frame
3874 ? &df->side_effects_frame
3875 : &df->future_frame));
3877 install_partial_transition (super, answer,
3878 trcset[offset], offset * 32);
3879 rx_unlock_superstate (rx, super);
3883 else if (df->future) /* A cache miss on an edge with a future? Must be
3884 * a semi-free destination. */
3886 if (df->future->is_semifree)
3887 refresh_semifree_superstate (rx->cache, df->future);
3888 return &df->future_frame;
3891 /* no future superstate on an existing edge */
3893 rx_lock_superstate (rx, super);
3894 if (!solve_destination (rx, df))
3896 rx_unlock_superstate (rx, super);
3900 && (df->edge->options->next_same_super_edge[0] == df->edge->options))
3901 install_partial_transition (super, &df->future_frame,
3902 df->edge->cset[offset], offset * 32);
3903 rx_unlock_superstate (rx, super);
3904 return &df->future_frame;
3911 /* The rest of the code provides a regex.c compatable interface. */
3914 __const__ char *re_error_msg[] =
3917 "No match", /* REG_NOMATCH */
3918 "Invalid regular expression", /* REG_BADPAT */
3919 "Invalid collation character", /* REG_ECOLLATE */
3920 "Invalid character class name", /* REG_ECTYPE */
3921 "Trailing backslash", /* REG_EESCAPE */
3922 "Invalid back reference", /* REG_ESUBREG */
3923 "Unmatched [ or [^", /* REG_EBRACK */
3924 "Unmatched ( or \\(", /* REG_EPAREN */
3925 "Unmatched \\{", /* REG_EBRACE */
3926 "Invalid content of \\{\\}", /* REG_BADBR */
3927 "Invalid range end", /* REG_ERANGE */
3928 "Memory exhausted", /* REG_ESPACE */
3929 "Invalid preceding regular expression", /* REG_BADRPT */
3930 "Premature end of regular expression", /* REG_EEND */
3931 "Regular expression too big", /* REG_ESIZE */
3932 "Unmatched ) or \\)", /* REG_ERPAREN */
3938 * Macros used while compiling patterns.
3940 * By convention, PEND points just past the end of the uncompiled pattern,
3941 * P points to the read position in the pattern. `translate' is the name
3942 * of the translation table (`TRANSLATE' is the name of a macro that looks
3943 * things up in `translate').
3948 * Fetch the next character in the uncompiled pattern---translating it
3949 * if necessary. *Also cast from a signed character in the constant
3950 * string passed to us by the user to an unsigned char that we can use
3951 * as an array index (in, e.g., `translate').
3953 #define PATFETCH(c) \
3954 do {if (p == pend) return REG_EEND; \
3955 c = (unsigned char) *p++; \
3960 * Fetch the next character in the uncompiled pattern, with no
3963 #define PATFETCH_RAW(c) \
3964 do {if (p == pend) return REG_EEND; \
3965 c = (unsigned char) *p++; \
3968 /* Go backwards one character in the pattern. */
3969 #define PATUNFETCH p--
3972 #define TRANSLATE(d) translate[(unsigned char) (d)]
3974 typedef unsigned regnum_t;
3976 /* Since offsets can go either forwards or backwards, this type needs to
3977 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
3979 typedef int pattern_offset_t;
3983 struct rexp_node ** top_expression; /* was begalt */
3984 struct rexp_node ** last_expression; /* was laststart */
3985 pattern_offset_t inner_group_offset;
3987 } compile_stack_elt_t;
3991 compile_stack_elt_t *stack;
3993 unsigned avail; /* Offset of next open position. */
3994 } compile_stack_type;
3997 #define INIT_COMPILE_STACK_SIZE 32
3999 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
4000 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
4002 /* The next available element. */
4003 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
4006 /* Set the bit for character C in a list. */
4007 #define SET_LIST_BIT(c) \
4008 (b[((unsigned char) (c)) / CHARBITS] \
4009 |= 1 << (((unsigned char) c) % CHARBITS))
4011 /* Get the next unsigned number in the uncompiled pattern. */
4012 #define GET_UNSIGNED_NUMBER(num) \
4016 while (isdigit (c)) \
4020 num = num * 10 + c - '0'; \
4028 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
4030 #define IS_CHAR_CLASS(string) \
4031 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
4032 || !strcmp (string, "lower") || !strcmp (string, "digit") \
4033 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
4034 || !strcmp (string, "space") || !strcmp (string, "print") \
4035 || !strcmp (string, "punct") || !strcmp (string, "graph") \
4036 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
4039 /* These predicates are used in regex_compile. */
4041 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4042 * after an alternative or a begin-subexpression. We assume there is at
4043 * least one character before the ^.
4048 at_begline_loc_p (__const__ char *pattern, __const__ char * p, reg_syntax_t syntax)
4051 at_begline_loc_p (pattern, p, syntax)
4052 __const__ char *pattern;
4054 reg_syntax_t syntax;
4057 __const__ char *prev = p - 2;
4058 boolean prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\'));
4062 (/* After a subexpression? */
4063 ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash))
4065 /* After an alternative? */
4066 ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash))
4070 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4071 * at least one character after the $, i.e., `P < PEND'.
4076 at_endline_loc_p (__const__ char *p, __const__ char *pend, int syntax)
4079 at_endline_loc_p (p, pend, syntax)
4081 __const__ char *pend;
4085 __const__ char *next = p;
4086 boolean next_backslash = (*next == '\\');
4087 __const__ char *next_next = (p + 1 < pend) ? (p + 1) : 0;
4091 /* Before a subexpression? */
4092 ((syntax & RE_NO_BK_PARENS)
4094 : (next_backslash && next_next && (*next_next == ')')))
4096 /* Before an alternative? */
4097 ((syntax & RE_NO_BK_VBAR)
4099 : (next_backslash && next_next && (*next_next == '|')))
4104 unsigned char rx_id_translation[256] =
4106 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4107 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
4108 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
4109 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
4110 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
4111 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
4112 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
4113 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
4114 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
4115 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
4117 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
4118 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
4119 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
4120 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
4121 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
4122 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
4123 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
4124 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
4125 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
4126 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
4128 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
4129 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
4130 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
4131 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
4132 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
4133 250, 251, 252, 253, 254, 255
4136 /* The compiler keeps an inverted translation table.
4137 * This looks up/inititalize elements.
4138 * VALID is an array of booleans that validate CACHE.
4143 inverse_translation (struct re_pattern_buffer * rxb,
4144 char * valid, rx_Bitset cache,
4145 unsigned char * translate, int c)
4148 inverse_translation (rxb, valid, cache, translate, c)
4149 struct re_pattern_buffer * rxb;
4152 unsigned char * translate;
4157 = cache + c * rx_bitset_numb_subsets (rxb->rx.local_cset_size);
4162 int c_tr = TRANSLATE(c);
4163 rx_bitset_null (rxb->rx.local_cset_size, cs);
4164 for (x = 0; x < 256; ++x) /* &&&& 13.37 */
4165 if (TRANSLATE(x) == c_tr)
4166 RX_bitset_enjoin (cs, x);
4175 /* More subroutine declarations and macros for regex_compile. */
4177 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4178 false if it's not. */
4182 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4185 group_in_compile_stack (compile_stack, regnum)
4186 compile_stack_type compile_stack;
4192 for (this_element = compile_stack.avail - 1;
4195 if (compile_stack.stack[this_element].regnum == regnum)
4203 * Read the ending character of a range (in a bracket expression) from the
4204 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
4205 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4206 * Then we set the translation of all bits between the starting and
4207 * ending characters (inclusive) in the compiled pattern B.
4209 * Return an error code.
4211 * We use these short variable names so we can use the same macros as
4212 * `regex_compile' itself.
4216 static reg_errcode_t
4217 compile_range (struct re_pattern_buffer * rxb, rx_Bitset cs,
4218 __const__ char ** p_ptr, __const__ char * pend,
4219 unsigned char * translate, reg_syntax_t syntax,
4220 rx_Bitset inv_tr, char * valid_inv_tr)
4222 static reg_errcode_t
4223 compile_range (rxb, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr)
4224 struct re_pattern_buffer * rxb;
4226 __const__ char ** p_ptr;
4227 __const__ char * pend;
4228 unsigned char * translate;
4229 reg_syntax_t syntax;
4231 char * valid_inv_tr;
4236 __const__ char *p = *p_ptr;
4238 unsigned char range_end;
4239 unsigned char range_start = TRANSLATE(p[-2]);
4244 PATFETCH (range_end);
4248 if (range_start > range_end)
4249 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4251 for (this_char = range_start; this_char <= range_end; this_char++)
4254 inverse_translation (rxb, valid_inv_tr, inv_tr, translate, this_char);
4255 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
4262 /* This searches a regexp for backreference side effects.
4263 * It fills in the array OUT with 1 at the index of every register pair
4264 * referenced by a backreference.
4266 * This is used to help optimize patterns for searching. The information is
4267 * useful because, if the caller doesn't want register values, backreferenced
4268 * registers are the only registers for which we need rx_backtrack.
4273 find_backrefs (char * out, struct rexp_node * rexp,
4274 struct re_se_params * params)
4277 find_backrefs (out, rexp, params)
4279 struct rexp_node * rexp;
4280 struct re_se_params * params;
4294 find_backrefs (out, rexp->params.pair.left, params);
4295 find_backrefs (out, rexp->params.pair.right, params);
4298 if ( ((long)rexp->params.side_effect >= 0)
4299 && (params [(long)rexp->params.side_effect].se == re_se_backref))
4300 out[ params [(long)rexp->params.side_effect].op1] = 1;
4307 /* Returns 0 unless the pattern can match the empty string. */
4311 compute_fastset (struct re_pattern_buffer * rxb, struct rexp_node * rexp)
4314 compute_fastset (rxb, rexp)
4315 struct re_pattern_buffer * rxb;
4316 struct rexp_node * rexp;
4327 rx_bitset_union (rxb->rx.local_cset_size,
4328 rxb->fastset, rexp->params.cset);
4332 return (compute_fastset (rxb, rexp->params.pair.left)
4333 && compute_fastset (rxb, rexp->params.pair.right));
4335 compute_fastset (rxb, rexp->params.pair.left);
4336 /* compute_fastset (rxb, rexp->params.pair.right); nope... */
4339 return !!(compute_fastset (rxb, rexp->params.pair.left)
4340 + compute_fastset (rxb, rexp->params.pair.right));
4343 compute_fastset (rxb, rexp->params.pair.left);
4349 /* this should never happen */
4355 * 1 -- yes, definately anchored by the given side effect.
4356 * 2 -- maybe anchored, maybe the empty string.
4357 * 0 -- definately not anchored
4358 * There is simply no other possibility.
4363 is_anchored (struct rexp_node * rexp, rx_side_effect se)
4366 is_anchored (rexp, se)
4367 struct rexp_node * rexp;
4381 int l = is_anchored (rexp->params.pair.left, se);
4382 return (l == 2 ? is_anchored (rexp->params.pair.right, se) : l);
4386 int l = is_anchored (rexp->params.pair.left, se);
4387 int r = l ? is_anchored (rexp->params.pair.right, se) : 0;
4391 else if ((l == 0) || (r == 0))
4398 return is_anchored (rexp->params.pair.left, se) ? 2 : 0;
4401 return ((rexp->params.side_effect == se)
4405 /* this should never happen */
4410 /* This removes register assignments that aren't required by backreferencing.
4411 * This can speed up explore_future, especially if it eliminates
4412 * non-determinism in the superstate NFA.
4414 * NEEDED is an array of characters, presumably filled in by FIND_BACKREFS.
4415 * The non-zero elements of the array indicate which register assignments
4416 * can NOT be removed from the expression.
4420 static struct rexp_node *
4421 remove_unecessary_side_effects (struct rx * rx, char * needed,
4422 struct rexp_node * rexp,
4423 struct re_se_params * params)
4425 static struct rexp_node *
4426 remove_unecessary_side_effects (rx, needed, rexp, params)
4429 struct rexp_node * rexp;
4430 struct re_se_params * params;
4433 struct rexp_node * l;
4434 struct rexp_node * r;
4446 l = remove_unecessary_side_effects (rx, needed,
4447 rexp->params.pair.left, params);
4448 r = remove_unecessary_side_effects (rx, needed,
4449 rexp->params.pair.right, params);
4450 if ((l && r) || (rexp->type != r_concat))
4452 rexp->params.pair.left = l;
4453 rexp->params.pair.right = r;
4458 rexp->params.pair.left = rexp->params.pair.right = 0;
4459 rx_free_rexp (rx, rexp);
4464 l = remove_unecessary_side_effects (rx, needed,
4465 rexp->params.pair.left, params);
4468 rexp->params.pair.left = l;
4473 rexp->params.pair.left = 0;
4474 rx_free_rexp (rx, rexp);
4479 int se = (long)rexp->params.side_effect;
4481 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4482 || ((enum re_side_effects)params[se].se == re_se_rparen))
4483 && (params [se].op1 > 0)
4484 && (!needed [params [se].op1]))
4486 rx_free_rexp (rx, rexp);
4494 /* this should never happen */
4502 pointless_if_repeated (struct rexp_node * node, struct re_se_params * params)
4505 pointless_if_repeated (node, params)
4506 struct rexp_node * node;
4507 struct re_se_params * params;
4519 return (pointless_if_repeated (node->params.pair.left, params)
4520 && pointless_if_repeated (node->params.pair.right, params));
4523 return pointless_if_repeated (node->params.pair.left, params);
4525 switch (((long)node->params.side_effect < 0)
4526 ? (enum re_side_effects)node->params.side_effect
4527 : (enum re_side_effects)params[(long)node->params.side_effect].se)
4534 case re_se_wordbound:
4535 case re_se_notwordbound:
4545 case re_se_end_iter:
4547 case re_se_not_syntax:
4561 registers_on_stack (struct re_pattern_buffer * rxb,
4562 struct rexp_node * rexp, int in_danger,
4563 struct re_se_params * params)
4566 registers_on_stack (rxb, rexp, in_danger, params)
4567 struct re_pattern_buffer * rxb;
4568 struct rexp_node * rexp;
4570 struct re_se_params * params;
4583 return ( registers_on_stack (rxb, rexp->params.pair.left,
4585 || (registers_on_stack
4586 (rxb, rexp->params.pair.right,
4587 in_danger, params)));
4589 return registers_on_stack (rxb, rexp->params.pair.left, 0, params);
4591 return registers_on_stack (rxb, rexp->params.pair.left, 1, params);
4594 ( registers_on_stack (rxb, rexp->params.pair.left, 1, params)
4595 || registers_on_stack (rxb, rexp->params.pair.right, 1, params));
4598 int se = (long)rexp->params.side_effect;
4601 && (params [se].op1 > 0)
4602 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4603 || ((enum re_side_effects)params[se].se == re_se_rparen)))
4610 /* this should never happen */
4616 static char idempotent_complex_se[] =
4618 #define RX_WANT_SE_DEFS 1
4620 #undef RX_DEF_CPLX_SE
4621 #define RX_DEF_SE(IDEM, NAME, VALUE)
4622 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) IDEM,
4625 #undef RX_DEF_CPLX_SE
4626 #undef RX_WANT_SE_DEFS
4630 static char idempotent_se[] =
4633 #define RX_WANT_SE_DEFS 1
4635 #undef RX_DEF_CPLX_SE
4636 #define RX_DEF_SE(IDEM, NAME, VALUE) IDEM,
4637 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE)
4640 #undef RX_DEF_CPLX_SE
4641 #undef RX_WANT_SE_DEFS
4650 has_any_se (struct rx * rx,
4651 struct rexp_node * rexp)
4654 has_any_se (rx, rexp)
4656 struct rexp_node * rexp;
4675 ( has_any_se (rx, rexp->params.pair.left)
4676 || has_any_se (rx, rexp->params.pair.right));
4680 return has_any_se (rx, rexp->params.pair.left);
4683 /* this should never happen */
4689 /* This must be called AFTER `convert_hard_loops' for a given REXP. */
4692 has_non_idempotent_epsilon_path (struct rx * rx,
4693 struct rexp_node * rexp,
4694 struct re_se_params * params)
4697 has_non_idempotent_epsilon_path (rx, rexp, params)
4699 struct rexp_node * rexp;
4700 struct re_se_params * params;
4715 !((long)rexp->params.side_effect > 0
4716 ? idempotent_complex_se [ params [(long)rexp->params.side_effect].se ]
4717 : idempotent_se [-(long)rexp->params.side_effect]);
4721 ( has_non_idempotent_epsilon_path (rx,
4722 rexp->params.pair.left, params)
4723 || has_non_idempotent_epsilon_path (rx,
4724 rexp->params.pair.right, params));
4729 ( has_non_idempotent_epsilon_path (rx,
4730 rexp->params.pair.left, params)
4731 && has_non_idempotent_epsilon_path (rx,
4732 rexp->params.pair.right, params));
4735 return has_non_idempotent_epsilon_path (rx,
4736 rexp->params.pair.left, params);
4739 /* this should never happen */
4745 /* This computes rougly what it's name suggests. It can (and does) go wrong
4746 * in the direction of returning spurious 0 without causing disasters.
4750 begins_with_complex_se (struct rx * rx, struct rexp_node * rexp)
4753 begins_with_complex_se (rx, rexp)
4755 struct rexp_node * rexp;
4768 return ((long)rexp->params.side_effect >= 0);
4772 ( begins_with_complex_se (rx, rexp->params.pair.left)
4773 && begins_with_complex_se (rx, rexp->params.pair.right));
4777 return has_any_se (rx, rexp->params.pair.left);
4784 /* this should never happen */
4789 /* This destructively removes some of the re_se_tv side effects from
4790 * a rexp tree. In particular, during parsing re_se_tv was inserted on the
4791 * right half of every | to guarantee that posix path preference could be
4792 * honored. This function removes some which it can be determined aren't
4798 speed_up_alt (struct rx * rx,
4799 struct rexp_node * rexp,
4803 speed_up_alt (rx, rexp, unposix)
4805 struct rexp_node * rexp;
4821 speed_up_alt (rx, rexp->params.pair.left, unposix);
4826 speed_up_alt (rx, rexp->params.pair.left, unposix);
4827 speed_up_alt (rx, rexp->params.pair.right, unposix);
4831 /* the right child is guaranteed to be (concat re_se_tv <subexp>) */
4833 speed_up_alt (rx, rexp->params.pair.left, unposix);
4834 speed_up_alt (rx, rexp->params.pair.right->params.pair.right, unposix);
4837 || (begins_with_complex_se
4838 (rx, rexp->params.pair.right->params.pair.right))
4839 || !( has_any_se (rx, rexp->params.pair.right->params.pair.right)
4840 || has_any_se (rx, rexp->params.pair.left)))
4842 struct rexp_node * conc = rexp->params.pair.right;
4843 rexp->params.pair.right = conc->params.pair.right;
4844 conc->params.pair.right = 0;
4845 rx_free_rexp (rx, conc);
4854 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
4855 Returns one of error codes defined in `regex.h', or zero for success.
4857 Assumes the `allocated' (and perhaps `buffer') and `translate'
4858 fields are set in BUFP on entry.
4860 If it succeeds, results are put in BUFP (if it returns an error, the
4861 contents of BUFP are undefined):
4862 `buffer' is the compiled pattern;
4863 `syntax' is set to SYNTAX;
4864 `used' is set to the length of the compiled pattern;
4865 `fastmap_accurate' is set to zero;
4866 `re_nsub' is set to the number of groups in PATTERN;
4867 `not_bol' and `not_eol' are set to zero.
4869 The `fastmap' and `newline_anchor' fields are neither
4870 examined nor set. */
4875 RX_DECL reg_errcode_t
4876 rx_compile (__const__ char *pattern, int size,
4877 reg_syntax_t syntax,
4878 struct re_pattern_buffer * rxb)
4880 RX_DECL reg_errcode_t
4881 rx_compile (pattern, size, syntax, rxb)
4882 __const__ char *pattern;
4884 reg_syntax_t syntax;
4885 struct re_pattern_buffer * rxb;
4889 inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4891 validate_inv_tr [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4893 /* We fetch characters from PATTERN here. Even though PATTERN is
4894 `char *' (i.e., signed), we declare these variables as unsigned, so
4895 they can be reliably used as array indices. */
4896 register unsigned char c, c1;
4898 /* A random tempory spot in PATTERN. */
4901 /* Keeps track of unclosed groups. */
4902 compile_stack_type compile_stack;
4904 /* Points to the current (ending) position in the pattern. */
4905 __const__ char *p = pattern;
4906 __const__ char *pend = pattern + size;
4908 /* How to translate the characters in the pattern. */
4909 unsigned char *translate = (rxb->translate
4911 : rx_id_translation);
4913 /* When parsing is done, this will hold the expression tree. */
4914 struct rexp_node * rexp = 0;
4916 /* In the midst of compilation, this holds onto the regexp
4917 * first parst while rexp goes on to aquire additional constructs.
4919 struct rexp_node * orig_rexp = 0;
4920 struct rexp_node * fewer_side_effects = 0;
4922 /* This and top_expression are saved on the compile stack. */
4923 struct rexp_node ** top_expression = &rexp;
4924 struct rexp_node ** last_expression = top_expression;
4926 /* Parameter to `goto append_node' */
4927 struct rexp_node * append;
4929 /* Counts open-groups as they are encountered. This is the index of the
4930 * innermost group being compiled.
4932 regnum_t regnum = 0;
4934 /* Place in the uncompiled pattern (i.e., the {) to
4935 * which to go back if the interval is invalid.
4937 __const__ char *beg_interval;
4939 struct re_se_params * params = 0;
4940 int paramc = 0; /* How many complex side effects so far? */
4942 rx_side_effect side; /* param to `goto add_side_effect' */
4944 bzero (validate_inv_tr, sizeof (validate_inv_tr));
4946 rxb->rx.instruction_table = rx_id_instruction_table;
4949 /* Initialize the compile stack. */
4950 compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t)));
4951 if (compile_stack.stack == 0)
4954 compile_stack.size = INIT_COMPILE_STACK_SIZE;
4955 compile_stack.avail = 0;
4957 /* Initialize the pattern buffer. */
4958 rxb->rx.cache = &default_cache;
4959 rxb->syntax = syntax;
4960 rxb->fastmap_accurate = 0;
4961 rxb->not_bol = rxb->not_eol = 0;
4962 rxb->least_subs = 0;
4964 /* Always count groups, whether or not rxb->no_sub is set.
4965 * The whole pattern is implicitly group 0, so counting begins
4970 #if !defined (emacs) && !defined (SYNTAX_TABLE)
4971 /* Initialize the syntax table. */
4972 init_syntax_once ();
4975 /* Loop through the uncompiled pattern until we're at the end. */
4984 if ( /* If at start of pattern, it's an operator. */
4986 /* If context independent, it's an operator. */
4987 || syntax & RE_CONTEXT_INDEP_ANCHORS
4988 /* Otherwise, depends on what's come before. */
4989 || at_begline_loc_p (pattern, p, syntax))
4991 struct rexp_node * n
4992 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_hat);
5006 if ( /* If at end of pattern, it's an operator. */
5008 /* If context independent, it's an operator. */
5009 || syntax & RE_CONTEXT_INDEP_ANCHORS
5010 /* Otherwise, depends on what's next. */
5011 || at_endline_loc_p (p, pend, syntax))
5013 struct rexp_node * n
5014 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_dollar);
5028 if ((syntax & RE_BK_PLUS_QM)
5029 || (syntax & RE_LIMITED_OPS))
5034 /* If there is no previous pattern... */
5035 if (pointless_if_repeated (*last_expression, params))
5037 if (syntax & RE_CONTEXT_INVALID_OPS)
5039 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5044 /* 1 means zero (many) matches is allowed. */
5045 char zero_times_ok = 0, many_times_ok = 0;
5047 /* If there is a sequence of repetition chars, collapse it
5048 down to just one (the right one). We can't combine
5049 interval operators with these because of, e.g., `a{2}*',
5050 which should only match an even number of `a's. */
5054 zero_times_ok |= c != '+';
5055 many_times_ok |= c != '?';
5063 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
5066 else if (syntax & RE_BK_PLUS_QM && c == '\\')
5068 if (p == pend) return REG_EESCAPE;
5071 if (!(c1 == '+' || c1 == '?'))
5086 /* If we get here, we found another repeat character. */
5089 /* Star, etc. applied to an empty pattern is equivalent
5090 to an empty pattern. */
5091 if (!last_expression)
5094 /* Now we know whether or not zero matches is allowed
5095 * and also whether or not two or more matches is allowed.
5099 struct rexp_node * inner_exp = *last_expression;
5103 && has_non_idempotent_epsilon_path (&rxb->rx,
5106 struct rexp_node * pusher
5107 = rx_mk_r_side_effect (&rxb->rx,
5108 (rx_side_effect)re_se_pushpos);
5109 struct rexp_node * checker
5110 = rx_mk_r_side_effect (&rxb->rx,
5111 (rx_side_effect)re_se_chkpos);
5112 struct rexp_node * pushback
5113 = rx_mk_r_side_effect (&rxb->rx,
5114 (rx_side_effect)re_se_pushback);
5115 rx_Bitset cs = rx_cset (&rxb->rx);
5116 struct rexp_node * lit_t = rx_mk_r_cset (&rxb->rx, cs);
5117 struct rexp_node * fake_state
5118 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5119 struct rexp_node * phase2
5120 = rx_mk_r_concat (&rxb->rx, checker, fake_state);
5121 struct rexp_node * popper
5122 = rx_mk_r_side_effect (&rxb->rx,
5123 (rx_side_effect)re_se_poppos);
5124 struct rexp_node * star
5125 = rx_mk_r_2phase_star (&rxb->rx, inner_exp, phase2);
5126 struct rexp_node * a
5127 = rx_mk_r_concat (&rxb->rx, pusher, star);
5128 struct rexp_node * whole_thing
5129 = rx_mk_r_concat (&rxb->rx, a, popper);
5130 if (!(pusher && star && pushback && lit_t && fake_state
5131 && lit_t && phase2 && checker && popper
5132 && a && whole_thing))
5134 RX_bitset_enjoin (cs, 't');
5135 *last_expression = whole_thing;
5139 struct rexp_node * star =
5140 (many_times_ok ? rx_mk_r_star : rx_mk_r_opt)
5141 (&rxb->rx, *last_expression);
5144 *last_expression = star;
5145 need_sync = has_any_se (&rxb->rx, *last_expression);
5149 struct rexp_node * concat
5150 = rx_mk_r_concat (&rxb->rx, inner_exp,
5151 rx_copy_rexp (&rxb->rx,
5155 *last_expression = concat;
5159 int sync_se = paramc;
5161 ? ((struct re_se_params *)
5163 sizeof (*params) * (1 + paramc)))
5164 : ((struct re_se_params *)
5165 malloc (sizeof (*params))));
5169 params [sync_se].se = re_se_tv;
5170 side = (rx_side_effect)sync_se;
5171 goto add_side_effect;
5174 /* The old regex.c used to optimize `.*\n'.
5175 * Maybe rx should too?
5183 rx_Bitset cs = rx_cset (&rxb->rx);
5184 struct rexp_node * n = rx_mk_r_cset (&rxb->rx, cs);
5188 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5189 if (!(rxb->syntax & RE_DOT_NEWLINE))
5190 RX_bitset_remove (cs, '\n');
5191 if (!(rxb->syntax & RE_DOT_NOT_NULL))
5192 RX_bitset_remove (cs, 0);
5201 if (p == pend) return REG_EBRACK;
5203 boolean had_char_class = false;
5204 rx_Bitset cs = rx_cset (&rxb->rx);
5205 struct rexp_node * node = rx_mk_r_cset (&rxb->rx, cs);
5206 int is_inverted = *p == '^';
5211 /* This branch of the switch is normally exited with
5219 /* Remember the first position in the bracket expression. */
5222 /* Read in characters and ranges, setting map bits. */
5225 if (p == pend) return REG_EBRACK;
5229 /* \ might escape characters inside [...] and [^...]. */
5230 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
5232 if (p == pend) return REG_EESCAPE;
5236 rx_Bitset it = inverse_translation (rxb,
5241 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5246 /* Could be the end of the bracket expression. If it's
5247 not (i.e., when the bracket expression is `[]' so
5248 far), the ']' character bit gets set way below. */
5249 if (c == ']' && p != p1 + 1)
5250 goto finalize_class_and_append;
5252 /* Look ahead to see if it's a range when the last thing
5253 was a character class. */
5254 if (had_char_class && c == '-' && *p != ']')
5257 /* Look ahead to see if it's a range when the last thing
5258 was a character: if this is a hyphen not at the
5259 beginning or the end of a list, then it's the range
5262 && !(p - 2 >= pattern && p[-2] == '[')
5263 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
5267 = compile_range (rxb, cs, &p, pend, translate, syntax,
5268 inverse_translate, validate_inv_tr);
5269 if (ret != REG_NOERROR) return ret;
5272 else if (p[0] == '-' && p[1] != ']')
5273 { /* This handles ranges made up of characters only. */
5276 /* Move past the `-'. */
5279 ret = compile_range (rxb, cs, &p, pend, translate, syntax,
5280 inverse_translate, validate_inv_tr);
5281 if (ret != REG_NOERROR) return ret;
5284 /* See if we're at the beginning of a possible character
5287 else if ((syntax & RE_CHAR_CLASSES)
5288 && (c == '[') && (*p == ':'))
5290 char str[CHAR_CLASS_MAX_LENGTH + 1];
5295 /* If pattern is `[[:'. */
5296 if (p == pend) return REG_EBRACK;
5301 if (c == ':' || c == ']' || p == pend
5302 || c1 == CHAR_CLASS_MAX_LENGTH)
5308 /* If isn't a word bracketed by `[:' and:`]':
5309 undo the ending character, the letters, and leave
5310 the leading `:' and `[' (but set bits for them). */
5311 if (c == ':' && *p == ']')
5314 boolean is_alnum = !strcmp (str, "alnum");
5315 boolean is_alpha = !strcmp (str, "alpha");
5316 boolean is_blank = !strcmp (str, "blank");
5317 boolean is_cntrl = !strcmp (str, "cntrl");
5318 boolean is_digit = !strcmp (str, "digit");
5319 boolean is_graph = !strcmp (str, "graph");
5320 boolean is_lower = !strcmp (str, "lower");
5321 boolean is_print = !strcmp (str, "print");
5322 boolean is_punct = !strcmp (str, "punct");
5323 boolean is_space = !strcmp (str, "space");
5324 boolean is_upper = !strcmp (str, "upper");
5325 boolean is_xdigit = !strcmp (str, "xdigit");
5327 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
5329 /* Throw away the ] at the end of the character
5333 if (p == pend) return REG_EBRACK;
5335 for (ch = 0; ch < 1 << CHARBITS; ch++)
5337 if ( (is_alnum && isalnum (ch))
5338 || (is_alpha && isalpha (ch))
5339 || (is_blank && isblank (ch))
5340 || (is_cntrl && iscntrl (ch))
5341 || (is_digit && isdigit (ch))
5342 || (is_graph && isgraph (ch))
5343 || (is_lower && islower (ch))
5344 || (is_print && isprint (ch))
5345 || (is_punct && ispunct (ch))
5346 || (is_space && isspace (ch))
5347 || (is_upper && isupper (ch))
5348 || (is_xdigit && isxdigit (ch)))
5351 inverse_translation (rxb,
5356 rx_bitset_union (rxb->rx.local_cset_size,
5360 had_char_class = true;
5369 inverse_translation (rxb,
5374 rx_bitset_union (rxb->rx.local_cset_size,
5379 inverse_translation (rxb,
5384 rx_bitset_union (rxb->rx.local_cset_size,
5387 had_char_class = false;
5392 had_char_class = false;
5394 rx_Bitset it = inverse_translation (rxb,
5399 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5404 finalize_class_and_append:
5407 rx_bitset_complement (rxb->rx.local_cset_size, cs);
5408 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5409 RX_bitset_remove (cs, '\n');
5417 if (syntax & RE_NO_BK_PARENS)
5424 if (syntax & RE_NO_BK_PARENS)
5431 if (syntax & RE_NEWLINE_ALT)
5438 if (syntax & RE_NO_BK_VBAR)
5445 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5446 goto handle_interval;
5452 if (p == pend) return REG_EESCAPE;
5454 /* Do not translate the character after the \, so that we can
5455 distinguish, e.g., \B from \b, even if we normally would
5456 translate, e.g., B to b. */
5462 if (syntax & RE_NO_BK_PARENS)
5463 goto normal_backslash;
5468 if (COMPILE_STACK_FULL)
5470 ((compile_stack.stack) =
5471 (compile_stack_elt_t *) realloc (compile_stack.stack, ( compile_stack.size << 1) * sizeof (
5472 compile_stack_elt_t)));
5473 if (compile_stack.stack == 0) return REG_ESPACE;
5475 compile_stack.size <<= 1;
5478 if (*last_expression)
5480 struct rexp_node * concat
5481 = rx_mk_r_concat (&rxb->rx, *last_expression, 0);
5484 *last_expression = concat;
5485 last_expression = &concat->params.pair.right;
5489 * These are the values to restore when we hit end of this
5492 COMPILE_STACK_TOP.top_expression = top_expression;
5493 COMPILE_STACK_TOP.last_expression = last_expression;
5494 COMPILE_STACK_TOP.regnum = regnum;
5496 compile_stack.avail++;
5498 top_expression = last_expression;
5503 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
5506 /* See similar code for backslashed left paren above. */
5507 if (COMPILE_STACK_EMPTY)
5508 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
5513 /* Since we just checked for an empty stack above, this
5514 ``can't happen''. */
5517 /* We don't just want to restore into `regnum', because
5518 later groups should continue to be numbered higher,
5519 as in `(ab)c(de)' -- the second group is #2. */
5520 regnum_t this_group_regnum;
5521 struct rexp_node ** inner = top_expression;
5523 compile_stack.avail--;
5524 top_expression = COMPILE_STACK_TOP.top_expression;
5525 last_expression = COMPILE_STACK_TOP.last_expression;
5526 this_group_regnum = COMPILE_STACK_TOP.regnum;
5528 int left_se = paramc;
5529 int right_se = paramc + 1;
5532 ? ((struct re_se_params *)
5534 (paramc + 2) * sizeof (params[0])))
5535 : ((struct re_se_params *)
5536 malloc (2 * sizeof (params[0]))));
5541 params[left_se].se = re_se_lparen;
5542 params[left_se].op1 = this_group_regnum;
5543 params[right_se].se = re_se_rparen;
5544 params[right_se].op1 = this_group_regnum;
5546 struct rexp_node * left
5547 = rx_mk_r_side_effect (&rxb->rx,
5548 (rx_side_effect)left_se);
5549 struct rexp_node * right
5550 = rx_mk_r_side_effect (&rxb->rx,
5551 (rx_side_effect)right_se);
5552 struct rexp_node * c1
5554 ? rx_mk_r_concat (&rxb->rx, left, *inner) : left);
5555 struct rexp_node * c2
5556 = rx_mk_r_concat (&rxb->rx, c1, right);
5557 if (!(left && right && c1 && c2))
5565 case '|': /* `\|'. */
5566 if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR))
5567 goto normal_backslash;
5569 if (syntax & RE_LIMITED_OPS)
5573 struct rexp_node * alt
5574 = rx_mk_r_alternate (&rxb->rx, *top_expression, 0);
5577 *top_expression = alt;
5578 last_expression = &alt->params.pair.right;
5580 int sync_se = paramc;
5583 ? ((struct re_se_params *)
5585 (paramc + 1) * sizeof (params[0])))
5586 : ((struct re_se_params *)
5587 malloc (sizeof (params[0]))));
5592 params[sync_se].se = re_se_tv;
5594 struct rexp_node * sync
5595 = rx_mk_r_side_effect (&rxb->rx,
5596 (rx_side_effect)sync_se);
5597 struct rexp_node * conc
5598 = rx_mk_r_concat (&rxb->rx, sync, 0);
5603 *last_expression = conc;
5604 last_expression = &conc->params.pair.right;
5612 /* If \{ is a literal. */
5613 if (!(syntax & RE_INTERVALS)
5614 /* If we're at `\{' and it's not the open-interval
5616 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5617 || (p - 2 == pattern && p == pend))
5618 goto normal_backslash;
5622 /* If got here, then the syntax allows intervals. */
5624 /* At least (most) this many matches must be made. */
5625 int lower_bound = -1, upper_bound = -1;
5627 beg_interval = p - 1;
5631 if (syntax & RE_NO_BK_BRACES)
5632 goto unfetch_interval;
5637 GET_UNSIGNED_NUMBER (lower_bound);
5641 GET_UNSIGNED_NUMBER (upper_bound);
5642 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
5645 /* Interval such as `{1}' => match exactly once. */
5646 upper_bound = lower_bound;
5648 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
5649 || lower_bound > upper_bound)
5651 if (syntax & RE_NO_BK_BRACES)
5652 goto unfetch_interval;
5657 if (!(syntax & RE_NO_BK_BRACES))
5659 if (c != '\\') return REG_EBRACE;
5665 if (syntax & RE_NO_BK_BRACES)
5666 goto unfetch_interval;
5671 /* We just parsed a valid interval. */
5673 /* If it's invalid to have no preceding re. */
5674 if (pointless_if_repeated (*last_expression, params))
5676 if (syntax & RE_CONTEXT_INVALID_OPS)
5678 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5679 goto unfetch_interval;
5680 /* was: else laststart = b; */
5683 /* If the upper bound is zero, don't want to iterate
5686 if (upper_bound == 0)
5688 if (*last_expression)
5690 rx_free_rexp (&rxb->rx, *last_expression);
5691 *last_expression = 0;
5695 /* Otherwise, we have a nontrivial interval. */
5697 int iter_se = paramc;
5698 int end_se = paramc + 1;
5700 ? ((struct re_se_params *)
5702 sizeof (*params) * (2 + paramc)))
5703 : ((struct re_se_params *)
5704 malloc (2 * sizeof (*params))));
5708 params [iter_se].se = re_se_iter;
5709 params [iter_se].op1 = lower_bound;
5710 params[iter_se].op2 = upper_bound;
5712 params[end_se].se = re_se_end_iter;
5713 params[end_se].op1 = lower_bound;
5714 params[end_se].op2 = upper_bound;
5716 struct rexp_node * push0
5717 = rx_mk_r_side_effect (&rxb->rx,
5718 (rx_side_effect)re_se_push0);
5719 struct rexp_node * start_one_iter
5720 = rx_mk_r_side_effect (&rxb->rx,
5721 (rx_side_effect)iter_se);
5722 struct rexp_node * phase1
5723 = rx_mk_r_concat (&rxb->rx, start_one_iter,
5725 struct rexp_node * pushback
5726 = rx_mk_r_side_effect (&rxb->rx,
5727 (rx_side_effect)re_se_pushback);
5728 rx_Bitset cs = rx_cset (&rxb->rx);
5729 struct rexp_node * lit_t
5730 = rx_mk_r_cset (&rxb->rx, cs);
5731 struct rexp_node * phase2
5732 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5733 struct rexp_node * loop
5734 = rx_mk_r_2phase_star (&rxb->rx, phase1, phase2);
5735 struct rexp_node * push_n_loop
5736 = rx_mk_r_concat (&rxb->rx, push0, loop);
5737 struct rexp_node * final_test
5738 = rx_mk_r_side_effect (&rxb->rx,
5739 (rx_side_effect)end_se);
5740 struct rexp_node * full_exp
5741 = rx_mk_r_concat (&rxb->rx, push_n_loop, final_test);
5743 if (!(push0 && start_one_iter && phase1
5744 && pushback && lit_t && phase2
5745 && loop && push_n_loop && final_test && full_exp))
5748 RX_bitset_enjoin(cs, 't');
5750 *last_expression = full_exp;
5758 /* If an invalid interval, match the characters as literals. */
5762 /* normal_char and normal_backslash need `c'. */
5765 if (!(syntax & RE_NO_BK_BRACES))
5767 if (p > pattern && p[-1] == '\\')
5768 goto normal_backslash;
5773 /* There is no way to specify the before_dot and after_dot
5774 operators. rms says this is ok. --karl */
5776 side = (rx_side_effect)rx_se_at_dot;
5777 goto add_side_effect;
5783 rx_Bitset cs = rx_cset (&rxb->rx);
5784 struct rexp_node * set = rx_mk_r_cset (&rxb->rx, cs);
5788 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5793 enum syntaxcode code = syntax_spec_code [c];
5794 for (x = 0; x < 256; ++x)
5797 if (SYNTAX (x) == code)
5800 inverse_translation (rxb, validate_inv_tr,
5803 rx_bitset_xor (rxb->rx.local_cset_size, cs, it);
5817 rx_Bitset cs = rx_cset (&rxb->rx);
5818 struct rexp_node * n = (cs ? rx_mk_r_cset (&rxb->rx, cs) : 0);
5822 rx_bitset_universe (rxb->rx.local_cset_size ,cs);
5825 for (x = rxb->rx.local_cset_size - 1; x > 0; --x)
5826 if (SYNTAX(x) & Sword)
5827 RX_bitset_toggle (cs, x);
5834 /* With a little extra work, some of these side effects could be optimized
5835 * away (basicly by looking at what we already know about the surrounding
5839 side = (rx_side_effect)re_se_wordbeg;
5840 goto add_side_effect;
5844 side = (rx_side_effect)re_se_wordend;
5845 goto add_side_effect;
5849 side = (rx_side_effect)re_se_wordbound;
5850 goto add_side_effect;
5854 side = (rx_side_effect)re_se_notwordbound;
5855 goto add_side_effect;
5859 side = (rx_side_effect)re_se_begbuf;
5860 goto add_side_effect;
5864 side = (rx_side_effect)re_se_endbuf;
5865 goto add_side_effect;
5870 struct rexp_node * se
5871 = rx_mk_r_side_effect (&rxb->rx, side);
5879 case '1': case '2': case '3': case '4': case '5':
5880 case '6': case '7': case '8': case '9':
5881 if (syntax & RE_NO_BK_REFS)
5889 /* Can't back reference to a subexpression if inside of it. */
5890 if (group_in_compile_stack (compile_stack, c1))
5894 int backref_se = paramc;
5896 ? ((struct re_se_params *)
5898 sizeof (*params) * (1 + paramc)))
5899 : ((struct re_se_params *)
5900 malloc (sizeof (*params))));
5904 params[backref_se].se = re_se_backref;
5905 params[backref_se].op1 = c1;
5906 side = (rx_side_effect)backref_se;
5907 goto add_side_effect;
5913 if (syntax & RE_BK_PLUS_QM)
5916 goto normal_backslash;
5920 /* You might think it would be useful for \ to mean
5921 not to translate; but if we don't translate it
5922 it will never match anything. */
5930 /* Expects the character in `c'. */
5933 rx_Bitset cs = rx_cset(&rxb->rx);
5934 struct rexp_node * match = rx_mk_r_cset (&rxb->rx, cs);
5938 it = inverse_translation (rxb, validate_inv_tr,
5939 inverse_translate, translate, c);
5940 rx_bitset_union (CHAR_SET_SIZE, cs, it);
5944 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
5945 * and then parses the next character normally.
5947 if (*last_expression)
5949 struct rexp_node * concat
5950 = rx_mk_r_concat (&rxb->rx, *last_expression, append);
5953 *last_expression = concat;
5954 last_expression = &concat->params.pair.right;
5957 *last_expression = append;
5960 } /* while p != pend */
5964 int win_se = paramc;
5966 ? ((struct re_se_params *)
5968 sizeof (*params) * (1 + paramc)))
5969 : ((struct re_se_params *)
5970 malloc (sizeof (*params))));
5974 params[win_se].se = re_se_win;
5976 struct rexp_node * se
5977 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)win_se);
5978 struct rexp_node * concat
5979 = rx_mk_r_concat (&rxb->rx, rexp, se);
5980 if (!(se && concat))
5987 /* Through the pattern now. */
5989 if (!COMPILE_STACK_EMPTY)
5992 free (compile_stack.stack);
5996 if (rx_debug_compile)
5999 fputs ("\n\nCompiling ", stdout);
6000 fwrite (pattern, 1, size, stdout);
6001 fputs (":\n", stdout);
6002 rxb->se_params = params;
6003 print_rexp (&rxb->rx, orig_rexp, 2, re_seprint, stdout);
6007 rx_Bitset cs = rx_cset(&rxb->rx);
6008 rx_Bitset cs2 = rx_cset(&rxb->rx);
6009 char * se_map = (char *) alloca (paramc);
6010 struct rexp_node * new_rexp = 0;
6013 bzero (se_map, paramc);
6014 find_backrefs (se_map, rexp, params);
6015 fewer_side_effects =
6016 remove_unecessary_side_effects (&rxb->rx, se_map,
6017 rx_copy_rexp (&rxb->rx, rexp), params);
6019 speed_up_alt (&rxb->rx, rexp, 0);
6020 speed_up_alt (&rxb->rx, fewer_side_effects, 1);
6023 char * syntax_parens = rxb->syntax_parens;
6024 if (syntax_parens == (char *)0x1)
6025 rexp = remove_unecessary_side_effects
6026 (&rxb->rx, se_map, rexp, params);
6027 else if (syntax_parens)
6030 for (x = 0; x < paramc; ++x)
6031 if (( (params[x].se == re_se_lparen)
6032 || (params[x].se == re_se_rparen))
6033 && (!syntax_parens [params[x].op1]))
6035 rexp = remove_unecessary_side_effects
6036 (&rxb->rx, se_map, rexp, params);
6040 /* At least one more optimization would be nice to have here but i ran out
6041 * of time. The idea would be to delay side effects.
6042 * For examle, `(abc)' is the same thing as `abc()' except that the
6043 * left paren is offset by 3 (which we know at compile time).
6044 * (In this comment, write that second pattern `abc(:3:)'
6045 * where `(:3:' is a syntactic unit.)
6047 * Trickier: `(abc|defg)' is the same as `(abc(:3:|defg(:4:))'
6048 * (The paren nesting may be hard to follow -- that's an alternation
6049 * of `abc(:3:' and `defg(:4:' inside (purely syntactic) parens
6050 * followed by the closing paren from the original expression.)
6052 * Neither the expression tree representation nor the the nfa make
6053 * this very easy to write. :(
6056 /* What we compile is different than what the parser returns.
6057 * Suppose the parser returns expression R.
6058 * Let R' be R with unnecessary register assignments removed
6059 * (see REMOVE_UNECESSARY_SIDE_EFFECTS, above).
6061 * What we will compile is the expression:
6063 * m{try}R{win}\|s{try}R'{win}
6065 * {try} and {win} denote side effect epsilons (see EXPLORE_FUTURE).
6067 * When trying a match, we insert an `m' at the beginning of the
6068 * string if the user wants registers to be filled, `s' if not.
6073 rx_mk_r_concat (&rxb->rx, rx_mk_r_cset (&rxb->rx, cs2), rexp),
6074 rx_mk_r_concat (&rxb->rx,
6075 rx_mk_r_cset (&rxb->rx, cs), fewer_side_effects));
6077 if (!(new_rexp && cs && cs2))
6079 RX_bitset_enjoin (cs2, '\0'); /* prefixed to the rexp used for matching. */
6080 RX_bitset_enjoin (cs, '\1'); /* prefixed to the rexp used for searching. */
6085 if (rx_debug_compile)
6087 fputs ("\n...which is compiled as:\n", stdout);
6088 print_rexp (&rxb->rx, rexp, 2, re_seprint, stdout);
6092 struct rx_nfa_state *start = 0;
6093 struct rx_nfa_state *end = 0;
6095 if (!rx_build_nfa (&rxb->rx, rexp, &start, &end))
6096 return REG_ESPACE; /* */
6099 void * mem = (void *)rxb->buffer;
6100 unsigned long size = rxb->allocated;
6103 int iterator_size = paramc * sizeof (params[0]);
6106 start->is_start = 1;
6107 rx_name_nfa_states (&rxb->rx);
6108 start_id = start->id;
6110 if (rx_debug_compile)
6112 fputs ("...giving the NFA: \n", stdout);
6114 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6117 if (!rx_eclose_nfa (&rxb->rx))
6121 rx_delete_epsilon_transitions (&rxb->rx);
6123 /* For compatability reasons, we need to shove the
6124 * compiled nfa into one chunk of malloced memory.
6126 rxb->rx.reserved = ( sizeof (params[0]) * paramc
6127 + rx_sizeof_bitset (rxb->rx.local_cset_size));
6129 if (rx_debug_compile)
6132 fputs ("...which cooks down (uncompactified) to: \n", stdout);
6133 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6136 if (!rx_compactify_nfa (&rxb->rx, &mem, &size))
6139 rxb->allocated = size;
6140 rxb->rx.buffer = mem;
6141 rxb->rx.allocated = size;
6142 perm_mem = ((char *)rxb->rx.buffer
6143 + rxb->rx.allocated - rxb->rx.reserved);
6144 rxb->se_params = ((struct re_se_params *)perm_mem);
6145 bcopy (params, rxb->se_params, iterator_size);
6146 perm_mem += iterator_size;
6147 rxb->fastset = (rx_Bitset) perm_mem;
6148 rxb->start = rx_id_to_nfa_state (&rxb->rx, start_id);
6150 rx_bitset_null (rxb->rx.local_cset_size, rxb->fastset);
6151 rxb->can_match_empty = compute_fastset (rxb, orig_rexp);
6152 rxb->match_regs_on_stack =
6153 registers_on_stack (rxb, orig_rexp, 0, params);
6154 rxb->search_regs_on_stack =
6155 registers_on_stack (rxb, fewer_side_effects, 0, params);
6156 if (rxb->can_match_empty)
6157 rx_bitset_universe (rxb->rx.local_cset_size, rxb->fastset);
6158 rxb->is_anchored = is_anchored (orig_rexp, (rx_side_effect) re_se_hat);
6159 rxb->begbuf_only = is_anchored (orig_rexp,
6160 (rx_side_effect) re_se_begbuf);
6162 rx_free_rexp (&rxb->rx, rexp);
6166 if (rx_debug_compile)
6169 fputs ("...which cooks down to: \n", stdout);
6170 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6179 /* This table gives an error message for each of the error codes listed
6180 in regex.h. Obviously the order here has to be same as there. */
6182 __const__ char * rx_error_msg[] =
6183 { 0, /* REG_NOERROR */
6184 "No match", /* REG_NOMATCH */
6185 "Invalid regular expression", /* REG_BADPAT */
6186 "Invalid collation character", /* REG_ECOLLATE */
6187 "Invalid character class name", /* REG_ECTYPE */
6188 "Trailing backslash", /* REG_EESCAPE */
6189 "Invalid back reference", /* REG_ESUBREG */
6190 "Unmatched [ or [^", /* REG_EBRACK */
6191 "Unmatched ( or \\(", /* REG_EPAREN */
6192 "Unmatched \\{", /* REG_EBRACE */
6193 "Invalid content of \\{\\}", /* REG_BADBR */
6194 "Invalid range end", /* REG_ERANGE */
6195 "Memory exhausted", /* REG_ESPACE */
6196 "Invalid preceding regular expression", /* REG_BADRPT */
6197 "Premature end of regular expression", /* REG_EEND */
6198 "Regular expression too big", /* REG_ESIZE */
6199 "Unmatched ) or \\)", /* REG_ERPAREN */
6205 char rx_slowmap [256] =
6207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6222 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6227 rx_blow_up_fastmap (struct re_pattern_buffer * rxb)
6230 rx_blow_up_fastmap (rxb)
6231 struct re_pattern_buffer * rxb;
6235 for (x = 0; x < 256; ++x) /* &&&& 3.6 % */
6236 rxb->fastmap [x] = !!RX_bitset_member (rxb->fastset, x);
6237 rxb->fastmap_accurate = 1;
6243 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6244 #define RE_SEARCH_2_FN inner_re_search_2
6245 #define RE_S2_QUAL static
6247 #define RE_SEARCH_2_FN re_search_2
6251 struct re_search_2_closure
6253 __const__ char * string1;
6255 __const__ char * string2;
6260 static __inline__ enum rx_get_burst_return
6261 re_search_2_get_burst (pos, vclosure, stop)
6262 struct rx_string_position * pos;
6266 struct re_search_2_closure * closure;
6267 closure = (struct re_search_2_closure *)vclosure;
6268 if (!closure->string2)
6272 inset = pos->pos - pos->string;
6273 if ((inset < -1) || (inset > closure->size1))
6274 return rx_get_burst_no_more;
6277 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6278 pos->string = (__const__ unsigned char *) closure->string1;
6279 pos->size = closure->size1;
6280 pos->end = ((__const__ unsigned char *)
6281 MIN(closure->string1 + closure->size1,
6282 closure->string1 + stop));
6284 return ((pos->pos < pos->end)
6286 : rx_get_burst_no_more);
6289 else if (!closure->string1)
6293 inset = pos->pos - pos->string;
6294 pos->pos = (__const__ unsigned char *) closure->string2 + inset;
6295 pos->string = (__const__ unsigned char *) closure->string2;
6296 pos->size = closure->size2;
6297 pos->end = ((__const__ unsigned char *)
6298 MIN(closure->string2 + closure->size2,
6299 closure->string2 + stop));
6301 return ((pos->pos < pos->end)
6303 : rx_get_burst_no_more);
6309 inset = pos->pos - pos->string + pos->offset;
6310 if (inset < closure->size1)
6312 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6313 pos->string = (__const__ unsigned char *) closure->string1;
6314 pos->size = closure->size1;
6315 pos->end = ((__const__ unsigned char *)
6316 MIN(closure->string1 + closure->size1,
6317 closure->string1 + stop));
6319 return rx_get_burst_ok;
6323 pos->pos = ((__const__ unsigned char *)
6324 closure->string2 + inset - closure->size1);
6325 pos->string = (__const__ unsigned char *) closure->string2;
6326 pos->size = closure->size2;
6327 pos->end = ((__const__ unsigned char *)
6328 MIN(closure->string2 + closure->size2,
6329 closure->string2 + stop - closure->size1));
6330 pos->offset = closure->size1;
6331 return ((pos->pos < pos->end)
6333 : rx_get_burst_no_more);
6339 static __inline__ enum rx_back_check_return
6340 re_search_2_back_check (pos, lparen, rparen, translate, vclosure, stop)
6341 struct rx_string_position * pos;
6344 unsigned char * translate;
6348 struct rx_string_position there;
6349 struct rx_string_position past;
6352 there.pos = there.string + lparen - there.offset;
6353 re_search_2_get_burst (&there, vclosure, stop);
6356 past.pos = past.string + rparen - there.offset;
6357 re_search_2_get_burst (&past, vclosure, stop);
6360 re_search_2_get_burst (pos, vclosure, stop);
6362 while ( (there.pos != past.pos)
6363 && (pos->pos != pos->end))
6364 if (TRANSLATE(*there.pos) != TRANSLATE(*pos->pos))
6365 return rx_back_check_fail;
6370 if (there.pos == there.end)
6371 re_search_2_get_burst (&there, vclosure, stop);
6372 if (pos->pos == pos->end)
6373 re_search_2_get_burst (pos, vclosure, stop);
6376 if (there.pos != past.pos)
6377 return rx_back_check_fail;
6379 re_search_2_get_burst (pos, vclosure, stop);
6380 return rx_back_check_pass;
6383 static __inline__ int
6384 re_search_2_fetch_char (pos, offset, app_closure, stop)
6385 struct rx_string_position * pos;
6390 struct re_search_2_closure * closure;
6391 closure = (struct re_search_2_closure *)app_closure;
6394 if (pos->pos >= pos->string)
6398 if ( (pos->string == (__const__ unsigned char *) closure->string2)
6399 && (closure->string1)
6400 && (closure->size1))
6401 return closure->string1[closure->size1 - 1];
6403 return 0; /* sure, why not. */
6406 if (pos->pos == pos->end)
6407 return *closure->string2;
6415 RE_SEARCH_2_FN (struct re_pattern_buffer *rxb,
6416 __const__ char * string1, int size1,
6417 __const__ char * string2, int size2,
6418 int startpos, int range,
6419 struct re_registers *regs,
6423 RE_SEARCH_2_FN (rxb,
6424 string1, size1, string2, size2, startpos, range, regs, stop)
6425 struct re_pattern_buffer *rxb;
6426 __const__ char * string1;
6428 __const__ char * string2;
6432 struct re_registers *regs;
6437 struct re_search_2_closure closure;
6438 closure.string1 = string1;
6439 closure.size1 = size1;
6440 closure.string2 = string2;
6441 closure.size2 = size2;
6442 answer = rx_search (rxb, startpos, range, stop, size1 + size2,
6443 re_search_2_get_burst,
6444 re_search_2_back_check,
6445 re_search_2_fetch_char,
6452 case rx_search_continuation:
6454 case rx_search_error:
6456 case rx_search_soft_fail:
6457 case rx_search_fail:
6464 /* Export rx_search to callers outside this file. */
6467 re_rx_search (rxb, startpos, range, stop, total_size,
6468 get_burst, back_check, fetch_char,
6469 app_closure, regs, resume_state, save_state)
6470 struct re_pattern_buffer * rxb;
6475 rx_get_burst_fn get_burst;
6476 rx_back_check_fn back_check;
6477 rx_fetch_char_fn fetch_char;
6479 struct re_registers * regs;
6480 struct rx_search_state * resume_state;
6481 struct rx_search_state * save_state;
6483 return rx_search (rxb, startpos, range, stop, total_size,
6484 get_burst, back_check, fetch_char, app_closure,
6485 regs, resume_state, save_state);
6488 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6491 re_search_2 (struct re_pattern_buffer *rxb,
6492 __const__ char * string1, int size1,
6493 __const__ char * string2, int size2,
6494 int startpos, int range,
6495 struct re_registers *regs,
6499 re_search_2 (rxb, string1, size1, string2, size2, startpos, range, regs, stop)
6500 struct re_pattern_buffer *rxb;
6501 __const__ char * string1;
6503 __const__ char * string2;
6507 struct re_registers *regs;
6512 ret = inner_re_search_2 (rxb, string1, size1, string2, size2, startpos,
6520 /* Like re_search_2, above, but only one string is specified, and
6521 * doesn't let you say where to stop matching.
6526 re_search (struct re_pattern_buffer * rxb, __const__ char *string,
6527 int size, int startpos, int range,
6528 struct re_registers *regs)
6531 re_search (rxb, string, size, startpos, range, regs)
6532 struct re_pattern_buffer * rxb;
6533 __const__ char * string;
6537 struct re_registers *regs;
6540 return re_search_2 (rxb, 0, 0, string, size, startpos, range, regs, size);
6545 re_match_2 (struct re_pattern_buffer * rxb,
6546 __const__ char * string1, int size1,
6547 __const__ char * string2, int size2,
6548 int pos, struct re_registers *regs, int stop)
6551 re_match_2 (rxb, string1, size1, string2, size2, pos, regs, stop)
6552 struct re_pattern_buffer * rxb;
6553 __const__ char * string1;
6555 __const__ char * string2;
6558 struct re_registers *regs;
6562 struct re_registers some_regs;
6566 int save = rxb->regs_allocated;
6567 struct re_registers * regs_to_pass = regs;
6571 some_regs.start = &start;
6572 some_regs.end = &end;
6573 some_regs.num_regs = 1;
6574 regs_to_pass = &some_regs;
6575 rxb->regs_allocated = REGS_FIXED;
6578 srch = re_search_2 (rxb, string1, size1, string2, size2,
6579 pos, 1, regs_to_pass, stop);
6580 if (regs_to_pass != regs)
6581 rxb->regs_allocated = save;
6584 return regs_to_pass->end[0] - regs_to_pass->start[0];
6587 /* re_match is like re_match_2 except it takes only a single string. */
6591 re_match (struct re_pattern_buffer * rxb,
6592 __const__ char * string,
6594 struct re_registers *regs)
6597 re_match (rxb, string, size, pos, regs)
6598 struct re_pattern_buffer * rxb;
6599 __const__ char *string;
6602 struct re_registers *regs;
6605 return re_match_2 (rxb, string, size, 0, 0, pos, regs, size);
6610 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
6611 also be assigned to arbitrarily: each pattern buffer stores its own
6612 syntax, so it can be changed between regex compilations. */
6613 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
6616 /* Specify the precise syntax of regexps for compilation. This provides
6617 for compatibility for various utilities which historically have
6618 different, incompatible syntaxes.
6620 The argument SYNTAX is a bit mask comprised of the various bits
6621 defined in regex.h. We return the old syntax. */
6625 re_set_syntax (reg_syntax_t syntax)
6628 re_set_syntax (syntax)
6629 reg_syntax_t syntax;
6632 reg_syntax_t ret = re_syntax_options;
6634 re_syntax_options = syntax;
6639 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6640 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6641 this memory for recording register information. STARTS and ENDS
6642 must be allocated using the malloc library routine, and must each
6643 be at least NUM_REGS * sizeof (regoff_t) bytes long.
6645 If NUM_REGS == 0, then subsequent matches should allocate their own
6648 Unless this function is called, the first search or match using
6649 PATTERN_BUFFER will allocate its own register data, without
6650 freeing the old data. */
6654 re_set_registers (struct re_pattern_buffer *bufp,
6655 struct re_registers *regs,
6657 regoff_t * starts, regoff_t * ends)
6660 re_set_registers (bufp, regs, num_regs, starts, ends)
6661 struct re_pattern_buffer *bufp;
6662 struct re_registers *regs;
6670 bufp->regs_allocated = REGS_REALLOCATE;
6671 regs->num_regs = num_regs;
6672 regs->start = starts;
6677 bufp->regs_allocated = REGS_UNALLOCATED;
6679 regs->start = regs->end = (regoff_t) 0;
6688 cplx_se_sublist_len (struct rx_se_list * list)
6691 cplx_se_sublist_len (list)
6692 struct rx_se_list * list;
6698 if ((long)list->car >= 0)
6706 /* For rx->se_list_cmp */
6710 posix_se_list_order (struct rx * rx,
6711 struct rx_se_list * a, struct rx_se_list * b)
6714 posix_se_list_order (rx, a, b)
6716 struct rx_se_list * a;
6717 struct rx_se_list * b;
6720 int al = cplx_se_sublist_len (a);
6721 int bl = cplx_se_sublist_len (b);
6726 : ((a < b) ? -1 : 1));
6736 rx_side_effect * av = ((rx_side_effect *)
6737 alloca (sizeof (rx_side_effect) * (al + 1)));
6738 rx_side_effect * bv = ((rx_side_effect *)
6739 alloca (sizeof (rx_side_effect) * (bl + 1)));
6740 struct rx_se_list * ap = a;
6741 struct rx_se_list * bp = b;
6744 for (ai = al - 1; ai >= 0; --ai)
6746 while ((long)ap->car < 0)
6751 av[al] = (rx_side_effect)-2;
6752 for (bi = bl - 1; bi >= 0; --bi)
6754 while ((long)bp->car < 0)
6759 bv[bl] = (rx_side_effect)-1;
6764 while (av[x] == bv[x])
6766 ret = (((unsigned *)(av[x]) < (unsigned *)(bv[x])) ? -1 : 1);
6775 /* re_compile_pattern is the GNU regular expression compiler: it
6776 compiles PATTERN (of length SIZE) and puts the result in RXB.
6777 Returns 0 if the pattern was valid, otherwise an error string.
6779 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6780 are set in RXB on entry.
6782 We call rx_compile to do the actual compilation. */
6786 re_compile_pattern (__const__ char *pattern,
6788 struct re_pattern_buffer * rxb)
6791 re_compile_pattern (pattern, length, rxb)
6792 __const__ char *pattern;
6794 struct re_pattern_buffer * rxb;
6799 /* GNU code is written to assume at least RE_NREGS registers will be set
6800 (and at least one extra will be -1). */
6801 rxb->regs_allocated = REGS_UNALLOCATED;
6803 /* And GNU code determines whether or not to get register information
6804 by passing null for the REGS argument to re_match, etc., not by
6808 rxb->rx.local_cset_size = 256;
6810 /* Match anchors at newline. */
6811 rxb->newline_anchor = 1;
6817 rxb->rx.epsnodec = 0;
6818 rxb->rx.instruction_table = 0;
6819 rxb->rx.nfa_states = 0;
6820 rxb->rx.se_list_cmp = posix_se_list_order;
6821 rxb->rx.start_set = 0;
6823 ret = rx_compile (pattern, length, re_syntax_options, rxb);
6825 return rx_error_msg[(int) ret];
6832 re_compile_fastmap (struct re_pattern_buffer * rxb)
6835 re_compile_fastmap (rxb)
6836 struct re_pattern_buffer * rxb;
6839 rx_blow_up_fastmap (rxb);
6846 /* Entry points compatible with 4.2 BSD regex library. We don't define
6847 them if this is an Emacs or POSIX compilation. */
6849 #if (!defined (emacs) && !defined (_POSIX_SOURCE)) || defined(USE_BSD_REGEX)
6851 /* BSD has one and only one pattern buffer. */
6852 static struct re_pattern_buffer rx_comp_buf;
6856 re_comp (__const__ char *s)
6865 if (!s || (*s == '\0'))
6867 if (!rx_comp_buf.buffer)
6868 return "No previous regular expression";
6872 if (!rx_comp_buf.fastmap)
6874 rx_comp_buf.fastmap = (char *) malloc (1 << CHARBITS);
6875 if (!rx_comp_buf.fastmap)
6876 return "Memory exhausted";
6879 /* Since `rx_exec' always passes NULL for the `regs' argument, we
6880 don't need to initialize the pattern buffer fields which affect it. */
6882 /* Match anchors at newlines. */
6883 rx_comp_buf.newline_anchor = 1;
6885 rx_comp_buf.re_nsub = 0;
6886 rx_comp_buf.start = 0;
6887 rx_comp_buf.se_params = 0;
6888 rx_comp_buf.rx.nodec = 0;
6889 rx_comp_buf.rx.epsnodec = 0;
6890 rx_comp_buf.rx.instruction_table = 0;
6891 rx_comp_buf.rx.nfa_states = 0;
6892 rx_comp_buf.rx.start = 0;
6893 rx_comp_buf.rx.se_list_cmp = posix_se_list_order;
6894 rx_comp_buf.rx.local_cset_size = 256;
6896 ret = rx_compile (s, strlen (s), re_syntax_options, &rx_comp_buf);
6899 /* Yes, we're discarding `__const__' here. */
6900 return (char *) rx_error_msg[(int) ret];
6906 re_exec (__const__ char *s)
6913 __const__ int len = strlen (s);
6915 0 <= re_search (&rx_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6917 #endif /* not emacs and not _POSIX_SOURCE */
6921 /* POSIX.2 functions. Don't define these for Emacs. */
6925 /* regcomp takes a regular expression as a string and compiles it.
6927 PREG is a regex_t *. We do not expect any fields to be initialized,
6928 since POSIX says we shouldn't. Thus, we set
6930 `buffer' to the compiled pattern;
6931 `used' to the length of the compiled pattern;
6932 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6933 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6934 RE_SYNTAX_POSIX_BASIC;
6935 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6936 `fastmap' and `fastmap_accurate' to zero;
6937 `re_nsub' to the number of subexpressions in PATTERN.
6939 PATTERN is the address of the pattern string.
6941 CFLAGS is a series of bits which affect compilation.
6943 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6944 use POSIX basic syntax.
6946 If REG_NEWLINE is set, then . and [^...] don't match newline.
6947 Also, regexec will try a match beginning after every newline.
6949 If REG_ICASE is set, then we considers upper- and lowercase
6950 versions of letters to be equivalent when matching.
6952 If REG_NOSUB is set, then when PREG is passed to regexec, that
6953 routine will report only success or failure, and nothing about the
6956 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6957 the return codes and their meanings.) */
6962 regcomp (regex_t * preg, __const__ char * pattern, int cflags)
6965 regcomp (preg, pattern, cflags)
6967 __const__ char * pattern;
6973 = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6975 /* regex_compile will allocate the space for the compiled pattern. */
6977 preg->allocated = 0;
6978 preg->fastmap = malloc (256);
6981 preg->fastmap_accurate = 0;
6983 if (cflags & REG_ICASE)
6987 preg->translate = (unsigned char *) malloc (256);
6988 if (!preg->translate)
6989 return (int) REG_ESPACE;
6991 /* Map uppercase characters to corresponding lowercase ones. */
6992 for (i = 0; i < CHAR_SET_SIZE; i++)
6993 preg->translate[i] = isupper (i) ? tolower (i) : i;
6996 preg->translate = 0;
6998 /* If REG_NEWLINE is set, newlines are treated differently. */
6999 if (cflags & REG_NEWLINE)
7000 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7001 syntax &= ~RE_DOT_NEWLINE;
7002 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7003 /* It also changes the matching behavior. */
7004 preg->newline_anchor = 1;
7007 preg->newline_anchor = 0;
7009 preg->no_sub = !!(cflags & REG_NOSUB);
7011 /* POSIX says a null character in the pattern terminates it, so we
7012 can use strlen here in compiling the pattern. */
7015 preg->se_params = 0;
7016 preg->syntax_parens = 0;
7018 preg->rx.epsnodec = 0;
7019 preg->rx.instruction_table = 0;
7020 preg->rx.nfa_states = 0;
7021 preg->rx.local_cset_size = 256;
7023 preg->rx.se_list_cmp = posix_se_list_order;
7024 preg->rx.start_set = 0;
7025 ret = rx_compile (pattern, strlen (pattern), syntax, preg);
7028 /* POSIX doesn't distinguish between an unmatched open-group and an
7029 unmatched close-group: both are REG_EPAREN. */
7030 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7036 /* regexec searches for a given pattern, specified by PREG, in the
7039 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7040 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7041 least NMATCH elements, and we set them to the offsets of the
7042 corresponding matched substrings.
7044 EFLAGS specifies `execution flags' which affect matching: if
7045 REG_NOTBOL is set, then ^ does not match at the beginning of the
7046 string; if REG_NOTEOL is set, then $ does not match at the end.
7048 We return 0 if we find a match and REG_NOMATCH if not. */
7052 regexec (__const__ regex_t *preg, __const__ char *string,
7053 size_t nmatch, regmatch_t pmatch[],
7057 regexec (preg, string, nmatch, pmatch, eflags)
7058 __const__ regex_t *preg;
7059 __const__ char *string;
7061 regmatch_t pmatch[];
7066 struct re_registers regs;
7067 regex_t private_preg;
7068 int len = strlen (string);
7069 boolean want_reg_info = !preg->no_sub && nmatch > 0;
7071 private_preg = *preg;
7073 private_preg.not_bol = !!(eflags & REG_NOTBOL);
7074 private_preg.not_eol = !!(eflags & REG_NOTEOL);
7076 /* The user has told us exactly how many registers to return
7077 * information about, via `nmatch'. We have to pass that on to the
7078 * matching routines.
7080 private_preg.regs_allocated = REGS_FIXED;
7084 regs.num_regs = nmatch;
7085 regs.start = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7086 regs.end = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7087 if (regs.start == 0 || regs.end == 0)
7088 return (int) REG_NOMATCH;
7091 /* Perform the searching operation. */
7092 ret = re_search (&private_preg,
7096 want_reg_info ? ®s : (struct re_registers *) 0);
7098 /* Copy the register information to the POSIX structure. */
7105 for (r = 0; r < nmatch; r++)
7107 pmatch[r].rm_so = regs.start[r];
7108 pmatch[r].rm_eo = regs.end[r];
7112 /* If we needed the temporary register info, free the space now. */
7117 /* We want zero return to mean success, unlike `re_search'. */
7118 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
7122 /* Returns a message corresponding to an error code, ERRCODE, returned
7123 from either regcomp or regexec. */
7127 regerror (int errcode, __const__ regex_t *preg,
7128 char *errbuf, size_t errbuf_size)
7131 regerror (errcode, preg, errbuf, errbuf_size)
7133 __const__ regex_t *preg;
7139 = rx_error_msg[errcode] == 0 ? "Success" : rx_error_msg[errcode];
7140 size_t msg_size = strlen (msg) + 1; /* Includes the 0. */
7142 if (errbuf_size != 0)
7144 if (msg_size > errbuf_size)
7146 strncpy (errbuf, msg, errbuf_size - 1);
7147 errbuf[errbuf_size - 1] = 0;
7150 strcpy (errbuf, msg);
7157 /* Free dynamically allocated space used by PREG. */
7161 regfree (regex_t *preg)
7168 if (preg->buffer != 0)
7169 free (preg->buffer);
7171 preg->allocated = 0;
7173 if (preg->fastmap != 0)
7174 free (preg->fastmap);
7176 preg->fastmap_accurate = 0;
7178 if (preg->translate != 0)
7179 free (preg->translate);
7180 preg->translate = 0;
7183 #endif /* not emacs */