1 /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
3 This file is part of the librx library.
5 Librx is free software; you can redistribute it and/or modify it under
6 the terms of the GNU Library General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 Librx is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 You should have received a copy of the GNU Library General Public
16 License along with this software; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
20 /* NOTE!!! AIX is so losing it requires this to be the first thing in the
22 * Do not put ANYTHING before it!
24 #if !defined (__GNUC__) && defined (_AIX)
28 /* To make linux happy? */
34 const char *rx_version_string = "GNU Rx version 0.07.2";
44 #define isgraph(c) (isprint (c) && !isspace (c))
47 #define isblank(c) ((c) == ' ' || (c) == '\t')
50 #include <sys/types.h>
54 #define MAX(a, b) ((a) > (b) ? (a) : (b))
55 #define MIN(a, b) ((a) < (b) ? (a) : (b))
66 /* Emacs already defines alloca, sometimes. */
69 /* Make alloca work the best possible way. */
71 #define alloca __builtin_alloca
72 #else /* not __GNUC__ */
75 #else /* not __GNUC__ or HAVE_ALLOCA_H */
76 #ifndef _AIX /* Already did AIX, up at the top. */
79 #endif /* not HAVE_ALLOCA_H */
80 #endif /* not __GNUC__ */
82 #endif /* not alloca */
84 /* Memory management and stuff for emacs. */
87 #define remalloc(M, S) (M ? realloc (M, S) : malloc (S))
90 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
91 * use `alloca' instead of `malloc' for the backtracking stack.
93 * Emacs will die miserably if we don't do this.
97 #define REGEX_ALLOCATE malloc
98 #else /* not REGEX_MALLOC */
99 #define REGEX_ALLOCATE alloca
100 #endif /* not REGEX_MALLOC */
103 #ifdef RX_WANT_RX_DEFS
104 #define RX_DECL extern
107 #define RX_WANT_RX_DEFS
108 #define RX_DECL static
109 #define RX_DEF_QUAL static
113 #define RX_DECL RX_DEF_QUAL
119 extern char *re_syntax_table;
120 #else /* not SYNTAX_TABLE */
122 RX_DECL char re_syntax_table[CHAR_SET_SIZE];
126 init_syntax_once (void)
138 bzero (re_syntax_table, sizeof re_syntax_table);
140 for (c = 'a'; c <= 'z'; c++)
141 re_syntax_table[c] = Sword;
143 for (c = 'A'; c <= 'Z'; c++)
144 re_syntax_table[c] = Sword;
146 for (c = '0'; c <= '9'; c++)
147 re_syntax_table[c] = Sword;
149 re_syntax_table['_'] = Sword;
153 #endif /* not SYNTAX_TABLE */
154 #endif /* not emacs */
156 /* Compile with `-DRX_DEBUG' and use the following flags.
159 * rx_debug - print information as a regexp is compiled
160 * rx_debug_trace - print information as a regexp is executed
165 int rx_debug_compile = 0;
166 int rx_debug_trace = 0;
167 static struct re_pattern_buffer * dbug_rxb = 0;
170 typedef void (*side_effect_printer) (struct rx *, void *, FILE *);
172 typedef void (*side_effect_printer) ();
176 static void print_cset (struct rx *rx, rx_Bitset cset, FILE * fp);
178 static void print_cset ();
183 print_rexp (struct rx *rx,
184 struct rexp_node *node, int depth,
185 side_effect_printer seprint, FILE * fp)
188 print_rexp (rx, node, depth, seprint, fp)
190 struct rexp_node *node;
192 side_effect_printer seprint;
204 fprintf (fp, "%*s", depth, "");
205 print_cset (rx, node->params.cset, fp);
212 fprintf (fp, "%*s%s\n", depth, "",
213 node->type == r_opt ? "opt" : "star");
214 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
218 fprintf (fp, "%*s2phase star\n", depth, "");
219 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
220 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
226 fprintf (fp, "%*s%s\n", depth, "",
227 node->type == r_alternate ? "alt" : "concat");
228 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
229 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
232 fprintf (fp, "%*sSide effect: ", depth, "");
233 seprint (rx, node->params.side_effect, fp);
241 print_nfa (struct rx * rx,
242 struct rx_nfa_state * n,
243 side_effect_printer seprint, FILE * fp)
246 print_nfa (rx, n, seprint, fp)
248 struct rx_nfa_state * n;
249 side_effect_printer seprint;
255 struct rx_nfa_edge *e = n->edges;
256 struct rx_possible_future *ec = n->futures;
257 fprintf (fp, "node %d %s\n", n->id,
258 n->is_final ? "final" : (n->is_start ? "start" : ""));
261 fprintf (fp, " edge to %d, ", e->dest->id);
265 fprintf (fp, "epsilon\n");
268 fprintf (fp, "side effect ");
269 seprint (rx, e->params.side_effect, fp);
273 fprintf (fp, "cset ");
274 print_cset (rx, e->params.cset, fp);
284 struct rx_nfa_state_set * s;
285 struct rx_se_list * l;
286 fprintf (fp, " eclosure to {");
287 for (s = ec->destset; s; s = s->cdr)
288 fprintf (fp, "%d ", s->car->id);
290 for (l = ec->effects; l; l = l->cdr)
292 seprint (rx, l->car, fp);
302 static char * efnames [] =
318 "re_se_notwordbound",
325 static char * efnames2[] =
336 static char * inx_names[] =
338 "rx_backtrack_point",
339 "rx_do_side_effects",
344 "rx_num_instructions"
350 re_seprint (struct rx * rx, void * effect, FILE * fp)
353 re_seprint (rx, effect, fp)
360 fputs (efnames[-(int)effect], fp);
363 struct re_se_params * p = &dbug_rxb->se_params[(int)effect];
364 fprintf (fp, "%s(%d,%d)", efnames2[p->se], p->op1, p->op2);
367 fprintf (fp, "[complex op # %d]", (int)effect);
371 /* These are so the regex.c regression tests will compile. */
373 print_compiled_pattern (rxb)
374 struct re_pattern_buffer * rxb;
384 #endif /* RX_DEBUG */
388 /* This page: Bitsets. Completely unintersting. */
392 rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b)
395 rx_bitset_is_equal (size, a, b)
405 for (x = rx_bitset_numb_subsets(size) - 1; a[x] == b[x]; --x)
409 return !x && s == a[0];
414 rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b)
417 rx_bitset_is_subset (size, a, b)
423 int x = rx_bitset_numb_subsets(size) - 1;
424 while (x-- && (a[x] & b[x]) == a[x]);
431 rx_bitset_empty (int size, rx_Bitset set)
434 rx_bitset_empty (size, set)
440 RX_subset s = set[0];
442 for (x = rx_bitset_numb_subsets(size) - 1; !set[x]; --x)
450 rx_bitset_null (int size, rx_Bitset b)
453 rx_bitset_null (size, b)
458 bzero (b, rx_sizeof_bitset(size));
464 rx_bitset_universe (int size, rx_Bitset b)
467 rx_bitset_universe (size, b)
472 int x = rx_bitset_numb_subsets (size);
474 *b++ = ~(RX_subset)0;
480 rx_bitset_complement (int size, rx_Bitset b)
483 rx_bitset_complement (size, b)
488 int x = rx_bitset_numb_subsets (size);
499 rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b)
502 rx_bitset_assign (size, a, b)
509 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
516 rx_bitset_union (int size, rx_Bitset a, rx_Bitset b)
519 rx_bitset_union (size, a, b)
526 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
533 rx_bitset_intersection (int size,
534 rx_Bitset a, rx_Bitset b)
537 rx_bitset_intersection (size, a, b)
544 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
551 rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b)
554 rx_bitset_difference (size, a, b)
561 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
568 rx_bitset_revdifference (int size,
569 rx_Bitset a, rx_Bitset b)
572 rx_bitset_revdifference (size, a, b)
579 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
585 rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b)
588 rx_bitset_xor (size, a, b)
595 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
601 RX_DECL unsigned long
602 rx_bitset_hash (int size, rx_Bitset b)
604 RX_DECL unsigned long
605 rx_bitset_hash (size, b)
611 unsigned long hash = (unsigned long)rx_bitset_hash;
613 for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x)
614 hash ^= rx_bitset_subset_val(b, x);
620 RX_DECL RX_subset rx_subset_singletons [RX_subset_bits] =
660 print_cset (struct rx *rx, rx_Bitset cset, FILE * fp)
663 print_cset (rx, cset, fp)
671 for (x = 0; x < rx->local_cset_size; ++x)
672 if (RX_bitset_member (cset, x))
677 fprintf (fp, "\\0%o ", x);
682 #endif /* RX_DEBUG */
686 static unsigned long rx_hash_masks[4] =
697 RX_DECL struct rx_hash_item *
698 rx_hash_find (struct rx_hash * table,
701 struct rx_hash_rules * rules)
703 RX_DECL struct rx_hash_item *
704 rx_hash_find (table, hash, value, rules)
705 struct rx_hash * table;
708 struct rx_hash_rules * rules;
711 rx_hash_eq eq = rules->eq;
713 long mask = rx_hash_masks [0];
714 int bucket = (hash & mask) % 13;
716 while (table->children [bucket])
718 table = table->children [bucket];
720 mask = rx_hash_masks[maskc];
721 bucket = (hash & mask) % 13;
725 struct rx_hash_item * it = table->buckets[bucket];
727 if (eq (it->data, value))
730 it = it->next_same_hash;
738 RX_DECL struct rx_hash_item *
739 rx_hash_store (struct rx_hash * table,
742 struct rx_hash_rules * rules)
744 RX_DECL struct rx_hash_item *
745 rx_hash_store (table, hash, value, rules)
746 struct rx_hash * table;
749 struct rx_hash_rules * rules;
752 rx_hash_eq eq = rules->eq;
754 long mask = rx_hash_masks[0];
755 int bucket = (hash & mask) % 13;
758 while (table->children [bucket])
760 table = table->children [bucket];
762 mask = rx_hash_masks[maskc];
763 bucket = (hash & mask) % 13;
768 struct rx_hash_item * it = table->buckets[bucket];
770 if (eq (it->data, value))
773 it = it->next_same_hash;
778 && (table->bucket_size [bucket] >= 4))
780 struct rx_hash * newtab = ((struct rx_hash *)
781 rules->hash_alloc (rules));
784 bzero (newtab, sizeof (*newtab));
785 newtab->parent = table;
787 struct rx_hash_item * them = table->buckets[bucket];
788 unsigned long newmask = rx_hash_masks[maskc + 1];
791 struct rx_hash_item * save = them->next_same_hash;
792 int new_buck = (them->hash & newmask) % 13;
793 them->next_same_hash = newtab->buckets[new_buck];
794 newtab->buckets[new_buck] = them;
795 them->table = newtab;
797 ++newtab->bucket_size[new_buck];
800 table->refs = (table->refs - table->bucket_size[bucket] + 1);
801 table->bucket_size[bucket] = 0;
802 table->buckets[bucket] = 0;
803 table->children[bucket] = newtab;
805 bucket = (hash & newmask) % 13;
811 struct rx_hash_item * it = ((struct rx_hash_item *)
812 rules->hash_item_alloc (rules, value));
817 /* DATA and BINDING are to be set in hash_item_alloc */
818 it->next_same_hash = table->buckets [bucket];
819 table->buckets[bucket] = it;
820 ++table->bucket_size [bucket];
829 rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules)
832 rx_hash_free (it, rules)
833 struct rx_hash_item * it;
834 struct rx_hash_rules * rules;
839 struct rx_hash * table = it->table;
840 unsigned long hash = it->hash;
841 int depth = (table->parent
842 ? (table->parent->parent
843 ? (table->parent->parent->parent
848 int bucket = (hash & rx_hash_masks [depth]) % 13;
849 struct rx_hash_item ** pos = &table->buckets [bucket];
852 pos = &(*pos)->next_same_hash;
853 *pos = it->next_same_hash;
854 rules->free_hash_item (it, rules);
855 --table->bucket_size[bucket];
857 while (!table->refs && depth)
859 struct rx_hash * save = table;
860 table = table->parent;
862 bucket = (hash & rx_hash_masks [depth]) % 13;
864 table->children[bucket] = 0;
865 rules->free_hash (save, rules);
872 rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
873 struct rx_hash_rules * rules)
876 rx_free_hash_table (tab, freefn, rules)
877 struct rx_hash * tab;
878 rx_hash_freefn freefn;
879 struct rx_hash_rules * rules;
884 for (x = 0; x < 13; ++x)
885 if (tab->children[x])
887 rx_free_hash_table (tab->children[x], freefn, rules);
888 rules->free_hash (tab->children[x], rules);
892 struct rx_hash_item * them = tab->buckets[x];
895 struct rx_hash_item * that = them;
896 them = that->next_same_hash;
898 rules->free_hash_item (that, rules);
905 /* Utilities for manipulating bitset represntations of characters sets. */
909 rx_cset (struct rx *rx)
916 rx_Bitset b = (rx_Bitset) malloc (rx_sizeof_bitset (rx->local_cset_size));
918 rx_bitset_null (rx->local_cset_size, b);
925 rx_copy_cset (struct rx *rx, rx_Bitset a)
933 rx_Bitset cs = rx_cset (rx);
936 rx_bitset_union (rx->local_cset_size, cs, a);
944 rx_free_cset (struct rx * rx, rx_Bitset c)
957 /* Hash table memory allocation policy for the regexp compiler */
960 static struct rx_hash *
961 compiler_hash_alloc (struct rx_hash_rules * rules)
963 static struct rx_hash *
964 compiler_hash_alloc (rules)
965 struct rx_hash_rules * rules;
968 return (struct rx_hash *)malloc (sizeof (struct rx_hash));
973 static struct rx_hash_item *
974 compiler_hash_item_alloc (struct rx_hash_rules * rules, void * value)
976 static struct rx_hash_item *
977 compiler_hash_item_alloc (rules, value)
978 struct rx_hash_rules * rules;
982 struct rx_hash_item * it;
983 it = (struct rx_hash_item *)malloc (sizeof (*it));
995 compiler_free_hash (struct rx_hash * tab,
996 struct rx_hash_rules * rules)
999 compiler_free_hash (tab, rules)
1000 struct rx_hash * tab;
1001 struct rx_hash_rules * rules;
1010 compiler_free_hash_item (struct rx_hash_item * item,
1011 struct rx_hash_rules * rules)
1014 compiler_free_hash_item (item, rules)
1015 struct rx_hash_item * item;
1016 struct rx_hash_rules * rules;
1019 free ((char *)item);
1023 /* This page: REXP_NODE (expression tree) structures. */
1026 RX_DECL struct rexp_node *
1027 rexp_node (struct rx *rx,
1028 enum rexp_node_type type)
1030 RX_DECL struct rexp_node *
1031 rexp_node (rx, type)
1033 enum rexp_node_type type;
1036 struct rexp_node *n;
1038 n = (struct rexp_node *)malloc (sizeof (*n));
1039 bzero (n, sizeof (*n));
1046 /* free_rexp_node assumes that the bitset passed to rx_mk_r_cset
1047 * can be freed using rx_free_cset.
1050 RX_DECL struct rexp_node *
1051 rx_mk_r_cset (struct rx * rx,
1054 RX_DECL struct rexp_node *
1055 rx_mk_r_cset (rx, b)
1060 struct rexp_node * n = rexp_node (rx, r_cset);
1068 RX_DECL struct rexp_node *
1069 rx_mk_r_concat (struct rx * rx,
1070 struct rexp_node * a,
1071 struct rexp_node * b)
1073 RX_DECL struct rexp_node *
1074 rx_mk_r_concat (rx, a, b)
1076 struct rexp_node * a;
1077 struct rexp_node * b;
1080 struct rexp_node * n = rexp_node (rx, r_concat);
1083 n->params.pair.left = a;
1084 n->params.pair.right = b;
1091 RX_DECL struct rexp_node *
1092 rx_mk_r_alternate (struct rx * rx,
1093 struct rexp_node * a,
1094 struct rexp_node * b)
1096 RX_DECL struct rexp_node *
1097 rx_mk_r_alternate (rx, a, b)
1099 struct rexp_node * a;
1100 struct rexp_node * b;
1103 struct rexp_node * n = rexp_node (rx, r_alternate);
1106 n->params.pair.left = a;
1107 n->params.pair.right = b;
1114 RX_DECL struct rexp_node *
1115 rx_mk_r_opt (struct rx * rx,
1116 struct rexp_node * a)
1118 RX_DECL struct rexp_node *
1121 struct rexp_node * a;
1124 struct rexp_node * n = rexp_node (rx, r_opt);
1127 n->params.pair.left = a;
1128 n->params.pair.right = 0;
1135 RX_DECL struct rexp_node *
1136 rx_mk_r_star (struct rx * rx,
1137 struct rexp_node * a)
1139 RX_DECL struct rexp_node *
1140 rx_mk_r_star (rx, a)
1142 struct rexp_node * a;
1145 struct rexp_node * n = rexp_node (rx, r_star);
1148 n->params.pair.left = a;
1149 n->params.pair.right = 0;
1156 RX_DECL struct rexp_node *
1157 rx_mk_r_2phase_star (struct rx * rx,
1158 struct rexp_node * a,
1159 struct rexp_node * b)
1161 RX_DECL struct rexp_node *
1162 rx_mk_r_2phase_star (rx, a, b)
1164 struct rexp_node * a;
1165 struct rexp_node * b;
1168 struct rexp_node * n = rexp_node (rx, r_2phase_star);
1171 n->params.pair.left = a;
1172 n->params.pair.right = b;
1179 RX_DECL struct rexp_node *
1180 rx_mk_r_side_effect (struct rx * rx,
1183 RX_DECL struct rexp_node *
1184 rx_mk_r_side_effect (rx, a)
1189 struct rexp_node * n = rexp_node (rx, r_side_effect);
1192 n->params.side_effect = a;
1193 n->params.pair.right = 0;
1200 RX_DECL struct rexp_node *
1201 rx_mk_r_data (struct rx * rx,
1204 RX_DECL struct rexp_node *
1205 rx_mk_r_data (rx, a)
1210 struct rexp_node * n = rexp_node (rx, r_data);
1213 n->params.pair.left = a;
1214 n->params.pair.right = 0;
1222 rx_free_rexp (struct rx * rx, struct rexp_node * node)
1225 rx_free_rexp (rx, node)
1227 struct rexp_node * node;
1235 if (node->params.cset)
1236 rx_free_cset (rx, node->params.cset);
1246 rx_free_rexp (rx, node->params.pair.left);
1247 rx_free_rexp (rx, node->params.pair.right);
1251 /* This shouldn't occur. */
1254 free ((char *)node);
1260 RX_DECL struct rexp_node *
1261 rx_copy_rexp (struct rx *rx,
1262 struct rexp_node *node)
1264 RX_DECL struct rexp_node *
1265 rx_copy_rexp (rx, node)
1267 struct rexp_node *node;
1274 struct rexp_node *n = rexp_node (rx, node->type);
1280 n->params.cset = rx_copy_cset (rx, node->params.cset);
1281 if (!n->params.cset)
1283 rx_free_rexp (rx, n);
1289 n->params.side_effect = node->params.side_effect;
1297 n->params.pair.left =
1298 rx_copy_rexp (rx, node->params.pair.left);
1299 n->params.pair.right =
1300 rx_copy_rexp (rx, node->params.pair.right);
1301 if ( (node->params.pair.left && !n->params.pair.left)
1302 || (node->params.pair.right && !n->params.pair.right))
1304 rx_free_rexp (rx, n);
1309 /* shouldn't happen */
1318 /* This page: functions to build and destroy graphs that describe nfa's */
1320 /* Constructs a new nfa node. */
1322 RX_DECL struct rx_nfa_state *
1323 rx_nfa_state (struct rx *rx)
1325 RX_DECL struct rx_nfa_state *
1330 struct rx_nfa_state * n = (struct rx_nfa_state *)malloc (sizeof (*n));
1333 bzero (n, sizeof (*n));
1334 n->next = rx->nfa_states;
1342 rx_free_nfa_state (struct rx_nfa_state * n)
1345 rx_free_nfa_state (n)
1346 struct rx_nfa_state * n;
1353 /* This looks up an nfa node, given a numeric id. Numeric id's are
1354 * assigned after the nfa has been built.
1357 RX_DECL struct rx_nfa_state *
1358 rx_id_to_nfa_state (struct rx * rx,
1361 RX_DECL struct rx_nfa_state *
1362 rx_id_to_nfa_state (rx, id)
1367 struct rx_nfa_state * n;
1368 for (n = rx->nfa_states; n; n = n->next)
1375 /* This adds an edge between two nodes, but doesn't initialize the
1380 RX_DECL struct rx_nfa_edge *
1381 rx_nfa_edge (struct rx *rx,
1382 enum rx_nfa_etype type,
1383 struct rx_nfa_state *start,
1384 struct rx_nfa_state *dest)
1386 RX_DECL struct rx_nfa_edge *
1387 rx_nfa_edge (rx, type, start, dest)
1389 enum rx_nfa_etype type;
1390 struct rx_nfa_state *start;
1391 struct rx_nfa_state *dest;
1394 struct rx_nfa_edge *e;
1395 e = (struct rx_nfa_edge *)malloc (sizeof (*e));
1398 e->next = start->edges;
1408 rx_free_nfa_edge (struct rx_nfa_edge * e)
1411 rx_free_nfa_edge (e)
1412 struct rx_nfa_edge * e;
1419 /* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure
1420 * of an NFA. These are added to an nfa automaticly by eclose_nfa.
1424 static struct rx_possible_future *
1425 rx_possible_future (struct rx * rx,
1426 struct rx_se_list * effects)
1428 static struct rx_possible_future *
1429 rx_possible_future (rx, effects)
1431 struct rx_se_list * effects;
1434 struct rx_possible_future *ec;
1435 ec = (struct rx_possible_future *) malloc (sizeof (*ec));
1440 ec->effects = effects;
1447 rx_free_possible_future (struct rx_possible_future * pf)
1450 rx_free_possible_future (pf)
1451 struct rx_possible_future * pf;
1460 rx_free_nfa (struct rx *rx)
1467 while (rx->nfa_states)
1469 while (rx->nfa_states->edges)
1471 switch (rx->nfa_states->edges->type)
1474 rx_free_cset (rx, rx->nfa_states->edges->params.cset);
1480 struct rx_nfa_edge * e;
1481 e = rx->nfa_states->edges;
1482 rx->nfa_states->edges = rx->nfa_states->edges->next;
1483 rx_free_nfa_edge (e);
1485 } /* while (rx->nfa_states->edges) */
1487 /* Iterate over the partial epsilon closures of rx->nfa_states */
1488 struct rx_possible_future * pf = rx->nfa_states->futures;
1491 struct rx_possible_future * pft = pf;
1493 rx_free_possible_future (pft);
1497 struct rx_nfa_state *n;
1499 rx->nfa_states = rx->nfa_states->next;
1500 rx_free_nfa_state (n);
1507 /* This page: translating a pattern expression into an nfa and doing the
1508 * static part of the nfa->super-nfa translation.
1511 /* This is the thompson regexp->nfa algorithm.
1512 * It is modified to allow for `side-effect epsilons.' Those are
1513 * edges that are taken whenever a similar epsilon edge would be,
1514 * but which imply that some side effect occurs when the edge
1517 * Side effects are used to model parts of the pattern langauge
1518 * that are not regular (in the formal sense).
1523 rx_build_nfa (struct rx *rx,
1524 struct rexp_node *rexp,
1525 struct rx_nfa_state **start,
1526 struct rx_nfa_state **end)
1529 rx_build_nfa (rx, rexp, start, end)
1531 struct rexp_node *rexp;
1532 struct rx_nfa_state **start;
1533 struct rx_nfa_state **end;
1536 struct rx_nfa_edge *edge;
1538 /* Start & end nodes may have been allocated by the caller. */
1539 *start = *start ? *start : rx_nfa_state (rx);
1550 *end = *end ? *end : rx_nfa_state (rx);
1554 rx_free_nfa_state (*start);
1564 edge = rx_nfa_edge (rx, ne_cset, *start, *end);
1567 edge->params.cset = rx_copy_cset (rx, rexp->params.cset);
1568 if (!edge->params.cset)
1570 rx_free_nfa_edge (edge);
1576 return (rx_build_nfa (rx, rexp->params.pair.left, start, end)
1577 && rx_nfa_edge (rx, ne_epsilon, *start, *end));
1581 struct rx_nfa_state * star_start = 0;
1582 struct rx_nfa_state * star_end = 0;
1583 return (rx_build_nfa (rx, rexp->params.pair.left,
1584 &star_start, &star_end)
1587 && rx_nfa_edge (rx, ne_epsilon, star_start, star_end)
1588 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1589 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1591 && rx_nfa_edge (rx, ne_epsilon, star_end, star_start));
1596 struct rx_nfa_state * star_start = 0;
1597 struct rx_nfa_state * star_end = 0;
1598 struct rx_nfa_state * loop_exp_start = 0;
1599 struct rx_nfa_state * loop_exp_end = 0;
1601 return (rx_build_nfa (rx, rexp->params.pair.left,
1602 &star_start, &star_end)
1603 && rx_build_nfa (rx, rexp->params.pair.right,
1604 &loop_exp_start, &loop_exp_end)
1609 && rx_nfa_edge (rx, ne_epsilon, star_start, *end)
1610 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1611 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1613 && rx_nfa_edge (rx, ne_epsilon, star_end, loop_exp_start)
1614 && rx_nfa_edge (rx, ne_epsilon, loop_exp_end, star_start));
1620 struct rx_nfa_state *shared = 0;
1622 (rx_build_nfa (rx, rexp->params.pair.left, start, &shared)
1623 && rx_build_nfa (rx, rexp->params.pair.right, &shared, end));
1628 struct rx_nfa_state *ls = 0;
1629 struct rx_nfa_state *le = 0;
1630 struct rx_nfa_state *rs = 0;
1631 struct rx_nfa_state *re = 0;
1632 return (rx_build_nfa (rx, rexp->params.pair.left, &ls, &le)
1633 && rx_build_nfa (rx, rexp->params.pair.right, &rs, &re)
1634 && rx_nfa_edge (rx, ne_epsilon, *start, ls)
1635 && rx_nfa_edge (rx, ne_epsilon, *start, rs)
1636 && rx_nfa_edge (rx, ne_epsilon, le, *end)
1637 && rx_nfa_edge (rx, ne_epsilon, re, *end));
1641 edge = rx_nfa_edge (rx, ne_side_effect, *start, *end);
1644 edge->params.side_effect = rexp->params.side_effect;
1648 /* this should never happen */
1653 /* RX_NAME_NFA_STATES identifies all nodes with outgoing non-epsilon
1654 * transitions. Only these nodes can occur in super-states.
1655 * All nodes are given an integer id.
1656 * The id is non-negative if the node has non-epsilon out-transitions, negative
1657 * otherwise (this is because we want the non-negative ids to be used as
1658 * array indexes in a few places).
1663 rx_name_nfa_states (struct rx *rx)
1666 rx_name_nfa_states (rx)
1670 struct rx_nfa_state *n = rx->nfa_states;
1677 struct rx_nfa_edge *e = n->edges;
1680 n->eclosure_needed = 1;
1687 case ne_side_effect:
1691 n->id = rx->nodec++;
1693 struct rx_nfa_edge *from_n = n->edges;
1696 from_n->dest->eclosure_needed = 1;
1697 from_n = from_n->next;
1704 n->id = rx->epsnodec--;
1708 rx->epsnodec = -rx->epsnodec;
1712 /* This page: data structures for the static part of the nfa->supernfa
1715 * There are side effect lists -- lists of side effects occuring
1716 * along an uninterrupted, acyclic path of side-effect epsilon edges.
1717 * Such paths are collapsed to single edges in the course of computing
1718 * epsilon closures. Such single edges are labled with a list of all
1719 * the side effects entailed in crossing them. Like lists of side
1720 * effects are made == by the constructors below.
1722 * There are also nfa state sets. These are used to hold a list of all
1723 * states reachable from a starting state for a given type of transition
1724 * and side effect list. These are also hash-consed.
1727 /* The next several functions compare, construct, etc. lists of side
1728 * effects. See ECLOSE_NFA (below) for details.
1731 /* Ordering of rx_se_list
1732 * (-1, 0, 1 return value convention).
1737 se_list_cmp (void * va, void * vb)
1740 se_list_cmp (va, vb)
1745 struct rx_se_list * a = (struct rx_se_list *)va;
1746 struct rx_se_list * b = (struct rx_se_list *)vb;
1754 : ((long)a->car < (long)b->car
1756 : ((long)a->car > (long)b->car
1758 : se_list_cmp ((void *)a->cdr, (void *)b->cdr))))));
1764 se_list_equal (void * va, void * vb)
1767 se_list_equal (va, vb)
1772 return !(se_list_cmp (va, vb));
1775 static struct rx_hash_rules se_list_hash_rules =
1778 compiler_hash_alloc,
1780 compiler_hash_item_alloc,
1781 compiler_free_hash_item
1786 static struct rx_se_list *
1787 side_effect_cons (struct rx * rx,
1788 void * se, struct rx_se_list * list)
1790 static struct rx_se_list *
1791 side_effect_cons (rx, se, list)
1794 struct rx_se_list * list;
1797 struct rx_se_list * l;
1798 l = ((struct rx_se_list *) malloc (sizeof (*l)));
1808 static struct rx_se_list *
1809 hash_cons_se_prog (struct rx * rx,
1810 struct rx_hash * memo,
1811 void * car, struct rx_se_list * cdr)
1813 static struct rx_se_list *
1814 hash_cons_se_prog (rx, memo, car, cdr)
1816 struct rx_hash * memo;
1818 struct rx_se_list * cdr;
1821 long hash = (long)car ^ (long)cdr;
1822 struct rx_se_list template;
1827 struct rx_hash_item * it = rx_hash_store (memo, hash,
1829 &se_list_hash_rules);
1832 if (it->data == (void *)&template)
1834 struct rx_se_list * consed;
1835 consed = (struct rx_se_list *) malloc (sizeof (*consed));
1837 it->data = (void *)consed;
1839 return (struct rx_se_list *)it->data;
1845 static struct rx_se_list *
1846 hash_se_prog (struct rx * rx, struct rx_hash * memo, struct rx_se_list * prog)
1848 static struct rx_se_list *
1849 hash_se_prog (rx, memo, prog)
1851 struct rx_hash * memo;
1852 struct rx_se_list * prog;
1855 struct rx_se_list * answer = 0;
1858 answer = hash_cons_se_prog (rx, memo, prog->car, answer);
1868 nfa_set_cmp (void * va, void * vb)
1871 nfa_set_cmp (va, vb)
1876 struct rx_nfa_state_set * a = (struct rx_nfa_state_set *)va;
1877 struct rx_nfa_state_set * b = (struct rx_nfa_state_set *)vb;
1885 : (a->car->id < b->car->id
1887 : (a->car->id > b->car->id
1889 : nfa_set_cmp ((void *)a->cdr, (void *)b->cdr))))));
1894 nfa_set_equal (void * va, void * vb)
1897 nfa_set_equal (va, vb)
1902 return !nfa_set_cmp (va, vb);
1905 static struct rx_hash_rules nfa_set_hash_rules =
1908 compiler_hash_alloc,
1910 compiler_hash_item_alloc,
1911 compiler_free_hash_item
1916 static struct rx_nfa_state_set *
1917 nfa_set_cons (struct rx * rx,
1918 struct rx_hash * memo, struct rx_nfa_state * state,
1919 struct rx_nfa_state_set * set)
1921 static struct rx_nfa_state_set *
1922 nfa_set_cons (rx, memo, state, set)
1924 struct rx_hash * memo;
1925 struct rx_nfa_state * state;
1926 struct rx_nfa_state_set * set;
1929 struct rx_nfa_state_set template;
1930 struct rx_hash_item * node;
1931 template.car = state;
1933 node = rx_hash_store (memo,
1934 (((long)state) >> 8) ^ (long)set,
1935 &template, &nfa_set_hash_rules);
1938 if (node->data == &template)
1940 struct rx_nfa_state_set * l;
1941 l = (struct rx_nfa_state_set *) malloc (sizeof (*l));
1942 node->data = (void *) l;
1947 return (struct rx_nfa_state_set *)node->data;
1952 static struct rx_nfa_state_set *
1953 nfa_set_enjoin (struct rx * rx,
1954 struct rx_hash * memo, struct rx_nfa_state * state,
1955 struct rx_nfa_state_set * set)
1957 static struct rx_nfa_state_set *
1958 nfa_set_enjoin (rx, memo, state, set)
1960 struct rx_hash * memo;
1961 struct rx_nfa_state * state;
1962 struct rx_nfa_state_set * set;
1965 if (!set || state->id < set->car->id)
1966 return nfa_set_cons (rx, memo, state, set);
1967 if (state->id == set->car->id)
1971 struct rx_nfa_state_set * newcdr
1972 = nfa_set_enjoin (rx, memo, state, set->cdr);
1973 if (newcdr != set->cdr)
1974 set = nfa_set_cons (rx, memo, set->car, newcdr);
1981 /* This page: computing epsilon closures. The closures aren't total.
1982 * Each node's closures are partitioned according to the side effects entailed
1983 * along the epsilon edges. Return true on success.
1988 struct rx_se_list *prog_backwards;
1994 eclose_node (struct rx *rx, struct rx_nfa_state *outnode,
1995 struct rx_nfa_state *node, struct eclose_frame *frame)
1998 eclose_node (rx, outnode, node, frame)
2000 struct rx_nfa_state *outnode;
2001 struct rx_nfa_state *node;
2002 struct eclose_frame *frame;
2005 struct rx_nfa_edge *e = node->edges;
2007 /* For each node, we follow all epsilon paths to build the closure.
2008 * The closure omits nodes that have only epsilon edges.
2009 * The closure is split into partial closures -- all the states in
2010 * a partial closure are reached by crossing the same list of
2011 * of side effects (though not necessarily the same path).
2017 if (node->id >= 0 || node->is_final)
2019 struct rx_possible_future **ec;
2020 struct rx_se_list * prog_in_order
2021 = ((struct rx_se_list *)hash_se_prog (rx,
2023 frame->prog_backwards));
2026 ec = &outnode->futures;
2030 cmp = se_list_cmp ((void *)(*ec)->effects, (void *)prog_in_order);
2035 if (!*ec || (cmp < 0))
2037 struct rx_possible_future * saved = *ec;
2038 *ec = rx_possible_future (rx, prog_in_order);
2039 (*ec)->next = saved;
2045 (*ec)->destset = nfa_set_enjoin (rx, &rx->set_list_memo,
2046 node, (*ec)->destset);
2047 if (!(*ec)->destset)
2057 if (!eclose_node (rx, outnode, e->dest, frame))
2060 case ne_side_effect:
2062 frame->prog_backwards = side_effect_cons (rx,
2063 e->params.side_effect,
2064 frame->prog_backwards);
2065 if (!frame->prog_backwards)
2067 if (!eclose_node (rx, outnode, e->dest, frame))
2070 struct rx_se_list * dying = frame->prog_backwards;
2071 frame->prog_backwards = frame->prog_backwards->cdr;
2072 free ((char *)dying);
2088 rx_eclose_nfa (struct rx *rx)
2095 struct rx_nfa_state *n = rx->nfa_states;
2096 struct eclose_frame frame;
2097 static int rx_id = 0;
2099 frame.prog_backwards = 0;
2100 rx->rx_id = rx_id++;
2101 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2102 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2106 if (n->eclosure_needed && !eclose_node (rx, n, n, &frame))
2108 /* clear_marks (rx); */
2115 /* This deletes epsilon edges from an NFA. After running eclose_node,
2116 * we have no more need for these edges. They are removed to simplify
2117 * further operations on the NFA.
2122 rx_delete_epsilon_transitions (struct rx *rx)
2125 rx_delete_epsilon_transitions (rx)
2129 struct rx_nfa_state *n = rx->nfa_states;
2130 struct rx_nfa_edge **e;
2137 struct rx_nfa_edge *t;
2141 case ne_side_effect:
2144 rx_free_nfa_edge (t);
2157 /* This page: storing the nfa in a contiguous region of memory for
2158 * subsequent conversion to a super-nfa.
2161 /* This is for qsort on an array of nfa_states. The order
2162 * is based on state ids and goes
2163 * [0...MAX][MIN..-1] where (MAX>=0) and (MIN<0)
2164 * This way, positive ids double as array indices.
2169 nfacmp (void * va, void * vb)
2177 struct rx_nfa_state **a = (struct rx_nfa_state **)va;
2178 struct rx_nfa_state **b = (struct rx_nfa_state **)vb;
2179 return (*a == *b /* &&&& 3.18 */
2181 : (((*a)->id < 0) == ((*b)->id < 0)
2182 ? (((*a)->id < (*b)->id) ? -1 : 1)
2189 count_hash_nodes (struct rx_hash * st)
2192 count_hash_nodes (st)
2193 struct rx_hash * st;
2198 for (x = 0; x < 13; ++x)
2199 count += ((st->children[x])
2200 ? count_hash_nodes (st->children[x])
2201 : st->bucket_size[x]);
2209 se_memo_freer (struct rx_hash_item * node)
2212 se_memo_freer (node)
2213 struct rx_hash_item * node;
2216 free ((char *)node->data);
2222 nfa_set_freer (struct rx_hash_item * node)
2225 nfa_set_freer (node)
2226 struct rx_hash_item * node;
2229 free ((char *)node->data);
2233 /* This copies an entire NFA into a single malloced block of memory.
2234 * Mostly this is for compatability with regex.c, though it is convenient
2235 * to have the nfa nodes in an array.
2240 rx_compactify_nfa (struct rx *rx,
2241 void **mem, unsigned long *size)
2244 rx_compactify_nfa (rx, mem, size)
2247 unsigned long *size;
2251 struct rx_nfa_state *n;
2254 int se_list_consc = count_hash_nodes (&rx->se_list_memo);
2255 int nfa_setc = count_hash_nodes (&rx->set_list_memo);
2256 unsigned long total_size;
2258 /* This takes place in two stages. First, the total size of the
2259 * nfa is computed, then structures are copied.
2265 struct rx_nfa_edge *e = n->edges;
2266 struct rx_possible_future *ec = n->futures;
2281 total_size = (total_nodec * sizeof (struct rx_nfa_state)
2282 + edgec * rx_sizeof_bitset (rx->local_cset_size)
2283 + edgec * sizeof (struct rx_nfa_edge)
2284 + nfa_setc * sizeof (struct rx_nfa_state_set)
2285 + eclosec * sizeof (struct rx_possible_future)
2286 + se_list_consc * sizeof (struct rx_se_list)
2289 if (total_size > *size)
2291 *mem = remalloc (*mem, total_size);
2297 /* Now we've allocated the memory; this copies the NFA. */
2299 static struct rx_nfa_state **scratch = 0;
2300 static int scratch_alloc = 0;
2301 struct rx_nfa_state *state_base = (struct rx_nfa_state *) * mem;
2302 struct rx_nfa_state *new_state = state_base;
2303 struct rx_nfa_edge *new_edge =
2304 (struct rx_nfa_edge *)
2305 ((char *) state_base + total_nodec * sizeof (struct rx_nfa_state));
2306 struct rx_se_list * new_se_list =
2307 (struct rx_se_list *)
2308 ((char *)new_edge + edgec * sizeof (struct rx_nfa_edge));
2309 struct rx_possible_future *new_close =
2310 ((struct rx_possible_future *)
2311 ((char *) new_se_list
2312 + se_list_consc * sizeof (struct rx_se_list)));
2313 struct rx_nfa_state_set * new_nfa_set =
2314 ((struct rx_nfa_state_set *)
2315 ((char *)new_close + eclosec * sizeof (struct rx_possible_future)));
2317 ((char *) new_nfa_set + nfa_setc * sizeof (struct rx_nfa_state_set));
2319 struct rx_nfa_state *n;
2321 if (scratch_alloc < total_nodec)
2323 scratch = ((struct rx_nfa_state **)
2324 remalloc (scratch, total_nodec * sizeof (*scratch)));
2326 scratch_alloc = total_nodec;
2334 for (x = 0, n = rx->nfa_states; n; n = n->next)
2337 qsort (scratch, total_nodec,
2338 sizeof (struct rx_nfa_state *), (int (*)())nfacmp);
2340 for (x = 0; x < total_nodec; ++x)
2342 struct rx_possible_future *eclose = scratch[x]->futures;
2343 struct rx_nfa_edge *edge = scratch[x]->edges;
2344 struct rx_nfa_state *cn = new_state++;
2347 cn->next = (x == total_nodec - 1) ? 0 : (cn + 1);
2348 cn->id = scratch[x]->id;
2349 cn->is_final = scratch[x]->is_final;
2350 cn->is_start = scratch[x]->is_start;
2354 int indx = (edge->dest->id < 0
2355 ? (total_nodec + edge->dest->id)
2357 struct rx_nfa_edge *e = new_edge++;
2358 rx_Bitset cset = (rx_Bitset) new_bitset;
2359 new_bitset += rx_sizeof_bitset (rx->local_cset_size);
2360 rx_bitset_null (rx->local_cset_size, cset);
2361 rx_bitset_union (rx->local_cset_size, cset, edge->params.cset);
2362 e->next = cn->edges;
2364 e->type = edge->type;
2365 e->dest = state_base + indx;
2366 e->params.cset = cset;
2371 struct rx_possible_future *ec = new_close++;
2372 struct rx_hash_item * sp;
2373 struct rx_se_list ** sepos;
2374 struct rx_se_list * sesrc;
2375 struct rx_nfa_state_set * destlst;
2376 struct rx_nfa_state_set ** destpos;
2377 ec->next = cn->futures;
2379 for (sepos = &ec->effects, sesrc = eclose->effects;
2381 sesrc = sesrc->cdr, sepos = &(*sepos)->cdr)
2383 sp = rx_hash_find (&rx->se_list_memo,
2384 (long)sesrc->car ^ (long)sesrc->cdr,
2385 sesrc, &se_list_hash_rules);
2388 sesrc = (struct rx_se_list *)sp->binding;
2391 *new_se_list = *sesrc;
2392 sp->binding = (void *)new_se_list;
2393 *sepos = new_se_list;
2397 for (destpos = &ec->destset, destlst = eclose->destset;
2399 destpos = &(*destpos)->cdr, destlst = destlst->cdr)
2401 sp = rx_hash_find (&rx->set_list_memo,
2402 ((((long)destlst->car) >> 8)
2403 ^ (long)destlst->cdr),
2404 destlst, &nfa_set_hash_rules);
2407 destlst = (struct rx_nfa_state_set *)sp->binding;
2410 *new_nfa_set = *destlst;
2411 new_nfa_set->car = state_base + destlst->car->id;
2412 sp->binding = (void *)new_nfa_set;
2413 *destpos = new_nfa_set;
2417 eclose = eclose->next;
2421 rx_free_hash_table (&rx->se_list_memo, se_memo_freer, &se_list_hash_rules);
2422 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2423 rx_free_hash_table (&rx->set_list_memo, nfa_set_freer, &nfa_set_hash_rules);
2424 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2427 rx->nfa_states = (struct rx_nfa_state *)*mem;
2432 /* The functions in the next several pages define the lazy-NFA-conversion used
2433 * by matchers. The input to this construction is an NFA such as
2434 * is built by compactify_nfa (rx.c). The output is the superNFA.
2437 /* Match engines can use arbitrary values for opcodes. So, the parse tree
2438 * is built using instructions names (enum rx_opcode), but the superstate
2439 * nfa is populated with mystery opcodes (void *).
2441 * For convenience, here is an id table. The opcodes are == to their inxs
2443 * The lables in re_search_2 would make good values for instructions.
2446 void * rx_id_instruction_table[rx_num_instructions] =
2448 (void *) rx_backtrack_point,
2449 (void *) rx_do_side_effects,
2450 (void *) rx_cache_miss,
2451 (void *) rx_next_char,
2452 (void *) rx_backtrack,
2453 (void *) rx_error_inx
2458 /* Memory mgt. for superstate graphs. */
2462 rx_cache_malloc (struct rx_cache * cache, int bytes)
2465 rx_cache_malloc (cache, bytes)
2466 struct rx_cache * cache;
2470 while (cache->bytes_left < bytes)
2472 if (cache->memory_pos)
2473 cache->memory_pos = cache->memory_pos->next;
2474 if (!cache->memory_pos)
2476 cache->morecore (cache);
2477 if (!cache->memory_pos)
2480 cache->bytes_left = cache->memory_pos->bytes;
2481 cache->memory_addr = ((char *)cache->memory_pos
2482 + sizeof (struct rx_blocklist));
2484 cache->bytes_left -= bytes;
2486 char * addr = cache->memory_addr;
2487 cache->memory_addr += bytes;
2494 rx_cache_free (struct rx_cache * cache,
2495 struct rx_freelist ** freelist, char * mem)
2498 rx_cache_free (cache, freelist, mem)
2499 struct rx_cache * cache;
2500 struct rx_freelist ** freelist;
2504 struct rx_freelist * it = (struct rx_freelist *)mem;
2505 it->next = *freelist;
2510 /* The partially instantiated superstate graph has a transition
2511 * table at every node. There is one entry for every character.
2512 * This fills in the transition for a set.
2516 install_transition (struct rx_superstate *super,
2517 struct rx_inx *answer, rx_Bitset trcset)
2520 install_transition (super, answer, trcset)
2521 struct rx_superstate *super;
2522 struct rx_inx *answer;
2526 struct rx_inx * transitions = super->transitions;
2528 for (chr = 0; chr < 256; )
2536 RX_subset sub = *trcset;
2538 int bound = chr + 32;
2542 transitions [chr] = *answer;
2553 qlen (struct rx_superstate * q)
2557 struct rx_superstate * q;
2561 struct rx_superstate * it;
2564 for (it = q->next_recyclable; it != q; it = it->next_recyclable)
2571 check_cache (struct rx_cache * cache)
2575 struct rx_cache * cache;
2578 struct rx_cache * you_fucked_up = 0;
2579 int total = cache->superstates;
2580 int semi = cache->semifree_superstates;
2581 if (semi != qlen (cache->semifree_superstate))
2582 check_cache (you_fucked_up);
2583 if ((total - semi) != qlen (cache->lru_superstate))
2584 check_cache (you_fucked_up);
2587 /* When a superstate is old and neglected, it can enter a
2588 * semi-free state. A semi-free state is slated to die.
2589 * Incoming transitions to a semi-free state are re-written
2590 * to cause an (interpreted) fault when they are taken.
2591 * The fault handler revives the semi-free state, patches
2592 * incoming transitions back to normal, and continues.
2594 * The idea is basicly to free in two stages, aborting
2595 * between the two if the state turns out to be useful again.
2596 * When a free is aborted, the rescued superstate is placed
2597 * in the most-favored slot to maximize the time until it
2598 * is next semi-freed.
2603 semifree_superstate (struct rx_cache * cache)
2606 semifree_superstate (cache)
2607 struct rx_cache * cache;
2610 int disqualified = cache->semifree_superstates;
2611 if (disqualified == cache->superstates)
2613 while (cache->lru_superstate->locks)
2615 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2617 if (disqualified == cache->superstates)
2621 struct rx_superstate * it = cache->lru_superstate;
2622 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2623 it->prev_recyclable->next_recyclable = it->next_recyclable;
2624 cache->lru_superstate = (it == it->next_recyclable
2626 : it->next_recyclable);
2627 if (!cache->semifree_superstate)
2629 cache->semifree_superstate = it;
2630 it->next_recyclable = it;
2631 it->prev_recyclable = it;
2635 it->prev_recyclable = cache->semifree_superstate->prev_recyclable;
2636 it->next_recyclable = cache->semifree_superstate;
2637 it->prev_recyclable->next_recyclable = it;
2638 it->next_recyclable->prev_recyclable = it;
2641 struct rx_distinct_future *df;
2642 it->is_semifree = 1;
2643 ++cache->semifree_superstates;
2644 df = it->transition_refs;
2647 df->prev_same_dest->next_same_dest = 0;
2648 for (df = it->transition_refs; df; df = df->next_same_dest)
2650 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2651 df->future_frame.data = 0;
2652 df->future_frame.data_2 = (void *) df;
2653 /* If there are any NEXT-CHAR instruction frames that
2654 * refer to this state, we convert them to CACHE-MISS frames.
2657 && (df->edge->options->next_same_super_edge[0]
2658 == df->edge->options))
2659 install_transition (df->present, &df->future_frame,
2662 df = it->transition_refs;
2663 df->prev_same_dest->next_same_dest = df;
2672 refresh_semifree_superstate (struct rx_cache * cache,
2673 struct rx_superstate * super)
2676 refresh_semifree_superstate (cache, super)
2677 struct rx_cache * cache;
2678 struct rx_superstate * super;
2681 struct rx_distinct_future *df;
2683 if (super->transition_refs)
2685 super->transition_refs->prev_same_dest->next_same_dest = 0;
2686 for (df = super->transition_refs; df; df = df->next_same_dest)
2688 df->future_frame.inx = cache->instruction_table[rx_next_char];
2689 df->future_frame.data = (void *) super->transitions;
2690 /* CACHE-MISS instruction frames that refer to this state,
2691 * must be converted to NEXT-CHAR frames.
2694 && (df->edge->options->next_same_super_edge[0]
2695 == df->edge->options))
2696 install_transition (df->present, &df->future_frame,
2699 super->transition_refs->prev_same_dest->next_same_dest
2700 = super->transition_refs;
2702 if (cache->semifree_superstate == super)
2703 cache->semifree_superstate = (super->prev_recyclable == super
2705 : super->prev_recyclable);
2706 super->next_recyclable->prev_recyclable = super->prev_recyclable;
2707 super->prev_recyclable->next_recyclable = super->next_recyclable;
2709 if (!cache->lru_superstate)
2710 (cache->lru_superstate
2711 = super->next_recyclable
2712 = super->prev_recyclable
2716 super->next_recyclable = cache->lru_superstate;
2717 super->prev_recyclable = cache->lru_superstate->prev_recyclable;
2718 super->next_recyclable->prev_recyclable = super;
2719 super->prev_recyclable->next_recyclable = super;
2721 super->is_semifree = 0;
2722 --cache->semifree_superstates;
2727 rx_refresh_this_superstate (struct rx_cache * cache, struct rx_superstate * superstate)
2730 rx_refresh_this_superstate (cache, superstate)
2731 struct rx_cache * cache;
2732 struct rx_superstate * superstate;
2735 if (superstate->is_semifree)
2736 refresh_semifree_superstate (cache, superstate);
2737 else if (cache->lru_superstate == superstate)
2738 cache->lru_superstate = superstate->next_recyclable;
2739 else if (superstate != cache->lru_superstate->prev_recyclable)
2741 superstate->next_recyclable->prev_recyclable
2742 = superstate->prev_recyclable;
2743 superstate->prev_recyclable->next_recyclable
2744 = superstate->next_recyclable;
2745 superstate->next_recyclable = cache->lru_superstate;
2746 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
2747 superstate->next_recyclable->prev_recyclable = superstate;
2748 superstate->prev_recyclable->next_recyclable = superstate;
2754 release_superset_low (struct rx_cache * cache,
2755 struct rx_superset *set)
2758 release_superset_low (cache, set)
2759 struct rx_cache * cache;
2760 struct rx_superset *set;
2766 release_superset_low (cache, set->cdr);
2768 set->starts_for = 0;
2772 (&cache->superset_table,
2773 (unsigned long)set->car ^ set->id ^ (unsigned long)set->cdr,
2775 &cache->superset_hash_rules),
2776 &cache->superset_hash_rules);
2777 rx_cache_free (cache, &cache->free_supersets, (char *)set);
2783 rx_release_superset (struct rx *rx,
2784 struct rx_superset *set)
2787 rx_release_superset (rx, set)
2789 struct rx_superset *set;
2792 release_superset_low (rx->cache, set);
2795 /* This tries to add a new superstate to the superstate freelist.
2796 * It might, as a result, free some edge pieces or hash tables.
2797 * If nothing can be freed because too many locks are being held, fail.
2802 rx_really_free_superstate (struct rx_cache * cache)
2805 rx_really_free_superstate (cache)
2806 struct rx_cache * cache;
2809 int locked_superstates = 0;
2810 struct rx_superstate * it;
2812 if (!cache->superstates)
2816 /* This is a total guess. The idea is that we should expect as
2817 * many misses as we've recently experienced. I.e., cache->misses
2818 * should be the same as cache->semifree_superstates.
2820 while ((cache->hits + cache->misses) > cache->superstates_allowed)
2823 cache->misses >>= 1;
2825 if ( ((cache->hits + cache->misses) * cache->semifree_superstates)
2826 < (cache->superstates * cache->misses))
2828 semifree_superstate (cache);
2829 semifree_superstate (cache);
2833 while (cache->semifree_superstate && cache->semifree_superstate->locks)
2835 refresh_semifree_superstate (cache, cache->semifree_superstate);
2836 ++locked_superstates;
2837 if (locked_superstates == cache->superstates)
2841 if (cache->semifree_superstate)
2843 it = cache->semifree_superstate;
2844 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2845 it->prev_recyclable->next_recyclable = it->next_recyclable;
2846 cache->semifree_superstate = ((it == it->next_recyclable)
2848 : it->next_recyclable);
2849 --cache->semifree_superstates;
2853 while (cache->lru_superstate->locks)
2855 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2856 ++locked_superstates;
2857 if (locked_superstates == cache->superstates)
2860 it = cache->lru_superstate;
2861 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2862 it->prev_recyclable->next_recyclable = it->next_recyclable;
2863 cache->lru_superstate = ((it == it->next_recyclable)
2865 : it->next_recyclable);
2868 if (it->transition_refs)
2870 struct rx_distinct_future *df;
2871 for (df = it->transition_refs,
2872 df->prev_same_dest->next_same_dest = 0;
2874 df = df->next_same_dest)
2876 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2877 df->future_frame.data = 0;
2878 df->future_frame.data_2 = (void *) df;
2881 it->transition_refs->prev_same_dest->next_same_dest =
2882 it->transition_refs;
2885 struct rx_super_edge *tc = it->edges;
2888 struct rx_distinct_future * df;
2889 struct rx_super_edge *tct = tc->next;
2891 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
2894 struct rx_distinct_future *dft = df;
2895 df = df->next_same_super_edge[0];
2898 if (dft->future && dft->future->transition_refs == dft)
2900 dft->future->transition_refs = dft->next_same_dest;
2901 if (dft->future->transition_refs == dft)
2902 dft->future->transition_refs = 0;
2904 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
2905 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
2906 rx_cache_free (cache, &cache->free_discernable_futures,
2909 rx_cache_free (cache, &cache->free_transition_classes, (char *)tc);
2914 if (it->contents->superstate == it)
2915 it->contents->superstate = 0;
2916 release_superset_low (cache, it->contents);
2917 rx_cache_free (cache, &cache->free_superstates, (char *)it);
2918 --cache->superstates;
2924 rx_cache_get (struct rx_cache * cache,
2925 struct rx_freelist ** freelist)
2928 rx_cache_get (cache, freelist)
2929 struct rx_cache * cache;
2930 struct rx_freelist ** freelist;
2933 while (!*freelist && rx_really_free_superstate (cache))
2938 struct rx_freelist * it = *freelist;
2939 *freelist = it->next;
2946 rx_cache_malloc_or_get (struct rx_cache * cache,
2947 struct rx_freelist ** freelist, int bytes)
2950 rx_cache_malloc_or_get (cache, freelist, bytes)
2951 struct rx_cache * cache;
2952 struct rx_freelist ** freelist;
2958 char * answer = rx_cache_malloc (cache, bytes);
2963 return rx_cache_get (cache, freelist);
2968 rx_cache_get_superstate (struct rx_cache * cache)
2971 rx_cache_get_superstate (cache)
2972 struct rx_cache * cache;
2976 int bytes = ( sizeof (struct rx_superstate)
2977 + cache->local_cset_size * sizeof (struct rx_inx));
2978 if (!cache->free_superstates
2979 && (cache->superstates < cache->superstates_allowed))
2981 answer = rx_cache_malloc (cache, bytes);
2984 ++cache->superstates;
2988 answer = rx_cache_get (cache, &cache->free_superstates);
2991 answer = rx_cache_malloc (cache, bytes);
2993 ++cache->superstates_allowed;
2995 ++cache->superstates;
3003 supersetcmp (void * va, void * vb)
3006 supersetcmp (va, vb)
3011 struct rx_superset * a = (struct rx_superset *)va;
3012 struct rx_superset * b = (struct rx_superset *)vb;
3014 || (a && b && (a->car == b->car) && (a->cdr == b->cdr)));
3018 static struct rx_hash_item *
3019 superset_allocator (struct rx_hash_rules * rules, void * val)
3021 static struct rx_hash_item *
3022 superset_allocator (rules, val)
3023 struct rx_hash_rules * rules;
3027 struct rx_cache * cache
3028 = ((struct rx_cache *)
3030 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3031 struct rx_superset * template = (struct rx_superset *)val;
3032 struct rx_superset * newset
3033 = ((struct rx_superset *)
3034 rx_cache_malloc_or_get (cache,
3035 &cache->free_supersets,
3036 sizeof (*template)));
3040 newset->car = template->car;
3041 newset->id = template->car->id;
3042 newset->cdr = template->cdr;
3043 newset->superstate = 0;
3044 rx_protect_superset (rx, template->cdr);
3045 newset->hash_item.data = (void *)newset;
3046 newset->hash_item.binding = 0;
3047 return &newset->hash_item;
3051 static struct rx_hash *
3052 super_hash_allocator (struct rx_hash_rules * rules)
3054 static struct rx_hash *
3055 super_hash_allocator (rules)
3056 struct rx_hash_rules * rules;
3059 struct rx_cache * cache
3060 = ((struct rx_cache *)
3062 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3063 return ((struct rx_hash *)
3064 rx_cache_malloc_or_get (cache,
3065 &cache->free_hash, sizeof (struct rx_hash)));
3071 super_hash_liberator (struct rx_hash * hash, struct rx_hash_rules * rules)
3074 super_hash_liberator (hash, rules)
3075 struct rx_hash * hash;
3076 struct rx_hash_rules * rules;
3079 struct rx_cache * cache
3080 = ((struct rx_cache *)
3081 (char *)rules - (long)(&((struct rx_cache *)0)->superset_hash_rules));
3082 rx_cache_free (cache, &cache->free_hash, (char *)hash);
3087 superset_hash_item_liberator (struct rx_hash_item * it,
3088 struct rx_hash_rules * rules)
3091 superset_hash_item_liberator (it, rules) /* Well, it does ya know. */
3092 struct rx_hash_item * it;
3093 struct rx_hash_rules * rules;
3098 int rx_cache_bound = 128;
3099 static int rx_default_cache_got = 0;
3103 bytes_for_cache_size (int supers, int cset_size)
3106 bytes_for_cache_size (supers, cset_size)
3111 /* What the hell is this? !!!*/
3114 ( (1.03 * (float) ( rx_sizeof_bitset (cset_size)
3115 + sizeof (struct rx_super_edge)))
3116 + (1.80 * (float) sizeof (struct rx_possible_future))
3117 + (float) ( sizeof (struct rx_superstate)
3118 + cset_size * sizeof (struct rx_inx))));
3123 rx_morecore (struct rx_cache * cache)
3127 struct rx_cache * cache;
3130 if (rx_default_cache_got >= rx_cache_bound)
3133 rx_default_cache_got += 16;
3134 cache->superstates_allowed = rx_cache_bound;
3136 struct rx_blocklist ** pos = &cache->memory;
3137 int size = bytes_for_cache_size (16, cache->local_cset_size);
3139 pos = &(*pos)->next;
3140 *pos = ((struct rx_blocklist *)
3141 malloc (size + sizeof (struct rx_blocklist)));
3146 (*pos)->bytes = size;
3147 cache->memory_pos = *pos;
3148 cache->memory_addr = (char *)*pos + sizeof (**pos);
3149 cache->bytes_left = size;
3153 static struct rx_cache default_cache =
3157 super_hash_allocator,
3158 super_hash_liberator,
3160 superset_hash_item_liberator,
3186 rx_id_instruction_table,
3197 /* This adds an element to a superstate set. These sets are lists, such
3198 * that lists with == elements are ==. The empty set is returned by
3199 * superset_cons (rx, 0, 0) and is NOT equivelent to
3200 * (struct rx_superset)0.
3204 RX_DECL struct rx_superset *
3205 rx_superset_cons (struct rx * rx,
3206 struct rx_nfa_state *car, struct rx_superset *cdr)
3208 RX_DECL struct rx_superset *
3209 rx_superset_cons (rx, car, cdr)
3211 struct rx_nfa_state *car;
3212 struct rx_superset *cdr;
3215 struct rx_cache * cache = rx->cache;
3218 if (!cache->empty_superset)
3220 cache->empty_superset
3221 = ((struct rx_superset *)
3222 rx_cache_malloc_or_get (cache, &cache->free_supersets,
3223 sizeof (struct rx_superset)));
3224 if (!cache->empty_superset)
3226 bzero (cache->empty_superset, sizeof (struct rx_superset));
3227 cache->empty_superset->refs = 1000;
3229 return cache->empty_superset;
3232 struct rx_superset template;
3233 struct rx_hash_item * hit;
3236 template.id = car->id;
3237 hit = rx_hash_store (&cache->superset_table,
3238 (unsigned long)car ^ car->id ^ (unsigned long)cdr,
3240 &cache->superset_hash_rules);
3242 ? (struct rx_superset *)hit->data
3247 /* This computes a union of two NFA state sets. The sets do not have the
3248 * same representation though. One is a RX_SUPERSET structure (part
3249 * of the superstate NFA) and the other is an NFA_STATE_SET (part of the NFA).
3253 RX_DECL struct rx_superset *
3254 rx_superstate_eclosure_union
3255 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl)
3257 RX_DECL struct rx_superset *
3258 rx_superstate_eclosure_union (rx, set, ecl)
3260 struct rx_superset *set;
3261 struct rx_nfa_state_set *ecl;
3268 return rx_superset_cons (rx, ecl->car,
3269 rx_superstate_eclosure_union (rx, set, ecl->cdr));
3270 if (set->car == ecl->car)
3271 return rx_superstate_eclosure_union (rx, set, ecl->cdr);
3274 struct rx_superset * tail;
3275 struct rx_nfa_state * first;
3277 if (set->car > ecl->car)
3279 tail = rx_superstate_eclosure_union (rx, set->cdr, ecl);
3284 tail = rx_superstate_eclosure_union (rx, set, ecl->cdr);
3291 struct rx_superset * answer;
3292 answer = rx_superset_cons (rx, first, tail);
3295 rx_protect_superset (rx, tail);
3296 rx_release_superset (rx, tail);
3309 * This makes sure that a list of rx_distinct_futures contains
3310 * a future for each possible set of side effects in the eclosure
3311 * of a given state. This is some of the work of filling in a
3312 * superstate transition.
3316 static struct rx_distinct_future *
3317 include_futures (struct rx *rx,
3318 struct rx_distinct_future *df, struct rx_nfa_state
3319 *state, struct rx_superstate *superstate)
3321 static struct rx_distinct_future *
3322 include_futures (rx, df, state, superstate)
3324 struct rx_distinct_future *df;
3325 struct rx_nfa_state *state;
3326 struct rx_superstate *superstate;
3329 struct rx_possible_future *future;
3330 struct rx_cache * cache = rx->cache;
3331 for (future = state->futures; future; future = future->next)
3333 struct rx_distinct_future *dfp;
3334 struct rx_distinct_future *insert_before = 0;
3336 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3337 for (dfp = df; dfp; dfp = dfp->next_same_super_edge[0])
3338 if (dfp->effects == future->effects)
3342 int order = rx->se_list_cmp (rx, dfp->effects, future->effects);
3345 insert_before = dfp;
3351 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3355 = ((struct rx_distinct_future *)
3356 rx_cache_malloc_or_get (cache, &cache->free_discernable_futures,
3357 sizeof (struct rx_distinct_future)));
3362 df = insert_before = dfp;
3363 df->next_same_super_edge[0] = df->next_same_super_edge[1] = df;
3365 else if (!insert_before)
3367 else if (insert_before == df)
3370 dfp->next_same_super_edge[0] = insert_before;
3371 dfp->next_same_super_edge[1]
3372 = insert_before->next_same_super_edge[1];
3373 dfp->next_same_super_edge[1]->next_same_super_edge[0] = dfp;
3374 dfp->next_same_super_edge[0]->next_same_super_edge[1] = dfp;
3375 dfp->next_same_dest = dfp->prev_same_dest = dfp;
3377 dfp->present = superstate;
3378 dfp->future_frame.inx = rx->instruction_table[rx_cache_miss];
3379 dfp->future_frame.data = 0;
3380 dfp->future_frame.data_2 = (void *) dfp;
3381 dfp->side_effects_frame.inx
3382 = rx->instruction_table[rx_do_side_effects];
3383 dfp->side_effects_frame.data = 0;
3384 dfp->side_effects_frame.data_2 = (void *) dfp;
3385 dfp->effects = future->effects;
3393 /* This constructs a new superstate from its state set. The only
3394 * complexity here is memory management.
3397 RX_DECL struct rx_superstate *
3398 rx_superstate (struct rx *rx,
3399 struct rx_superset *set)
3401 RX_DECL struct rx_superstate *
3402 rx_superstate (rx, set)
3404 struct rx_superset *set;
3407 struct rx_cache * cache = rx->cache;
3408 struct rx_superstate * superstate = 0;
3410 /* Does the superstate already exist in the cache? */
3411 if (set->superstate)
3413 if (set->superstate->rx_id != rx->rx_id)
3415 /* Aha. It is in the cache, but belongs to a superstate
3416 * that refers to an NFA that no longer exists.
3417 * (We know it no longer exists because it was evidently
3418 * stored in the same region of memory as the current nfa
3419 * yet it has a different id.)
3421 superstate = set->superstate;
3422 if (!superstate->is_semifree)
3424 if (cache->lru_superstate == superstate)
3426 cache->lru_superstate = superstate->next_recyclable;
3427 if (cache->lru_superstate == superstate)
3428 cache->lru_superstate = 0;
3431 superstate->next_recyclable->prev_recyclable
3432 = superstate->prev_recyclable;
3433 superstate->prev_recyclable->next_recyclable
3434 = superstate->next_recyclable;
3435 if (!cache->semifree_superstate)
3437 (cache->semifree_superstate
3438 = superstate->next_recyclable
3439 = superstate->prev_recyclable
3444 superstate->next_recyclable = cache->semifree_superstate;
3445 superstate->prev_recyclable
3446 = cache->semifree_superstate->prev_recyclable;
3447 superstate->next_recyclable->prev_recyclable
3449 superstate->prev_recyclable->next_recyclable
3451 cache->semifree_superstate = superstate;
3453 ++cache->semifree_superstates;
3456 set->superstate = 0;
3457 goto handle_cache_miss;
3460 superstate = set->superstate;
3462 rx_refresh_this_superstate (cache, superstate);
3468 /* This point reached only for cache misses. */
3471 if (rx_debug_trace > 1)
3473 struct rx_superset * setp = set;
3474 fprintf (stderr, "Building a superstet %d(%d): ", rx->rx_id, set);
3477 fprintf (stderr, "%d ", setp->id);
3480 fprintf (stderr, "(%d)\n", set);
3483 superstate = (struct rx_superstate *)rx_cache_get_superstate (cache);
3487 if (!cache->lru_superstate)
3488 (cache->lru_superstate
3489 = superstate->next_recyclable
3490 = superstate->prev_recyclable
3494 superstate->next_recyclable = cache->lru_superstate;
3495 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
3496 ( superstate->prev_recyclable->next_recyclable
3497 = superstate->next_recyclable->prev_recyclable
3500 superstate->rx_id = rx->rx_id;
3501 superstate->transition_refs = 0;
3502 superstate->locks = 0;
3503 superstate->is_semifree = 0;
3504 set->superstate = superstate;
3505 superstate->contents = set;
3506 rx_protect_superset (rx, set);
3507 superstate->edges = 0;
3510 /* None of the transitions from this superstate are known yet. */
3511 for (x = 0; x < rx->local_cset_size; ++x) /* &&&&& 3.8 % */
3513 struct rx_inx * ifr = &superstate->transitions[x];
3514 ifr->inx = rx->instruction_table [rx_cache_miss];
3515 ifr->data = ifr->data_2 = 0;
3522 /* This computes the destination set of one edge of the superstate NFA.
3523 * Note that a RX_DISTINCT_FUTURE is a superstate edge.
3524 * Returns 0 on an allocation failure.
3529 solve_destination (struct rx *rx, struct rx_distinct_future *df)
3532 solve_destination (rx, df)
3534 struct rx_distinct_future *df;
3537 struct rx_super_edge *tc = df->edge;
3538 struct rx_superset *nfa_state;
3539 struct rx_superset *nil_set = rx_superset_cons (rx, 0, 0);
3540 struct rx_superset *solution = nil_set;
3541 struct rx_superstate *dest;
3543 rx_protect_superset (rx, solution);
3544 /* Iterate over all NFA states in the state set of this superstate. */
3545 for (nfa_state = df->present->contents;
3547 nfa_state = nfa_state->cdr)
3549 struct rx_nfa_edge *e;
3550 /* Iterate over all edges of each NFA state. */
3551 for (e = nfa_state->car->edges; e; e = e->next)
3552 /* If we find an edge that is labeled with
3553 * the characters we are solving for.....
3555 if (rx_bitset_is_subset (rx->local_cset_size,
3556 tc->cset, e->params.cset))
3558 struct rx_nfa_state *n = e->dest;
3559 struct rx_possible_future *pf;
3560 /* ....search the partial epsilon closures of the destination
3561 * of that edge for a path that involves the same set of
3562 * side effects we are solving for.
3563 * If we find such a RX_POSSIBLE_FUTURE, we add members to the
3564 * stateset we are computing.
3566 for (pf = n->futures; pf; pf = pf->next)
3567 if (pf->effects == df->effects)
3569 struct rx_superset * old_sol;
3571 solution = rx_superstate_eclosure_union (rx, solution,
3575 rx_protect_superset (rx, solution);
3576 rx_release_superset (rx, old_sol);
3580 /* It is possible that the RX_DISTINCT_FUTURE we are working on has
3581 * the empty set of NFA states as its definition. In that case, this
3582 * is a failure point.
3584 if (solution == nil_set)
3586 df->future_frame.inx = (void *) rx_backtrack;
3587 df->future_frame.data = 0;
3588 df->future_frame.data_2 = 0;
3591 dest = rx_superstate (rx, solution);
3592 rx_release_superset (rx, solution);
3597 struct rx_distinct_future *dft;
3599 df->prev_same_dest->next_same_dest = 0;
3603 dft->future_frame.inx = rx->instruction_table[rx_next_char];
3604 dft->future_frame.data = (void *) dest->transitions;
3605 dft = dft->next_same_dest;
3607 df->prev_same_dest->next_same_dest = df;
3609 if (!dest->transition_refs)
3610 dest->transition_refs = df;
3613 struct rx_distinct_future *dft = dest->transition_refs->next_same_dest;
3614 dest->transition_refs->next_same_dest = df->next_same_dest;
3615 df->next_same_dest->prev_same_dest = dest->transition_refs;
3616 df->next_same_dest = dft;
3617 dft->prev_same_dest = df;
3623 /* This takes a superstate and a character, and computes some edges
3624 * from the superstate NFA. In particular, this computes all edges
3625 * that lead from SUPERSTATE given CHR. This function also
3626 * computes the set of characters that share this edge set.
3627 * This returns 0 on allocation error.
3628 * The character set and list of edges are returned through
3629 * the paramters CSETOUT and DFOUT.
3634 compute_super_edge (struct rx *rx, struct rx_distinct_future **dfout,
3635 rx_Bitset csetout, struct rx_superstate *superstate,
3639 compute_super_edge (rx, dfout, csetout, superstate, chr)
3641 struct rx_distinct_future **dfout;
3643 struct rx_superstate *superstate;
3647 struct rx_superset *stateset = superstate->contents;
3649 /* To compute the set of characters that share edges with CHR,
3650 * we start with the full character set, and subtract.
3652 rx_bitset_universe (rx->local_cset_size, csetout);
3655 /* Iterate over the NFA states in the superstate state-set. */
3656 while (stateset->car)
3658 struct rx_nfa_edge *e;
3659 for (e = stateset->car->edges; e; e = e->next)
3660 if (RX_bitset_member (e->params.cset, chr))
3662 /* If we find an NFA edge that applies, we make sure there
3663 * are corresponding edges in the superstate NFA.
3666 struct rx_distinct_future * saved;
3668 *dfout = include_futures (rx, *dfout, e->dest, superstate);
3671 struct rx_distinct_future * df;
3674 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3677 struct rx_distinct_future *dft;
3679 df = df->next_same_super_edge[0];
3681 if (dft->future && dft->future->transition_refs == dft)
3683 dft->future->transition_refs = dft->next_same_dest;
3684 if (dft->future->transition_refs == dft)
3685 dft->future->transition_refs = 0;
3687 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
3688 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
3689 rx_cache_free (rx->cache,
3690 &rx->cache->free_discernable_futures,
3696 /* We also trim the character set a bit. */
3697 rx_bitset_intersection (rx->local_cset_size,
3698 csetout, e->params.cset);
3701 /* An edge that doesn't apply at least tells us some characters
3702 * that don't share the same edge set as CHR.
3704 rx_bitset_difference (rx->local_cset_size, csetout, e->params.cset);
3705 stateset = stateset->cdr;
3711 /* This is a constructor for RX_SUPER_EDGE structures. These are
3712 * wrappers for lists of superstate NFA edges that share character sets labels.
3713 * If a transition class contains more than one rx_distinct_future (superstate
3714 * edge), then it represents a non-determinism in the superstate NFA.
3718 static struct rx_super_edge *
3719 rx_super_edge (struct rx *rx,
3720 struct rx_superstate *super, rx_Bitset cset,
3721 struct rx_distinct_future *df)
3723 static struct rx_super_edge *
3724 rx_super_edge (rx, super, cset, df)
3726 struct rx_superstate *super;
3728 struct rx_distinct_future *df;
3731 struct rx_super_edge *tc =
3732 (struct rx_super_edge *)rx_cache_malloc_or_get
3733 (rx->cache, &rx->cache->free_transition_classes,
3734 sizeof (struct rx_super_edge) + rx_sizeof_bitset (rx->local_cset_size));
3738 tc->next = super->edges;
3740 tc->rx_backtrack_frame.inx = rx->instruction_table[rx_backtrack_point];
3741 tc->rx_backtrack_frame.data = 0;
3742 tc->rx_backtrack_frame.data_2 = (void *) tc;
3744 tc->cset = (rx_Bitset) ((char *) tc + sizeof (*tc));
3745 rx_bitset_assign (rx->local_cset_size, tc->cset, cset);
3748 struct rx_distinct_future * dfp = df;
3749 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3753 dfp = dfp->next_same_super_edge[0];
3755 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3761 /* There are three kinds of cache miss. The first occurs when a
3762 * transition is taken that has never been computed during the
3763 * lifetime of the source superstate. That cache miss is handled by
3764 * calling COMPUTE_SUPER_EDGE. The second kind of cache miss
3765 * occurs when the destination superstate of a transition doesn't
3766 * exist. SOLVE_DESTINATION is used to construct the destination superstate.
3767 * Finally, the third kind of cache miss occurs when the destination
3768 * superstate of a transition is in a `semi-free state'. That case is
3769 * handled by UNFREE_SUPERSTATE.
3771 * The function of HANDLE_CACHE_MISS is to figure out which of these
3777 install_partial_transition (struct rx_superstate *super,
3778 struct rx_inx *answer,
3779 RX_subset set, int offset)
3782 install_partial_transition (super, answer, set, offset)
3783 struct rx_superstate *super;
3784 struct rx_inx *answer;
3790 int end = start + 32;
3792 struct rx_inx * transitions = super->transitions;
3797 transitions[start] = *answer;
3805 RX_DECL struct rx_inx *
3806 rx_handle_cache_miss
3807 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data)
3809 RX_DECL struct rx_inx *
3810 rx_handle_cache_miss (rx, super, chr, data)
3812 struct rx_superstate *super;
3817 int offset = chr / RX_subset_bits;
3818 struct rx_distinct_future *df = data;
3820 if (!df) /* must be the shared_cache_miss_frame */
3822 /* Perhaps this is just a transition waiting to be filled. */
3823 struct rx_super_edge *tc;
3824 RX_subset mask = rx_subset_singletons [chr % RX_subset_bits];
3826 for (tc = super->edges; tc; tc = tc->next)
3827 if (tc->cset[offset] & mask)
3829 struct rx_inx * answer;
3831 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3832 ? &tc->rx_backtrack_frame
3834 ? &df->side_effects_frame
3835 : &df->future_frame));
3836 install_partial_transition (super, answer,
3837 tc->cset [offset], offset * 32);
3840 /* Otherwise, it's a flushed or newly encountered edge. */
3842 char cset_space[1024]; /* this limit is far from unreasonable */
3844 struct rx_inx *answer;
3846 if (rx_sizeof_bitset (rx->local_cset_size) > sizeof (cset_space))
3847 return 0; /* If the arbitrary limit is hit, always fail */
3849 trcset = (rx_Bitset)cset_space;
3850 rx_lock_superstate (rx, super);
3851 if (!compute_super_edge (rx, &df, trcset, super, chr))
3853 rx_unlock_superstate (rx, super);
3856 if (!df) /* We just computed the fail transition. */
3858 static struct rx_inx
3859 shared_fail_frame = { 0, 0, (void *)rx_backtrack, 0 };
3860 answer = &shared_fail_frame;
3864 tc = rx_super_edge (rx, super, trcset, df);
3867 rx_unlock_superstate (rx, super);
3870 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3871 ? &tc->rx_backtrack_frame
3873 ? &df->side_effects_frame
3874 : &df->future_frame));
3876 install_partial_transition (super, answer,
3877 trcset[offset], offset * 32);
3878 rx_unlock_superstate (rx, super);
3882 else if (df->future) /* A cache miss on an edge with a future? Must be
3883 * a semi-free destination. */
3885 if (df->future->is_semifree)
3886 refresh_semifree_superstate (rx->cache, df->future);
3887 return &df->future_frame;
3890 /* no future superstate on an existing edge */
3892 rx_lock_superstate (rx, super);
3893 if (!solve_destination (rx, df))
3895 rx_unlock_superstate (rx, super);
3899 && (df->edge->options->next_same_super_edge[0] == df->edge->options))
3900 install_partial_transition (super, &df->future_frame,
3901 df->edge->cset[offset], offset * 32);
3902 rx_unlock_superstate (rx, super);
3903 return &df->future_frame;
3910 /* The rest of the code provides a regex.c compatable interface. */
3913 __const__ char *re_error_msg[] =
3916 "No match", /* REG_NOMATCH */
3917 "Invalid regular expression", /* REG_BADPAT */
3918 "Invalid collation character", /* REG_ECOLLATE */
3919 "Invalid character class name", /* REG_ECTYPE */
3920 "Trailing backslash", /* REG_EESCAPE */
3921 "Invalid back reference", /* REG_ESUBREG */
3922 "Unmatched [ or [^", /* REG_EBRACK */
3923 "Unmatched ( or \\(", /* REG_EPAREN */
3924 "Unmatched \\{", /* REG_EBRACE */
3925 "Invalid content of \\{\\}", /* REG_BADBR */
3926 "Invalid range end", /* REG_ERANGE */
3927 "Memory exhausted", /* REG_ESPACE */
3928 "Invalid preceding regular expression", /* REG_BADRPT */
3929 "Premature end of regular expression", /* REG_EEND */
3930 "Regular expression too big", /* REG_ESIZE */
3931 "Unmatched ) or \\)", /* REG_ERPAREN */
3937 * Macros used while compiling patterns.
3939 * By convention, PEND points just past the end of the uncompiled pattern,
3940 * P points to the read position in the pattern. `translate' is the name
3941 * of the translation table (`TRANSLATE' is the name of a macro that looks
3942 * things up in `translate').
3947 * Fetch the next character in the uncompiled pattern---translating it
3948 * if necessary. *Also cast from a signed character in the constant
3949 * string passed to us by the user to an unsigned char that we can use
3950 * as an array index (in, e.g., `translate').
3952 #define PATFETCH(c) \
3953 do {if (p == pend) return REG_EEND; \
3954 c = (unsigned char) *p++; \
3959 * Fetch the next character in the uncompiled pattern, with no
3962 #define PATFETCH_RAW(c) \
3963 do {if (p == pend) return REG_EEND; \
3964 c = (unsigned char) *p++; \
3967 /* Go backwards one character in the pattern. */
3968 #define PATUNFETCH p--
3971 #define TRANSLATE(d) translate[(unsigned char) (d)]
3973 typedef unsigned regnum_t;
3975 /* Since offsets can go either forwards or backwards, this type needs to
3976 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
3978 typedef int pattern_offset_t;
3982 struct rexp_node ** top_expression; /* was begalt */
3983 struct rexp_node ** last_expression; /* was laststart */
3984 pattern_offset_t inner_group_offset;
3986 } compile_stack_elt_t;
3990 compile_stack_elt_t *stack;
3992 unsigned avail; /* Offset of next open position. */
3993 } compile_stack_type;
3996 #define INIT_COMPILE_STACK_SIZE 32
3998 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
3999 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
4001 /* The next available element. */
4002 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
4005 /* Set the bit for character C in a list. */
4006 #define SET_LIST_BIT(c) \
4007 (b[((unsigned char) (c)) / CHARBITS] \
4008 |= 1 << (((unsigned char) c) % CHARBITS))
4010 /* Get the next unsigned number in the uncompiled pattern. */
4011 #define GET_UNSIGNED_NUMBER(num) \
4015 while (isdigit (c)) \
4019 num = num * 10 + c - '0'; \
4027 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
4029 #define IS_CHAR_CLASS(string) \
4030 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
4031 || !strcmp (string, "lower") || !strcmp (string, "digit") \
4032 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
4033 || !strcmp (string, "space") || !strcmp (string, "print") \
4034 || !strcmp (string, "punct") || !strcmp (string, "graph") \
4035 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
4038 /* These predicates are used in regex_compile. */
4040 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4041 * after an alternative or a begin-subexpression. We assume there is at
4042 * least one character before the ^.
4047 at_begline_loc_p (__const__ char *pattern, __const__ char * p, reg_syntax_t syntax)
4050 at_begline_loc_p (pattern, p, syntax)
4051 __const__ char *pattern;
4053 reg_syntax_t syntax;
4056 __const__ char *prev = p - 2;
4057 boolean prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\'));
4061 (/* After a subexpression? */
4062 ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash))
4064 /* After an alternative? */
4065 ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash))
4069 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4070 * at least one character after the $, i.e., `P < PEND'.
4075 at_endline_loc_p (__const__ char *p, __const__ char *pend, int syntax)
4078 at_endline_loc_p (p, pend, syntax)
4080 __const__ char *pend;
4084 __const__ char *next = p;
4085 boolean next_backslash = (*next == '\\');
4086 __const__ char *next_next = (p + 1 < pend) ? (p + 1) : 0;
4090 /* Before a subexpression? */
4091 ((syntax & RE_NO_BK_PARENS)
4093 : (next_backslash && next_next && (*next_next == ')')))
4095 /* Before an alternative? */
4096 ((syntax & RE_NO_BK_VBAR)
4098 : (next_backslash && next_next && (*next_next == '|')))
4103 unsigned char rx_id_translation[256] =
4105 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4106 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
4107 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
4108 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
4109 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
4110 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
4111 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
4112 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
4113 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
4114 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
4116 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
4117 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
4118 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
4119 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
4120 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
4121 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
4122 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
4123 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
4124 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
4125 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
4127 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
4128 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
4129 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
4130 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
4131 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
4132 250, 251, 252, 253, 254, 255
4135 /* The compiler keeps an inverted translation table.
4136 * This looks up/inititalize elements.
4137 * VALID is an array of booleans that validate CACHE.
4142 inverse_translation (struct re_pattern_buffer * rxb,
4143 char * valid, rx_Bitset cache,
4144 unsigned char * translate, int c)
4147 inverse_translation (rxb, valid, cache, translate, c)
4148 struct re_pattern_buffer * rxb;
4151 unsigned char * translate;
4156 = cache + c * rx_bitset_numb_subsets (rxb->rx.local_cset_size);
4161 int c_tr = TRANSLATE(c);
4162 rx_bitset_null (rxb->rx.local_cset_size, cs);
4163 for (x = 0; x < 256; ++x) /* &&&& 13.37 */
4164 if (TRANSLATE(x) == c_tr)
4165 RX_bitset_enjoin (cs, x);
4174 /* More subroutine declarations and macros for regex_compile. */
4176 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4177 false if it's not. */
4181 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4184 group_in_compile_stack (compile_stack, regnum)
4185 compile_stack_type compile_stack;
4191 for (this_element = compile_stack.avail - 1;
4194 if (compile_stack.stack[this_element].regnum == regnum)
4202 * Read the ending character of a range (in a bracket expression) from the
4203 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
4204 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4205 * Then we set the translation of all bits between the starting and
4206 * ending characters (inclusive) in the compiled pattern B.
4208 * Return an error code.
4210 * We use these short variable names so we can use the same macros as
4211 * `regex_compile' itself.
4215 static reg_errcode_t
4216 compile_range (struct re_pattern_buffer * rxb, rx_Bitset cs,
4217 __const__ char ** p_ptr, __const__ char * pend,
4218 unsigned char * translate, reg_syntax_t syntax,
4219 rx_Bitset inv_tr, char * valid_inv_tr)
4221 static reg_errcode_t
4222 compile_range (rxb, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr)
4223 struct re_pattern_buffer * rxb;
4225 __const__ char ** p_ptr;
4226 __const__ char * pend;
4227 unsigned char * translate;
4228 reg_syntax_t syntax;
4230 char * valid_inv_tr;
4235 __const__ char *p = *p_ptr;
4237 unsigned char range_end;
4238 unsigned char range_start = TRANSLATE(p[-2]);
4243 PATFETCH (range_end);
4247 if (range_start > range_end)
4248 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4250 for (this_char = range_start; this_char <= range_end; this_char++)
4253 inverse_translation (rxb, valid_inv_tr, inv_tr, translate, this_char);
4254 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
4261 /* This searches a regexp for backreference side effects.
4262 * It fills in the array OUT with 1 at the index of every register pair
4263 * referenced by a backreference.
4265 * This is used to help optimize patterns for searching. The information is
4266 * useful because, if the caller doesn't want register values, backreferenced
4267 * registers are the only registers for which we need rx_backtrack.
4272 find_backrefs (char * out, struct rexp_node * rexp,
4273 struct re_se_params * params)
4276 find_backrefs (out, rexp, params)
4278 struct rexp_node * rexp;
4279 struct re_se_params * params;
4293 find_backrefs (out, rexp->params.pair.left, params);
4294 find_backrefs (out, rexp->params.pair.right, params);
4297 if ( ((long)rexp->params.side_effect >= 0)
4298 && (params [(long)rexp->params.side_effect].se == re_se_backref))
4299 out[ params [(long)rexp->params.side_effect].op1] = 1;
4306 /* Returns 0 unless the pattern can match the empty string. */
4310 compute_fastset (struct re_pattern_buffer * rxb, struct rexp_node * rexp)
4313 compute_fastset (rxb, rexp)
4314 struct re_pattern_buffer * rxb;
4315 struct rexp_node * rexp;
4326 rx_bitset_union (rxb->rx.local_cset_size,
4327 rxb->fastset, rexp->params.cset);
4331 return (compute_fastset (rxb, rexp->params.pair.left)
4332 && compute_fastset (rxb, rexp->params.pair.right));
4334 compute_fastset (rxb, rexp->params.pair.left);
4335 /* compute_fastset (rxb, rexp->params.pair.right); nope... */
4338 return !!(compute_fastset (rxb, rexp->params.pair.left)
4339 + compute_fastset (rxb, rexp->params.pair.right));
4342 compute_fastset (rxb, rexp->params.pair.left);
4348 /* this should never happen */
4354 * 1 -- yes, definately anchored by the given side effect.
4355 * 2 -- maybe anchored, maybe the empty string.
4356 * 0 -- definately not anchored
4357 * There is simply no other possibility.
4362 is_anchored (struct rexp_node * rexp, rx_side_effect se)
4365 is_anchored (rexp, se)
4366 struct rexp_node * rexp;
4380 int l = is_anchored (rexp->params.pair.left, se);
4381 return (l == 2 ? is_anchored (rexp->params.pair.right, se) : l);
4385 int l = is_anchored (rexp->params.pair.left, se);
4386 int r = l ? is_anchored (rexp->params.pair.right, se) : 0;
4390 else if ((l == 0) || (r == 0))
4397 return is_anchored (rexp->params.pair.left, se) ? 2 : 0;
4400 return ((rexp->params.side_effect == se)
4404 /* this should never happen */
4409 /* This removes register assignments that aren't required by backreferencing.
4410 * This can speed up explore_future, especially if it eliminates
4411 * non-determinism in the superstate NFA.
4413 * NEEDED is an array of characters, presumably filled in by FIND_BACKREFS.
4414 * The non-zero elements of the array indicate which register assignments
4415 * can NOT be removed from the expression.
4419 static struct rexp_node *
4420 remove_unecessary_side_effects (struct rx * rx, char * needed,
4421 struct rexp_node * rexp,
4422 struct re_se_params * params)
4424 static struct rexp_node *
4425 remove_unecessary_side_effects (rx, needed, rexp, params)
4428 struct rexp_node * rexp;
4429 struct re_se_params * params;
4432 struct rexp_node * l;
4433 struct rexp_node * r;
4445 l = remove_unecessary_side_effects (rx, needed,
4446 rexp->params.pair.left, params);
4447 r = remove_unecessary_side_effects (rx, needed,
4448 rexp->params.pair.right, params);
4449 if ((l && r) || (rexp->type != r_concat))
4451 rexp->params.pair.left = l;
4452 rexp->params.pair.right = r;
4457 rexp->params.pair.left = rexp->params.pair.right = 0;
4458 rx_free_rexp (rx, rexp);
4463 l = remove_unecessary_side_effects (rx, needed,
4464 rexp->params.pair.left, params);
4467 rexp->params.pair.left = l;
4472 rexp->params.pair.left = 0;
4473 rx_free_rexp (rx, rexp);
4478 int se = (long)rexp->params.side_effect;
4480 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4481 || ((enum re_side_effects)params[se].se == re_se_rparen))
4482 && (params [se].op1 > 0)
4483 && (!needed [params [se].op1]))
4485 rx_free_rexp (rx, rexp);
4493 /* this should never happen */
4501 pointless_if_repeated (struct rexp_node * node, struct re_se_params * params)
4504 pointless_if_repeated (node, params)
4505 struct rexp_node * node;
4506 struct re_se_params * params;
4518 return (pointless_if_repeated (node->params.pair.left, params)
4519 && pointless_if_repeated (node->params.pair.right, params));
4522 return pointless_if_repeated (node->params.pair.left, params);
4524 switch (((long)node->params.side_effect < 0)
4525 ? (enum re_side_effects)node->params.side_effect
4526 : (enum re_side_effects)params[(long)node->params.side_effect].se)
4533 case re_se_wordbound:
4534 case re_se_notwordbound:
4544 case re_se_end_iter:
4546 case re_se_not_syntax:
4560 registers_on_stack (struct re_pattern_buffer * rxb,
4561 struct rexp_node * rexp, int in_danger,
4562 struct re_se_params * params)
4565 registers_on_stack (rxb, rexp, in_danger, params)
4566 struct re_pattern_buffer * rxb;
4567 struct rexp_node * rexp;
4569 struct re_se_params * params;
4582 return ( registers_on_stack (rxb, rexp->params.pair.left,
4584 || (registers_on_stack
4585 (rxb, rexp->params.pair.right,
4586 in_danger, params)));
4588 return registers_on_stack (rxb, rexp->params.pair.left, 0, params);
4590 return registers_on_stack (rxb, rexp->params.pair.left, 1, params);
4593 ( registers_on_stack (rxb, rexp->params.pair.left, 1, params)
4594 || registers_on_stack (rxb, rexp->params.pair.right, 1, params));
4597 int se = (long)rexp->params.side_effect;
4600 && (params [se].op1 > 0)
4601 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4602 || ((enum re_side_effects)params[se].se == re_se_rparen)))
4609 /* this should never happen */
4615 static char idempotent_complex_se[] =
4617 #define RX_WANT_SE_DEFS 1
4619 #undef RX_DEF_CPLX_SE
4620 #define RX_DEF_SE(IDEM, NAME, VALUE)
4621 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) IDEM,
4624 #undef RX_DEF_CPLX_SE
4625 #undef RX_WANT_SE_DEFS
4629 static char idempotent_se[] =
4632 #define RX_WANT_SE_DEFS 1
4634 #undef RX_DEF_CPLX_SE
4635 #define RX_DEF_SE(IDEM, NAME, VALUE) IDEM,
4636 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE)
4639 #undef RX_DEF_CPLX_SE
4640 #undef RX_WANT_SE_DEFS
4649 has_any_se (struct rx * rx,
4650 struct rexp_node * rexp)
4653 has_any_se (rx, rexp)
4655 struct rexp_node * rexp;
4674 ( has_any_se (rx, rexp->params.pair.left)
4675 || has_any_se (rx, rexp->params.pair.right));
4679 return has_any_se (rx, rexp->params.pair.left);
4682 /* this should never happen */
4688 /* This must be called AFTER `convert_hard_loops' for a given REXP. */
4691 has_non_idempotent_epsilon_path (struct rx * rx,
4692 struct rexp_node * rexp,
4693 struct re_se_params * params)
4696 has_non_idempotent_epsilon_path (rx, rexp, params)
4698 struct rexp_node * rexp;
4699 struct re_se_params * params;
4714 !((long)rexp->params.side_effect > 0
4715 ? idempotent_complex_se [ params [(long)rexp->params.side_effect].se ]
4716 : idempotent_se [-(long)rexp->params.side_effect]);
4720 ( has_non_idempotent_epsilon_path (rx,
4721 rexp->params.pair.left, params)
4722 || has_non_idempotent_epsilon_path (rx,
4723 rexp->params.pair.right, params));
4728 ( has_non_idempotent_epsilon_path (rx,
4729 rexp->params.pair.left, params)
4730 && has_non_idempotent_epsilon_path (rx,
4731 rexp->params.pair.right, params));
4734 return has_non_idempotent_epsilon_path (rx,
4735 rexp->params.pair.left, params);
4738 /* this should never happen */
4744 /* This computes rougly what it's name suggests. It can (and does) go wrong
4745 * in the direction of returning spurious 0 without causing disasters.
4749 begins_with_complex_se (struct rx * rx, struct rexp_node * rexp)
4752 begins_with_complex_se (rx, rexp)
4754 struct rexp_node * rexp;
4767 return ((long)rexp->params.side_effect >= 0);
4771 ( begins_with_complex_se (rx, rexp->params.pair.left)
4772 && begins_with_complex_se (rx, rexp->params.pair.right));
4776 return has_any_se (rx, rexp->params.pair.left);
4783 /* this should never happen */
4788 /* This destructively removes some of the re_se_tv side effects from
4789 * a rexp tree. In particular, during parsing re_se_tv was inserted on the
4790 * right half of every | to guarantee that posix path preference could be
4791 * honored. This function removes some which it can be determined aren't
4797 speed_up_alt (struct rx * rx,
4798 struct rexp_node * rexp,
4802 speed_up_alt (rx, rexp, unposix)
4804 struct rexp_node * rexp;
4820 speed_up_alt (rx, rexp->params.pair.left, unposix);
4825 speed_up_alt (rx, rexp->params.pair.left, unposix);
4826 speed_up_alt (rx, rexp->params.pair.right, unposix);
4830 /* the right child is guaranteed to be (concat re_se_tv <subexp>) */
4832 speed_up_alt (rx, rexp->params.pair.left, unposix);
4833 speed_up_alt (rx, rexp->params.pair.right->params.pair.right, unposix);
4836 || (begins_with_complex_se
4837 (rx, rexp->params.pair.right->params.pair.right))
4838 || !( has_any_se (rx, rexp->params.pair.right->params.pair.right)
4839 || has_any_se (rx, rexp->params.pair.left)))
4841 struct rexp_node * conc = rexp->params.pair.right;
4842 rexp->params.pair.right = conc->params.pair.right;
4843 conc->params.pair.right = 0;
4844 rx_free_rexp (rx, conc);
4853 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
4854 Returns one of error codes defined in `regex.h', or zero for success.
4856 Assumes the `allocated' (and perhaps `buffer') and `translate'
4857 fields are set in BUFP on entry.
4859 If it succeeds, results are put in BUFP (if it returns an error, the
4860 contents of BUFP are undefined):
4861 `buffer' is the compiled pattern;
4862 `syntax' is set to SYNTAX;
4863 `used' is set to the length of the compiled pattern;
4864 `fastmap_accurate' is set to zero;
4865 `re_nsub' is set to the number of groups in PATTERN;
4866 `not_bol' and `not_eol' are set to zero.
4868 The `fastmap' and `newline_anchor' fields are neither
4869 examined nor set. */
4874 RX_DECL reg_errcode_t
4875 rx_compile (__const__ char *pattern, int size,
4876 reg_syntax_t syntax,
4877 struct re_pattern_buffer * rxb)
4879 RX_DECL reg_errcode_t
4880 rx_compile (pattern, size, syntax, rxb)
4881 __const__ char *pattern;
4883 reg_syntax_t syntax;
4884 struct re_pattern_buffer * rxb;
4888 inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4890 validate_inv_tr [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4892 /* We fetch characters from PATTERN here. Even though PATTERN is
4893 `char *' (i.e., signed), we declare these variables as unsigned, so
4894 they can be reliably used as array indices. */
4895 register unsigned char c, c1;
4897 /* A random tempory spot in PATTERN. */
4900 /* Keeps track of unclosed groups. */
4901 compile_stack_type compile_stack;
4903 /* Points to the current (ending) position in the pattern. */
4904 __const__ char *p = pattern;
4905 __const__ char *pend = pattern + size;
4907 /* How to translate the characters in the pattern. */
4908 unsigned char *translate = (rxb->translate
4910 : rx_id_translation);
4912 /* When parsing is done, this will hold the expression tree. */
4913 struct rexp_node * rexp = 0;
4915 /* In the midst of compilation, this holds onto the regexp
4916 * first parst while rexp goes on to aquire additional constructs.
4918 struct rexp_node * orig_rexp = 0;
4919 struct rexp_node * fewer_side_effects = 0;
4921 /* This and top_expression are saved on the compile stack. */
4922 struct rexp_node ** top_expression = &rexp;
4923 struct rexp_node ** last_expression = top_expression;
4925 /* Parameter to `goto append_node' */
4926 struct rexp_node * append;
4928 /* Counts open-groups as they are encountered. This is the index of the
4929 * innermost group being compiled.
4931 regnum_t regnum = 0;
4933 /* Place in the uncompiled pattern (i.e., the {) to
4934 * which to go back if the interval is invalid.
4936 __const__ char *beg_interval;
4938 struct re_se_params * params = 0;
4939 int paramc = 0; /* How many complex side effects so far? */
4941 rx_side_effect side; /* param to `goto add_side_effect' */
4943 bzero (validate_inv_tr, sizeof (validate_inv_tr));
4945 rxb->rx.instruction_table = rx_id_instruction_table;
4948 /* Initialize the compile stack. */
4949 compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t)));
4950 if (compile_stack.stack == 0)
4953 compile_stack.size = INIT_COMPILE_STACK_SIZE;
4954 compile_stack.avail = 0;
4956 /* Initialize the pattern buffer. */
4957 rxb->rx.cache = &default_cache;
4958 rxb->syntax = syntax;
4959 rxb->fastmap_accurate = 0;
4960 rxb->not_bol = rxb->not_eol = 0;
4961 rxb->least_subs = 0;
4963 /* Always count groups, whether or not rxb->no_sub is set.
4964 * The whole pattern is implicitly group 0, so counting begins
4969 #if !defined (emacs) && !defined (SYNTAX_TABLE)
4970 /* Initialize the syntax table. */
4971 init_syntax_once ();
4974 /* Loop through the uncompiled pattern until we're at the end. */
4983 if ( /* If at start of pattern, it's an operator. */
4985 /* If context independent, it's an operator. */
4986 || syntax & RE_CONTEXT_INDEP_ANCHORS
4987 /* Otherwise, depends on what's come before. */
4988 || at_begline_loc_p (pattern, p, syntax))
4990 struct rexp_node * n
4991 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_hat);
5005 if ( /* If at end of pattern, it's an operator. */
5007 /* If context independent, it's an operator. */
5008 || syntax & RE_CONTEXT_INDEP_ANCHORS
5009 /* Otherwise, depends on what's next. */
5010 || at_endline_loc_p (p, pend, syntax))
5012 struct rexp_node * n
5013 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_dollar);
5027 if ((syntax & RE_BK_PLUS_QM)
5028 || (syntax & RE_LIMITED_OPS))
5033 /* If there is no previous pattern... */
5034 if (pointless_if_repeated (*last_expression, params))
5036 if (syntax & RE_CONTEXT_INVALID_OPS)
5038 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5043 /* 1 means zero (many) matches is allowed. */
5044 char zero_times_ok = 0, many_times_ok = 0;
5046 /* If there is a sequence of repetition chars, collapse it
5047 down to just one (the right one). We can't combine
5048 interval operators with these because of, e.g., `a{2}*',
5049 which should only match an even number of `a's. */
5053 zero_times_ok |= c != '+';
5054 many_times_ok |= c != '?';
5062 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
5065 else if (syntax & RE_BK_PLUS_QM && c == '\\')
5067 if (p == pend) return REG_EESCAPE;
5070 if (!(c1 == '+' || c1 == '?'))
5085 /* If we get here, we found another repeat character. */
5088 /* Star, etc. applied to an empty pattern is equivalent
5089 to an empty pattern. */
5090 if (!last_expression)
5093 /* Now we know whether or not zero matches is allowed
5094 * and also whether or not two or more matches is allowed.
5098 struct rexp_node * inner_exp = *last_expression;
5102 && has_non_idempotent_epsilon_path (&rxb->rx,
5105 struct rexp_node * pusher
5106 = rx_mk_r_side_effect (&rxb->rx,
5107 (rx_side_effect)re_se_pushpos);
5108 struct rexp_node * checker
5109 = rx_mk_r_side_effect (&rxb->rx,
5110 (rx_side_effect)re_se_chkpos);
5111 struct rexp_node * pushback
5112 = rx_mk_r_side_effect (&rxb->rx,
5113 (rx_side_effect)re_se_pushback);
5114 rx_Bitset cs = rx_cset (&rxb->rx);
5115 struct rexp_node * lit_t = rx_mk_r_cset (&rxb->rx, cs);
5116 struct rexp_node * fake_state
5117 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5118 struct rexp_node * phase2
5119 = rx_mk_r_concat (&rxb->rx, checker, fake_state);
5120 struct rexp_node * popper
5121 = rx_mk_r_side_effect (&rxb->rx,
5122 (rx_side_effect)re_se_poppos);
5123 struct rexp_node * star
5124 = rx_mk_r_2phase_star (&rxb->rx, inner_exp, phase2);
5125 struct rexp_node * a
5126 = rx_mk_r_concat (&rxb->rx, pusher, star);
5127 struct rexp_node * whole_thing
5128 = rx_mk_r_concat (&rxb->rx, a, popper);
5129 if (!(pusher && star && pushback && lit_t && fake_state
5130 && lit_t && phase2 && checker && popper
5131 && a && whole_thing))
5133 RX_bitset_enjoin (cs, 't');
5134 *last_expression = whole_thing;
5138 struct rexp_node * star =
5139 (many_times_ok ? rx_mk_r_star : rx_mk_r_opt)
5140 (&rxb->rx, *last_expression);
5143 *last_expression = star;
5144 need_sync = has_any_se (&rxb->rx, *last_expression);
5148 struct rexp_node * concat
5149 = rx_mk_r_concat (&rxb->rx, inner_exp,
5150 rx_copy_rexp (&rxb->rx,
5154 *last_expression = concat;
5158 int sync_se = paramc;
5160 ? ((struct re_se_params *)
5162 sizeof (*params) * (1 + paramc)))
5163 : ((struct re_se_params *)
5164 malloc (sizeof (*params))));
5168 params [sync_se].se = re_se_tv;
5169 side = (rx_side_effect)sync_se;
5170 goto add_side_effect;
5173 /* The old regex.c used to optimize `.*\n'.
5174 * Maybe rx should too?
5182 rx_Bitset cs = rx_cset (&rxb->rx);
5183 struct rexp_node * n = rx_mk_r_cset (&rxb->rx, cs);
5187 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5188 if (!(rxb->syntax & RE_DOT_NEWLINE))
5189 RX_bitset_remove (cs, '\n');
5190 if (!(rxb->syntax & RE_DOT_NOT_NULL))
5191 RX_bitset_remove (cs, 0);
5200 if (p == pend) return REG_EBRACK;
5202 boolean had_char_class = false;
5203 rx_Bitset cs = rx_cset (&rxb->rx);
5204 struct rexp_node * node = rx_mk_r_cset (&rxb->rx, cs);
5205 int is_inverted = *p == '^';
5210 /* This branch of the switch is normally exited with
5218 /* Remember the first position in the bracket expression. */
5221 /* Read in characters and ranges, setting map bits. */
5224 if (p == pend) return REG_EBRACK;
5228 /* \ might escape characters inside [...] and [^...]. */
5229 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
5231 if (p == pend) return REG_EESCAPE;
5235 rx_Bitset it = inverse_translation (rxb,
5240 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5245 /* Could be the end of the bracket expression. If it's
5246 not (i.e., when the bracket expression is `[]' so
5247 far), the ']' character bit gets set way below. */
5248 if (c == ']' && p != p1 + 1)
5249 goto finalize_class_and_append;
5251 /* Look ahead to see if it's a range when the last thing
5252 was a character class. */
5253 if (had_char_class && c == '-' && *p != ']')
5256 /* Look ahead to see if it's a range when the last thing
5257 was a character: if this is a hyphen not at the
5258 beginning or the end of a list, then it's the range
5261 && !(p - 2 >= pattern && p[-2] == '[')
5262 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
5266 = compile_range (rxb, cs, &p, pend, translate, syntax,
5267 inverse_translate, validate_inv_tr);
5268 if (ret != REG_NOERROR) return ret;
5271 else if (p[0] == '-' && p[1] != ']')
5272 { /* This handles ranges made up of characters only. */
5275 /* Move past the `-'. */
5278 ret = compile_range (rxb, cs, &p, pend, translate, syntax,
5279 inverse_translate, validate_inv_tr);
5280 if (ret != REG_NOERROR) return ret;
5283 /* See if we're at the beginning of a possible character
5286 else if ((syntax & RE_CHAR_CLASSES)
5287 && (c == '[') && (*p == ':'))
5289 char str[CHAR_CLASS_MAX_LENGTH + 1];
5294 /* If pattern is `[[:'. */
5295 if (p == pend) return REG_EBRACK;
5300 if (c == ':' || c == ']' || p == pend
5301 || c1 == CHAR_CLASS_MAX_LENGTH)
5307 /* If isn't a word bracketed by `[:' and:`]':
5308 undo the ending character, the letters, and leave
5309 the leading `:' and `[' (but set bits for them). */
5310 if (c == ':' && *p == ']')
5313 boolean is_alnum = !strcmp (str, "alnum");
5314 boolean is_alpha = !strcmp (str, "alpha");
5315 boolean is_blank = !strcmp (str, "blank");
5316 boolean is_cntrl = !strcmp (str, "cntrl");
5317 boolean is_digit = !strcmp (str, "digit");
5318 boolean is_graph = !strcmp (str, "graph");
5319 boolean is_lower = !strcmp (str, "lower");
5320 boolean is_print = !strcmp (str, "print");
5321 boolean is_punct = !strcmp (str, "punct");
5322 boolean is_space = !strcmp (str, "space");
5323 boolean is_upper = !strcmp (str, "upper");
5324 boolean is_xdigit = !strcmp (str, "xdigit");
5326 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
5328 /* Throw away the ] at the end of the character
5332 if (p == pend) return REG_EBRACK;
5334 for (ch = 0; ch < 1 << CHARBITS; ch++)
5336 if ( (is_alnum && isalnum (ch))
5337 || (is_alpha && isalpha (ch))
5338 || (is_blank && isblank (ch))
5339 || (is_cntrl && iscntrl (ch))
5340 || (is_digit && isdigit (ch))
5341 || (is_graph && isgraph (ch))
5342 || (is_lower && islower (ch))
5343 || (is_print && isprint (ch))
5344 || (is_punct && ispunct (ch))
5345 || (is_space && isspace (ch))
5346 || (is_upper && isupper (ch))
5347 || (is_xdigit && isxdigit (ch)))
5350 inverse_translation (rxb,
5355 rx_bitset_union (rxb->rx.local_cset_size,
5359 had_char_class = true;
5368 inverse_translation (rxb,
5373 rx_bitset_union (rxb->rx.local_cset_size,
5378 inverse_translation (rxb,
5383 rx_bitset_union (rxb->rx.local_cset_size,
5386 had_char_class = false;
5391 had_char_class = false;
5393 rx_Bitset it = inverse_translation (rxb,
5398 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5403 finalize_class_and_append:
5406 rx_bitset_complement (rxb->rx.local_cset_size, cs);
5407 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5408 RX_bitset_remove (cs, '\n');
5416 if (syntax & RE_NO_BK_PARENS)
5423 if (syntax & RE_NO_BK_PARENS)
5430 if (syntax & RE_NEWLINE_ALT)
5437 if (syntax & RE_NO_BK_VBAR)
5444 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5445 goto handle_interval;
5451 if (p == pend) return REG_EESCAPE;
5453 /* Do not translate the character after the \, so that we can
5454 distinguish, e.g., \B from \b, even if we normally would
5455 translate, e.g., B to b. */
5461 if (syntax & RE_NO_BK_PARENS)
5462 goto normal_backslash;
5467 if (COMPILE_STACK_FULL)
5469 ((compile_stack.stack) =
5470 (compile_stack_elt_t *) realloc (compile_stack.stack, ( compile_stack.size << 1) * sizeof (
5471 compile_stack_elt_t)));
5472 if (compile_stack.stack == 0) return REG_ESPACE;
5474 compile_stack.size <<= 1;
5477 if (*last_expression)
5479 struct rexp_node * concat
5480 = rx_mk_r_concat (&rxb->rx, *last_expression, 0);
5483 *last_expression = concat;
5484 last_expression = &concat->params.pair.right;
5488 * These are the values to restore when we hit end of this
5491 COMPILE_STACK_TOP.top_expression = top_expression;
5492 COMPILE_STACK_TOP.last_expression = last_expression;
5493 COMPILE_STACK_TOP.regnum = regnum;
5495 compile_stack.avail++;
5497 top_expression = last_expression;
5502 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
5505 /* See similar code for backslashed left paren above. */
5506 if (COMPILE_STACK_EMPTY)
5507 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
5512 /* Since we just checked for an empty stack above, this
5513 ``can't happen''. */
5516 /* We don't just want to restore into `regnum', because
5517 later groups should continue to be numbered higher,
5518 as in `(ab)c(de)' -- the second group is #2. */
5519 regnum_t this_group_regnum;
5520 struct rexp_node ** inner = top_expression;
5522 compile_stack.avail--;
5523 top_expression = COMPILE_STACK_TOP.top_expression;
5524 last_expression = COMPILE_STACK_TOP.last_expression;
5525 this_group_regnum = COMPILE_STACK_TOP.regnum;
5527 int left_se = paramc;
5528 int right_se = paramc + 1;
5531 ? ((struct re_se_params *)
5533 (paramc + 2) * sizeof (params[0])))
5534 : ((struct re_se_params *)
5535 malloc (2 * sizeof (params[0]))));
5540 params[left_se].se = re_se_lparen;
5541 params[left_se].op1 = this_group_regnum;
5542 params[right_se].se = re_se_rparen;
5543 params[right_se].op1 = this_group_regnum;
5545 struct rexp_node * left
5546 = rx_mk_r_side_effect (&rxb->rx,
5547 (rx_side_effect)left_se);
5548 struct rexp_node * right
5549 = rx_mk_r_side_effect (&rxb->rx,
5550 (rx_side_effect)right_se);
5551 struct rexp_node * c1
5553 ? rx_mk_r_concat (&rxb->rx, left, *inner) : left);
5554 struct rexp_node * c2
5555 = rx_mk_r_concat (&rxb->rx, c1, right);
5556 if (!(left && right && c1 && c2))
5564 case '|': /* `\|'. */
5565 if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR))
5566 goto normal_backslash;
5568 if (syntax & RE_LIMITED_OPS)
5572 struct rexp_node * alt
5573 = rx_mk_r_alternate (&rxb->rx, *top_expression, 0);
5576 *top_expression = alt;
5577 last_expression = &alt->params.pair.right;
5579 int sync_se = paramc;
5582 ? ((struct re_se_params *)
5584 (paramc + 1) * sizeof (params[0])))
5585 : ((struct re_se_params *)
5586 malloc (sizeof (params[0]))));
5591 params[sync_se].se = re_se_tv;
5593 struct rexp_node * sync
5594 = rx_mk_r_side_effect (&rxb->rx,
5595 (rx_side_effect)sync_se);
5596 struct rexp_node * conc
5597 = rx_mk_r_concat (&rxb->rx, sync, 0);
5602 *last_expression = conc;
5603 last_expression = &conc->params.pair.right;
5611 /* If \{ is a literal. */
5612 if (!(syntax & RE_INTERVALS)
5613 /* If we're at `\{' and it's not the open-interval
5615 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5616 || (p - 2 == pattern && p == pend))
5617 goto normal_backslash;
5621 /* If got here, then the syntax allows intervals. */
5623 /* At least (most) this many matches must be made. */
5624 int lower_bound = -1, upper_bound = -1;
5626 beg_interval = p - 1;
5630 if (syntax & RE_NO_BK_BRACES)
5631 goto unfetch_interval;
5636 GET_UNSIGNED_NUMBER (lower_bound);
5640 GET_UNSIGNED_NUMBER (upper_bound);
5641 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
5644 /* Interval such as `{1}' => match exactly once. */
5645 upper_bound = lower_bound;
5647 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
5648 || lower_bound > upper_bound)
5650 if (syntax & RE_NO_BK_BRACES)
5651 goto unfetch_interval;
5656 if (!(syntax & RE_NO_BK_BRACES))
5658 if (c != '\\') return REG_EBRACE;
5664 if (syntax & RE_NO_BK_BRACES)
5665 goto unfetch_interval;
5670 /* We just parsed a valid interval. */
5672 /* If it's invalid to have no preceding re. */
5673 if (pointless_if_repeated (*last_expression, params))
5675 if (syntax & RE_CONTEXT_INVALID_OPS)
5677 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5678 goto unfetch_interval;
5679 /* was: else laststart = b; */
5682 /* If the upper bound is zero, don't want to iterate
5685 if (upper_bound == 0)
5687 if (*last_expression)
5689 rx_free_rexp (&rxb->rx, *last_expression);
5690 *last_expression = 0;
5694 /* Otherwise, we have a nontrivial interval. */
5696 int iter_se = paramc;
5697 int end_se = paramc + 1;
5699 ? ((struct re_se_params *)
5701 sizeof (*params) * (2 + paramc)))
5702 : ((struct re_se_params *)
5703 malloc (2 * sizeof (*params))));
5707 params [iter_se].se = re_se_iter;
5708 params [iter_se].op1 = lower_bound;
5709 params[iter_se].op2 = upper_bound;
5711 params[end_se].se = re_se_end_iter;
5712 params[end_se].op1 = lower_bound;
5713 params[end_se].op2 = upper_bound;
5715 struct rexp_node * push0
5716 = rx_mk_r_side_effect (&rxb->rx,
5717 (rx_side_effect)re_se_push0);
5718 struct rexp_node * start_one_iter
5719 = rx_mk_r_side_effect (&rxb->rx,
5720 (rx_side_effect)iter_se);
5721 struct rexp_node * phase1
5722 = rx_mk_r_concat (&rxb->rx, start_one_iter,
5724 struct rexp_node * pushback
5725 = rx_mk_r_side_effect (&rxb->rx,
5726 (rx_side_effect)re_se_pushback);
5727 rx_Bitset cs = rx_cset (&rxb->rx);
5728 struct rexp_node * lit_t
5729 = rx_mk_r_cset (&rxb->rx, cs);
5730 struct rexp_node * phase2
5731 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5732 struct rexp_node * loop
5733 = rx_mk_r_2phase_star (&rxb->rx, phase1, phase2);
5734 struct rexp_node * push_n_loop
5735 = rx_mk_r_concat (&rxb->rx, push0, loop);
5736 struct rexp_node * final_test
5737 = rx_mk_r_side_effect (&rxb->rx,
5738 (rx_side_effect)end_se);
5739 struct rexp_node * full_exp
5740 = rx_mk_r_concat (&rxb->rx, push_n_loop, final_test);
5742 if (!(push0 && start_one_iter && phase1
5743 && pushback && lit_t && phase2
5744 && loop && push_n_loop && final_test && full_exp))
5747 RX_bitset_enjoin(cs, 't');
5749 *last_expression = full_exp;
5757 /* If an invalid interval, match the characters as literals. */
5761 /* normal_char and normal_backslash need `c'. */
5764 if (!(syntax & RE_NO_BK_BRACES))
5766 if (p > pattern && p[-1] == '\\')
5767 goto normal_backslash;
5772 /* There is no way to specify the before_dot and after_dot
5773 operators. rms says this is ok. --karl */
5775 side = (rx_side_effect)rx_se_at_dot;
5776 goto add_side_effect;
5782 rx_Bitset cs = rx_cset (&rxb->rx);
5783 struct rexp_node * set = rx_mk_r_cset (&rxb->rx, cs);
5787 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5792 enum syntaxcode code = syntax_spec_code [c];
5793 for (x = 0; x < 256; ++x)
5796 if (SYNTAX (x) == code)
5799 inverse_translation (rxb, validate_inv_tr,
5802 rx_bitset_xor (rxb->rx.local_cset_size, cs, it);
5816 rx_Bitset cs = rx_cset (&rxb->rx);
5817 struct rexp_node * n = (cs ? rx_mk_r_cset (&rxb->rx, cs) : 0);
5821 rx_bitset_universe (rxb->rx.local_cset_size ,cs);
5824 for (x = rxb->rx.local_cset_size - 1; x > 0; --x)
5825 if (SYNTAX(x) & Sword)
5826 RX_bitset_toggle (cs, x);
5833 /* With a little extra work, some of these side effects could be optimized
5834 * away (basicly by looking at what we already know about the surrounding
5838 side = (rx_side_effect)re_se_wordbeg;
5839 goto add_side_effect;
5843 side = (rx_side_effect)re_se_wordend;
5844 goto add_side_effect;
5848 side = (rx_side_effect)re_se_wordbound;
5849 goto add_side_effect;
5853 side = (rx_side_effect)re_se_notwordbound;
5854 goto add_side_effect;
5858 side = (rx_side_effect)re_se_begbuf;
5859 goto add_side_effect;
5863 side = (rx_side_effect)re_se_endbuf;
5864 goto add_side_effect;
5869 struct rexp_node * se
5870 = rx_mk_r_side_effect (&rxb->rx, side);
5878 case '1': case '2': case '3': case '4': case '5':
5879 case '6': case '7': case '8': case '9':
5880 if (syntax & RE_NO_BK_REFS)
5888 /* Can't back reference to a subexpression if inside of it. */
5889 if (group_in_compile_stack (compile_stack, c1))
5893 int backref_se = paramc;
5895 ? ((struct re_se_params *)
5897 sizeof (*params) * (1 + paramc)))
5898 : ((struct re_se_params *)
5899 malloc (sizeof (*params))));
5903 params[backref_se].se = re_se_backref;
5904 params[backref_se].op1 = c1;
5905 side = (rx_side_effect)backref_se;
5906 goto add_side_effect;
5912 if (syntax & RE_BK_PLUS_QM)
5915 goto normal_backslash;
5919 /* You might think it would be useful for \ to mean
5920 not to translate; but if we don't translate it
5921 it will never match anything. */
5929 /* Expects the character in `c'. */
5932 rx_Bitset cs = rx_cset(&rxb->rx);
5933 struct rexp_node * match = rx_mk_r_cset (&rxb->rx, cs);
5937 it = inverse_translation (rxb, validate_inv_tr,
5938 inverse_translate, translate, c);
5939 rx_bitset_union (CHAR_SET_SIZE, cs, it);
5943 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
5944 * and then parses the next character normally.
5946 if (*last_expression)
5948 struct rexp_node * concat
5949 = rx_mk_r_concat (&rxb->rx, *last_expression, append);
5952 *last_expression = concat;
5953 last_expression = &concat->params.pair.right;
5956 *last_expression = append;
5959 } /* while p != pend */
5963 int win_se = paramc;
5965 ? ((struct re_se_params *)
5967 sizeof (*params) * (1 + paramc)))
5968 : ((struct re_se_params *)
5969 malloc (sizeof (*params))));
5973 params[win_se].se = re_se_win;
5975 struct rexp_node * se
5976 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)win_se);
5977 struct rexp_node * concat
5978 = rx_mk_r_concat (&rxb->rx, rexp, se);
5979 if (!(se && concat))
5986 /* Through the pattern now. */
5988 if (!COMPILE_STACK_EMPTY)
5991 free (compile_stack.stack);
5995 if (rx_debug_compile)
5998 fputs ("\n\nCompiling ", stdout);
5999 fwrite (pattern, 1, size, stdout);
6000 fputs (":\n", stdout);
6001 rxb->se_params = params;
6002 print_rexp (&rxb->rx, orig_rexp, 2, re_seprint, stdout);
6006 rx_Bitset cs = rx_cset(&rxb->rx);
6007 rx_Bitset cs2 = rx_cset(&rxb->rx);
6008 char * se_map = (char *) alloca (paramc);
6009 struct rexp_node * new_rexp = 0;
6012 bzero (se_map, paramc);
6013 find_backrefs (se_map, rexp, params);
6014 fewer_side_effects =
6015 remove_unecessary_side_effects (&rxb->rx, se_map,
6016 rx_copy_rexp (&rxb->rx, rexp), params);
6018 speed_up_alt (&rxb->rx, rexp, 0);
6019 speed_up_alt (&rxb->rx, fewer_side_effects, 1);
6022 char * syntax_parens = rxb->syntax_parens;
6023 if (syntax_parens == (char *)0x1)
6024 rexp = remove_unecessary_side_effects
6025 (&rxb->rx, se_map, rexp, params);
6026 else if (syntax_parens)
6029 for (x = 0; x < paramc; ++x)
6030 if (( (params[x].se == re_se_lparen)
6031 || (params[x].se == re_se_rparen))
6032 && (!syntax_parens [params[x].op1]))
6034 rexp = remove_unecessary_side_effects
6035 (&rxb->rx, se_map, rexp, params);
6039 /* At least one more optimization would be nice to have here but i ran out
6040 * of time. The idea would be to delay side effects.
6041 * For examle, `(abc)' is the same thing as `abc()' except that the
6042 * left paren is offset by 3 (which we know at compile time).
6043 * (In this comment, write that second pattern `abc(:3:)'
6044 * where `(:3:' is a syntactic unit.)
6046 * Trickier: `(abc|defg)' is the same as `(abc(:3:|defg(:4:))'
6047 * (The paren nesting may be hard to follow -- that's an alternation
6048 * of `abc(:3:' and `defg(:4:' inside (purely syntactic) parens
6049 * followed by the closing paren from the original expression.)
6051 * Neither the expression tree representation nor the the nfa make
6052 * this very easy to write. :(
6055 /* What we compile is different than what the parser returns.
6056 * Suppose the parser returns expression R.
6057 * Let R' be R with unnecessary register assignments removed
6058 * (see REMOVE_UNECESSARY_SIDE_EFFECTS, above).
6060 * What we will compile is the expression:
6062 * m{try}R{win}\|s{try}R'{win}
6064 * {try} and {win} denote side effect epsilons (see EXPLORE_FUTURE).
6066 * When trying a match, we insert an `m' at the beginning of the
6067 * string if the user wants registers to be filled, `s' if not.
6072 rx_mk_r_concat (&rxb->rx, rx_mk_r_cset (&rxb->rx, cs2), rexp),
6073 rx_mk_r_concat (&rxb->rx,
6074 rx_mk_r_cset (&rxb->rx, cs), fewer_side_effects));
6076 if (!(new_rexp && cs && cs2))
6078 RX_bitset_enjoin (cs2, '\0'); /* prefixed to the rexp used for matching. */
6079 RX_bitset_enjoin (cs, '\1'); /* prefixed to the rexp used for searching. */
6084 if (rx_debug_compile)
6086 fputs ("\n...which is compiled as:\n", stdout);
6087 print_rexp (&rxb->rx, rexp, 2, re_seprint, stdout);
6091 struct rx_nfa_state *start = 0;
6092 struct rx_nfa_state *end = 0;
6094 if (!rx_build_nfa (&rxb->rx, rexp, &start, &end))
6095 return REG_ESPACE; /* */
6098 void * mem = (void *)rxb->buffer;
6099 unsigned long size = rxb->allocated;
6102 int iterator_size = paramc * sizeof (params[0]);
6105 start->is_start = 1;
6106 rx_name_nfa_states (&rxb->rx);
6107 start_id = start->id;
6109 if (rx_debug_compile)
6111 fputs ("...giving the NFA: \n", stdout);
6113 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6116 if (!rx_eclose_nfa (&rxb->rx))
6120 rx_delete_epsilon_transitions (&rxb->rx);
6122 /* For compatability reasons, we need to shove the
6123 * compiled nfa into one chunk of malloced memory.
6125 rxb->rx.reserved = ( sizeof (params[0]) * paramc
6126 + rx_sizeof_bitset (rxb->rx.local_cset_size));
6128 if (rx_debug_compile)
6131 fputs ("...which cooks down (uncompactified) to: \n", stdout);
6132 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6135 if (!rx_compactify_nfa (&rxb->rx, &mem, &size))
6138 rxb->allocated = size;
6139 rxb->rx.buffer = mem;
6140 rxb->rx.allocated = size;
6141 perm_mem = ((char *)rxb->rx.buffer
6142 + rxb->rx.allocated - rxb->rx.reserved);
6143 rxb->se_params = ((struct re_se_params *)perm_mem);
6144 bcopy (params, rxb->se_params, iterator_size);
6145 perm_mem += iterator_size;
6146 rxb->fastset = (rx_Bitset) perm_mem;
6147 rxb->start = rx_id_to_nfa_state (&rxb->rx, start_id);
6149 rx_bitset_null (rxb->rx.local_cset_size, rxb->fastset);
6150 rxb->can_match_empty = compute_fastset (rxb, orig_rexp);
6151 rxb->match_regs_on_stack =
6152 registers_on_stack (rxb, orig_rexp, 0, params);
6153 rxb->search_regs_on_stack =
6154 registers_on_stack (rxb, fewer_side_effects, 0, params);
6155 if (rxb->can_match_empty)
6156 rx_bitset_universe (rxb->rx.local_cset_size, rxb->fastset);
6157 rxb->is_anchored = is_anchored (orig_rexp, (rx_side_effect) re_se_hat);
6158 rxb->begbuf_only = is_anchored (orig_rexp,
6159 (rx_side_effect) re_se_begbuf);
6161 rx_free_rexp (&rxb->rx, rexp);
6165 if (rx_debug_compile)
6168 fputs ("...which cooks down to: \n", stdout);
6169 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6178 /* This table gives an error message for each of the error codes listed
6179 in regex.h. Obviously the order here has to be same as there. */
6181 __const__ char * rx_error_msg[] =
6182 { 0, /* REG_NOERROR */
6183 "No match", /* REG_NOMATCH */
6184 "Invalid regular expression", /* REG_BADPAT */
6185 "Invalid collation character", /* REG_ECOLLATE */
6186 "Invalid character class name", /* REG_ECTYPE */
6187 "Trailing backslash", /* REG_EESCAPE */
6188 "Invalid back reference", /* REG_ESUBREG */
6189 "Unmatched [ or [^", /* REG_EBRACK */
6190 "Unmatched ( or \\(", /* REG_EPAREN */
6191 "Unmatched \\{", /* REG_EBRACE */
6192 "Invalid content of \\{\\}", /* REG_BADBR */
6193 "Invalid range end", /* REG_ERANGE */
6194 "Memory exhausted", /* REG_ESPACE */
6195 "Invalid preceding regular expression", /* REG_BADRPT */
6196 "Premature end of regular expression", /* REG_EEND */
6197 "Regular expression too big", /* REG_ESIZE */
6198 "Unmatched ) or \\)", /* REG_ERPAREN */
6204 char rx_slowmap [256] =
6206 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6226 rx_blow_up_fastmap (struct re_pattern_buffer * rxb)
6229 rx_blow_up_fastmap (rxb)
6230 struct re_pattern_buffer * rxb;
6234 for (x = 0; x < 256; ++x) /* &&&& 3.6 % */
6235 rxb->fastmap [x] = !!RX_bitset_member (rxb->fastset, x);
6236 rxb->fastmap_accurate = 1;
6242 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6243 #define RE_SEARCH_2_FN inner_re_search_2
6244 #define RE_S2_QUAL static
6246 #define RE_SEARCH_2_FN re_search_2
6250 struct re_search_2_closure
6252 __const__ unsigned char * string1;
6254 __const__ unsigned char * string2;
6259 static __inline__ enum rx_get_burst_return
6260 re_search_2_get_burst (pos, vclosure, stop)
6261 struct rx_string_position * pos;
6265 struct re_search_2_closure * closure;
6266 closure = (struct re_search_2_closure *)vclosure;
6267 if (!closure->string2)
6271 inset = pos->pos - pos->string;
6272 if ((inset < -1) || (inset > closure->size1))
6273 return rx_get_burst_no_more;
6276 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6277 pos->string = (__const__ unsigned char *) closure->string1;
6278 pos->size = closure->size1;
6279 pos->end = ((__const__ unsigned char *)
6280 MIN(closure->string1 + closure->size1,
6281 closure->string1 + stop));
6283 return ((pos->pos < pos->end)
6285 : rx_get_burst_no_more);
6288 else if (!closure->string1)
6292 inset = pos->pos - pos->string;
6293 pos->pos = (__const__ unsigned char *) closure->string2 + inset;
6294 pos->string = (__const__ unsigned char *) closure->string2;
6295 pos->size = closure->size2;
6296 pos->end = ((__const__ unsigned char *)
6297 MIN(closure->string2 + closure->size2,
6298 closure->string2 + stop));
6300 return ((pos->pos < pos->end)
6302 : rx_get_burst_no_more);
6308 inset = pos->pos - pos->string + pos->offset;
6309 if (inset < closure->size1)
6311 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6312 pos->string = (__const__ unsigned char *) closure->string1;
6313 pos->size = closure->size1;
6314 pos->end = ((__const__ unsigned char *)
6315 MIN(closure->string1 + closure->size1,
6316 closure->string1 + stop));
6318 return rx_get_burst_ok;
6322 pos->pos = ((__const__ unsigned char *)
6323 closure->string2 + inset - closure->size1);
6324 pos->string = (__const__ unsigned char *) closure->string2;
6325 pos->size = closure->size2;
6326 pos->end = ((__const__ unsigned char *)
6327 MIN(closure->string2 + closure->size2,
6328 closure->string2 + stop - closure->size1));
6329 pos->offset = closure->size1;
6330 return ((pos->pos < pos->end)
6332 : rx_get_burst_no_more);
6338 static __inline__ enum rx_back_check_return
6339 re_search_2_back_check (pos, lparen, rparen, translate, vclosure, stop)
6340 struct rx_string_position * pos;
6343 unsigned char * translate;
6347 struct rx_string_position there;
6348 struct rx_string_position past;
6351 there.pos = there.string + lparen - there.offset;
6352 re_search_2_get_burst (&there, vclosure, stop);
6355 past.pos = past.string + rparen - there.offset;
6356 re_search_2_get_burst (&past, vclosure, stop);
6359 re_search_2_get_burst (pos, vclosure, stop);
6361 while ( (there.pos != past.pos)
6362 && (pos->pos != pos->end))
6363 if (TRANSLATE(*there.pos) != TRANSLATE(*pos->pos))
6364 return rx_back_check_fail;
6369 if (there.pos == there.end)
6370 re_search_2_get_burst (&there, vclosure, stop);
6371 if (pos->pos == pos->end)
6372 re_search_2_get_burst (pos, vclosure, stop);
6375 if (there.pos != past.pos)
6376 return rx_back_check_fail;
6378 re_search_2_get_burst (pos, vclosure, stop);
6379 return rx_back_check_pass;
6382 static __inline__ int
6383 re_search_2_fetch_char (pos, offset, app_closure, stop)
6384 struct rx_string_position * pos;
6389 struct re_search_2_closure * closure;
6390 closure = (struct re_search_2_closure *)app_closure;
6393 if (pos->pos >= pos->string)
6397 if ( (pos->string == closure->string2)
6398 && (closure->string1)
6399 && (closure->size1))
6400 return closure->string1[closure->size1 - 1];
6402 return 0; /* sure, why not. */
6405 if (pos->pos == pos->end)
6406 return *closure->string2;
6414 RE_SEARCH_2_FN (struct re_pattern_buffer *rxb,
6415 __const__ char * string1, int size1,
6416 __const__ char * string2, int size2,
6417 int startpos, int range,
6418 struct re_registers *regs,
6422 RE_SEARCH_2_FN (rxb,
6423 string1, size1, string2, size2, startpos, range, regs, stop)
6424 struct re_pattern_buffer *rxb;
6425 __const__ char * string1;
6427 __const__ char * string2;
6431 struct re_registers *regs;
6436 struct re_search_2_closure closure;
6437 closure.string1 = string1;
6438 closure.size1 = size1;
6439 closure.string2 = string2;
6440 closure.size2 = size2;
6441 answer = rx_search (rxb, startpos, range, stop, size1 + size2,
6442 re_search_2_get_burst,
6443 re_search_2_back_check,
6444 re_search_2_fetch_char,
6451 case rx_search_continuation:
6453 case rx_search_error:
6455 case rx_search_soft_fail:
6456 case rx_search_fail:
6463 /* Export rx_search to callers outside this file. */
6466 re_rx_search (rxb, startpos, range, stop, total_size,
6467 get_burst, back_check, fetch_char,
6468 app_closure, regs, resume_state, save_state)
6469 struct re_pattern_buffer * rxb;
6474 rx_get_burst_fn get_burst;
6475 rx_back_check_fn back_check;
6476 rx_fetch_char_fn fetch_char;
6478 struct re_registers * regs;
6479 struct rx_search_state * resume_state;
6480 struct rx_search_state * save_state;
6482 return rx_search (rxb, startpos, range, stop, total_size,
6483 get_burst, back_check, fetch_char, app_closure,
6484 regs, resume_state, save_state);
6487 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6490 re_search_2 (struct re_pattern_buffer *rxb,
6491 __const__ char * string1, int size1,
6492 __const__ char * string2, int size2,
6493 int startpos, int range,
6494 struct re_registers *regs,
6498 re_search_2 (rxb, string1, size1, string2, size2, startpos, range, regs, stop)
6499 struct re_pattern_buffer *rxb;
6500 __const__ char * string1;
6502 __const__ char * string2;
6506 struct re_registers *regs;
6511 ret = inner_re_search_2 (rxb, string1, size1, string2, size2, startpos,
6519 /* Like re_search_2, above, but only one string is specified, and
6520 * doesn't let you say where to stop matching.
6525 re_search (struct re_pattern_buffer * rxb, __const__ char *string,
6526 int size, int startpos, int range,
6527 struct re_registers *regs)
6530 re_search (rxb, string, size, startpos, range, regs)
6531 struct re_pattern_buffer * rxb;
6532 __const__ char * string;
6536 struct re_registers *regs;
6539 return re_search_2 (rxb, 0, 0, string, size, startpos, range, regs, size);
6544 re_match_2 (struct re_pattern_buffer * rxb,
6545 __const__ char * string1, int size1,
6546 __const__ char * string2, int size2,
6547 int pos, struct re_registers *regs, int stop)
6550 re_match_2 (rxb, string1, size1, string2, size2, pos, regs, stop)
6551 struct re_pattern_buffer * rxb;
6552 __const__ char * string1;
6554 __const__ char * string2;
6557 struct re_registers *regs;
6561 struct re_registers some_regs;
6565 int save = rxb->regs_allocated;
6566 struct re_registers * regs_to_pass = regs;
6570 some_regs.start = &start;
6571 some_regs.end = &end;
6572 some_regs.num_regs = 1;
6573 regs_to_pass = &some_regs;
6574 rxb->regs_allocated = REGS_FIXED;
6577 srch = re_search_2 (rxb, string1, size1, string2, size2,
6578 pos, 1, regs_to_pass, stop);
6579 if (regs_to_pass != regs)
6580 rxb->regs_allocated = save;
6583 return regs_to_pass->end[0] - regs_to_pass->start[0];
6586 /* re_match is like re_match_2 except it takes only a single string. */
6590 re_match (struct re_pattern_buffer * rxb,
6591 __const__ char * string,
6593 struct re_registers *regs)
6596 re_match (rxb, string, size, pos, regs)
6597 struct re_pattern_buffer * rxb;
6598 __const__ char *string;
6601 struct re_registers *regs;
6604 return re_match_2 (rxb, string, size, 0, 0, pos, regs, size);
6609 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
6610 also be assigned to arbitrarily: each pattern buffer stores its own
6611 syntax, so it can be changed between regex compilations. */
6612 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
6615 /* Specify the precise syntax of regexps for compilation. This provides
6616 for compatibility for various utilities which historically have
6617 different, incompatible syntaxes.
6619 The argument SYNTAX is a bit mask comprised of the various bits
6620 defined in regex.h. We return the old syntax. */
6624 re_set_syntax (reg_syntax_t syntax)
6627 re_set_syntax (syntax)
6628 reg_syntax_t syntax;
6631 reg_syntax_t ret = re_syntax_options;
6633 re_syntax_options = syntax;
6638 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6639 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6640 this memory for recording register information. STARTS and ENDS
6641 must be allocated using the malloc library routine, and must each
6642 be at least NUM_REGS * sizeof (regoff_t) bytes long.
6644 If NUM_REGS == 0, then subsequent matches should allocate their own
6647 Unless this function is called, the first search or match using
6648 PATTERN_BUFFER will allocate its own register data, without
6649 freeing the old data. */
6653 re_set_registers (struct re_pattern_buffer *bufp,
6654 struct re_registers *regs,
6656 regoff_t * starts, regoff_t * ends)
6659 re_set_registers (bufp, regs, num_regs, starts, ends)
6660 struct re_pattern_buffer *bufp;
6661 struct re_registers *regs;
6669 bufp->regs_allocated = REGS_REALLOCATE;
6670 regs->num_regs = num_regs;
6671 regs->start = starts;
6676 bufp->regs_allocated = REGS_UNALLOCATED;
6678 regs->start = regs->end = (regoff_t) 0;
6687 cplx_se_sublist_len (struct rx_se_list * list)
6690 cplx_se_sublist_len (list)
6691 struct rx_se_list * list;
6697 if ((long)list->car >= 0)
6705 /* For rx->se_list_cmp */
6709 posix_se_list_order (struct rx * rx,
6710 struct rx_se_list * a, struct rx_se_list * b)
6713 posix_se_list_order (rx, a, b)
6715 struct rx_se_list * a;
6716 struct rx_se_list * b;
6719 int al = cplx_se_sublist_len (a);
6720 int bl = cplx_se_sublist_len (b);
6725 : ((a < b) ? -1 : 1));
6735 rx_side_effect * av = ((rx_side_effect *)
6736 alloca (sizeof (rx_side_effect) * (al + 1)));
6737 rx_side_effect * bv = ((rx_side_effect *)
6738 alloca (sizeof (rx_side_effect) * (bl + 1)));
6739 struct rx_se_list * ap = a;
6740 struct rx_se_list * bp = b;
6743 for (ai = al - 1; ai >= 0; --ai)
6745 while ((long)ap->car < 0)
6750 av[al] = (rx_side_effect)-2;
6751 for (bi = bl - 1; bi >= 0; --bi)
6753 while ((long)bp->car < 0)
6758 bv[bl] = (rx_side_effect)-1;
6763 while (av[x] == bv[x])
6765 ret = (((unsigned *)(av[x]) < (unsigned *)(bv[x])) ? -1 : 1);
6774 /* re_compile_pattern is the GNU regular expression compiler: it
6775 compiles PATTERN (of length SIZE) and puts the result in RXB.
6776 Returns 0 if the pattern was valid, otherwise an error string.
6778 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6779 are set in RXB on entry.
6781 We call rx_compile to do the actual compilation. */
6785 re_compile_pattern (__const__ char *pattern,
6787 struct re_pattern_buffer * rxb)
6790 re_compile_pattern (pattern, length, rxb)
6791 __const__ char *pattern;
6793 struct re_pattern_buffer * rxb;
6798 /* GNU code is written to assume at least RE_NREGS registers will be set
6799 (and at least one extra will be -1). */
6800 rxb->regs_allocated = REGS_UNALLOCATED;
6802 /* And GNU code determines whether or not to get register information
6803 by passing null for the REGS argument to re_match, etc., not by
6807 rxb->rx.local_cset_size = 256;
6809 /* Match anchors at newline. */
6810 rxb->newline_anchor = 1;
6816 rxb->rx.epsnodec = 0;
6817 rxb->rx.instruction_table = 0;
6818 rxb->rx.nfa_states = 0;
6819 rxb->rx.se_list_cmp = posix_se_list_order;
6820 rxb->rx.start_set = 0;
6822 ret = rx_compile (pattern, length, re_syntax_options, rxb);
6824 return rx_error_msg[(int) ret];
6831 re_compile_fastmap (struct re_pattern_buffer * rxb)
6834 re_compile_fastmap (rxb)
6835 struct re_pattern_buffer * rxb;
6838 rx_blow_up_fastmap (rxb);
6845 /* Entry points compatible with 4.2 BSD regex library. We don't define
6846 them if this is an Emacs or POSIX compilation. */
6848 #if (!defined (emacs) && !defined (_POSIX_SOURCE)) || defined(USE_BSD_REGEX)
6850 /* BSD has one and only one pattern buffer. */
6851 static struct re_pattern_buffer rx_comp_buf;
6855 re_comp (__const__ char *s)
6864 if (!s || (*s == '\0'))
6866 if (!rx_comp_buf.buffer)
6867 return "No previous regular expression";
6871 if (!rx_comp_buf.fastmap)
6873 rx_comp_buf.fastmap = (char *) malloc (1 << CHARBITS);
6874 if (!rx_comp_buf.fastmap)
6875 return "Memory exhausted";
6878 /* Since `rx_exec' always passes NULL for the `regs' argument, we
6879 don't need to initialize the pattern buffer fields which affect it. */
6881 /* Match anchors at newlines. */
6882 rx_comp_buf.newline_anchor = 1;
6884 rx_comp_buf.fastmap_accurate = 0;
6885 rx_comp_buf.re_nsub = 0;
6886 rx_comp_buf.start = 0;
6887 rx_comp_buf.se_params = 0;
6888 rx_comp_buf.rx.nodec = 0;
6889 rx_comp_buf.rx.epsnodec = 0;
6890 rx_comp_buf.rx.instruction_table = 0;
6891 rx_comp_buf.rx.nfa_states = 0;
6892 rx_comp_buf.rx.start = 0;
6893 rx_comp_buf.rx.se_list_cmp = posix_se_list_order;
6894 rx_comp_buf.rx.start_set = 0;
6895 rx_comp_buf.rx.local_cset_size = 256;
6897 ret = rx_compile (s, strlen (s), re_syntax_options, &rx_comp_buf);
6900 /* Yes, we're discarding `__const__' here. */
6901 return (char *) rx_error_msg[(int) ret];
6907 re_exec (__const__ char *s)
6914 __const__ int len = strlen (s);
6916 0 <= re_search (&rx_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6918 #endif /* not emacs and not _POSIX_SOURCE */
6922 /* POSIX.2 functions. Don't define these for Emacs. */
6926 /* regcomp takes a regular expression as a string and compiles it.
6928 PREG is a regex_t *. We do not expect any fields to be initialized,
6929 since POSIX says we shouldn't. Thus, we set
6931 `buffer' to the compiled pattern;
6932 `used' to the length of the compiled pattern;
6933 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6934 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6935 RE_SYNTAX_POSIX_BASIC;
6936 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6937 `fastmap' and `fastmap_accurate' to zero;
6938 `re_nsub' to the number of subexpressions in PATTERN.
6940 PATTERN is the address of the pattern string.
6942 CFLAGS is a series of bits which affect compilation.
6944 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6945 use POSIX basic syntax.
6947 If REG_NEWLINE is set, then . and [^...] don't match newline.
6948 Also, regexec will try a match beginning after every newline.
6950 If REG_ICASE is set, then we considers upper- and lowercase
6951 versions of letters to be equivalent when matching.
6953 If REG_NOSUB is set, then when PREG is passed to regexec, that
6954 routine will report only success or failure, and nothing about the
6957 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6958 the return codes and their meanings.) */
6963 regcomp (regex_t * preg, __const__ char * pattern, int cflags)
6966 regcomp (preg, pattern, cflags)
6968 __const__ char * pattern;
6974 = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6976 /* regex_compile will allocate the space for the compiled pattern. */
6978 preg->allocated = 0;
6979 preg->fastmap = malloc (256);
6982 preg->fastmap_accurate = 0;
6984 if (cflags & REG_ICASE)
6988 preg->translate = (unsigned char *) malloc (256);
6989 if (!preg->translate)
6990 return (int) REG_ESPACE;
6992 /* Map uppercase characters to corresponding lowercase ones. */
6993 for (i = 0; i < CHAR_SET_SIZE; i++)
6994 preg->translate[i] = isupper (i) ? tolower (i) : i;
6997 preg->translate = 0;
6999 /* If REG_NEWLINE is set, newlines are treated differently. */
7000 if (cflags & REG_NEWLINE)
7001 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7002 syntax &= ~RE_DOT_NEWLINE;
7003 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7004 /* It also changes the matching behavior. */
7005 preg->newline_anchor = 1;
7008 preg->newline_anchor = 0;
7010 preg->no_sub = !!(cflags & REG_NOSUB);
7012 /* POSIX says a null character in the pattern terminates it, so we
7013 can use strlen here in compiling the pattern. */
7016 preg->se_params = 0;
7017 preg->syntax_parens = 0;
7019 preg->rx.epsnodec = 0;
7020 preg->rx.instruction_table = 0;
7021 preg->rx.nfa_states = 0;
7022 preg->rx.local_cset_size = 256;
7024 preg->rx.se_list_cmp = posix_se_list_order;
7025 preg->rx.start_set = 0;
7026 ret = rx_compile (pattern, strlen (pattern), syntax, preg);
7029 /* POSIX doesn't distinguish between an unmatched open-group and an
7030 unmatched close-group: both are REG_EPAREN. */
7031 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7037 /* regexec searches for a given pattern, specified by PREG, in the
7040 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7041 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7042 least NMATCH elements, and we set them to the offsets of the
7043 corresponding matched substrings.
7045 EFLAGS specifies `execution flags' which affect matching: if
7046 REG_NOTBOL is set, then ^ does not match at the beginning of the
7047 string; if REG_NOTEOL is set, then $ does not match at the end.
7049 We return 0 if we find a match and REG_NOMATCH if not. */
7053 regexec (__const__ regex_t *preg, __const__ char *string,
7054 size_t nmatch, regmatch_t pmatch[],
7058 regexec (preg, string, nmatch, pmatch, eflags)
7059 __const__ regex_t *preg;
7060 __const__ char *string;
7062 regmatch_t pmatch[];
7067 struct re_registers regs;
7068 regex_t private_preg;
7069 int len = strlen (string);
7070 boolean want_reg_info = !preg->no_sub && nmatch > 0;
7072 private_preg = *preg;
7074 private_preg.not_bol = !!(eflags & REG_NOTBOL);
7075 private_preg.not_eol = !!(eflags & REG_NOTEOL);
7077 /* The user has told us exactly how many registers to return
7078 * information about, via `nmatch'. We have to pass that on to the
7079 * matching routines.
7081 private_preg.regs_allocated = REGS_FIXED;
7085 regs.num_regs = nmatch;
7086 regs.start = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7087 regs.end = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7088 if (regs.start == 0 || regs.end == 0)
7089 return (int) REG_NOMATCH;
7092 /* Perform the searching operation. */
7093 ret = re_search (&private_preg,
7097 want_reg_info ? ®s : (struct re_registers *) 0);
7099 /* Copy the register information to the POSIX structure. */
7106 for (r = 0; r < nmatch; r++)
7108 pmatch[r].rm_so = regs.start[r];
7109 pmatch[r].rm_eo = regs.end[r];
7113 /* If we needed the temporary register info, free the space now. */
7118 /* We want zero return to mean success, unlike `re_search'. */
7119 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
7123 /* Returns a message corresponding to an error code, ERRCODE, returned
7124 from either regcomp or regexec. */
7128 regerror (int errcode, __const__ regex_t *preg,
7129 char *errbuf, size_t errbuf_size)
7132 regerror (errcode, preg, errbuf, errbuf_size)
7134 __const__ regex_t *preg;
7140 = rx_error_msg[errcode] == 0 ? "Success" : rx_error_msg[errcode];
7141 size_t msg_size = strlen (msg) + 1; /* Includes the 0. */
7143 if (errbuf_size != 0)
7145 if (msg_size > errbuf_size)
7147 strncpy (errbuf, msg, errbuf_size - 1);
7148 errbuf[errbuf_size - 1] = 0;
7151 strcpy (errbuf, msg);
7158 /* Free dynamically allocated space used by PREG. */
7162 regfree (regex_t *preg)
7169 if (preg->buffer != 0)
7170 free (preg->buffer);
7172 preg->allocated = 0;
7174 if (preg->fastmap != 0)
7175 free (preg->fastmap);
7177 preg->fastmap_accurate = 0;
7179 if (preg->translate != 0)
7180 free (preg->translate);
7181 preg->translate = 0;
7184 #endif /* not emacs */