lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           int length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
  94                                       re_bitset_ptr_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
 103                                       re_bitset_ptr_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        unsigned REG_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (const char *pattern, size_t length,
 214                     struct re_pattern_buffer *bufp)
 215 {
 216   reg_errcode_t ret;
 217
 218   /* And GNU code determines whether or not to get register information
 219      by passing null for the REGS argument to re_match, etc., not by
 220      setting re_no_sub, unless REG_NO_SUB is set.  */
 221   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
 222
 223   /* Match anchors at newline.  */
 224   bufp->re_newline_anchor = 1;
 225
 226   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 227
 228   if (!ret)
 229     return NULL;
 230   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 231 }
 232 #ifdef _LIBC
 233 weak_alias (__re_compile_pattern, re_compile_pattern)
 234 #endif
 235
 236 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 237    also be assigned to arbitrarily: each pattern buffer stores its own
 238    syntax, so it can be changed between regex compilations.  */
 239 /* This has no initializer because initialized variables in Emacs
 240    become read-only after dumping.  */
 241 reg_syntax_t re_syntax_options;
 242
 243
 244 /* Specify the precise syntax of regexps for compilation.  This provides
 245    for compatibility for various utilities which historically have
 246    different, incompatible syntaxes.
 247
 248    The argument SYNTAX is a bit mask comprised of the various bits
 249    defined in regex.h.  We return the old syntax.  */
 250
 251 reg_syntax_t
 252 re_set_syntax (reg_syntax_t syntax)
 253 {
 254   reg_syntax_t ret = re_syntax_options;
 255
 256   re_syntax_options = syntax;
 257   return ret;
 258 }
 259 #ifdef _LIBC
 260 weak_alias (__re_set_syntax, re_set_syntax)
 261 #endif
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 267   char *fastmap = bufp->re_fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->re_fastmap_accurate = 1;
 278   return 0;
 279 }
 280 #ifdef _LIBC
 281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 282 #endif
 283
 284 static inline void
 285 __attribute ((always_inline))
 286 re_set_fastmap (char *fastmap, int icase, int ch)
 287 {
 288   fastmap[ch] = 1;
 289   if (icase)
 290     fastmap[tolower (ch)] = 1;
 291 }
 292
 293 /* Helper function for re_compile_fastmap.
 294    Compile fastmap for the initial_state INIT_STATE.  */
 295
 296 static void
 297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 298                          char *fastmap)
 299 {
 300   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 301   int node_cnt;
 302   int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
 303   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 304     {
 305       int node = init_state->nodes.elems[node_cnt];
 306       re_token_type_t type = dfa->nodes[node].type;
 307
 308       if (type == CHARACTER)
 309         {
 310           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 311 #ifdef RE_ENABLE_I18N
 312           if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 313             {
 314               unsigned char buf[MB_LEN_MAX];
 315               unsigned char *p;
 316               wchar_t wc;
 317               mbstate_t state;
 318
 319               p = buf;
 320               *p++ = dfa->nodes[node].opr.c;
 321               while (++node < dfa->nodes_len
 322                      && dfa->nodes[node].type == CHARACTER
 323                      && dfa->nodes[node].mb_partial)
 324                 *p++ = dfa->nodes[node].opr.c;
 325               memset (&state, 0, sizeof (state));
 326               if (mbrtowc (&wc, (const char *) buf, p - buf,
 327                            &state) == p - buf
 328                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 329                       != (size_t) -1))
 330                 re_set_fastmap (fastmap, 0, buf[0]);
 331             }
 332 #endif
 333         }
 334       else if (type == SIMPLE_BRACKET)
 335         {
 336           int i, j, ch;
 337           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 338             for (j = 0; j < UINT_BITS; ++j, ++ch)
 339               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 340                 re_set_fastmap (fastmap, icase, ch);
 341         }
 342 #ifdef RE_ENABLE_I18N
 343       else if (type == COMPLEX_BRACKET)
 344         {
 345           int i;
 346           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 347           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 348               || cset->nranges || cset->nchar_classes)
 349             {
 350 # ifdef _LIBC
 351               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 352                 {
 353                   /* In this case we want to catch the bytes which are
 354                      the first byte of any collation elements.
 355                      e.g. In da_DK, we want to catch 'a' since "aa"
 356                           is a valid collation element, and don't catch
 357                           'b' since 'b' is the only collation element
 358                           which starts from 'b'.  */
 359                   int j, ch;
 360                   const int32_t *table = (const int32_t *)
 361                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 362                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 363                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 364                       if (table[ch] < 0)
 365                         re_set_fastmap (fastmap, icase, ch);
 366                 }
 367 # else
 368               if (dfa->mb_cur_max > 1)
 369                 for (i = 0; i < SBC_MAX; ++i)
 370                   if (__btowc (i) == WEOF)
 371                     re_set_fastmap (fastmap, icase, i);
 372 # endif /* not _LIBC */
 373             }
 374           for (i = 0; i < cset->nmbchars; ++i)
 375             {
 376               char buf[256];
 377               mbstate_t state;
 378               memset (&state, '\0', sizeof (state));
 379               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 380                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 381               if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 382                 {
 383                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 384                       != (size_t) -1)
 385                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 386                 }
 387             }
 388         }
 389 #endif /* RE_ENABLE_I18N */
 390       else if (type == OP_PERIOD
 391 #ifdef RE_ENABLE_I18N
 392                || type == OP_UTF8_PERIOD
 393 #endif /* RE_ENABLE_I18N */
 394                || type == END_OF_RE)
 395         {
 396           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 397           if (type == END_OF_RE)
 398             bufp->re_can_be_null = 1;
 399           return;
 400         }
 401     }
 402 }
 403 \f
 404 /* Entry point for POSIX code.  */
 405 /* regcomp takes a regular expression as a string and compiles it.
 406
 407    PREG is a regex_t *.  We do not expect any fields to be initialized,
 408    since POSIX says we shouldn't.  Thus, we set
 409
 410      `re_buffer' to the compiled pattern;
 411      `re_used' to the length of the compiled pattern;
 412      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
 413        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 414        REG_SYNTAX_POSIX_BASIC;
 415      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
 416      `re_fastmap' to an allocated space for the fastmap;
 417      `re_fastmap_accurate' to zero;
 418      `re_nsub' to the number of subexpressions in PATTERN.
 419
 420    PATTERN is the address of the pattern string.
 421
 422    CFLAGS is a series of bits which affect compilation.
 423
 424      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 425      use POSIX basic syntax.
 426
 427      If REG_NEWLINE is set, then . and [^...] don't match newline.
 428      Also, regexec will try a match beginning after every newline.
 429
 430      If REG_ICASE is set, then we considers upper- and lowercase
 431      versions of letters to be equivalent when matching.
 432
 433      If REG_NOSUB is set, then when PREG is passed to regexec, that
 434      routine will report only success or failure, and nothing about the
 435      registers.
 436
 437    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 438    the return codes and their meanings.)  */
 439
 440 int
 441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 442 {
 443   reg_errcode_t ret;
 444   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
 445                         : REG_SYNTAX_POSIX_BASIC);
 446
 447   preg->re_buffer = NULL;
 448   preg->re_allocated = 0;
 449   preg->re_used = 0;
 450
 451   /* Try to allocate space for the fastmap.  */
 452   preg->re_fastmap = re_malloc (char, SBC_MAX);
 453   if (BE (preg->re_fastmap == NULL, 0))
 454     return REG_ESPACE;
 455
 456   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
 457
 458   /* If REG_NEWLINE is set, newlines are treated differently.  */
 459   if (cflags & REG_NEWLINE)
 460     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 461       syntax &= ~REG_DOT_NEWLINE;
 462       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
 463       /* It also changes the matching behavior.  */
 464       preg->re_newline_anchor = 1;
 465     }
 466   else
 467     preg->re_newline_anchor = 0;
 468   preg->re_no_sub = !!(cflags & REG_NOSUB);
 469   preg->re_translate = NULL;
 470
 471   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 472
 473   /* POSIX doesn't distinguish between an unmatched open-group and an
 474      unmatched close-group: both are REG_EPAREN.  */
 475   if (ret == REG_ERPAREN)
 476     ret = REG_EPAREN;
 477
 478   /* We have already checked preg->re_fastmap != NULL.  */
 479   if (BE (ret == REG_NOERROR, 1))
 480     /* Compute the fastmap now, since regexec cannot modify the pattern
 481        buffer.  This function never fails in this implementation.  */
 482     (void) re_compile_fastmap (preg);
 483   else
 484     {
 485       /* Some error occurred while compiling the expression.  */
 486       re_free (preg->re_fastmap);
 487       preg->re_fastmap = NULL;
 488     }
 489
 490   return (int) ret;
 491 }
 492 #ifdef _LIBC
 493 weak_alias (__regcomp, regcomp)
 494 #endif
 495
 496 /* Returns a message corresponding to an error code, ERRCODE, returned
 497    from either regcomp or regexec.   We don't use PREG here.  */
 498
 499 size_t
 500 regerror (int errcode, const regex_t *__restrict preg,
 501           char *__restrict errbuf, size_t errbuf_size)
 502 {
 503   const char *msg;
 504   size_t msg_size;
 505
 506   if (BE (errcode < 0
 507           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 508                                / sizeof (__re_error_msgid_idx[0])), 0))
 509     /* Only error codes returned by the rest of the code should be passed
 510        to this routine.  If we are given anything else, or if other regex
 511        code generates an invalid error code, then the program has a bug.
 512        Dump core so we can fix it.  */
 513     abort ();
 514
 515   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 516
 517   msg_size = strlen (msg) + 1; /* Includes the null.  */
 518
 519   if (BE (errbuf_size != 0, 1))
 520     {
 521       if (BE (msg_size > errbuf_size, 0))
 522         {
 523 #if defined HAVE_MEMPCPY || defined _LIBC
 524           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 525 #else
 526           memcpy (errbuf, msg, errbuf_size - 1);
 527           errbuf[errbuf_size - 1] = 0;
 528 #endif
 529         }
 530       else
 531         memcpy (errbuf, msg, msg_size);
 532     }
 533
 534   return msg_size;
 535 }
 536 #ifdef _LIBC
 537 weak_alias (__regerror, regerror)
 538 #endif
 539
 540
 541 #ifdef RE_ENABLE_I18N
 542 /* This static array is used for the map to single-byte characters when
 543    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 544    it the same all the time.  UTF-8 is the preferred encoding so this is
 545    a worthwhile optimization.  */
 546 static const bitset utf8_sb_map =
 547 {
 548   /* Set the first 128 bits.  */
 549 # if UINT_MAX == 0xffffffff
 550   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 551 # else
 552 #  error "Add case for new unsigned int size"
 553 # endif
 554 };
 555 #endif
 556
 557
 558 static void
 559 free_dfa_content (re_dfa_t *dfa)
 560 {
 561   int i, j;
 562
 563   if (dfa->nodes)
 564     for (i = 0; i < dfa->nodes_len; ++i)
 565       free_token (dfa->nodes + i);
 566   re_free (dfa->nexts);
 567   for (i = 0; i < dfa->nodes_len; ++i)
 568     {
 569       if (dfa->eclosures != NULL)
 570         re_node_set_free (dfa->eclosures + i);
 571       if (dfa->inveclosures != NULL)
 572         re_node_set_free (dfa->inveclosures + i);
 573       if (dfa->edests != NULL)
 574         re_node_set_free (dfa->edests + i);
 575     }
 576   re_free (dfa->edests);
 577   re_free (dfa->eclosures);
 578   re_free (dfa->inveclosures);
 579   re_free (dfa->nodes);
 580
 581   if (dfa->state_table)
 582     for (i = 0; i <= dfa->state_hash_mask; ++i)
 583       {
 584         struct re_state_table_entry *entry = dfa->state_table + i;
 585         for (j = 0; j < entry->num; ++j)
 586           {
 587             re_dfastate_t *state = entry->array[j];
 588             free_state (state);
 589           }
 590         re_free (entry->array);
 591       }
 592   re_free (dfa->state_table);
 593 #ifdef RE_ENABLE_I18N
 594   if (dfa->sb_char != utf8_sb_map)
 595     re_free (dfa->sb_char);
 596 #endif
 597   re_free (dfa->subexp_map);
 598 #ifdef DEBUG
 599   re_free (dfa->re_str);
 600 #endif
 601
 602   re_free (dfa);
 603 }
 604
 605
 606 /* Free dynamically allocated space used by PREG.  */
 607
 608 void
 609 regfree (regex_t *preg)
 610 {
 611   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 612   if (BE (dfa != NULL, 1))
 613     free_dfa_content (dfa);
 614   preg->re_buffer = NULL;
 615   preg->re_allocated = 0;
 616
 617   re_free (preg->re_fastmap);
 618   preg->re_fastmap = NULL;
 619
 620   re_free (preg->re_translate);
 621   preg->re_translate = NULL;
 622 }
 623 #ifdef _LIBC
 624 weak_alias (__regfree, regfree)
 625 #endif
 626 \f
 627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 628    them unless specifically requested.  */
 629
 630 #if defined _REGEX_RE_COMP || defined _LIBC
 631
 632 /* BSD has one and only one pattern buffer.  */
 633 static struct re_pattern_buffer re_comp_buf;
 634
 635 char *
 636 # ifdef _LIBC
 637 /* Make these definitions weak in libc, so POSIX programs can redefine
 638    these names if they don't use our functions, and still use
 639    regcomp/regexec above without link errors.  */
 640 weak_function
 641 # endif
 642 re_comp (s)
 643      const char *s;
 644 {
 645   reg_errcode_t ret;
 646   char *fastmap;
 647
 648   if (!s)
 649     {
 650       if (!re_comp_buf.re_buffer)
 651         return gettext ("No previous regular expression");
 652       return 0;
 653     }
 654
 655   if (re_comp_buf.re_buffer)
 656     {
 657       fastmap = re_comp_buf.re_fastmap;
 658       re_comp_buf.re_fastmap = NULL;
 659       __regfree (&re_comp_buf);
 660       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 661       re_comp_buf.re_fastmap = fastmap;
 662     }
 663
 664   if (re_comp_buf.re_fastmap == NULL)
 665     {
 666       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
 667       if (re_comp_buf.re_fastmap == NULL)
 668         return (char *) gettext (__re_error_msgid
 669                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 670     }
 671
 672   /* Since `re_exec' always passes NULL for the `regs' argument, we
 673      don't need to initialize the pattern buffer fields which affect it.  */
 674
 675   /* Match anchors at newlines.  */
 676   re_comp_buf.re_newline_anchor = 1;
 677
 678   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 679
 680   if (!ret)
 681     return NULL;
 682
 683   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 684   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 685 }
 686
 687 #ifdef _LIBC
 688 libc_freeres_fn (free_mem)
 689 {
 690   __regfree (&re_comp_buf);
 691 }
 692 #endif
 693
 694 #endif /* _REGEX_RE_COMP */
 695 \f
 696 /* Internal entry point.
 697    Compile the regular expression PATTERN, whose length is LENGTH.
 698    SYNTAX indicate regular expression's syntax.  */
 699
 700 static reg_errcode_t
 701 re_compile_internal (regex_t *preg, const char * pattern, int length,
 702                      reg_syntax_t syntax)
 703 {
 704   reg_errcode_t err = REG_NOERROR;
 705   re_dfa_t *dfa;
 706   re_string_t regexp;
 707
 708   /* Initialize the pattern buffer.  */
 709   preg->re_fastmap_accurate = 0;
 710   preg->re_syntax = syntax;
 711   preg->re_not_bol = preg->re_not_eol = 0;
 712   preg->re_used = 0;
 713   preg->re_nsub = 0;
 714   preg->re_can_be_null = 0;
 715   preg->re_regs_allocated = REG_UNALLOCATED;
 716
 717   /* Initialize the dfa.  */
 718   dfa = (re_dfa_t *) preg->re_buffer;
 719   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
 720     {
 721       /* If zero allocated, but buffer is non-null, try to realloc
 722          enough space.  This loses if buffer's address is bogus, but
 723          that is the user's responsibility.  If buffer is null this
 724          is a simple allocation.  */
 725       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
 726       if (dfa == NULL)
 727         return REG_ESPACE;
 728       preg->re_allocated = sizeof (re_dfa_t);
 729       preg->re_buffer = (unsigned char *) dfa;
 730     }
 731   preg->re_used = sizeof (re_dfa_t);
 732
 733   __libc_lock_init (dfa->lock);
 734
 735   err = init_dfa (dfa, length);
 736   if (BE (err != REG_NOERROR, 0))
 737     {
 738       free_dfa_content (dfa);
 739       preg->re_buffer = NULL;
 740       preg->re_allocated = 0;
 741       return err;
 742     }
 743 #ifdef DEBUG
 744   dfa->re_str = re_malloc (char, length + 1);
 745   strncpy (dfa->re_str, pattern, length + 1);
 746 #endif
 747
 748   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
 749                              syntax & REG_IGNORE_CASE, dfa);
 750   if (BE (err != REG_NOERROR, 0))
 751     {
 752     re_compile_internal_free_return:
 753       free_workarea_compile (preg);
 754       re_string_destruct (&regexp);
 755       free_dfa_content (dfa);
 756       preg->re_buffer = NULL;
 757       preg->re_allocated = 0;
 758       return err;
 759     }
 760
 761   /* Parse the regular expression, and build a structure tree.  */
 762   preg->re_nsub = 0;
 763   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 764   if (BE (dfa->str_tree == NULL, 0))
 765     goto re_compile_internal_free_return;
 766
 767   /* Analyze the tree and create the nfa.  */
 768   err = analyze (preg);
 769   if (BE (err != REG_NOERROR, 0))
 770     goto re_compile_internal_free_return;
 771
 772 #ifdef RE_ENABLE_I18N
 773   /* If possible, do searching in single byte encoding to speed things up.  */
 774   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
 775     optimize_utf8 (dfa);
 776 #endif
 777
 778   /* Then create the initial state of the dfa.  */
 779   err = create_initial_state (dfa);
 780
 781   /* Release work areas.  */
 782   free_workarea_compile (preg);
 783   re_string_destruct (&regexp);
 784
 785   if (BE (err != REG_NOERROR, 0))
 786     {
 787       free_dfa_content (dfa);
 788       preg->re_buffer = NULL;
 789       preg->re_allocated = 0;
 790     }
 791
 792   return err;
 793 }
 794
 795 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 796    as the initial length of some arrays.  */
 797
 798 static reg_errcode_t
 799 init_dfa (re_dfa_t *dfa, int pat_len)
 800 {
 801   int table_size;
 802 #ifndef _LIBC
 803   char *codeset_name;
 804 #endif
 805
 806   memset (dfa, '\0', sizeof (re_dfa_t));
 807
 808   /* Force allocation of str_tree_storage the first time.  */
 809   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 810
 811   dfa->nodes_alloc = pat_len + 1;
 812   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 813
 814   dfa->states_alloc = pat_len + 1;
 815
 816   /*  table_size = 2 ^ ceil(log pat_len) */
 817   for (table_size = 1; table_size > 0; table_size <<= 1)
 818     if (table_size > pat_len)
 819       break;
 820
 821   dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
 822   dfa->state_hash_mask = table_size - 1;
 823
 824   dfa->mb_cur_max = MB_CUR_MAX;
 825 #ifdef _LIBC
 826   if (dfa->mb_cur_max == 6
 827       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 828     dfa->is_utf8 = 1;
 829   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 830                        != 0);
 831 #else
 832 # ifdef HAVE_LANGINFO_CODESET
 833   codeset_name = nl_langinfo (CODESET);
 834 # else
 835   codeset_name = getenv ("LC_ALL");
 836   if (codeset_name == NULL || codeset_name[0] == '\0')
 837     codeset_name = getenv ("LC_CTYPE");
 838   if (codeset_name == NULL || codeset_name[0] == '\0')
 839     codeset_name = getenv ("LANG");
 840   if (codeset_name == NULL)
 841     codeset_name = "";
 842   else if (strchr (codeset_name, '.') !=  NULL)
 843     codeset_name = strchr (codeset_name, '.') + 1;
 844 # endif
 845
 846   if (strcasecmp (codeset_name, "UTF-8") == 0
 847       || strcasecmp (codeset_name, "UTF8") == 0)
 848     dfa->is_utf8 = 1;
 849
 850   /* We check exhaustively in the loop below if this charset is a
 851      superset of ASCII.  */
 852   dfa->map_notascii = 0;
 853 #endif
 854
 855 #ifdef RE_ENABLE_I18N
 856   if (dfa->mb_cur_max > 1)
 857     {
 858       if (dfa->is_utf8)
 859         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 860       else
 861         {
 862           int i, j, ch;
 863
 864           dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
 865           if (BE (dfa->sb_char == NULL, 0))
 866             return REG_ESPACE;
 867
 868           /* Clear all bits by, then set those corresponding to single
 869              byte chars.  */
 870           bitset_empty (dfa->sb_char);
 871
 872           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 873             for (j = 0; j < UINT_BITS; ++j, ++ch)
 874               {
 875                 wint_t wch = __btowc (ch);
 876                 if (wch != WEOF)
 877                   dfa->sb_char[i] |= 1 << j;
 878 # ifndef _LIBC
 879                 if (isascii (ch) && wch != ch)
 880                   dfa->map_notascii = 1;
 881 # endif
 882               }
 883         }
 884     }
 885 #endif
 886
 887   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 888     return REG_ESPACE;
 889   return REG_NOERROR;
 890 }
 891
 892 /* Initialize WORD_CHAR table, which indicate which character is
 893    "word".  In this case "word" means that it is the word construction
 894    character used by some operators like "\<", "\>", etc.  */
 895
 896 static void
 897 init_word_char (re_dfa_t *dfa)
 898 {
 899   int i, j, ch;
 900   dfa->word_ops_used = 1;
 901   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 902     for (j = 0; j < UINT_BITS; ++j, ++ch)
 903       if (isalnum (ch) || ch == '_')
 904         dfa->word_char[i] |= 1 << j;
 905 }
 906
 907 /* Free the work area which are only used while compiling.  */
 908
 909 static void
 910 free_workarea_compile (regex_t *preg)
 911 {
 912   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 913   bin_tree_storage_t *storage, *next;
 914   for (storage = dfa->str_tree_storage; storage; storage = next)
 915     {
 916       next = storage->next;
 917       re_free (storage);
 918     }
 919   dfa->str_tree_storage = NULL;
 920   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 921   dfa->str_tree = NULL;
 922   re_free (dfa->org_indices);
 923   dfa->org_indices = NULL;
 924 }
 925
 926 /* Create initial states for all contexts.  */
 927
 928 static reg_errcode_t
 929 create_initial_state (re_dfa_t *dfa)
 930 {
 931   int first, i;
 932   reg_errcode_t err;
 933   re_node_set init_nodes;
 934
 935   /* Initial states have the epsilon closure of the node which is
 936      the first node of the regular expression.  */
 937   first = dfa->str_tree->first->node_idx;
 938   dfa->init_node = first;
 939   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 940   if (BE (err != REG_NOERROR, 0))
 941     return err;
 942
 943   /* The back-references which are in initial states can epsilon transit,
 944      since in this case all of the subexpressions can be null.
 945      Then we add epsilon closures of the nodes which are the next nodes of
 946      the back-references.  */
 947   if (dfa->nbackref > 0)
 948     for (i = 0; i < init_nodes.nelem; ++i)
 949       {
 950         int node_idx = init_nodes.elems[i];
 951         re_token_type_t type = dfa->nodes[node_idx].type;
 952
 953         int clexp_idx;
 954         if (type != OP_BACK_REF)
 955           continue;
 956         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 957           {
 958             re_token_t *clexp_node;
 959             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 960             if (clexp_node->type == OP_CLOSE_SUBEXP
 961                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 962               break;
 963           }
 964         if (clexp_idx == init_nodes.nelem)
 965           continue;
 966
 967         if (type == OP_BACK_REF)
 968           {
 969             int dest_idx = dfa->edests[node_idx].elems[0];
 970             if (!re_node_set_contains (&init_nodes, dest_idx))
 971               {
 972                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 973                 i = 0;
 974               }
 975           }
 976       }
 977
 978   /* It must be the first time to invoke acquire_state.  */
 979   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 980   /* We don't check ERR here, since the initial state must not be NULL.  */
 981   if (BE (dfa->init_state == NULL, 0))
 982     return err;
 983   if (dfa->init_state->has_constraint)
 984     {
 985       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 986                                                        CONTEXT_WORD);
 987       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 988                                                      CONTEXT_NEWLINE);
 989       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 990                                                          &init_nodes,
 991                                                          CONTEXT_NEWLINE
 992                                                          | CONTEXT_BEGBUF);
 993       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 994               || dfa->init_state_begbuf == NULL, 0))
 995         return err;
 996     }
 997   else
 998     dfa->init_state_word = dfa->init_state_nl
 999       = dfa->init_state_begbuf = dfa->init_state;
1000
1001   re_node_set_free (&init_nodes);
1002   return REG_NOERROR;
1003 }
1004 \f
1005 #ifdef RE_ENABLE_I18N
1006 /* If it is possible to do searching in single byte encoding instead of UTF-8
1007    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1008    DFA nodes where needed.  */
1009
1010 static void
1011 optimize_utf8 (re_dfa_t *dfa)
1012 {
1013   int node, i, mb_chars = 0, has_period = 0;
1014
1015   for (node = 0; node < dfa->nodes_len; ++node)
1016     switch (dfa->nodes[node].type)
1017       {
1018       case CHARACTER:
1019         if (dfa->nodes[node].opr.c >= 0x80)
1020           mb_chars = 1;
1021         break;
1022       case ANCHOR:
1023         switch (dfa->nodes[node].opr.idx)
1024           {
1025           case LINE_FIRST:
1026           case LINE_LAST:
1027           case BUF_FIRST:
1028           case BUF_LAST:
1029             break;
1030           default:
1031             /* Word anchors etc. cannot be handled.  */
1032             return;
1033           }
1034         break;
1035       case OP_PERIOD:
1036         has_period = 1;
1037         break;
1038       case OP_BACK_REF:
1039       case OP_ALT:
1040       case END_OF_RE:
1041       case OP_DUP_ASTERISK:
1042       case OP_OPEN_SUBEXP:
1043       case OP_CLOSE_SUBEXP:
1044         break;
1045       case COMPLEX_BRACKET:
1046         return;
1047       case SIMPLE_BRACKET:
1048         /* Just double check.  */
1049         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1050           if (dfa->nodes[node].opr.sbcset[i])
1051             return;
1052         break;
1053       default:
1054         abort ();
1055       }
1056
1057   if (mb_chars || has_period)
1058     for (node = 0; node < dfa->nodes_len; ++node)
1059       {
1060         if (dfa->nodes[node].type == CHARACTER
1061             && dfa->nodes[node].opr.c >= 0x80)
1062           dfa->nodes[node].mb_partial = 0;
1063         else if (dfa->nodes[node].type == OP_PERIOD)
1064           dfa->nodes[node].type = OP_UTF8_PERIOD;
1065       }
1066
1067   /* The search can be in single byte locale.  */
1068   dfa->mb_cur_max = 1;
1069   dfa->is_utf8 = 0;
1070   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1071 }
1072 #endif
1073 \f
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075    "eclosure", and "inveclosure".  */
1076
1077 static reg_errcode_t
1078 analyze (regex_t *preg)
1079 {
1080   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1081   reg_errcode_t ret;
1082
1083   /* Allocate arrays.  */
1084   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1085   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1086   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1087   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089           || dfa->eclosures == NULL, 0))
1090     return REG_ESPACE;
1091
1092   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1093   if (dfa->subexp_map != NULL)
1094     {
1095       int i;
1096       for (i = 0; i < preg->re_nsub; i++)
1097         dfa->subexp_map[i] = i;
1098       preorder (dfa->str_tree, optimize_subexps, dfa);
1099       for (i = 0; i < preg->re_nsub; i++)
1100         if (dfa->subexp_map[i] != i)
1101           break;
1102       if (i == preg->re_nsub)
1103         {
1104           free (dfa->subexp_map);
1105           dfa->subexp_map = NULL;
1106         }
1107     }
1108
1109   ret = postorder (dfa->str_tree, lower_subexps, preg);
1110   if (BE (ret != REG_NOERROR, 0))
1111     return ret;
1112   ret = postorder (dfa->str_tree, calc_first, dfa);
1113   if (BE (ret != REG_NOERROR, 0))
1114     return ret;
1115   preorder (dfa->str_tree, calc_next, dfa);
1116   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117   if (BE (ret != REG_NOERROR, 0))
1118     return ret;
1119   ret = calc_eclosure (dfa);
1120   if (BE (ret != REG_NOERROR, 0))
1121     return ret;
1122
1123   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1125   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126       || dfa->nbackref)
1127     {
1128       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1129       if (BE (dfa->inveclosures == NULL, 0))
1130         return REG_ESPACE;
1131       ret = calc_inveclosure (dfa);
1132     }
1133
1134   return ret;
1135 }
1136
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138    implement parse tree visits.  Instead, we use parent pointers and
1139    some hairy code in these two functions.  */
1140 static reg_errcode_t
1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142            void *extra)
1143 {
1144   bin_tree_t *node, *prev;
1145
1146   for (node = root; ; )
1147     {
1148       /* Descend down the tree, preferably to the left (or to the right
1149          if that's the only child).  */
1150       while (node->left || node->right)
1151         if (node->left)
1152           node = node->left;
1153         else
1154           node = node->right;
1155
1156       do
1157         {
1158           reg_errcode_t err = fn (extra, node);
1159           if (BE (err != REG_NOERROR, 0))
1160             return err;
1161           if (node->parent == NULL)
1162             return REG_NOERROR;
1163           prev = node;
1164           node = node->parent;
1165         }
1166       /* Go up while we have a node that is reached from the right.  */
1167       while (node->right == prev || node->right == NULL);
1168       node = node->right;
1169     }
1170 }
1171
1172 static reg_errcode_t
1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1174           void *extra)
1175 {
1176   bin_tree_t *node;
1177
1178   for (node = root; ; )
1179     {
1180       reg_errcode_t err = fn (extra, node);
1181       if (BE (err != REG_NOERROR, 0))
1182         return err;
1183
1184       /* Go to the left node, or up and to the right.  */
1185       if (node->left)
1186         node = node->left;
1187       else
1188         {
1189           bin_tree_t *prev = NULL;
1190           while (node->right == prev || node->right == NULL)
1191             {
1192               prev = node;
1193               node = node->parent;
1194               if (!node)
1195                 return REG_NOERROR;
1196             }
1197           node = node->right;
1198         }
1199     }
1200 }
1201
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1204    backreferences as well.  Requires a preorder visit.  */
1205 static reg_errcode_t
1206 optimize_subexps (void *extra, bin_tree_t *node)
1207 {
1208   re_dfa_t *dfa = (re_dfa_t *) extra;
1209
1210   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1211     {
1212       int idx = node->token.opr.idx;
1213       node->token.opr.idx = dfa->subexp_map[idx];
1214       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215     }
1216
1217   else if (node->token.type == SUBEXP
1218            && node->left && node->left->token.type == SUBEXP)
1219     {
1220       int other_idx = node->left->token.opr.idx;
1221
1222       node->left = node->left->left;
1223       if (node->left)
1224         node->left->parent = node;
1225
1226       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1228         dfa->used_bkref_map &= ~(1 << other_idx);
1229     }
1230
1231   return REG_NOERROR;
1232 }
1233
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1236 static reg_errcode_t
1237 lower_subexps (void *extra, bin_tree_t *node)
1238 {
1239   regex_t *preg = (regex_t *) extra;
1240   reg_errcode_t err = REG_NOERROR;
1241
1242   if (node->left && node->left->token.type == SUBEXP)
1243     {
1244       node->left = lower_subexp (&err, preg, node->left);
1245       if (node->left)
1246         node->left->parent = node;
1247     }
1248   if (node->right && node->right->token.type == SUBEXP)
1249     {
1250       node->right = lower_subexp (&err, preg, node->right);
1251       if (node->right)
1252         node->right->parent = node;
1253     }
1254
1255   return err;
1256 }
1257
1258 static bin_tree_t *
1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1260 {
1261   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1262   bin_tree_t *body = node->left;
1263   bin_tree_t *op, *cls, *tree1, *tree;
1264
1265   if (preg->re_no_sub
1266       /* We do not optimize empty subexpressions, because otherwise we may
1267          have bad CONCAT nodes with NULL children.  This is obviously not
1268          very common, so we do not lose much.  An example that triggers
1269          this case is the sed "script" /\(\)/x.  */
1270       && node->left != NULL
1271       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1272           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1273     return node->left;
1274
1275   /* Convert the SUBEXP node to the concatenation of an
1276      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1277   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1278   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1279   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1280   tree = create_tree (dfa, op, tree1, CONCAT);
1281   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1282     {
1283       *err = REG_ESPACE;
1284       return NULL;
1285     }
1286
1287   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1288   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1289   return tree;
1290 }
1291
1292 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1293    nodes.  Requires a postorder visit.  */
1294 static reg_errcode_t
1295 calc_first (void *extra, bin_tree_t *node)
1296 {
1297   re_dfa_t *dfa = (re_dfa_t *) extra;
1298   if (node->token.type == CONCAT)
1299     {
1300       node->first = node->left->first;
1301       node->node_idx = node->left->node_idx;
1302     }
1303   else
1304     {
1305       node->first = node;
1306       node->node_idx = re_dfa_add_node (dfa, node->token);
1307       if (BE (node->node_idx == -1, 0))
1308         return REG_ESPACE;
1309     }
1310   return REG_NOERROR;
1311 }
1312
1313 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1314 static reg_errcode_t
1315 calc_next (void *extra, bin_tree_t *node)
1316 {
1317   switch (node->token.type)
1318     {
1319     case OP_DUP_ASTERISK:
1320       node->left->next = node;
1321       break;
1322     case CONCAT:
1323       node->left->next = node->right->first;
1324       node->right->next = node->next;
1325       break;
1326     default:
1327       if (node->left)
1328         node->left->next = node->next;
1329       if (node->right)
1330         node->right->next = node->next;
1331       break;
1332     }
1333   return REG_NOERROR;
1334 }
1335
1336 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1337 static reg_errcode_t
1338 link_nfa_nodes (void *extra, bin_tree_t *node)
1339 {
1340   re_dfa_t *dfa = (re_dfa_t *) extra;
1341   int idx = node->node_idx;
1342   reg_errcode_t err = REG_NOERROR;
1343
1344   switch (node->token.type)
1345     {
1346     case CONCAT:
1347       break;
1348
1349     case END_OF_RE:
1350       assert (node->next == NULL);
1351       break;
1352
1353     case OP_DUP_ASTERISK:
1354     case OP_ALT:
1355       {
1356         int left, right;
1357         dfa->has_plural_match = 1;
1358         if (node->left != NULL)
1359           left = node->left->first->node_idx;
1360         else
1361           left = node->next->node_idx;
1362         if (node->right != NULL)
1363           right = node->right->first->node_idx;
1364         else
1365           right = node->next->node_idx;
1366         assert (left > -1);
1367         assert (right > -1);
1368         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1369       }
1370       break;
1371
1372     case ANCHOR:
1373     case OP_OPEN_SUBEXP:
1374     case OP_CLOSE_SUBEXP:
1375       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1376       break;
1377
1378     case OP_BACK_REF:
1379       dfa->nexts[idx] = node->next->node_idx;
1380       if (node->token.type == OP_BACK_REF)
1381         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1382       break;
1383
1384     default:
1385       assert (!IS_EPSILON_NODE (node->token.type));
1386       dfa->nexts[idx] = node->next->node_idx;
1387       break;
1388     }
1389
1390   return err;
1391 }
1392
1393 /* Duplicate the epsilon closure of the node ROOT_NODE.
1394    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1395    to their own constraint.  */
1396
1397 static reg_errcode_t
1398 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1399                         int root_node, unsigned int init_constraint)
1400 {
1401   int org_node, clone_node, ret;
1402   unsigned int constraint = init_constraint;
1403   for (org_node = top_org_node, clone_node = top_clone_node;;)
1404     {
1405       int org_dest, clone_dest;
1406       if (dfa->nodes[org_node].type == OP_BACK_REF)
1407         {
1408           /* If the back reference epsilon-transit, its destination must
1409              also have the constraint.  Then duplicate the epsilon closure
1410              of the destination of the back reference, and store it in
1411              edests of the back reference.  */
1412           org_dest = dfa->nexts[org_node];
1413           re_node_set_empty (dfa->edests + clone_node);
1414           clone_dest = duplicate_node (dfa, org_dest, constraint);
1415           if (BE (clone_dest == -1, 0))
1416             return REG_ESPACE;
1417           dfa->nexts[clone_node] = dfa->nexts[org_node];
1418           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1419           if (BE (ret < 0, 0))
1420             return REG_ESPACE;
1421         }
1422       else if (dfa->edests[org_node].nelem == 0)
1423         {
1424           /* In case of the node can't epsilon-transit, don't duplicate the
1425              destination and store the original destination as the
1426              destination of the node.  */
1427           dfa->nexts[clone_node] = dfa->nexts[org_node];
1428           break;
1429         }
1430       else if (dfa->edests[org_node].nelem == 1)
1431         {
1432           /* In case of the node can epsilon-transit, and it has only one
1433              destination.  */
1434           org_dest = dfa->edests[org_node].elems[0];
1435           re_node_set_empty (dfa->edests + clone_node);
1436           if (dfa->nodes[org_node].type == ANCHOR)
1437             {
1438               /* In case of the node has another constraint, append it.  */
1439               if (org_node == root_node && clone_node != org_node)
1440                 {
1441                   /* ...but if the node is root_node itself, it means the
1442                      epsilon closure have a loop, then tie it to the
1443                      destination of the root_node.  */
1444                   ret = re_node_set_insert (dfa->edests + clone_node,
1445                                             org_dest);
1446                   if (BE (ret < 0, 0))
1447                     return REG_ESPACE;
1448                   break;
1449                 }
1450               constraint |= dfa->nodes[org_node].opr.ctx_type;
1451             }
1452           clone_dest = duplicate_node (dfa, org_dest, constraint);
1453           if (BE (clone_dest == -1, 0))
1454             return REG_ESPACE;
1455           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1456           if (BE (ret < 0, 0))
1457             return REG_ESPACE;
1458         }
1459       else /* dfa->edests[org_node].nelem == 2 */
1460         {
1461           /* In case of the node can epsilon-transit, and it has two
1462              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1463           org_dest = dfa->edests[org_node].elems[0];
1464           re_node_set_empty (dfa->edests + clone_node);
1465           /* Search for a duplicated node which satisfies the constraint.  */
1466           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1467           if (clone_dest == -1)
1468             {
1469               /* There are no such a duplicated node, create a new one.  */
1470               reg_errcode_t err;
1471               clone_dest = duplicate_node (dfa, org_dest, constraint);
1472               if (BE (clone_dest == -1, 0))
1473                 return REG_ESPACE;
1474               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1475               if (BE (ret < 0, 0))
1476                 return REG_ESPACE;
1477               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1478                                             root_node, constraint);
1479               if (BE (err != REG_NOERROR, 0))
1480                 return err;
1481             }
1482           else
1483             {
1484               /* There are a duplicated node which satisfy the constraint,
1485                  use it to avoid infinite loop.  */
1486               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487               if (BE (ret < 0, 0))
1488                 return REG_ESPACE;
1489             }
1490
1491           org_dest = dfa->edests[org_node].elems[1];
1492           clone_dest = duplicate_node (dfa, org_dest, constraint);
1493           if (BE (clone_dest == -1, 0))
1494             return REG_ESPACE;
1495           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496           if (BE (ret < 0, 0))
1497             return REG_ESPACE;
1498         }
1499       org_node = org_dest;
1500       clone_node = clone_dest;
1501     }
1502   return REG_NOERROR;
1503 }
1504
1505 /* Search for a node which is duplicated from the node ORG_NODE, and
1506    satisfies the constraint CONSTRAINT.  */
1507
1508 static int
1509 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1510 {
1511   int idx;
1512   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1513     {
1514       if (org_node == dfa->org_indices[idx]
1515           && constraint == dfa->nodes[idx].constraint)
1516         return idx; /* Found.  */
1517     }
1518   return -1; /* Not found.  */
1519 }
1520
1521 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1522    Return the index of the new node, or -1 if insufficient storage is
1523    available.  */
1524
1525 static int
1526 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1527 {
1528   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1529   if (BE (dup_idx != -1, 1))
1530     {
1531       dfa->nodes[dup_idx].constraint = constraint;
1532       if (dfa->nodes[org_idx].type == ANCHOR)
1533         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1534       dfa->nodes[dup_idx].duplicated = 1;
1535
1536       /* Store the index of the original node.  */
1537       dfa->org_indices[dup_idx] = org_idx;
1538     }
1539   return dup_idx;
1540 }
1541
1542 static reg_errcode_t
1543 calc_inveclosure (re_dfa_t *dfa)
1544 {
1545   int src, idx, ret;
1546   for (idx = 0; idx < dfa->nodes_len; ++idx)
1547     re_node_set_init_empty (dfa->inveclosures + idx);
1548
1549   for (src = 0; src < dfa->nodes_len; ++src)
1550     {
1551       int *elems = dfa->eclosures[src].elems;
1552       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1553         {
1554           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1555           if (BE (ret == -1, 0))
1556             return REG_ESPACE;
1557         }
1558     }
1559
1560   return REG_NOERROR;
1561 }
1562
1563 /* Calculate "eclosure" for all the node in DFA.  */
1564
1565 static reg_errcode_t
1566 calc_eclosure (re_dfa_t *dfa)
1567 {
1568   int node_idx, incomplete;
1569 #ifdef DEBUG
1570   assert (dfa->nodes_len > 0);
1571 #endif
1572   incomplete = 0;
1573   /* For each nodes, calculate epsilon closure.  */
1574   for (node_idx = 0; ; ++node_idx)
1575     {
1576       reg_errcode_t err;
1577       re_node_set eclosure_elem;
1578       if (node_idx == dfa->nodes_len)
1579         {
1580           if (!incomplete)
1581             break;
1582           incomplete = 0;
1583           node_idx = 0;
1584         }
1585
1586 #ifdef DEBUG
1587       assert (dfa->eclosures[node_idx].nelem != -1);
1588 #endif
1589
1590       /* If we have already calculated, skip it.  */
1591       if (dfa->eclosures[node_idx].nelem != 0)
1592         continue;
1593       /* Calculate epsilon closure of `node_idx'.  */
1594       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1595       if (BE (err != REG_NOERROR, 0))
1596         return err;
1597
1598       if (dfa->eclosures[node_idx].nelem == 0)
1599         {
1600           incomplete = 1;
1601           re_node_set_free (&eclosure_elem);
1602         }
1603     }
1604   return REG_NOERROR;
1605 }
1606
1607 /* Calculate epsilon closure of NODE.  */
1608
1609 static reg_errcode_t
1610 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1611 {
1612   reg_errcode_t err;
1613   unsigned int constraint;
1614   int i, incomplete;
1615   re_node_set eclosure;
1616   incomplete = 0;
1617   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1618   if (BE (err != REG_NOERROR, 0))
1619     return err;
1620
1621   /* This indicates that we are calculating this node now.
1622      We reference this value to avoid infinite loop.  */
1623   dfa->eclosures[node].nelem = -1;
1624
1625   constraint = ((dfa->nodes[node].type == ANCHOR)
1626                 ? dfa->nodes[node].opr.ctx_type : 0);
1627   /* If the current node has constraints, duplicate all nodes.
1628      Since they must inherit the constraints.  */
1629   if (constraint
1630       && dfa->edests[node].nelem
1631       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1632     {
1633       int org_node, cur_node;
1634       org_node = cur_node = node;
1635       err = duplicate_node_closure (dfa, node, node, node, constraint);
1636       if (BE (err != REG_NOERROR, 0))
1637         return err;
1638     }
1639
1640   /* Expand each epsilon destination nodes.  */
1641   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1642     for (i = 0; i < dfa->edests[node].nelem; ++i)
1643       {
1644         re_node_set eclosure_elem;
1645         int edest = dfa->edests[node].elems[i];
1646         /* If calculating the epsilon closure of `edest' is in progress,
1647            return intermediate result.  */
1648         if (dfa->eclosures[edest].nelem == -1)
1649           {
1650             incomplete = 1;
1651             continue;
1652           }
1653         /* If we haven't calculated the epsilon closure of `edest' yet,
1654            calculate now. Otherwise use calculated epsilon closure.  */
1655         if (dfa->eclosures[edest].nelem == 0)
1656           {
1657             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1658             if (BE (err != REG_NOERROR, 0))
1659               return err;
1660           }
1661         else
1662           eclosure_elem = dfa->eclosures[edest];
1663         /* Merge the epsilon closure of `edest'.  */
1664         re_node_set_merge (&eclosure, &eclosure_elem);
1665         /* If the epsilon closure of `edest' is incomplete,
1666            the epsilon closure of this node is also incomplete.  */
1667         if (dfa->eclosures[edest].nelem == 0)
1668           {
1669             incomplete = 1;
1670             re_node_set_free (&eclosure_elem);
1671           }
1672       }
1673
1674   /* Epsilon closures include itself.  */
1675   re_node_set_insert (&eclosure, node);
1676   if (incomplete && !root)
1677     dfa->eclosures[node].nelem = 0;
1678   else
1679     dfa->eclosures[node] = eclosure;
1680   *new_set = eclosure;
1681   return REG_NOERROR;
1682 }
1683 \f
1684 /* Functions for token which are used in the parser.  */
1685
1686 /* Fetch a token from INPUT.
1687    We must not use this function inside bracket expressions.  */
1688
1689 static void
1690 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1691 {
1692   re_string_skip_bytes (input, peek_token (result, input, syntax));
1693 }
1694
1695 /* Peek a token from INPUT, and return the length of the token.
1696    We must not use this function inside bracket expressions.  */
1697
1698 static int
1699 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1700 {
1701   unsigned char c;
1702
1703   if (re_string_eoi (input))
1704     {
1705       token->type = END_OF_RE;
1706       return 0;
1707     }
1708
1709   c = re_string_peek_byte (input, 0);
1710   token->opr.c = c;
1711
1712   token->word_char = 0;
1713 #ifdef RE_ENABLE_I18N
1714   token->mb_partial = 0;
1715   if (input->mb_cur_max > 1 &&
1716       !re_string_first_byte (input, re_string_cur_idx (input)))
1717     {
1718       token->type = CHARACTER;
1719       token->mb_partial = 1;
1720       return 1;
1721     }
1722 #endif
1723   if (c == '\\')
1724     {
1725       unsigned char c2;
1726       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1727         {
1728           token->type = BACK_SLASH;
1729           return 1;
1730         }
1731
1732       c2 = re_string_peek_byte_case (input, 1);
1733       token->opr.c = c2;
1734       token->type = CHARACTER;
1735 #ifdef RE_ENABLE_I18N
1736       if (input->mb_cur_max > 1)
1737         {
1738           wint_t wc = re_string_wchar_at (input,
1739                                           re_string_cur_idx (input) + 1);
1740           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1741         }
1742       else
1743 #endif
1744         token->word_char = IS_WORD_CHAR (c2) != 0;
1745
1746       switch (c2)
1747         {
1748         case '|':
1749           if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1750             token->type = OP_ALT;
1751           break;
1752         case '1': case '2': case '3': case '4': case '5':
1753         case '6': case '7': case '8': case '9':
1754           if (!(syntax & REG_NO_BK_REFS))
1755             {
1756               token->type = OP_BACK_REF;
1757               token->opr.idx = c2 - '1';
1758             }
1759           break;
1760         case '<':
1761           if (!(syntax & REG_NO_GNU_OPS))
1762             {
1763               token->type = ANCHOR;
1764               token->opr.ctx_type = WORD_FIRST;
1765             }
1766           break;
1767         case '>':
1768           if (!(syntax & REG_NO_GNU_OPS))
1769             {
1770               token->type = ANCHOR;
1771               token->opr.ctx_type = WORD_LAST;
1772             }
1773           break;
1774         case 'b':
1775           if (!(syntax & REG_NO_GNU_OPS))
1776             {
1777               token->type = ANCHOR;
1778               token->opr.ctx_type = WORD_DELIM;
1779             }
1780           break;
1781         case 'B':
1782           if (!(syntax & REG_NO_GNU_OPS))
1783             {
1784               token->type = ANCHOR;
1785               token->opr.ctx_type = NOT_WORD_DELIM;
1786             }
1787           break;
1788         case 'w':
1789           if (!(syntax & REG_NO_GNU_OPS))
1790             token->type = OP_WORD;
1791           break;
1792         case 'W':
1793           if (!(syntax & REG_NO_GNU_OPS))
1794             token->type = OP_NOTWORD;
1795           break;
1796         case 's':
1797           if (!(syntax & REG_NO_GNU_OPS))
1798             token->type = OP_SPACE;
1799           break;
1800         case 'S':
1801           if (!(syntax & REG_NO_GNU_OPS))
1802             token->type = OP_NOTSPACE;
1803           break;
1804         case '`':
1805           if (!(syntax & REG_NO_GNU_OPS))
1806             {
1807               token->type = ANCHOR;
1808               token->opr.ctx_type = BUF_FIRST;
1809             }
1810           break;
1811         case '\'':
1812           if (!(syntax & REG_NO_GNU_OPS))
1813             {
1814               token->type = ANCHOR;
1815               token->opr.ctx_type = BUF_LAST;
1816             }
1817           break;
1818         case '(':
1819           if (!(syntax & REG_NO_BK_PARENS))
1820             token->type = OP_OPEN_SUBEXP;
1821           break;
1822         case ')':
1823           if (!(syntax & REG_NO_BK_PARENS))
1824             token->type = OP_CLOSE_SUBEXP;
1825           break;
1826         case '+':
1827           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1828             token->type = OP_DUP_PLUS;
1829           break;
1830         case '?':
1831           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1832             token->type = OP_DUP_QUESTION;
1833           break;
1834         case '{':
1835           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1836             token->type = OP_OPEN_DUP_NUM;
1837           break;
1838         case '}':
1839           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1840             token->type = OP_CLOSE_DUP_NUM;
1841           break;
1842         default:
1843           break;
1844         }
1845       return 2;
1846     }
1847
1848   token->type = CHARACTER;
1849 #ifdef RE_ENABLE_I18N
1850   if (input->mb_cur_max > 1)
1851     {
1852       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1853       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854     }
1855   else
1856 #endif
1857     token->word_char = IS_WORD_CHAR (token->opr.c);
1858
1859   switch (c)
1860     {
1861     case '\n':
1862       if (syntax & REG_NEWLINE_ALT)
1863         token->type = OP_ALT;
1864       break;
1865     case '|':
1866       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1867         token->type = OP_ALT;
1868       break;
1869     case '*':
1870       token->type = OP_DUP_ASTERISK;
1871       break;
1872     case '+':
1873       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1874         token->type = OP_DUP_PLUS;
1875       break;
1876     case '?':
1877       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1878         token->type = OP_DUP_QUESTION;
1879       break;
1880     case '{':
1881       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1882         token->type = OP_OPEN_DUP_NUM;
1883       break;
1884     case '}':
1885       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1886         token->type = OP_CLOSE_DUP_NUM;
1887       break;
1888     case '(':
1889       if (syntax & REG_NO_BK_PARENS)
1890         token->type = OP_OPEN_SUBEXP;
1891       break;
1892     case ')':
1893       if (syntax & REG_NO_BK_PARENS)
1894         token->type = OP_CLOSE_SUBEXP;
1895       break;
1896     case '[':
1897       token->type = OP_OPEN_BRACKET;
1898       break;
1899     case '.':
1900       token->type = OP_PERIOD;
1901       break;
1902     case '^':
1903       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1904           re_string_cur_idx (input) != 0)
1905         {
1906           char prev = re_string_peek_byte (input, -1);
1907           if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1908             break;
1909         }
1910       token->type = ANCHOR;
1911       token->opr.ctx_type = LINE_FIRST;
1912       break;
1913     case '$':
1914       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1915           re_string_cur_idx (input) + 1 != re_string_length (input))
1916         {
1917           re_token_t next;
1918           re_string_skip_bytes (input, 1);
1919           peek_token (&next, input, syntax);
1920           re_string_skip_bytes (input, -1);
1921           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1922             break;
1923         }
1924       token->type = ANCHOR;
1925       token->opr.ctx_type = LINE_LAST;
1926       break;
1927     default:
1928       break;
1929     }
1930   return 1;
1931 }
1932
1933 /* Peek a token from INPUT, and return the length of the token.
1934    We must not use this function out of bracket expressions.  */
1935
1936 static int
1937 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1938 {
1939   unsigned char c;
1940   if (re_string_eoi (input))
1941     {
1942       token->type = END_OF_RE;
1943       return 0;
1944     }
1945   c = re_string_peek_byte (input, 0);
1946   token->opr.c = c;
1947
1948 #ifdef RE_ENABLE_I18N
1949   if (input->mb_cur_max > 1 &&
1950       !re_string_first_byte (input, re_string_cur_idx (input)))
1951     {
1952       token->type = CHARACTER;
1953       return 1;
1954     }
1955 #endif /* RE_ENABLE_I18N */
1956
1957   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1958       && re_string_cur_idx (input) + 1 < re_string_length (input))
1959     {
1960       /* In this case, '\' escape a character.  */
1961       unsigned char c2;
1962       re_string_skip_bytes (input, 1);
1963       c2 = re_string_peek_byte (input, 0);
1964       token->opr.c = c2;
1965       token->type = CHARACTER;
1966       return 1;
1967     }
1968   if (c == '[') /* '[' is a special char in a bracket exps.  */
1969     {
1970       unsigned char c2;
1971       int token_len;
1972       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1973         c2 = re_string_peek_byte (input, 1);
1974       else
1975         c2 = 0;
1976       token->opr.c = c2;
1977       token_len = 2;
1978       switch (c2)
1979         {
1980         case '.':
1981           token->type = OP_OPEN_COLL_ELEM;
1982           break;
1983         case '=':
1984           token->type = OP_OPEN_EQUIV_CLASS;
1985           break;
1986         case ':':
1987           if (syntax & REG_CHAR_CLASSES)
1988             {
1989               token->type = OP_OPEN_CHAR_CLASS;
1990               break;
1991             }
1992           /* else fall through.  */
1993         default:
1994           token->type = CHARACTER;
1995           token->opr.c = c;
1996           token_len = 1;
1997           break;
1998         }
1999       return token_len;
2000     }
2001   switch (c)
2002     {
2003     case '-':
2004       token->type = OP_CHARSET_RANGE;
2005       break;
2006     case ']':
2007       token->type = OP_CLOSE_BRACKET;
2008       break;
2009     case '^':
2010       token->type = OP_NON_MATCH_LIST;
2011       break;
2012     default:
2013       token->type = CHARACTER;
2014     }
2015   return 1;
2016 }
2017 \f
2018 /* Functions for parser.  */
2019
2020 /* Entry point of the parser.
2021    Parse the regular expression REGEXP and return the structure tree.
2022    If an error is occured, ERR is set by error code, and return NULL.
2023    This function build the following tree, from regular expression <reg_exp>:
2024            CAT
2025            / \
2026           /   \
2027    <reg_exp>  EOR
2028
2029    CAT means concatenation.
2030    EOR means end of regular expression.  */
2031
2032 static bin_tree_t *
2033 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2034        reg_errcode_t *err)
2035 {
2036   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2037   bin_tree_t *tree, *eor, *root;
2038   re_token_t current_token;
2039   dfa->syntax = syntax;
2040   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2041   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2042   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2043     return NULL;
2044   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2045   if (tree != NULL)
2046     root = create_tree (dfa, tree, eor, CONCAT);
2047   else
2048     root = eor;
2049   if (BE (eor == NULL || root == NULL, 0))
2050     {
2051       *err = REG_ESPACE;
2052       return NULL;
2053     }
2054   return root;
2055 }
2056
2057 /* This function build the following tree, from regular expression
2058    <branch1>|<branch2>:
2059            ALT
2060            / \
2061           /   \
2062    <branch1> <branch2>
2063
2064    ALT means alternative, which represents the operator `|'.  */
2065
2066 static bin_tree_t *
2067 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2068                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2069 {
2070   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2071   bin_tree_t *tree, *branch = NULL;
2072   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2073   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074     return NULL;
2075
2076   while (token->type == OP_ALT)
2077     {
2078       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2079       if (token->type != OP_ALT && token->type != END_OF_RE
2080           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2081         {
2082           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2083           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2084             return NULL;
2085         }
2086       else
2087         branch = NULL;
2088       tree = create_tree (dfa, tree, branch, OP_ALT);
2089       if (BE (tree == NULL, 0))
2090         {
2091           *err = REG_ESPACE;
2092           return NULL;
2093         }
2094     }
2095   return tree;
2096 }
2097
2098 /* This function build the following tree, from regular expression
2099    <exp1><exp2>:
2100         CAT
2101         / \
2102        /   \
2103    <exp1> <exp2>
2104
2105    CAT means concatenation.  */
2106
2107 static bin_tree_t *
2108 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2109               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2110 {
2111   bin_tree_t *tree, *exp;
2112   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2113   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2114   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115     return NULL;
2116
2117   while (token->type != OP_ALT && token->type != END_OF_RE
2118          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2119     {
2120       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2121       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2122         {
2123           return NULL;
2124         }
2125       if (tree != NULL && exp != NULL)
2126         {
2127           tree = create_tree (dfa, tree, exp, CONCAT);
2128           if (tree == NULL)
2129             {
2130               *err = REG_ESPACE;
2131               return NULL;
2132             }
2133         }
2134       else if (tree == NULL)
2135         tree = exp;
2136       /* Otherwise exp == NULL, we don't need to create new tree.  */
2137     }
2138   return tree;
2139 }
2140
2141 /* This function build the following tree, from regular expression a*:
2142          *
2143          |
2144          a
2145 */
2146
2147 static bin_tree_t *
2148 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150 {
2151   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2152   bin_tree_t *tree;
2153   switch (token->type)
2154     {
2155     case CHARACTER:
2156       tree = create_token_tree (dfa, NULL, NULL, token);
2157       if (BE (tree == NULL, 0))
2158         {
2159           *err = REG_ESPACE;
2160           return NULL;
2161         }
2162 #ifdef RE_ENABLE_I18N
2163       if (dfa->mb_cur_max > 1)
2164         {
2165           while (!re_string_eoi (regexp)
2166                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2167             {
2168               bin_tree_t *mbc_remain;
2169               fetch_token (token, regexp, syntax);
2170               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2171               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2172               if (BE (mbc_remain == NULL || tree == NULL, 0))
2173                 {
2174                   *err = REG_ESPACE;
2175                   return NULL;
2176                 }
2177             }
2178         }
2179 #endif
2180       break;
2181     case OP_OPEN_SUBEXP:
2182       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2183       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184         return NULL;
2185       break;
2186     case OP_OPEN_BRACKET:
2187       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2188       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189         return NULL;
2190       break;
2191     case OP_BACK_REF:
2192       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2193         {
2194           *err = REG_ESUBREG;
2195           return NULL;
2196         }
2197       dfa->used_bkref_map |= 1 << token->opr.idx;
2198       tree = create_token_tree (dfa, NULL, NULL, token);
2199       if (BE (tree == NULL, 0))
2200         {
2201           *err = REG_ESPACE;
2202           return NULL;
2203         }
2204       ++dfa->nbackref;
2205       dfa->has_mb_node = 1;
2206       break;
2207     case OP_OPEN_DUP_NUM:
2208       if (syntax & REG_CONTEXT_INVALID_DUP)
2209         {
2210           *err = REG_BADRPT;
2211           return NULL;
2212         }
2213       /* FALLTHROUGH */
2214     case OP_DUP_ASTERISK:
2215     case OP_DUP_PLUS:
2216     case OP_DUP_QUESTION:
2217       if (syntax & REG_CONTEXT_INVALID_OPS)
2218         {
2219           *err = REG_BADRPT;
2220           return NULL;
2221         }
2222       else if (syntax & REG_CONTEXT_INDEP_OPS)
2223         {
2224           fetch_token (token, regexp, syntax);
2225           return parse_expression (regexp, preg, token, syntax, nest, err);
2226         }
2227       /* else fall through  */
2228     case OP_CLOSE_SUBEXP:
2229       if ((token->type == OP_CLOSE_SUBEXP) &&
2230           !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2231         {
2232           *err = REG_ERPAREN;
2233           return NULL;
2234         }
2235       /* else fall through  */
2236     case OP_CLOSE_DUP_NUM:
2237       /* We treat it as a normal character.  */
2238
2239       /* Then we can these characters as normal characters.  */
2240       token->type = CHARACTER;
2241       /* mb_partial and word_char bits should be initialized already
2242          by peek_token.  */
2243       tree = create_token_tree (dfa, NULL, NULL, token);
2244       if (BE (tree == NULL, 0))
2245         {
2246           *err = REG_ESPACE;
2247           return NULL;
2248         }
2249       break;
2250     case ANCHOR:
2251       if ((token->opr.ctx_type
2252            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2253           && dfa->word_ops_used == 0)
2254         init_word_char (dfa);
2255       if (token->opr.ctx_type == WORD_DELIM
2256           || token->opr.ctx_type == NOT_WORD_DELIM)
2257         {
2258           bin_tree_t *tree_first, *tree_last;
2259           if (token->opr.ctx_type == WORD_DELIM)
2260             {
2261               token->opr.ctx_type = WORD_FIRST;
2262               tree_first = create_token_tree (dfa, NULL, NULL, token);
2263               token->opr.ctx_type = WORD_LAST;
2264             }
2265           else
2266             {
2267               token->opr.ctx_type = INSIDE_WORD;
2268               tree_first = create_token_tree (dfa, NULL, NULL, token);
2269               token->opr.ctx_type = INSIDE_NOTWORD;
2270             }
2271           tree_last = create_token_tree (dfa, NULL, NULL, token);
2272           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2273           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2274             {
2275               *err = REG_ESPACE;
2276               return NULL;
2277             }
2278         }
2279       else
2280         {
2281           tree = create_token_tree (dfa, NULL, NULL, token);
2282           if (BE (tree == NULL, 0))
2283             {
2284               *err = REG_ESPACE;
2285               return NULL;
2286             }
2287         }
2288       /* We must return here, since ANCHORs can't be followed
2289          by repetition operators.
2290          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2291              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2292       fetch_token (token, regexp, syntax);
2293       return tree;
2294     case OP_PERIOD:
2295       tree = create_token_tree (dfa, NULL, NULL, token);
2296       if (BE (tree == NULL, 0))
2297         {
2298           *err = REG_ESPACE;
2299           return NULL;
2300         }
2301       if (dfa->mb_cur_max > 1)
2302         dfa->has_mb_node = 1;
2303       break;
2304     case OP_WORD:
2305     case OP_NOTWORD:
2306       tree = build_charclass_op (dfa, regexp->trans,
2307                                  (const unsigned char *) "alnum",
2308                                  (const unsigned char *) "_",
2309                                  token->type == OP_NOTWORD, err);
2310       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2311         return NULL;
2312       break;
2313     case OP_SPACE:
2314     case OP_NOTSPACE:
2315       tree = build_charclass_op (dfa, regexp->trans,
2316                                  (const unsigned char *) "space",
2317                                  (const unsigned char *) "",
2318                                  token->type == OP_NOTSPACE, err);
2319       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2320         return NULL;
2321       break;
2322     case OP_ALT:
2323     case END_OF_RE:
2324       return NULL;
2325     case BACK_SLASH:
2326       *err = REG_EESCAPE;
2327       return NULL;
2328     default:
2329       /* Must not happen?  */
2330 #ifdef DEBUG
2331       assert (0);
2332 #endif
2333       return NULL;
2334     }
2335   fetch_token (token, regexp, syntax);
2336
2337   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2338          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2339     {
2340       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2341       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2342         return NULL;
2343       /* In BRE consecutive duplications are not allowed.  */
2344       if ((syntax & REG_CONTEXT_INVALID_DUP)
2345           && (token->type == OP_DUP_ASTERISK
2346               || token->type == OP_OPEN_DUP_NUM))
2347         {
2348           *err = REG_BADRPT;
2349           return NULL;
2350         }
2351     }
2352
2353   return tree;
2354 }
2355
2356 /* This function build the following tree, from regular expression
2357    (<reg_exp>):
2358          SUBEXP
2359             |
2360         <reg_exp>
2361 */
2362
2363 static bin_tree_t *
2364 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2365                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2366 {
2367   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2368   bin_tree_t *tree;
2369   size_t cur_nsub;
2370   cur_nsub = preg->re_nsub++;
2371
2372   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2373
2374   /* The subexpression may be a null string.  */
2375   if (token->type == OP_CLOSE_SUBEXP)
2376     tree = NULL;
2377   else
2378     {
2379       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2380       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2381         *err = REG_EPAREN;
2382       if (BE (*err != REG_NOERROR, 0))
2383         return NULL;
2384     }
2385   dfa->completed_bkref_map |= 1 << cur_nsub;
2386
2387   tree = create_tree (dfa, tree, NULL, SUBEXP);
2388   if (BE (tree == NULL, 0))
2389     {
2390       *err = REG_ESPACE;
2391       return NULL;
2392     }
2393   tree->token.opr.idx = cur_nsub;
2394   return tree;
2395 }
2396
2397 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2398
2399 static bin_tree_t *
2400 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2401               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2402 {
2403   bin_tree_t *tree = NULL, *old_tree = NULL;
2404   int i, start, end, start_idx = re_string_cur_idx (regexp);
2405   re_token_t start_token = *token;
2406
2407   if (token->type == OP_OPEN_DUP_NUM)
2408     {
2409       end = 0;
2410       start = fetch_number (regexp, token, syntax);
2411       if (start == -1)
2412         {
2413           if (token->type == CHARACTER && token->opr.c == ',')
2414             start = 0; /* We treat "{,m}" as "{0,m}".  */
2415           else
2416             {
2417               *err = REG_BADBR; /* <re>{} is invalid.  */
2418               return NULL;
2419             }
2420         }
2421       if (BE (start != -2, 1))
2422         {
2423           /* We treat "{n}" as "{n,n}".  */
2424           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2425                  : ((token->type == CHARACTER && token->opr.c == ',')
2426                     ? fetch_number (regexp, token, syntax) : -2));
2427         }
2428       if (BE (start == -2 || end == -2, 0))
2429         {
2430           /* Invalid sequence.  */
2431           if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2432             {
2433               if (token->type == END_OF_RE)
2434                 *err = REG_EBRACE;
2435               else
2436                 *err = REG_BADBR;
2437
2438               return NULL;
2439             }
2440
2441           /* If the syntax bit is set, rollback.  */
2442           re_string_set_index (regexp, start_idx);
2443           *token = start_token;
2444           token->type = CHARACTER;
2445           /* mb_partial and word_char bits should be already initialized by
2446              peek_token.  */
2447           return elem;
2448         }
2449
2450       if (BE (end != -1 && start > end, 0))
2451         {
2452           /* First number greater than second.  */
2453           *err = REG_BADBR;
2454           return NULL;
2455         }
2456     }
2457   else
2458     {
2459       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2460       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2461     }
2462
2463   fetch_token (token, regexp, syntax);
2464
2465   if (BE (elem == NULL, 0))
2466     return NULL;
2467   if (BE (start == 0 && end == 0, 0))
2468     {
2469       postorder (elem, free_tree, NULL);
2470       return NULL;
2471     }
2472
2473   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2474   if (BE (start > 0, 0))
2475     {
2476       tree = elem;
2477       for (i = 2; i <= start; ++i)
2478         {
2479           elem = duplicate_tree (elem, dfa);
2480           tree = create_tree (dfa, tree, elem, CONCAT);
2481           if (BE (elem == NULL || tree == NULL, 0))
2482             goto parse_dup_op_espace;
2483         }
2484
2485       if (start == end)
2486         return tree;
2487
2488       /* Duplicate ELEM before it is marked optional.  */
2489       elem = duplicate_tree (elem, dfa);
2490       old_tree = tree;
2491     }
2492   else
2493     old_tree = NULL;
2494
2495   if (elem->token.type == SUBEXP)
2496     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2497
2498   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2499   if (BE (tree == NULL, 0))
2500     goto parse_dup_op_espace;
2501
2502   /* This loop is actually executed only when end != -1,
2503      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2504      already created the start+1-th copy.  */
2505   for (i = start + 2; i <= end; ++i)
2506     {
2507       elem = duplicate_tree (elem, dfa);
2508       tree = create_tree (dfa, tree, elem, CONCAT);
2509       if (BE (elem == NULL || tree == NULL, 0))
2510         goto parse_dup_op_espace;
2511
2512       tree = create_tree (dfa, tree, NULL, OP_ALT);
2513       if (BE (tree == NULL, 0))
2514         goto parse_dup_op_espace;
2515     }
2516
2517   if (old_tree)
2518     tree = create_tree (dfa, old_tree, tree, CONCAT);
2519
2520   return tree;
2521
2522  parse_dup_op_espace:
2523   *err = REG_ESPACE;
2524   return NULL;
2525 }
2526
2527 /* Size of the names for collating symbol/equivalence_class/character_class.
2528    I'm not sure, but maybe enough.  */
2529 #define BRACKET_NAME_BUF_SIZE 32
2530
2531 #ifndef _LIBC
2532   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2533      Build the range expression which starts from START_ELEM, and ends
2534      at END_ELEM.  The result are written to MBCSET and SBCSET.
2535      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2536      mbcset->range_ends, is a pointer argument sinse we may
2537      update it.  */
2538
2539 static reg_errcode_t
2540 build_range_exp (re_bitset_ptr_t sbcset,
2541 # ifdef RE_ENABLE_I18N
2542                  re_charset_t *mbcset, int *range_alloc,
2543 # endif
2544                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2545 {
2546   unsigned int start_ch, end_ch;
2547   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2548   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2549           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2550           0))
2551     return REG_ERANGE;
2552
2553   /* We can handle no multi character collating elements without libc
2554      support.  */
2555   if (BE ((start_elem->type == COLL_SYM
2556            && strlen ((char *) start_elem->opr.name) > 1)
2557           || (end_elem->type == COLL_SYM
2558               && strlen ((char *) end_elem->opr.name) > 1), 0))
2559     return REG_ECOLLATE;
2560
2561 # ifdef RE_ENABLE_I18N
2562   {
2563     wchar_t wc;
2564     wint_t start_wc, end_wc;
2565     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2566
2567     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2568                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2569                    : 0));
2570     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2571               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2572                  : 0));
2573     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2574                 ? __btowc (start_ch) : start_elem->opr.wch);
2575     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2576               ? __btowc (end_ch) : end_elem->opr.wch);
2577     if (start_wc == WEOF || end_wc == WEOF)
2578       return REG_ECOLLATE;
2579     cmp_buf[0] = start_wc;
2580     cmp_buf[4] = end_wc;
2581     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2582       return REG_ERANGE;
2583
2584     /* Got valid collation sequence values, add them as a new entry.
2585        However, for !_LIBC we have no collation elements: if the
2586        character set is single byte, the single byte character set
2587        that we build below suffices.  parse_bracket_exp passes
2588        no MBCSET if dfa->mb_cur_max == 1.  */
2589     if (mbcset)
2590       {
2591         /* Check the space of the arrays.  */
2592         if (BE (*range_alloc == mbcset->nranges, 0))
2593           {
2594             /* There is not enough space, need realloc.  */
2595             wchar_t *new_array_start, *new_array_end;
2596             int new_nranges;
2597
2598             /* +1 in case of mbcset->nranges is 0.  */
2599             new_nranges = 2 * mbcset->nranges + 1;
2600             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2601                are NULL if *range_alloc == 0.  */
2602             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2603                                           new_nranges);
2604             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2605                                         new_nranges);
2606
2607             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2608               return REG_ESPACE;
2609
2610             mbcset->range_starts = new_array_start;
2611             mbcset->range_ends = new_array_end;
2612             *range_alloc = new_nranges;
2613           }
2614
2615         mbcset->range_starts[mbcset->nranges] = start_wc;
2616         mbcset->range_ends[mbcset->nranges++] = end_wc;
2617       }
2618
2619     /* Build the table for single byte characters.  */
2620     for (wc = 0; wc < SBC_MAX; ++wc)
2621       {
2622         cmp_buf[2] = wc;
2623         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2624             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2625           bitset_set (sbcset, wc);
2626       }
2627   }
2628 # else /* not RE_ENABLE_I18N */
2629   {
2630     unsigned int ch;
2631     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2632                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2633                    : 0));
2634     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2635               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2636                  : 0));
2637     if (start_ch > end_ch)
2638       return REG_ERANGE;
2639     /* Build the table for single byte characters.  */
2640     for (ch = 0; ch < SBC_MAX; ++ch)
2641       if (start_ch <= ch  && ch <= end_ch)
2642         bitset_set (sbcset, ch);
2643   }
2644 # endif /* not RE_ENABLE_I18N */
2645   return REG_NOERROR;
2646 }
2647 #endif /* not _LIBC */
2648
2649 #ifndef _LIBC
2650 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2651    Build the collating element which is represented by NAME.
2652    The result are written to MBCSET and SBCSET.
2653    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2654    pointer argument since we may update it.  */
2655
2656 static reg_errcode_t
2657 build_collating_symbol (re_bitset_ptr_t sbcset,
2658 # ifdef RE_ENABLE_I18N
2659                         re_charset_t *mbcset, int *coll_sym_alloc,
2660 # endif
2661                         const unsigned char *name)
2662 {
2663   size_t name_len = strlen ((const char *) name);
2664   if (BE (name_len != 1, 0))
2665     return REG_ECOLLATE;
2666   else
2667     {
2668       bitset_set (sbcset, name[0]);
2669       return REG_NOERROR;
2670     }
2671 }
2672 #endif /* not _LIBC */
2673
2674 /* This function parse bracket expression like "[abc]", "[a-c]",
2675    "[[.a-a.]]" etc.  */
2676
2677 static bin_tree_t *
2678 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2679                    reg_syntax_t syntax, reg_errcode_t *err)
2680 {
2681 #ifdef _LIBC
2682   const unsigned char *collseqmb;
2683   const char *collseqwc;
2684   uint32_t nrules;
2685   int32_t table_size;
2686   const int32_t *symb_table;
2687   const unsigned char *extra;
2688
2689   /* Local function for parse_bracket_exp used in _LIBC environement.
2690      Seek the collating symbol entry correspondings to NAME.
2691      Return the index of the symbol in the SYMB_TABLE.  */
2692
2693   auto inline int32_t
2694   __attribute ((always_inline))
2695   seek_collating_symbol_entry (name, name_len)
2696          const unsigned char *name;
2697          size_t name_len;
2698     {
2699       int32_t hash = elem_hash ((const char *) name, name_len);
2700       int32_t elem = hash % table_size;
2701       int32_t second = hash % (table_size - 2);
2702       while (symb_table[2 * elem] != 0)
2703         {
2704           /* First compare the hashing value.  */
2705           if (symb_table[2 * elem] == hash
2706               /* Compare the length of the name.  */
2707               && name_len == extra[symb_table[2 * elem + 1]]
2708               /* Compare the name.  */
2709               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2710                          name_len) == 0)
2711             {
2712               /* Yep, this is the entry.  */
2713               break;
2714             }
2715
2716           /* Next entry.  */
2717           elem += second;
2718         }
2719       return elem;
2720     }
2721
2722   /* Local function for parse_bracket_exp used in _LIBC environement.
2723      Look up the collation sequence value of BR_ELEM.
2724      Return the value if succeeded, UINT_MAX otherwise.  */
2725
2726   auto inline unsigned int
2727   __attribute ((always_inline))
2728   lookup_collation_sequence_value (br_elem)
2729          bracket_elem_t *br_elem;
2730     {
2731       if (br_elem->type == SB_CHAR)
2732         {
2733           /*
2734           if (MB_CUR_MAX == 1)
2735           */
2736           if (nrules == 0)
2737             return collseqmb[br_elem->opr.ch];
2738           else
2739             {
2740               wint_t wc = __btowc (br_elem->opr.ch);
2741               return __collseq_table_lookup (collseqwc, wc);
2742             }
2743         }
2744       else if (br_elem->type == MB_CHAR)
2745         {
2746           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2747         }
2748       else if (br_elem->type == COLL_SYM)
2749         {
2750           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2751           if (nrules != 0)
2752             {
2753               int32_t elem, idx;
2754               elem = seek_collating_symbol_entry (br_elem->opr.name,
2755                                                   sym_name_len);
2756               if (symb_table[2 * elem] != 0)
2757                 {
2758                   /* We found the entry.  */
2759                   idx = symb_table[2 * elem + 1];
2760                   /* Skip the name of collating element name.  */
2761                   idx += 1 + extra[idx];
2762                   /* Skip the byte sequence of the collating element.  */
2763                   idx += 1 + extra[idx];
2764                   /* Adjust for the alignment.  */
2765                   idx = (idx + 3) & ~3;
2766                   /* Skip the multibyte collation sequence value.  */
2767                   idx += sizeof (unsigned int);
2768                   /* Skip the wide char sequence of the collating element.  */
2769                   idx += sizeof (unsigned int) *
2770                     (1 + *(unsigned int *) (extra + idx));
2771                   /* Return the collation sequence value.  */
2772                   return *(unsigned int *) (extra + idx);
2773                 }
2774               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2775                 {
2776                   /* No valid character.  Match it as a single byte
2777                      character.  */
2778                   return collseqmb[br_elem->opr.name[0]];
2779                 }
2780             }
2781           else if (sym_name_len == 1)
2782             return collseqmb[br_elem->opr.name[0]];
2783         }
2784       return UINT_MAX;
2785     }
2786
2787   /* Local function for parse_bracket_exp used in _LIBC environement.
2788      Build the range expression which starts from START_ELEM, and ends
2789      at END_ELEM.  The result are written to MBCSET and SBCSET.
2790      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2791      mbcset->range_ends, is a pointer argument sinse we may
2792      update it.  */
2793
2794   auto inline reg_errcode_t
2795   __attribute ((always_inline))
2796   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2797          re_charset_t *mbcset;
2798          int *range_alloc;
2799          re_bitset_ptr_t sbcset;
2800          bracket_elem_t *start_elem, *end_elem;
2801     {
2802       unsigned int ch;
2803       uint32_t start_collseq;
2804       uint32_t end_collseq;
2805
2806       /* Equivalence Classes and Character Classes can't be a range
2807          start/end.  */
2808       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2809               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2810               0))
2811         return REG_ERANGE;
2812
2813       start_collseq = lookup_collation_sequence_value (start_elem);
2814       end_collseq = lookup_collation_sequence_value (end_elem);
2815       /* Check start/end collation sequence values.  */
2816       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2817         return REG_ECOLLATE;
2818       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2819         return REG_ERANGE;
2820
2821       /* Got valid collation sequence values, add them as a new entry.
2822          However, if we have no collation elements, and the character set
2823          is single byte, the single byte character set that we
2824          build below suffices. */
2825       if (nrules > 0 || dfa->mb_cur_max > 1)
2826         {
2827           /* Check the space of the arrays.  */
2828           if (BE (*range_alloc == mbcset->nranges, 0))
2829             {
2830               /* There is not enough space, need realloc.  */
2831               uint32_t *new_array_start;
2832               uint32_t *new_array_end;
2833               int new_nranges;
2834
2835               /* +1 in case of mbcset->nranges is 0.  */
2836               new_nranges = 2 * mbcset->nranges + 1;
2837               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2838                                             new_nranges);
2839               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2840                                           new_nranges);
2841
2842               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2843                 return REG_ESPACE;
2844
2845               mbcset->range_starts = new_array_start;
2846               mbcset->range_ends = new_array_end;
2847               *range_alloc = new_nranges;
2848             }
2849
2850           mbcset->range_starts[mbcset->nranges] = start_collseq;
2851           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2852         }
2853
2854       /* Build the table for single byte characters.  */
2855       for (ch = 0; ch < SBC_MAX; ch++)
2856         {
2857           uint32_t ch_collseq;
2858           /*
2859           if (MB_CUR_MAX == 1)
2860           */
2861           if (nrules == 0)
2862             ch_collseq = collseqmb[ch];
2863           else
2864             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2865           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2866             bitset_set (sbcset, ch);
2867         }
2868       return REG_NOERROR;
2869     }
2870
2871   /* Local function for parse_bracket_exp used in _LIBC environement.
2872      Build the collating element which is represented by NAME.
2873      The result are written to MBCSET and SBCSET.
2874      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2875      pointer argument sinse we may update it.  */
2876
2877   auto inline reg_errcode_t
2878   __attribute ((always_inline))
2879   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2880          re_charset_t *mbcset;
2881          int *coll_sym_alloc;
2882          re_bitset_ptr_t sbcset;
2883          const unsigned char *name;
2884     {
2885       int32_t elem, idx;
2886       size_t name_len = strlen ((const char *) name);
2887       if (nrules != 0)
2888         {
2889           elem = seek_collating_symbol_entry (name, name_len);
2890           if (symb_table[2 * elem] != 0)
2891             {
2892               /* We found the entry.  */
2893               idx = symb_table[2 * elem + 1];
2894               /* Skip the name of collating element name.  */
2895               idx += 1 + extra[idx];
2896             }
2897           else if (symb_table[2 * elem] == 0 && name_len == 1)
2898             {
2899               /* No valid character, treat it as a normal
2900                  character.  */
2901               bitset_set (sbcset, name[0]);
2902               return REG_NOERROR;
2903             }
2904           else
2905             return REG_ECOLLATE;
2906
2907           /* Got valid collation sequence, add it as a new entry.  */
2908           /* Check the space of the arrays.  */
2909           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2910             {
2911               /* Not enough, realloc it.  */
2912               /* +1 in case of mbcset->ncoll_syms is 0.  */
2913               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2914               /* Use realloc since mbcset->coll_syms is NULL
2915                  if *alloc == 0.  */
2916               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2917                                                    new_coll_sym_alloc);
2918               if (BE (new_coll_syms == NULL, 0))
2919                 return REG_ESPACE;
2920               mbcset->coll_syms = new_coll_syms;
2921               *coll_sym_alloc = new_coll_sym_alloc;
2922             }
2923           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2924           return REG_NOERROR;
2925         }
2926       else
2927         {
2928           if (BE (name_len != 1, 0))
2929             return REG_ECOLLATE;
2930           else
2931             {
2932               bitset_set (sbcset, name[0]);
2933               return REG_NOERROR;
2934             }
2935         }
2936     }
2937 #endif
2938
2939   re_token_t br_token;
2940   re_bitset_ptr_t sbcset;
2941 #ifdef RE_ENABLE_I18N
2942   re_charset_t *mbcset;
2943   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2944   int equiv_class_alloc = 0, char_class_alloc = 0;
2945 #endif /* not RE_ENABLE_I18N */
2946   int non_match = 0;
2947   bin_tree_t *work_tree;
2948   int token_len;
2949   int first_round = 1;
2950 #ifdef _LIBC
2951   collseqmb = (const unsigned char *)
2952     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2953   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2954   if (nrules)
2955     {
2956       /*
2957       if (MB_CUR_MAX > 1)
2958       */
2959         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2960       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2961       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2962                                                   _NL_COLLATE_SYMB_TABLEMB);
2963       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2964                                                    _NL_COLLATE_SYMB_EXTRAMB);
2965     }
2966 #endif
2967   sbcset = re_calloc (unsigned int, BITSET_UINTS);
2968 #ifdef RE_ENABLE_I18N
2969   mbcset = re_calloc (re_charset_t, 1);
2970 #endif /* RE_ENABLE_I18N */
2971 #ifdef RE_ENABLE_I18N
2972   if (BE (sbcset == NULL || mbcset == NULL, 0))
2973 #else
2974   if (BE (sbcset == NULL, 0))
2975 #endif /* RE_ENABLE_I18N */
2976     {
2977       *err = REG_ESPACE;
2978       return NULL;
2979     }
2980
2981   token_len = peek_token_bracket (token, regexp, syntax);
2982   if (BE (token->type == END_OF_RE, 0))
2983     {
2984       *err = REG_BADPAT;
2985       goto parse_bracket_exp_free_return;
2986     }
2987   if (token->type == OP_NON_MATCH_LIST)
2988     {
2989 #ifdef RE_ENABLE_I18N
2990       mbcset->non_match = 1;
2991 #endif /* not RE_ENABLE_I18N */
2992       non_match = 1;
2993       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2994         bitset_set (sbcset, '\0');
2995       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2996       token_len = peek_token_bracket (token, regexp, syntax);
2997       if (BE (token->type == END_OF_RE, 0))
2998         {
2999           *err = REG_BADPAT;
3000           goto parse_bracket_exp_free_return;
3001         }
3002     }
3003
3004   /* We treat the first ']' as a normal character.  */
3005   if (token->type == OP_CLOSE_BRACKET)
3006     token->type = CHARACTER;
3007
3008   while (1)
3009     {
3010       bracket_elem_t start_elem, end_elem;
3011       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3012       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3013       reg_errcode_t ret;
3014       int token_len2 = 0, is_range_exp = 0;
3015       re_token_t token2;
3016
3017       start_elem.opr.name = start_name_buf;
3018       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3019                                    syntax, first_round);
3020       if (BE (ret != REG_NOERROR, 0))
3021         {
3022           *err = ret;
3023           goto parse_bracket_exp_free_return;
3024         }
3025       first_round = 0;
3026
3027       /* Get information about the next token.  We need it in any case.  */
3028       token_len = peek_token_bracket (token, regexp, syntax);
3029
3030       /* Do not check for ranges if we know they are not allowed.  */
3031       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3032         {
3033           if (BE (token->type == END_OF_RE, 0))
3034             {
3035               *err = REG_EBRACK;
3036               goto parse_bracket_exp_free_return;
3037             }
3038           if (token->type == OP_CHARSET_RANGE)
3039             {
3040               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3041               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3042               if (BE (token2.type == END_OF_RE, 0))
3043                 {
3044                   *err = REG_EBRACK;
3045                   goto parse_bracket_exp_free_return;
3046                 }
3047               if (token2.type == OP_CLOSE_BRACKET)
3048                 {
3049                   /* We treat the last '-' as a normal character.  */
3050                   re_string_skip_bytes (regexp, -token_len);
3051                   token->type = CHARACTER;
3052                 }
3053               else
3054                 is_range_exp = 1;
3055             }
3056         }
3057
3058       if (is_range_exp == 1)
3059         {
3060           end_elem.opr.name = end_name_buf;
3061           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3062                                        dfa, syntax, 1);
3063           if (BE (ret != REG_NOERROR, 0))
3064             {
3065               *err = ret;
3066               goto parse_bracket_exp_free_return;
3067             }
3068
3069           token_len = peek_token_bracket (token, regexp, syntax);
3070
3071 #ifdef _LIBC
3072           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3073                                   &start_elem, &end_elem);
3074 #else
3075 # ifdef RE_ENABLE_I18N
3076           *err = build_range_exp (sbcset,
3077                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3078                                   &range_alloc, &start_elem, &end_elem);
3079 # else
3080           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3081 # endif
3082 #endif /* RE_ENABLE_I18N */
3083           if (BE (*err != REG_NOERROR, 0))
3084             goto parse_bracket_exp_free_return;
3085         }
3086       else
3087         {
3088           switch (start_elem.type)
3089             {
3090             case SB_CHAR:
3091               bitset_set (sbcset, start_elem.opr.ch);
3092               break;
3093 #ifdef RE_ENABLE_I18N
3094             case MB_CHAR:
3095               /* Check whether the array has enough space.  */
3096               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3097                 {
3098                   wchar_t *new_mbchars;
3099                   /* Not enough, realloc it.  */
3100                   /* +1 in case of mbcset->nmbchars is 0.  */
3101                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3102                   /* Use realloc since array is NULL if *alloc == 0.  */
3103                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3104                                             mbchar_alloc);
3105                   if (BE (new_mbchars == NULL, 0))
3106                     goto parse_bracket_exp_espace;
3107                   mbcset->mbchars = new_mbchars;
3108                 }
3109               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3110               break;
3111 #endif /* RE_ENABLE_I18N */
3112             case EQUIV_CLASS:
3113               *err = build_equiv_class (sbcset,
3114 #ifdef RE_ENABLE_I18N
3115                                         mbcset, &equiv_class_alloc,
3116 #endif /* RE_ENABLE_I18N */
3117                                         start_elem.opr.name);
3118               if (BE (*err != REG_NOERROR, 0))
3119                 goto parse_bracket_exp_free_return;
3120               break;
3121             case COLL_SYM:
3122               *err = build_collating_symbol (sbcset,
3123 #ifdef RE_ENABLE_I18N
3124                                              mbcset, &coll_sym_alloc,
3125 #endif /* RE_ENABLE_I18N */
3126                                              start_elem.opr.name);
3127               if (BE (*err != REG_NOERROR, 0))
3128                 goto parse_bracket_exp_free_return;
3129               break;
3130             case CHAR_CLASS:
3131               *err = build_charclass (regexp->trans, sbcset,
3132 #ifdef RE_ENABLE_I18N
3133                                       mbcset, &char_class_alloc,
3134 #endif /* RE_ENABLE_I18N */
3135                                       start_elem.opr.name, syntax);
3136               if (BE (*err != REG_NOERROR, 0))
3137                goto parse_bracket_exp_free_return;
3138               break;
3139             default:
3140               assert (0);
3141               break;
3142             }
3143         }
3144       if (BE (token->type == END_OF_RE, 0))
3145         {
3146           *err = REG_EBRACK;
3147           goto parse_bracket_exp_free_return;
3148         }
3149       if (token->type == OP_CLOSE_BRACKET)
3150         break;
3151     }
3152
3153   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3154
3155   /* If it is non-matching list.  */
3156   if (non_match)
3157     bitset_not (sbcset);
3158
3159 #ifdef RE_ENABLE_I18N
3160   /* Ensure only single byte characters are set.  */
3161   if (dfa->mb_cur_max > 1)
3162     bitset_mask (sbcset, dfa->sb_char);
3163
3164   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166                                                      || mbcset->non_match)))
3167     {
3168       bin_tree_t *mbc_tree;
3169       int sbc_idx;
3170       /* Build a tree for complex bracket.  */
3171       dfa->has_mb_node = 1;
3172       br_token.type = COMPLEX_BRACKET;
3173       br_token.opr.mbcset = mbcset;
3174       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3175       if (BE (mbc_tree == NULL, 0))
3176         goto parse_bracket_exp_espace;
3177       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3178         if (sbcset[sbc_idx])
3179           break;
3180       /* If there are no bits set in sbcset, there is no point
3181          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3182       if (sbc_idx < BITSET_UINTS)
3183         {
3184           /* Build a tree for simple bracket.  */
3185           br_token.type = SIMPLE_BRACKET;
3186           br_token.opr.sbcset = sbcset;
3187           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3188           if (BE (work_tree == NULL, 0))
3189             goto parse_bracket_exp_espace;
3190
3191           /* Then join them by ALT node.  */
3192           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3193           if (BE (work_tree == NULL, 0))
3194             goto parse_bracket_exp_espace;
3195         }
3196       else
3197         {
3198           re_free (sbcset);
3199           work_tree = mbc_tree;
3200         }
3201     }
3202   else
3203 #endif /* not RE_ENABLE_I18N */
3204     {
3205 #ifdef RE_ENABLE_I18N
3206       free_charset (mbcset);
3207 #endif
3208       /* Build a tree for simple bracket.  */
3209       br_token.type = SIMPLE_BRACKET;
3210       br_token.opr.sbcset = sbcset;
3211       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212       if (BE (work_tree == NULL, 0))
3213         goto parse_bracket_exp_espace;
3214     }
3215   return work_tree;
3216
3217  parse_bracket_exp_espace:
3218   *err = REG_ESPACE;
3219  parse_bracket_exp_free_return:
3220   re_free (sbcset);
3221 #ifdef RE_ENABLE_I18N
3222   free_charset (mbcset);
3223 #endif /* RE_ENABLE_I18N */
3224   return NULL;
3225 }
3226
3227 /* Parse an element in the bracket expression.  */
3228
3229 static reg_errcode_t
3230 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3231                        re_token_t *token, int token_len, re_dfa_t *dfa,
3232                        reg_syntax_t syntax, int accept_hyphen)
3233 {
3234 #ifdef RE_ENABLE_I18N
3235   int cur_char_size;
3236   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3237   if (cur_char_size > 1)
3238     {
3239       elem->type = MB_CHAR;
3240       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3241       re_string_skip_bytes (regexp, cur_char_size);
3242       return REG_NOERROR;
3243     }
3244 #endif /* RE_ENABLE_I18N */
3245   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3246   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3247       || token->type == OP_OPEN_EQUIV_CLASS)
3248     return parse_bracket_symbol (elem, regexp, token);
3249   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3250     {
3251       /* A '-' must only appear as anything but a range indicator before
3252          the closing bracket.  Everything else is an error.  */
3253       re_token_t token2;
3254       (void) peek_token_bracket (&token2, regexp, syntax);
3255       if (token2.type != OP_CLOSE_BRACKET)
3256         /* The actual error value is not standardized since this whole
3257            case is undefined.  But ERANGE makes good sense.  */
3258         return REG_ERANGE;
3259     }
3260   elem->type = SB_CHAR;
3261   elem->opr.ch = token->opr.c;
3262   return REG_NOERROR;
3263 }
3264
3265 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3266    such as [:<character_class>:], [.<collating_element>.], and
3267    [=<equivalent_class>=].  */
3268
3269 static reg_errcode_t
3270 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3271                       re_token_t *token)
3272 {
3273   unsigned char ch, delim = token->opr.c;
3274   int i = 0;
3275   if (re_string_eoi(regexp))
3276     return REG_EBRACK;
3277   for (;; ++i)
3278     {
3279       if (i >= BRACKET_NAME_BUF_SIZE)
3280         return REG_EBRACK;
3281       if (token->type == OP_OPEN_CHAR_CLASS)
3282         ch = re_string_fetch_byte_case (regexp);
3283       else
3284         ch = re_string_fetch_byte (regexp);
3285       if (re_string_eoi(regexp))
3286         return REG_EBRACK;
3287       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3288         break;
3289       elem->opr.name[i] = ch;
3290     }
3291   re_string_skip_bytes (regexp, 1);
3292   elem->opr.name[i] = '\0';
3293   switch (token->type)
3294     {
3295     case OP_OPEN_COLL_ELEM:
3296       elem->type = COLL_SYM;
3297       break;
3298     case OP_OPEN_EQUIV_CLASS:
3299       elem->type = EQUIV_CLASS;
3300       break;
3301     case OP_OPEN_CHAR_CLASS:
3302       elem->type = CHAR_CLASS;
3303       break;
3304     default:
3305       break;
3306     }
3307   return REG_NOERROR;
3308 }
3309
3310   /* Helper function for parse_bracket_exp.
3311      Build the equivalence class which is represented by NAME.
3312      The result are written to MBCSET and SBCSET.
3313      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3314      is a pointer argument sinse we may update it.  */
3315
3316 static reg_errcode_t
3317 build_equiv_class (re_bitset_ptr_t sbcset,
3318 #ifdef RE_ENABLE_I18N
3319                    re_charset_t *mbcset, int *equiv_class_alloc,
3320 #endif
3321                    const unsigned char *name)
3322 {
3323 #if defined _LIBC
3324   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3325   if (nrules != 0)
3326     {
3327       const int32_t *table, *indirect;
3328       const unsigned char *weights, *extra, *cp;
3329       unsigned char char_buf[2];
3330       int32_t idx1, idx2;
3331       unsigned int ch;
3332       size_t len;
3333       /* This #include defines a local function!  */
3334 # include <locale/weight.h>
3335       /* Calculate the index for equivalence class.  */
3336       cp = name;
3337       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3338       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3339                                                _NL_COLLATE_WEIGHTMB);
3340       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341                                                    _NL_COLLATE_EXTRAMB);
3342       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3343                                                 _NL_COLLATE_INDIRECTMB);
3344       idx1 = findidx (&cp);
3345       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3346         /* This isn't a valid character.  */
3347         return REG_ECOLLATE;
3348
3349       /* Build single byte matcing table for this equivalence class.  */
3350       char_buf[1] = (unsigned char) '\0';
3351       len = weights[idx1];
3352       for (ch = 0; ch < SBC_MAX; ++ch)
3353         {
3354           char_buf[0] = ch;
3355           cp = char_buf;
3356           idx2 = findidx (&cp);
3357 /*
3358           idx2 = table[ch];
3359 */
3360           if (idx2 == 0)
3361             /* This isn't a valid character.  */
3362             continue;
3363           if (len == weights[idx2])
3364             {
3365               int cnt = 0;
3366               while (cnt <= len &&
3367                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3368                 ++cnt;
3369
3370               if (cnt > len)
3371                 bitset_set (sbcset, ch);
3372             }
3373         }
3374       /* Check whether the array has enough space.  */
3375       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3376         {
3377           /* Not enough, realloc it.  */
3378           /* +1 in case of mbcset->nequiv_classes is 0.  */
3379           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3380           /* Use realloc since the array is NULL if *alloc == 0.  */
3381           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3382                                                    int32_t,
3383                                                    new_equiv_class_alloc);
3384           if (BE (new_equiv_classes == NULL, 0))
3385             return REG_ESPACE;
3386           mbcset->equiv_classes = new_equiv_classes;
3387           *equiv_class_alloc = new_equiv_class_alloc;
3388         }
3389       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3390     }
3391   else
3392 #endif /* _LIBC */
3393     {
3394       if (BE (strlen ((const char *) name) != 1, 0))
3395         return REG_ECOLLATE;
3396       bitset_set (sbcset, *name);
3397     }
3398   return REG_NOERROR;
3399 }
3400
3401   /* Helper function for parse_bracket_exp.
3402      Build the character class which is represented by NAME.
3403      The result are written to MBCSET and SBCSET.
3404      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3405      is a pointer argument sinse we may update it.  */
3406
3407 static reg_errcode_t
3408 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3409 #ifdef RE_ENABLE_I18N
3410                  re_charset_t *mbcset, int *char_class_alloc,
3411 #endif
3412                  const unsigned char *class_name, reg_syntax_t syntax)
3413 {
3414   int i;
3415   const char *name = (const char *) class_name;
3416
3417   /* In case of REG_ICASE "upper" and "lower" match the both of
3418      upper and lower cases.  */
3419   if ((syntax & REG_IGNORE_CASE)
3420       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3421     name = "alpha";
3422
3423 #ifdef RE_ENABLE_I18N
3424   /* Check the space of the arrays.  */
3425   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3426     {
3427       /* Not enough, realloc it.  */
3428       /* +1 in case of mbcset->nchar_classes is 0.  */
3429       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3430       /* Use realloc since array is NULL if *alloc == 0.  */
3431       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3432                                                new_char_class_alloc);
3433       if (BE (new_char_classes == NULL, 0))
3434         return REG_ESPACE;
3435       mbcset->char_classes = new_char_classes;
3436       *char_class_alloc = new_char_class_alloc;
3437     }
3438   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3440
3441 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3442     for (i = 0; i < SBC_MAX; ++i)               \
3443       {                                         \
3444         if (ctype_func (i))                     \
3445           {                                     \
3446             int ch = trans ? trans[i] : i;      \
3447             bitset_set (sbcset, ch);            \
3448           }                                     \
3449       }
3450
3451   if (strcmp (name, "alnum") == 0)
3452     BUILD_CHARCLASS_LOOP (isalnum)
3453   else if (strcmp (name, "cntrl") == 0)
3454     BUILD_CHARCLASS_LOOP (iscntrl)
3455   else if (strcmp (name, "lower") == 0)
3456     BUILD_CHARCLASS_LOOP (islower)
3457   else if (strcmp (name, "space") == 0)
3458     BUILD_CHARCLASS_LOOP (isspace)
3459   else if (strcmp (name, "alpha") == 0)
3460     BUILD_CHARCLASS_LOOP (isalpha)
3461   else if (strcmp (name, "digit") == 0)
3462     BUILD_CHARCLASS_LOOP (isdigit)
3463   else if (strcmp (name, "print") == 0)
3464     BUILD_CHARCLASS_LOOP (isprint)
3465   else if (strcmp (name, "upper") == 0)
3466     BUILD_CHARCLASS_LOOP (isupper)
3467   else if (strcmp (name, "blank") == 0)
3468     BUILD_CHARCLASS_LOOP (isblank)
3469   else if (strcmp (name, "graph") == 0)
3470     BUILD_CHARCLASS_LOOP (isgraph)
3471   else if (strcmp (name, "punct") == 0)
3472     BUILD_CHARCLASS_LOOP (ispunct)
3473   else if (strcmp (name, "xdigit") == 0)
3474     BUILD_CHARCLASS_LOOP (isxdigit)
3475   else
3476     return REG_ECTYPE;
3477
3478   return REG_NOERROR;
3479 }
3480
3481 static bin_tree_t *
3482 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3483                     const unsigned char *class_name,
3484                     const unsigned char *extra,
3485                     int non_match, reg_errcode_t *err)
3486 {
3487   re_bitset_ptr_t sbcset;
3488 #ifdef RE_ENABLE_I18N
3489   re_charset_t *mbcset;
3490   int alloc = 0;
3491 #endif /* not RE_ENABLE_I18N */
3492   reg_errcode_t ret;
3493   re_token_t br_token;
3494   bin_tree_t *tree;
3495
3496   sbcset = re_calloc (unsigned int, BITSET_UINTS);
3497 #ifdef RE_ENABLE_I18N
3498   mbcset = re_calloc (re_charset_t, 1);
3499 #endif /* RE_ENABLE_I18N */
3500
3501 #ifdef RE_ENABLE_I18N
3502   if (BE (sbcset == NULL || mbcset == NULL, 0))
3503 #else /* not RE_ENABLE_I18N */
3504   if (BE (sbcset == NULL, 0))
3505 #endif /* not RE_ENABLE_I18N */
3506     {
3507       *err = REG_ESPACE;
3508       return NULL;
3509     }
3510
3511   if (non_match)
3512     {
3513 #ifdef RE_ENABLE_I18N
3514       /*
3515       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3516         bitset_set(cset->sbcset, '\0');
3517       */
3518       mbcset->non_match = 1;
3519 #endif /* not RE_ENABLE_I18N */
3520     }
3521
3522   /* We don't care the syntax in this case.  */
3523   ret = build_charclass (trans, sbcset,
3524 #ifdef RE_ENABLE_I18N
3525                          mbcset, &alloc,
3526 #endif /* RE_ENABLE_I18N */
3527                          class_name, 0);
3528
3529   if (BE (ret != REG_NOERROR, 0))
3530     {
3531       re_free (sbcset);
3532 #ifdef RE_ENABLE_I18N
3533       free_charset (mbcset);
3534 #endif /* RE_ENABLE_I18N */
3535       *err = ret;
3536       return NULL;
3537     }
3538   /* \w match '_' also.  */
3539   for (; *extra; extra++)
3540     bitset_set (sbcset, *extra);
3541
3542   /* If it is non-matching list.  */
3543   if (non_match)
3544     bitset_not (sbcset);
3545
3546 #ifdef RE_ENABLE_I18N
3547   /* Ensure only single byte characters are set.  */
3548   if (dfa->mb_cur_max > 1)
3549     bitset_mask (sbcset, dfa->sb_char);
3550 #endif
3551
3552   /* Build a tree for simple bracket.  */
3553   br_token.type = SIMPLE_BRACKET;
3554   br_token.opr.sbcset = sbcset;
3555   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3556   if (BE (tree == NULL, 0))
3557     goto build_word_op_espace;
3558
3559 #ifdef RE_ENABLE_I18N
3560   if (dfa->mb_cur_max > 1)
3561     {
3562       bin_tree_t *mbc_tree;
3563       /* Build a tree for complex bracket.  */
3564       br_token.type = COMPLEX_BRACKET;
3565       br_token.opr.mbcset = mbcset;
3566       dfa->has_mb_node = 1;
3567       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3568       if (BE (mbc_tree == NULL, 0))
3569         goto build_word_op_espace;
3570       /* Then join them by ALT node.  */
3571       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3572       if (BE (mbc_tree != NULL, 1))
3573         return tree;
3574     }
3575   else
3576     {
3577       free_charset (mbcset);
3578       return tree;
3579     }
3580 #else /* not RE_ENABLE_I18N */
3581   return tree;
3582 #endif /* not RE_ENABLE_I18N */
3583
3584  build_word_op_espace:
3585   re_free (sbcset);
3586 #ifdef RE_ENABLE_I18N
3587   free_charset (mbcset);
3588 #endif /* RE_ENABLE_I18N */
3589   *err = REG_ESPACE;
3590   return NULL;
3591 }
3592
3593 /* This is intended for the expressions like "a{1,3}".
3594    Fetch a number from `input', and return the number.
3595    Return -1, if the number field is empty like "{,1}".
3596    Return -2, If an error is occured.  */
3597
3598 static int
3599 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3600 {
3601   int num = -1;
3602   unsigned char c;
3603   while (1)
3604     {
3605       fetch_token (token, input, syntax);
3606       c = token->opr.c;
3607       if (BE (token->type == END_OF_RE, 0))
3608         return -2;
3609       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3610         break;
3611       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3612              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3613       num = (num > REG_DUP_MAX) ? -2 : num;
3614     }
3615   return num;
3616 }
3617 \f
3618 #ifdef RE_ENABLE_I18N
3619 static void
3620 free_charset (re_charset_t *cset)
3621 {
3622   re_free (cset->mbchars);
3623 # ifdef _LIBC
3624   re_free (cset->coll_syms);
3625   re_free (cset->equiv_classes);
3626   re_free (cset->range_starts);
3627   re_free (cset->range_ends);
3628 # endif
3629   re_free (cset->char_classes);
3630   re_free (cset);
3631 }
3632 #endif /* RE_ENABLE_I18N */
3633 \f
3634 /* Functions for binary tree operation.  */
3635
3636 /* Create a tree node.  */
3637
3638 static bin_tree_t *
3639 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3640              re_token_type_t type)
3641 {
3642   re_token_t t;
3643   t.type = type;
3644   return create_token_tree (dfa, left, right, &t);
3645 }
3646
3647 static bin_tree_t *
3648 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3649                    const re_token_t *token)
3650 {
3651   bin_tree_t *tree;
3652   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3653     {
3654       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3655
3656       if (storage == NULL)
3657         return NULL;
3658       storage->next = dfa->str_tree_storage;
3659       dfa->str_tree_storage = storage;
3660       dfa->str_tree_storage_idx = 0;
3661     }
3662   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3663
3664   tree->parent = NULL;
3665   tree->left = left;
3666   tree->right = right;
3667   tree->token = *token;
3668   tree->token.duplicated = 0;
3669   tree->token.opt_subexp = 0;
3670   tree->first = NULL;
3671   tree->next = NULL;
3672   tree->node_idx = -1;
3673
3674   if (left != NULL)
3675     left->parent = tree;
3676   if (right != NULL)
3677     right->parent = tree;
3678   return tree;
3679 }
3680
3681 /* Mark the tree SRC as an optional subexpression.
3682    To be called from preorder or postorder.  */
3683
3684 static reg_errcode_t
3685 mark_opt_subexp (void *extra, bin_tree_t *node)
3686 {
3687   int idx = (int) (long) extra;
3688   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3689     node->token.opt_subexp = 1;
3690
3691   return REG_NOERROR;
3692 }
3693
3694 /* Free the allocated memory inside NODE. */
3695
3696 static void
3697 free_token (re_token_t *node)
3698 {
3699 #ifdef RE_ENABLE_I18N
3700   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3701     free_charset (node->opr.mbcset);
3702   else
3703 #endif /* RE_ENABLE_I18N */
3704     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3705       re_free (node->opr.sbcset);
3706 }
3707
3708 /* Worker function for tree walking.  Free the allocated memory inside NODE
3709    and its children. */
3710
3711 static reg_errcode_t
3712 free_tree (void *extra, bin_tree_t *node)
3713 {
3714   free_token (&node->token);
3715   return REG_NOERROR;
3716 }
3717
3718
3719 /* Duplicate the node SRC, and return new node.  This is a preorder
3720    visit similar to the one implemented by the generic visitor, but
3721    we need more infrastructure to maintain two parallel trees --- so,
3722    it's easier to duplicate.  */
3723
3724 static bin_tree_t *
3725 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3726 {
3727   const bin_tree_t *node;
3728   bin_tree_t *dup_root;
3729   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3730
3731   for (node = root; ; )
3732     {
3733       /* Create a new tree and link it back to the current parent.  */
3734       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3735       if (*p_new == NULL)
3736         return NULL;
3737       (*p_new)->parent = dup_node;
3738       (*p_new)->token.duplicated = 1;
3739       dup_node = *p_new;
3740
3741       /* Go to the left node, or up and to the right.  */
3742       if (node->left)
3743         {
3744           node = node->left;
3745           p_new = &dup_node->left;
3746         }
3747       else
3748         {
3749           const bin_tree_t *prev = NULL;
3750           while (node->right == prev || node->right == NULL)
3751             {
3752               prev = node;
3753               node = node->parent;
3754               dup_node = dup_node->parent;
3755               if (!node)
3756                 return dup_root;
3757             }
3758           node = node->right;
3759           p_new = &dup_node->right;
3760         }
3761     }
3762 }