lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           int length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
  94                                       re_bitset_ptr_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
 103                                       re_bitset_ptr_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        unsigned REG_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (const char *pattern, size_t length,
 214                     struct re_pattern_buffer *bufp)
 215 {
 216   reg_errcode_t ret;
 217
 218   /* And GNU code determines whether or not to get register information
 219      by passing null for the REGS argument to re_match, etc., not by
 220      setting re_no_sub, unless REG_NO_SUB is set.  */
 221   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
 222
 223   /* Match anchors at newline.  */
 224   bufp->re_newline_anchor = 1;
 225
 226   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 227
 228   if (!ret)
 229     return NULL;
 230   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 231 }
 232 #ifdef _LIBC
 233 weak_alias (__re_compile_pattern, re_compile_pattern)
 234 #endif
 235
 236 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 237    also be assigned to arbitrarily: each pattern buffer stores its own
 238    syntax, so it can be changed between regex compilations.  */
 239 /* This has no initializer because initialized variables in Emacs
 240    become read-only after dumping.  */
 241 reg_syntax_t re_syntax_options;
 242
 243
 244 /* Specify the precise syntax of regexps for compilation.  This provides
 245    for compatibility for various utilities which historically have
 246    different, incompatible syntaxes.
 247
 248    The argument SYNTAX is a bit mask comprised of the various bits
 249    defined in regex.h.  We return the old syntax.  */
 250
 251 reg_syntax_t
 252 re_set_syntax (reg_syntax_t syntax)
 253 {
 254   reg_syntax_t ret = re_syntax_options;
 255
 256   re_syntax_options = syntax;
 257   return ret;
 258 }
 259 #ifdef _LIBC
 260 weak_alias (__re_set_syntax, re_set_syntax)
 261 #endif
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 267   char *fastmap = bufp->re_fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->re_fastmap_accurate = 1;
 278   return 0;
 279 }
 280 #ifdef _LIBC
 281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 282 #endif
 283
 284 static inline void
 285 __attribute ((always_inline))
 286 re_set_fastmap (char *fastmap, int icase, int ch)
 287 {
 288   fastmap[ch] = 1;
 289   if (icase)
 290     fastmap[tolower (ch)] = 1;
 291 }
 292
 293 /* Helper function for re_compile_fastmap.
 294    Compile fastmap for the initial_state INIT_STATE.  */
 295
 296 static void
 297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 298                          char *fastmap)
 299 {
 300   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 301   int node_cnt;
 302   int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
 303   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 304     {
 305       int node = init_state->nodes.elems[node_cnt];
 306       re_token_type_t type = dfa->nodes[node].type;
 307
 308       if (type == CHARACTER)
 309         {
 310           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 311 #ifdef RE_ENABLE_I18N
 312           if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 313             {
 314               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 315               wchar_t wc;
 316               mbstate_t state;
 317
 318               p = buf;
 319               *p++ = dfa->nodes[node].opr.c;
 320               while (++node < dfa->nodes_len
 321                      && dfa->nodes[node].type == CHARACTER
 322                      && dfa->nodes[node].mb_partial)
 323                 *p++ = dfa->nodes[node].opr.c;
 324               memset (&state, 0, sizeof (state));
 325               if (mbrtowc (&wc, (const char *) buf, p - buf,
 326                            &state) == p - buf
 327                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 328                       != (size_t) -1))
 329                 re_set_fastmap (fastmap, 0, buf[0]);
 330             }
 331 #endif
 332         }
 333       else if (type == SIMPLE_BRACKET)
 334         {
 335           int i, j, ch;
 336           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 337             for (j = 0; j < UINT_BITS; ++j, ++ch)
 338               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 339                 re_set_fastmap (fastmap, icase, ch);
 340         }
 341 #ifdef RE_ENABLE_I18N
 342       else if (type == COMPLEX_BRACKET)
 343         {
 344           int i;
 345           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 346           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 347               || cset->nranges || cset->nchar_classes)
 348             {
 349 # ifdef _LIBC
 350               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 351                 {
 352                   /* In this case we want to catch the bytes which are
 353                      the first byte of any collation elements.
 354                      e.g. In da_DK, we want to catch 'a' since "aa"
 355                           is a valid collation element, and don't catch
 356                           'b' since 'b' is the only collation element
 357                           which starts from 'b'.  */
 358                   int j, ch;
 359                   const int32_t *table = (const int32_t *)
 360                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 361                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 362                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 363                       if (table[ch] < 0)
 364                         re_set_fastmap (fastmap, icase, ch);
 365                 }
 366 # else
 367               if (dfa->mb_cur_max > 1)
 368                 for (i = 0; i < SBC_MAX; ++i)
 369                   if (__btowc (i) == WEOF)
 370                     re_set_fastmap (fastmap, icase, i);
 371 # endif /* not _LIBC */
 372             }
 373           for (i = 0; i < cset->nmbchars; ++i)
 374             {
 375               char buf[256];
 376               mbstate_t state;
 377               memset (&state, '\0', sizeof (state));
 378               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 379                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 380               if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 381                 {
 382                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 383                       != (size_t) -1)
 384                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 385                 }
 386             }
 387         }
 388 #endif /* RE_ENABLE_I18N */
 389       else if (type == OP_PERIOD
 390 #ifdef RE_ENABLE_I18N
 391                || type == OP_UTF8_PERIOD
 392 #endif /* RE_ENABLE_I18N */
 393                || type == END_OF_RE)
 394         {
 395           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 396           if (type == END_OF_RE)
 397             bufp->re_can_be_null = 1;
 398           return;
 399         }
 400     }
 401 }
 402 \f
 403 /* Entry point for POSIX code.  */
 404 /* regcomp takes a regular expression as a string and compiles it.
 405
 406    PREG is a regex_t *.  We do not expect any fields to be initialized,
 407    since POSIX says we shouldn't.  Thus, we set
 408
 409      `re_buffer' to the compiled pattern;
 410      `re_used' to the length of the compiled pattern;
 411      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
 412        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 413        REG_SYNTAX_POSIX_BASIC;
 414      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
 415      `re_fastmap' to an allocated space for the fastmap;
 416      `re_fastmap_accurate' to zero;
 417      `re_nsub' to the number of subexpressions in PATTERN.
 418
 419    PATTERN is the address of the pattern string.
 420
 421    CFLAGS is a series of bits which affect compilation.
 422
 423      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 424      use POSIX basic syntax.
 425
 426      If REG_NEWLINE is set, then . and [^...] don't match newline.
 427      Also, regexec will try a match beginning after every newline.
 428
 429      If REG_ICASE is set, then we considers upper- and lowercase
 430      versions of letters to be equivalent when matching.
 431
 432      If REG_NOSUB is set, then when PREG is passed to regexec, that
 433      routine will report only success or failure, and nothing about the
 434      registers.
 435
 436    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 437    the return codes and their meanings.)  */
 438
 439 int
 440 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 441 {
 442   reg_errcode_t ret;
 443   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
 444                         : REG_SYNTAX_POSIX_BASIC);
 445
 446   preg->re_buffer = NULL;
 447   preg->re_allocated = 0;
 448   preg->re_used = 0;
 449
 450   /* Try to allocate space for the fastmap.  */
 451   preg->re_fastmap = re_malloc (char, SBC_MAX);
 452   if (BE (preg->re_fastmap == NULL, 0))
 453     return REG_ESPACE;
 454
 455   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
 456
 457   /* If REG_NEWLINE is set, newlines are treated differently.  */
 458   if (cflags & REG_NEWLINE)
 459     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 460       syntax &= ~REG_DOT_NEWLINE;
 461       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
 462       /* It also changes the matching behavior.  */
 463       preg->re_newline_anchor = 1;
 464     }
 465   else
 466     preg->re_newline_anchor = 0;
 467   preg->re_no_sub = !!(cflags & REG_NOSUB);
 468   preg->re_translate = NULL;
 469
 470   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 471
 472   /* POSIX doesn't distinguish between an unmatched open-group and an
 473      unmatched close-group: both are REG_EPAREN.  */
 474   if (ret == REG_ERPAREN)
 475     ret = REG_EPAREN;
 476
 477   /* We have already checked preg->re_fastmap != NULL.  */
 478   if (BE (ret == REG_NOERROR, 1))
 479     /* Compute the fastmap now, since regexec cannot modify the pattern
 480        buffer.  This function never fails in this implementation.  */
 481     (void) re_compile_fastmap (preg);
 482   else
 483     {
 484       /* Some error occurred while compiling the expression.  */
 485       re_free (preg->re_fastmap);
 486       preg->re_fastmap = NULL;
 487     }
 488
 489   return (int) ret;
 490 }
 491 #ifdef _LIBC
 492 weak_alias (__regcomp, regcomp)
 493 #endif
 494
 495 /* Returns a message corresponding to an error code, ERRCODE, returned
 496    from either regcomp or regexec.   We don't use PREG here.  */
 497
 498 size_t
 499 regerror (int errcode, const regex_t *__restrict preg,
 500           char *__restrict errbuf, size_t errbuf_size)
 501 {
 502   const char *msg;
 503   size_t msg_size;
 504
 505   if (BE (errcode < 0
 506           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 507                                / sizeof (__re_error_msgid_idx[0])), 0))
 508     /* Only error codes returned by the rest of the code should be passed
 509        to this routine.  If we are given anything else, or if other regex
 510        code generates an invalid error code, then the program has a bug.
 511        Dump core so we can fix it.  */
 512     abort ();
 513
 514   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 515
 516   msg_size = strlen (msg) + 1; /* Includes the null.  */
 517
 518   if (BE (errbuf_size != 0, 1))
 519     {
 520       if (BE (msg_size > errbuf_size, 0))
 521         {
 522 #if defined HAVE_MEMPCPY || defined _LIBC
 523           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 524 #else
 525           memcpy (errbuf, msg, errbuf_size - 1);
 526           errbuf[errbuf_size - 1] = 0;
 527 #endif
 528         }
 529       else
 530         memcpy (errbuf, msg, msg_size);
 531     }
 532
 533   return msg_size;
 534 }
 535 #ifdef _LIBC
 536 weak_alias (__regerror, regerror)
 537 #endif
 538
 539
 540 #ifdef RE_ENABLE_I18N
 541 /* This static array is used for the map to single-byte characters when
 542    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 543    it the same all the time.  UTF-8 is the preferred encoding so this is
 544    a worthwhile optimization.  */
 545 static const bitset utf8_sb_map =
 546 {
 547   /* Set the first 128 bits.  */
 548 # if UINT_MAX == 0xffffffff
 549   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 550 # else
 551 #  error "Add case for new unsigned int size"
 552 # endif
 553 };
 554 #endif
 555
 556
 557 static void
 558 free_dfa_content (re_dfa_t *dfa)
 559 {
 560   int i, j;
 561
 562   if (dfa->nodes)
 563     for (i = 0; i < dfa->nodes_len; ++i)
 564       free_token (dfa->nodes + i);
 565   re_free (dfa->nexts);
 566   for (i = 0; i < dfa->nodes_len; ++i)
 567     {
 568       if (dfa->eclosures != NULL)
 569         re_node_set_free (dfa->eclosures + i);
 570       if (dfa->inveclosures != NULL)
 571         re_node_set_free (dfa->inveclosures + i);
 572       if (dfa->edests != NULL)
 573         re_node_set_free (dfa->edests + i);
 574     }
 575   re_free (dfa->edests);
 576   re_free (dfa->eclosures);
 577   re_free (dfa->inveclosures);
 578   re_free (dfa->nodes);
 579
 580   if (dfa->state_table)
 581     for (i = 0; i <= dfa->state_hash_mask; ++i)
 582       {
 583         struct re_state_table_entry *entry = dfa->state_table + i;
 584         for (j = 0; j < entry->num; ++j)
 585           {
 586             re_dfastate_t *state = entry->array[j];
 587             free_state (state);
 588           }
 589         re_free (entry->array);
 590       }
 591   re_free (dfa->state_table);
 592 #ifdef RE_ENABLE_I18N
 593   if (dfa->sb_char != utf8_sb_map)
 594     re_free (dfa->sb_char);
 595 #endif
 596   re_free (dfa->subexp_map);
 597 #ifdef DEBUG
 598   re_free (dfa->re_str);
 599 #endif
 600
 601   re_free (dfa);
 602 }
 603
 604
 605 /* Free dynamically allocated space used by PREG.  */
 606
 607 void
 608 regfree (regex_t *preg)
 609 {
 610   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 611   if (BE (dfa != NULL, 1))
 612     free_dfa_content (dfa);
 613   preg->re_buffer = NULL;
 614   preg->re_allocated = 0;
 615
 616   re_free (preg->re_fastmap);
 617   preg->re_fastmap = NULL;
 618
 619   re_free (preg->re_translate);
 620   preg->re_translate = NULL;
 621 }
 622 #ifdef _LIBC
 623 weak_alias (__regfree, regfree)
 624 #endif
 625 \f
 626 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 627    them unless specifically requested.  */
 628
 629 #if defined _REGEX_RE_COMP || defined _LIBC
 630
 631 /* BSD has one and only one pattern buffer.  */
 632 static struct re_pattern_buffer re_comp_buf;
 633
 634 char *
 635 # ifdef _LIBC
 636 /* Make these definitions weak in libc, so POSIX programs can redefine
 637    these names if they don't use our functions, and still use
 638    regcomp/regexec above without link errors.  */
 639 weak_function
 640 # endif
 641 re_comp (s)
 642      const char *s;
 643 {
 644   reg_errcode_t ret;
 645   char *fastmap;
 646
 647   if (!s)
 648     {
 649       if (!re_comp_buf.re_buffer)
 650         return gettext ("No previous regular expression");
 651       return 0;
 652     }
 653
 654   if (re_comp_buf.re_buffer)
 655     {
 656       fastmap = re_comp_buf.re_fastmap;
 657       re_comp_buf.re_fastmap = NULL;
 658       __regfree (&re_comp_buf);
 659       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 660       re_comp_buf.re_fastmap = fastmap;
 661     }
 662
 663   if (re_comp_buf.re_fastmap == NULL)
 664     {
 665       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
 666       if (re_comp_buf.re_fastmap == NULL)
 667         return (char *) gettext (__re_error_msgid
 668                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 669     }
 670
 671   /* Since `re_exec' always passes NULL for the `regs' argument, we
 672      don't need to initialize the pattern buffer fields which affect it.  */
 673
 674   /* Match anchors at newlines.  */
 675   re_comp_buf.re_newline_anchor = 1;
 676
 677   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 678
 679   if (!ret)
 680     return NULL;
 681
 682   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 683   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 684 }
 685
 686 #ifdef _LIBC
 687 libc_freeres_fn (free_mem)
 688 {
 689   __regfree (&re_comp_buf);
 690 }
 691 #endif
 692
 693 #endif /* _REGEX_RE_COMP */
 694 \f
 695 /* Internal entry point.
 696    Compile the regular expression PATTERN, whose length is LENGTH.
 697    SYNTAX indicate regular expression's syntax.  */
 698
 699 static reg_errcode_t
 700 re_compile_internal (regex_t *preg, const char * pattern, int length,
 701                      reg_syntax_t syntax)
 702 {
 703   reg_errcode_t err = REG_NOERROR;
 704   re_dfa_t *dfa;
 705   re_string_t regexp;
 706
 707   /* Initialize the pattern buffer.  */
 708   preg->re_fastmap_accurate = 0;
 709   preg->re_syntax = syntax;
 710   preg->re_not_bol = preg->re_not_eol = 0;
 711   preg->re_used = 0;
 712   preg->re_nsub = 0;
 713   preg->re_can_be_null = 0;
 714   preg->re_regs_allocated = REG_UNALLOCATED;
 715
 716   /* Initialize the dfa.  */
 717   dfa = (re_dfa_t *) preg->re_buffer;
 718   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
 719     {
 720       /* If zero allocated, but buffer is non-null, try to realloc
 721          enough space.  This loses if buffer's address is bogus, but
 722          that is the user's responsibility.  If buffer is null this
 723          is a simple allocation.  */
 724       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
 725       if (dfa == NULL)
 726         return REG_ESPACE;
 727       preg->re_allocated = sizeof (re_dfa_t);
 728       preg->re_buffer = (unsigned char *) dfa;
 729     }
 730   preg->re_used = sizeof (re_dfa_t);
 731
 732   __libc_lock_init (dfa->lock);
 733
 734   err = init_dfa (dfa, length);
 735   if (BE (err != REG_NOERROR, 0))
 736     {
 737       free_dfa_content (dfa);
 738       preg->re_buffer = NULL;
 739       preg->re_allocated = 0;
 740       return err;
 741     }
 742 #ifdef DEBUG
 743   dfa->re_str = re_malloc (char, length + 1);
 744   strncpy (dfa->re_str, pattern, length + 1);
 745 #endif
 746
 747   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
 748                              syntax & REG_IGNORE_CASE, dfa);
 749   if (BE (err != REG_NOERROR, 0))
 750     {
 751     re_compile_internal_free_return:
 752       free_workarea_compile (preg);
 753       re_string_destruct (&regexp);
 754       free_dfa_content (dfa);
 755       preg->re_buffer = NULL;
 756       preg->re_allocated = 0;
 757       return err;
 758     }
 759
 760   /* Parse the regular expression, and build a structure tree.  */
 761   preg->re_nsub = 0;
 762   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 763   if (BE (dfa->str_tree == NULL, 0))
 764     goto re_compile_internal_free_return;
 765
 766   /* Analyze the tree and create the nfa.  */
 767   err = analyze (preg);
 768   if (BE (err != REG_NOERROR, 0))
 769     goto re_compile_internal_free_return;
 770
 771 #ifdef RE_ENABLE_I18N
 772   /* If possible, do searching in single byte encoding to speed things up.  */
 773   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
 774     optimize_utf8 (dfa);
 775 #endif
 776
 777   /* Then create the initial state of the dfa.  */
 778   err = create_initial_state (dfa);
 779
 780   /* Release work areas.  */
 781   free_workarea_compile (preg);
 782   re_string_destruct (&regexp);
 783
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786       free_dfa_content (dfa);
 787       preg->re_buffer = NULL;
 788       preg->re_allocated = 0;
 789     }
 790
 791   return err;
 792 }
 793
 794 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 795    as the initial length of some arrays.  */
 796
 797 static reg_errcode_t
 798 init_dfa (re_dfa_t *dfa, int pat_len)
 799 {
 800   int table_size;
 801 #ifndef _LIBC
 802   char *codeset_name;
 803 #endif
 804
 805   memset (dfa, '\0', sizeof (re_dfa_t));
 806
 807   /* Force allocation of str_tree_storage the first time.  */
 808   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 809
 810   dfa->nodes_alloc = pat_len + 1;
 811   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 812
 813   dfa->states_alloc = pat_len + 1;
 814
 815   /*  table_size = 2 ^ ceil(log pat_len) */
 816   for (table_size = 1; table_size > 0; table_size <<= 1)
 817     if (table_size > pat_len)
 818       break;
 819
 820   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 821   dfa->state_hash_mask = table_size - 1;
 822
 823   dfa->mb_cur_max = MB_CUR_MAX;
 824 #ifdef _LIBC
 825   if (dfa->mb_cur_max == 6
 826       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 827     dfa->is_utf8 = 1;
 828   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 829                        != 0);
 830 #else
 831 # ifdef HAVE_LANGINFO_CODESET
 832   codeset_name = nl_langinfo (CODESET);
 833 # else
 834   codeset_name = getenv ("LC_ALL");
 835   if (codeset_name == NULL || codeset_name[0] == '\0')
 836     codeset_name = getenv ("LC_CTYPE");
 837   if (codeset_name == NULL || codeset_name[0] == '\0')
 838     codeset_name = getenv ("LANG");
 839   if (codeset_name == NULL)
 840     codeset_name = "";
 841   else if (strchr (codeset_name, '.') !=  NULL)
 842     codeset_name = strchr (codeset_name, '.') + 1;
 843 # endif
 844
 845   if (strcasecmp (codeset_name, "UTF-8") == 0
 846       || strcasecmp (codeset_name, "UTF8") == 0)
 847     dfa->is_utf8 = 1;
 848
 849   /* We check exhaustively in the loop below if this charset is a
 850      superset of ASCII.  */
 851   dfa->map_notascii = 0;
 852 #endif
 853
 854 #ifdef RE_ENABLE_I18N
 855   if (dfa->mb_cur_max > 1)
 856     {
 857       if (dfa->is_utf8)
 858         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 859       else
 860         {
 861           int i, j, ch;
 862
 863           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 864           if (BE (dfa->sb_char == NULL, 0))
 865             return REG_ESPACE;
 866
 867           /* Clear all bits by, then set those corresponding to single
 868              byte chars.  */
 869           bitset_empty (dfa->sb_char);
 870
 871           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 872             for (j = 0; j < UINT_BITS; ++j, ++ch)
 873               {
 874                 wint_t wch = __btowc (ch);
 875                 if (wch != WEOF)
 876                   dfa->sb_char[i] |= 1 << j;
 877 # ifndef _LIBC
 878                 if (isascii (ch) && wch != ch)
 879                   dfa->map_notascii = 1;
 880 # endif
 881               }
 882         }
 883     }
 884 #endif
 885
 886   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 887     return REG_ESPACE;
 888   return REG_NOERROR;
 889 }
 890
 891 /* Initialize WORD_CHAR table, which indicate which character is
 892    "word".  In this case "word" means that it is the word construction
 893    character used by some operators like "\<", "\>", etc.  */
 894
 895 static void
 896 init_word_char (re_dfa_t *dfa)
 897 {
 898   int i, j, ch;
 899   dfa->word_ops_used = 1;
 900   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 901     for (j = 0; j < UINT_BITS; ++j, ++ch)
 902       if (isalnum (ch) || ch == '_')
 903         dfa->word_char[i] |= 1 << j;
 904 }
 905
 906 /* Free the work area which are only used while compiling.  */
 907
 908 static void
 909 free_workarea_compile (regex_t *preg)
 910 {
 911   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 912   bin_tree_storage_t *storage, *next;
 913   for (storage = dfa->str_tree_storage; storage; storage = next)
 914     {
 915       next = storage->next;
 916       re_free (storage);
 917     }
 918   dfa->str_tree_storage = NULL;
 919   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 920   dfa->str_tree = NULL;
 921   re_free (dfa->org_indices);
 922   dfa->org_indices = NULL;
 923 }
 924
 925 /* Create initial states for all contexts.  */
 926
 927 static reg_errcode_t
 928 create_initial_state (re_dfa_t *dfa)
 929 {
 930   int first, i;
 931   reg_errcode_t err;
 932   re_node_set init_nodes;
 933
 934   /* Initial states have the epsilon closure of the node which is
 935      the first node of the regular expression.  */
 936   first = dfa->str_tree->first->node_idx;
 937   dfa->init_node = first;
 938   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 939   if (BE (err != REG_NOERROR, 0))
 940     return err;
 941
 942   /* The back-references which are in initial states can epsilon transit,
 943      since in this case all of the subexpressions can be null.
 944      Then we add epsilon closures of the nodes which are the next nodes of
 945      the back-references.  */
 946   if (dfa->nbackref > 0)
 947     for (i = 0; i < init_nodes.nelem; ++i)
 948       {
 949         int node_idx = init_nodes.elems[i];
 950         re_token_type_t type = dfa->nodes[node_idx].type;
 951
 952         int clexp_idx;
 953         if (type != OP_BACK_REF)
 954           continue;
 955         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 956           {
 957             re_token_t *clexp_node;
 958             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 959             if (clexp_node->type == OP_CLOSE_SUBEXP
 960                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 961               break;
 962           }
 963         if (clexp_idx == init_nodes.nelem)
 964           continue;
 965
 966         if (type == OP_BACK_REF)
 967           {
 968             int dest_idx = dfa->edests[node_idx].elems[0];
 969             if (!re_node_set_contains (&init_nodes, dest_idx))
 970               {
 971                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 972                 i = 0;
 973               }
 974           }
 975       }
 976
 977   /* It must be the first time to invoke acquire_state.  */
 978   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 979   /* We don't check ERR here, since the initial state must not be NULL.  */
 980   if (BE (dfa->init_state == NULL, 0))
 981     return err;
 982   if (dfa->init_state->has_constraint)
 983     {
 984       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 985                                                        CONTEXT_WORD);
 986       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 987                                                      CONTEXT_NEWLINE);
 988       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 989                                                          &init_nodes,
 990                                                          CONTEXT_NEWLINE
 991                                                          | CONTEXT_BEGBUF);
 992       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 993               || dfa->init_state_begbuf == NULL, 0))
 994         return err;
 995     }
 996   else
 997     dfa->init_state_word = dfa->init_state_nl
 998       = dfa->init_state_begbuf = dfa->init_state;
 999
1000   re_node_set_free (&init_nodes);
1001   return REG_NOERROR;
1002 }
1003 \f
1004 #ifdef RE_ENABLE_I18N
1005 /* If it is possible to do searching in single byte encoding instead of UTF-8
1006    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1007    DFA nodes where needed.  */
1008
1009 static void
1010 optimize_utf8 (re_dfa_t *dfa)
1011 {
1012   int node, i, mb_chars = 0, has_period = 0;
1013
1014   for (node = 0; node < dfa->nodes_len; ++node)
1015     switch (dfa->nodes[node].type)
1016       {
1017       case CHARACTER:
1018         if (dfa->nodes[node].opr.c >= 0x80)
1019           mb_chars = 1;
1020         break;
1021       case ANCHOR:
1022         switch (dfa->nodes[node].opr.idx)
1023           {
1024           case LINE_FIRST:
1025           case LINE_LAST:
1026           case BUF_FIRST:
1027           case BUF_LAST:
1028             break;
1029           default:
1030             /* Word anchors etc. cannot be handled.  */
1031             return;
1032           }
1033         break;
1034       case OP_PERIOD:
1035         has_period = 1;
1036         break;
1037       case OP_BACK_REF:
1038       case OP_ALT:
1039       case END_OF_RE:
1040       case OP_DUP_ASTERISK:
1041       case OP_OPEN_SUBEXP:
1042       case OP_CLOSE_SUBEXP:
1043         break;
1044       case COMPLEX_BRACKET:
1045         return;
1046       case SIMPLE_BRACKET:
1047         /* Just double check.  */
1048         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1049           if (dfa->nodes[node].opr.sbcset[i])
1050             return;
1051         break;
1052       default:
1053         abort ();
1054       }
1055
1056   if (mb_chars || has_period)
1057     for (node = 0; node < dfa->nodes_len; ++node)
1058       {
1059         if (dfa->nodes[node].type == CHARACTER
1060             && dfa->nodes[node].opr.c >= 0x80)
1061           dfa->nodes[node].mb_partial = 0;
1062         else if (dfa->nodes[node].type == OP_PERIOD)
1063           dfa->nodes[node].type = OP_UTF8_PERIOD;
1064       }
1065
1066   /* The search can be in single byte locale.  */
1067   dfa->mb_cur_max = 1;
1068   dfa->is_utf8 = 0;
1069   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1070 }
1071 #endif
1072 \f
1073 /* Analyze the structure tree, and calculate "first", "next", "edest",
1074    "eclosure", and "inveclosure".  */
1075
1076 static reg_errcode_t
1077 analyze (regex_t *preg)
1078 {
1079   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1080   reg_errcode_t ret;
1081
1082   /* Allocate arrays.  */
1083   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1084   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1085   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1086   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1087   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1088           || dfa->eclosures == NULL, 0))
1089     return REG_ESPACE;
1090
1091   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1092   if (dfa->subexp_map != NULL)
1093     {
1094       int i;
1095       for (i = 0; i < preg->re_nsub; i++)
1096         dfa->subexp_map[i] = i;
1097       preorder (dfa->str_tree, optimize_subexps, dfa);
1098       for (i = 0; i < preg->re_nsub; i++)
1099         if (dfa->subexp_map[i] != i)
1100           break;
1101       if (i == preg->re_nsub)
1102         {
1103           free (dfa->subexp_map);
1104           dfa->subexp_map = NULL;
1105         }
1106     }
1107
1108   ret = postorder (dfa->str_tree, lower_subexps, preg);
1109   if (BE (ret != REG_NOERROR, 0))
1110     return ret;
1111   ret = postorder (dfa->str_tree, calc_first, dfa);
1112   if (BE (ret != REG_NOERROR, 0))
1113     return ret;
1114   preorder (dfa->str_tree, calc_next, dfa);
1115   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1116   if (BE (ret != REG_NOERROR, 0))
1117     return ret;
1118   ret = calc_eclosure (dfa);
1119   if (BE (ret != REG_NOERROR, 0))
1120     return ret;
1121
1122   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1123      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1124   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1125       || dfa->nbackref)
1126     {
1127       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1128       if (BE (dfa->inveclosures == NULL, 0))
1129         return REG_ESPACE;
1130       ret = calc_inveclosure (dfa);
1131     }
1132
1133   return ret;
1134 }
1135
1136 /* Our parse trees are very unbalanced, so we cannot use a stack to
1137    implement parse tree visits.  Instead, we use parent pointers and
1138    some hairy code in these two functions.  */
1139 static reg_errcode_t
1140 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1141            void *extra)
1142 {
1143   bin_tree_t *node, *prev;
1144
1145   for (node = root; ; )
1146     {
1147       /* Descend down the tree, preferably to the left (or to the right
1148          if that's the only child).  */
1149       while (node->left || node->right)
1150         if (node->left)
1151           node = node->left;
1152         else
1153           node = node->right;
1154
1155       do
1156         {
1157           reg_errcode_t err = fn (extra, node);
1158           if (BE (err != REG_NOERROR, 0))
1159             return err;
1160           if (node->parent == NULL)
1161             return REG_NOERROR;
1162           prev = node;
1163           node = node->parent;
1164         }
1165       /* Go up while we have a node that is reached from the right.  */
1166       while (node->right == prev || node->right == NULL);
1167       node = node->right;
1168     }
1169 }
1170
1171 static reg_errcode_t
1172 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1173           void *extra)
1174 {
1175   bin_tree_t *node;
1176
1177   for (node = root; ; )
1178     {
1179       reg_errcode_t err = fn (extra, node);
1180       if (BE (err != REG_NOERROR, 0))
1181         return err;
1182
1183       /* Go to the left node, or up and to the right.  */
1184       if (node->left)
1185         node = node->left;
1186       else
1187         {
1188           bin_tree_t *prev = NULL;
1189           while (node->right == prev || node->right == NULL)
1190             {
1191               prev = node;
1192               node = node->parent;
1193               if (!node)
1194                 return REG_NOERROR;
1195             }
1196           node = node->right;
1197         }
1198     }
1199 }
1200
1201 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1202    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1203    backreferences as well.  Requires a preorder visit.  */
1204 static reg_errcode_t
1205 optimize_subexps (void *extra, bin_tree_t *node)
1206 {
1207   re_dfa_t *dfa = (re_dfa_t *) extra;
1208
1209   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1210     {
1211       int idx = node->token.opr.idx;
1212       node->token.opr.idx = dfa->subexp_map[idx];
1213       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1214     }
1215
1216   else if (node->token.type == SUBEXP
1217            && node->left && node->left->token.type == SUBEXP)
1218     {
1219       int other_idx = node->left->token.opr.idx;
1220
1221       node->left = node->left->left;
1222       if (node->left)
1223         node->left->parent = node;
1224
1225       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1226       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1227         dfa->used_bkref_map &= ~(1 << other_idx);
1228     }
1229
1230   return REG_NOERROR;
1231 }
1232
1233 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1234    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1235 static reg_errcode_t
1236 lower_subexps (void *extra, bin_tree_t *node)
1237 {
1238   regex_t *preg = (regex_t *) extra;
1239   reg_errcode_t err = REG_NOERROR;
1240
1241   if (node->left && node->left->token.type == SUBEXP)
1242     {
1243       node->left = lower_subexp (&err, preg, node->left);
1244       if (node->left)
1245         node->left->parent = node;
1246     }
1247   if (node->right && node->right->token.type == SUBEXP)
1248     {
1249       node->right = lower_subexp (&err, preg, node->right);
1250       if (node->right)
1251         node->right->parent = node;
1252     }
1253
1254   return err;
1255 }
1256
1257 static bin_tree_t *
1258 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1259 {
1260   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1261   bin_tree_t *body = node->left;
1262   bin_tree_t *op, *cls, *tree1, *tree;
1263
1264   if (preg->re_no_sub
1265       /* We do not optimize empty subexpressions, because otherwise we may
1266          have bad CONCAT nodes with NULL children.  This is obviously not
1267          very common, so we do not lose much.  An example that triggers
1268          this case is the sed "script" /\(\)/x.  */
1269       && node->left != NULL
1270       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1271           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1272     return node->left;
1273
1274   /* Convert the SUBEXP node to the concatenation of an
1275      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1276   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1277   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1278   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1279   tree = create_tree (dfa, op, tree1, CONCAT);
1280   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1281     {
1282       *err = REG_ESPACE;
1283       return NULL;
1284     }
1285
1286   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1287   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1288   return tree;
1289 }
1290
1291 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1292    nodes.  Requires a postorder visit.  */
1293 static reg_errcode_t
1294 calc_first (void *extra, bin_tree_t *node)
1295 {
1296   re_dfa_t *dfa = (re_dfa_t *) extra;
1297   if (node->token.type == CONCAT)
1298     {
1299       node->first = node->left->first;
1300       node->node_idx = node->left->node_idx;
1301     }
1302   else
1303     {
1304       node->first = node;
1305       node->node_idx = re_dfa_add_node (dfa, node->token);
1306       if (BE (node->node_idx == -1, 0))
1307         return REG_ESPACE;
1308     }
1309   return REG_NOERROR;
1310 }
1311
1312 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1313 static reg_errcode_t
1314 calc_next (void *extra, bin_tree_t *node)
1315 {
1316   switch (node->token.type)
1317     {
1318     case OP_DUP_ASTERISK:
1319       node->left->next = node;
1320       break;
1321     case CONCAT:
1322       node->left->next = node->right->first;
1323       node->right->next = node->next;
1324       break;
1325     default:
1326       if (node->left)
1327         node->left->next = node->next;
1328       if (node->right)
1329         node->right->next = node->next;
1330       break;
1331     }
1332   return REG_NOERROR;
1333 }
1334
1335 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1336 static reg_errcode_t
1337 link_nfa_nodes (void *extra, bin_tree_t *node)
1338 {
1339   re_dfa_t *dfa = (re_dfa_t *) extra;
1340   int idx = node->node_idx;
1341   reg_errcode_t err = REG_NOERROR;
1342
1343   switch (node->token.type)
1344     {
1345     case CONCAT:
1346       break;
1347
1348     case END_OF_RE:
1349       assert (node->next == NULL);
1350       break;
1351
1352     case OP_DUP_ASTERISK:
1353     case OP_ALT:
1354       {
1355         int left, right;
1356         dfa->has_plural_match = 1;
1357         if (node->left != NULL)
1358           left = node->left->first->node_idx;
1359         else
1360           left = node->next->node_idx;
1361         if (node->right != NULL)
1362           right = node->right->first->node_idx;
1363         else
1364           right = node->next->node_idx;
1365         assert (left > -1);
1366         assert (right > -1);
1367         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1368       }
1369       break;
1370
1371     case ANCHOR:
1372     case OP_OPEN_SUBEXP:
1373     case OP_CLOSE_SUBEXP:
1374       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1375       break;
1376
1377     case OP_BACK_REF:
1378       dfa->nexts[idx] = node->next->node_idx;
1379       if (node->token.type == OP_BACK_REF)
1380         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1381       break;
1382
1383     default:
1384       assert (!IS_EPSILON_NODE (node->token.type));
1385       dfa->nexts[idx] = node->next->node_idx;
1386       break;
1387     }
1388
1389   return err;
1390 }
1391
1392 /* Duplicate the epsilon closure of the node ROOT_NODE.
1393    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1394    to their own constraint.  */
1395
1396 static reg_errcode_t
1397 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1398                         int root_node, unsigned int init_constraint)
1399 {
1400   int org_node, clone_node, ret;
1401   unsigned int constraint = init_constraint;
1402   for (org_node = top_org_node, clone_node = top_clone_node;;)
1403     {
1404       int org_dest, clone_dest;
1405       if (dfa->nodes[org_node].type == OP_BACK_REF)
1406         {
1407           /* If the back reference epsilon-transit, its destination must
1408              also have the constraint.  Then duplicate the epsilon closure
1409              of the destination of the back reference, and store it in
1410              edests of the back reference.  */
1411           org_dest = dfa->nexts[org_node];
1412           re_node_set_empty (dfa->edests + clone_node);
1413           clone_dest = duplicate_node (dfa, org_dest, constraint);
1414           if (BE (clone_dest == -1, 0))
1415             return REG_ESPACE;
1416           dfa->nexts[clone_node] = dfa->nexts[org_node];
1417           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1418           if (BE (ret < 0, 0))
1419             return REG_ESPACE;
1420         }
1421       else if (dfa->edests[org_node].nelem == 0)
1422         {
1423           /* In case of the node can't epsilon-transit, don't duplicate the
1424              destination and store the original destination as the
1425              destination of the node.  */
1426           dfa->nexts[clone_node] = dfa->nexts[org_node];
1427           break;
1428         }
1429       else if (dfa->edests[org_node].nelem == 1)
1430         {
1431           /* In case of the node can epsilon-transit, and it has only one
1432              destination.  */
1433           org_dest = dfa->edests[org_node].elems[0];
1434           re_node_set_empty (dfa->edests + clone_node);
1435           if (dfa->nodes[org_node].type == ANCHOR)
1436             {
1437               /* In case of the node has another constraint, append it.  */
1438               if (org_node == root_node && clone_node != org_node)
1439                 {
1440                   /* ...but if the node is root_node itself, it means the
1441                      epsilon closure have a loop, then tie it to the
1442                      destination of the root_node.  */
1443                   ret = re_node_set_insert (dfa->edests + clone_node,
1444                                             org_dest);
1445                   if (BE (ret < 0, 0))
1446                     return REG_ESPACE;
1447                   break;
1448                 }
1449               constraint |= dfa->nodes[org_node].opr.ctx_type;
1450             }
1451           clone_dest = duplicate_node (dfa, org_dest, constraint);
1452           if (BE (clone_dest == -1, 0))
1453             return REG_ESPACE;
1454           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1455           if (BE (ret < 0, 0))
1456             return REG_ESPACE;
1457         }
1458       else /* dfa->edests[org_node].nelem == 2 */
1459         {
1460           /* In case of the node can epsilon-transit, and it has two
1461              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1462           org_dest = dfa->edests[org_node].elems[0];
1463           re_node_set_empty (dfa->edests + clone_node);
1464           /* Search for a duplicated node which satisfies the constraint.  */
1465           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1466           if (clone_dest == -1)
1467             {
1468               /* There are no such a duplicated node, create a new one.  */
1469               reg_errcode_t err;
1470               clone_dest = duplicate_node (dfa, org_dest, constraint);
1471               if (BE (clone_dest == -1, 0))
1472                 return REG_ESPACE;
1473               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1474               if (BE (ret < 0, 0))
1475                 return REG_ESPACE;
1476               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1477                                             root_node, constraint);
1478               if (BE (err != REG_NOERROR, 0))
1479                 return err;
1480             }
1481           else
1482             {
1483               /* There are a duplicated node which satisfy the constraint,
1484                  use it to avoid infinite loop.  */
1485               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1486               if (BE (ret < 0, 0))
1487                 return REG_ESPACE;
1488             }
1489
1490           org_dest = dfa->edests[org_node].elems[1];
1491           clone_dest = duplicate_node (dfa, org_dest, constraint);
1492           if (BE (clone_dest == -1, 0))
1493             return REG_ESPACE;
1494           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495           if (BE (ret < 0, 0))
1496             return REG_ESPACE;
1497         }
1498       org_node = org_dest;
1499       clone_node = clone_dest;
1500     }
1501   return REG_NOERROR;
1502 }
1503
1504 /* Search for a node which is duplicated from the node ORG_NODE, and
1505    satisfies the constraint CONSTRAINT.  */
1506
1507 static int
1508 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1509 {
1510   int idx;
1511   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1512     {
1513       if (org_node == dfa->org_indices[idx]
1514           && constraint == dfa->nodes[idx].constraint)
1515         return idx; /* Found.  */
1516     }
1517   return -1; /* Not found.  */
1518 }
1519
1520 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1521    Return the index of the new node, or -1 if insufficient storage is
1522    available.  */
1523
1524 static int
1525 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1526 {
1527   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1528   if (BE (dup_idx != -1, 1))
1529     {
1530       dfa->nodes[dup_idx].constraint = constraint;
1531       if (dfa->nodes[org_idx].type == ANCHOR)
1532         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1533       dfa->nodes[dup_idx].duplicated = 1;
1534
1535       /* Store the index of the original node.  */
1536       dfa->org_indices[dup_idx] = org_idx;
1537     }
1538   return dup_idx;
1539 }
1540
1541 static reg_errcode_t
1542 calc_inveclosure (re_dfa_t *dfa)
1543 {
1544   int src, idx, ret;
1545   for (idx = 0; idx < dfa->nodes_len; ++idx)
1546     re_node_set_init_empty (dfa->inveclosures + idx);
1547
1548   for (src = 0; src < dfa->nodes_len; ++src)
1549     {
1550       int *elems = dfa->eclosures[src].elems;
1551       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1552         {
1553           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1554           if (BE (ret == -1, 0))
1555             return REG_ESPACE;
1556         }
1557     }
1558
1559   return REG_NOERROR;
1560 }
1561
1562 /* Calculate "eclosure" for all the node in DFA.  */
1563
1564 static reg_errcode_t
1565 calc_eclosure (re_dfa_t *dfa)
1566 {
1567   int node_idx, incomplete;
1568 #ifdef DEBUG
1569   assert (dfa->nodes_len > 0);
1570 #endif
1571   incomplete = 0;
1572   /* For each nodes, calculate epsilon closure.  */
1573   for (node_idx = 0; ; ++node_idx)
1574     {
1575       reg_errcode_t err;
1576       re_node_set eclosure_elem;
1577       if (node_idx == dfa->nodes_len)
1578         {
1579           if (!incomplete)
1580             break;
1581           incomplete = 0;
1582           node_idx = 0;
1583         }
1584
1585 #ifdef DEBUG
1586       assert (dfa->eclosures[node_idx].nelem != -1);
1587 #endif
1588
1589       /* If we have already calculated, skip it.  */
1590       if (dfa->eclosures[node_idx].nelem != 0)
1591         continue;
1592       /* Calculate epsilon closure of `node_idx'.  */
1593       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1594       if (BE (err != REG_NOERROR, 0))
1595         return err;
1596
1597       if (dfa->eclosures[node_idx].nelem == 0)
1598         {
1599           incomplete = 1;
1600           re_node_set_free (&eclosure_elem);
1601         }
1602     }
1603   return REG_NOERROR;
1604 }
1605
1606 /* Calculate epsilon closure of NODE.  */
1607
1608 static reg_errcode_t
1609 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1610 {
1611   reg_errcode_t err;
1612   unsigned int constraint;
1613   int i, incomplete;
1614   re_node_set eclosure;
1615   incomplete = 0;
1616   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1617   if (BE (err != REG_NOERROR, 0))
1618     return err;
1619
1620   /* This indicates that we are calculating this node now.
1621      We reference this value to avoid infinite loop.  */
1622   dfa->eclosures[node].nelem = -1;
1623
1624   constraint = ((dfa->nodes[node].type == ANCHOR)
1625                 ? dfa->nodes[node].opr.ctx_type : 0);
1626   /* If the current node has constraints, duplicate all nodes.
1627      Since they must inherit the constraints.  */
1628   if (constraint
1629       && dfa->edests[node].nelem
1630       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1631     {
1632       int org_node, cur_node;
1633       org_node = cur_node = node;
1634       err = duplicate_node_closure (dfa, node, node, node, constraint);
1635       if (BE (err != REG_NOERROR, 0))
1636         return err;
1637     }
1638
1639   /* Expand each epsilon destination nodes.  */
1640   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1641     for (i = 0; i < dfa->edests[node].nelem; ++i)
1642       {
1643         re_node_set eclosure_elem;
1644         int edest = dfa->edests[node].elems[i];
1645         /* If calculating the epsilon closure of `edest' is in progress,
1646            return intermediate result.  */
1647         if (dfa->eclosures[edest].nelem == -1)
1648           {
1649             incomplete = 1;
1650             continue;
1651           }
1652         /* If we haven't calculated the epsilon closure of `edest' yet,
1653            calculate now. Otherwise use calculated epsilon closure.  */
1654         if (dfa->eclosures[edest].nelem == 0)
1655           {
1656             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1657             if (BE (err != REG_NOERROR, 0))
1658               return err;
1659           }
1660         else
1661           eclosure_elem = dfa->eclosures[edest];
1662         /* Merge the epsilon closure of `edest'.  */
1663         re_node_set_merge (&eclosure, &eclosure_elem);
1664         /* If the epsilon closure of `edest' is incomplete,
1665            the epsilon closure of this node is also incomplete.  */
1666         if (dfa->eclosures[edest].nelem == 0)
1667           {
1668             incomplete = 1;
1669             re_node_set_free (&eclosure_elem);
1670           }
1671       }
1672
1673   /* Epsilon closures include itself.  */
1674   re_node_set_insert (&eclosure, node);
1675   if (incomplete && !root)
1676     dfa->eclosures[node].nelem = 0;
1677   else
1678     dfa->eclosures[node] = eclosure;
1679   *new_set = eclosure;
1680   return REG_NOERROR;
1681 }
1682 \f
1683 /* Functions for token which are used in the parser.  */
1684
1685 /* Fetch a token from INPUT.
1686    We must not use this function inside bracket expressions.  */
1687
1688 static void
1689 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1690 {
1691   re_string_skip_bytes (input, peek_token (result, input, syntax));
1692 }
1693
1694 /* Peek a token from INPUT, and return the length of the token.
1695    We must not use this function inside bracket expressions.  */
1696
1697 static int
1698 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1699 {
1700   unsigned char c;
1701
1702   if (re_string_eoi (input))
1703     {
1704       token->type = END_OF_RE;
1705       return 0;
1706     }
1707
1708   c = re_string_peek_byte (input, 0);
1709   token->opr.c = c;
1710
1711   token->word_char = 0;
1712 #ifdef RE_ENABLE_I18N
1713   token->mb_partial = 0;
1714   if (input->mb_cur_max > 1 &&
1715       !re_string_first_byte (input, re_string_cur_idx (input)))
1716     {
1717       token->type = CHARACTER;
1718       token->mb_partial = 1;
1719       return 1;
1720     }
1721 #endif
1722   if (c == '\\')
1723     {
1724       unsigned char c2;
1725       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1726         {
1727           token->type = BACK_SLASH;
1728           return 1;
1729         }
1730
1731       c2 = re_string_peek_byte_case (input, 1);
1732       token->opr.c = c2;
1733       token->type = CHARACTER;
1734 #ifdef RE_ENABLE_I18N
1735       if (input->mb_cur_max > 1)
1736         {
1737           wint_t wc = re_string_wchar_at (input,
1738                                           re_string_cur_idx (input) + 1);
1739           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1740         }
1741       else
1742 #endif
1743         token->word_char = IS_WORD_CHAR (c2) != 0;
1744
1745       switch (c2)
1746         {
1747         case '|':
1748           if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1749             token->type = OP_ALT;
1750           break;
1751         case '1': case '2': case '3': case '4': case '5':
1752         case '6': case '7': case '8': case '9':
1753           if (!(syntax & REG_NO_BK_REFS))
1754             {
1755               token->type = OP_BACK_REF;
1756               token->opr.idx = c2 - '1';
1757             }
1758           break;
1759         case '<':
1760           if (!(syntax & REG_NO_GNU_OPS))
1761             {
1762               token->type = ANCHOR;
1763               token->opr.ctx_type = WORD_FIRST;
1764             }
1765           break;
1766         case '>':
1767           if (!(syntax & REG_NO_GNU_OPS))
1768             {
1769               token->type = ANCHOR;
1770               token->opr.ctx_type = WORD_LAST;
1771             }
1772           break;
1773         case 'b':
1774           if (!(syntax & REG_NO_GNU_OPS))
1775             {
1776               token->type = ANCHOR;
1777               token->opr.ctx_type = WORD_DELIM;
1778             }
1779           break;
1780         case 'B':
1781           if (!(syntax & REG_NO_GNU_OPS))
1782             {
1783               token->type = ANCHOR;
1784               token->opr.ctx_type = NOT_WORD_DELIM;
1785             }
1786           break;
1787         case 'w':
1788           if (!(syntax & REG_NO_GNU_OPS))
1789             token->type = OP_WORD;
1790           break;
1791         case 'W':
1792           if (!(syntax & REG_NO_GNU_OPS))
1793             token->type = OP_NOTWORD;
1794           break;
1795         case 's':
1796           if (!(syntax & REG_NO_GNU_OPS))
1797             token->type = OP_SPACE;
1798           break;
1799         case 'S':
1800           if (!(syntax & REG_NO_GNU_OPS))
1801             token->type = OP_NOTSPACE;
1802           break;
1803         case '`':
1804           if (!(syntax & REG_NO_GNU_OPS))
1805             {
1806               token->type = ANCHOR;
1807               token->opr.ctx_type = BUF_FIRST;
1808             }
1809           break;
1810         case '\'':
1811           if (!(syntax & REG_NO_GNU_OPS))
1812             {
1813               token->type = ANCHOR;
1814               token->opr.ctx_type = BUF_LAST;
1815             }
1816           break;
1817         case '(':
1818           if (!(syntax & REG_NO_BK_PARENS))
1819             token->type = OP_OPEN_SUBEXP;
1820           break;
1821         case ')':
1822           if (!(syntax & REG_NO_BK_PARENS))
1823             token->type = OP_CLOSE_SUBEXP;
1824           break;
1825         case '+':
1826           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1827             token->type = OP_DUP_PLUS;
1828           break;
1829         case '?':
1830           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1831             token->type = OP_DUP_QUESTION;
1832           break;
1833         case '{':
1834           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1835             token->type = OP_OPEN_DUP_NUM;
1836           break;
1837         case '}':
1838           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1839             token->type = OP_CLOSE_DUP_NUM;
1840           break;
1841         default:
1842           break;
1843         }
1844       return 2;
1845     }
1846
1847   token->type = CHARACTER;
1848 #ifdef RE_ENABLE_I18N
1849   if (input->mb_cur_max > 1)
1850     {
1851       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1852       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1853     }
1854   else
1855 #endif
1856     token->word_char = IS_WORD_CHAR (token->opr.c);
1857
1858   switch (c)
1859     {
1860     case '\n':
1861       if (syntax & REG_NEWLINE_ALT)
1862         token->type = OP_ALT;
1863       break;
1864     case '|':
1865       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1866         token->type = OP_ALT;
1867       break;
1868     case '*':
1869       token->type = OP_DUP_ASTERISK;
1870       break;
1871     case '+':
1872       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1873         token->type = OP_DUP_PLUS;
1874       break;
1875     case '?':
1876       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1877         token->type = OP_DUP_QUESTION;
1878       break;
1879     case '{':
1880       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1881         token->type = OP_OPEN_DUP_NUM;
1882       break;
1883     case '}':
1884       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1885         token->type = OP_CLOSE_DUP_NUM;
1886       break;
1887     case '(':
1888       if (syntax & REG_NO_BK_PARENS)
1889         token->type = OP_OPEN_SUBEXP;
1890       break;
1891     case ')':
1892       if (syntax & REG_NO_BK_PARENS)
1893         token->type = OP_CLOSE_SUBEXP;
1894       break;
1895     case '[':
1896       token->type = OP_OPEN_BRACKET;
1897       break;
1898     case '.':
1899       token->type = OP_PERIOD;
1900       break;
1901     case '^':
1902       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1903           re_string_cur_idx (input) != 0)
1904         {
1905           char prev = re_string_peek_byte (input, -1);
1906           if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1907             break;
1908         }
1909       token->type = ANCHOR;
1910       token->opr.ctx_type = LINE_FIRST;
1911       break;
1912     case '$':
1913       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1914           re_string_cur_idx (input) + 1 != re_string_length (input))
1915         {
1916           re_token_t next;
1917           re_string_skip_bytes (input, 1);
1918           peek_token (&next, input, syntax);
1919           re_string_skip_bytes (input, -1);
1920           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1921             break;
1922         }
1923       token->type = ANCHOR;
1924       token->opr.ctx_type = LINE_LAST;
1925       break;
1926     default:
1927       break;
1928     }
1929   return 1;
1930 }
1931
1932 /* Peek a token from INPUT, and return the length of the token.
1933    We must not use this function out of bracket expressions.  */
1934
1935 static int
1936 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1937 {
1938   unsigned char c;
1939   if (re_string_eoi (input))
1940     {
1941       token->type = END_OF_RE;
1942       return 0;
1943     }
1944   c = re_string_peek_byte (input, 0);
1945   token->opr.c = c;
1946
1947 #ifdef RE_ENABLE_I18N
1948   if (input->mb_cur_max > 1 &&
1949       !re_string_first_byte (input, re_string_cur_idx (input)))
1950     {
1951       token->type = CHARACTER;
1952       return 1;
1953     }
1954 #endif /* RE_ENABLE_I18N */
1955
1956   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1957       && re_string_cur_idx (input) + 1 < re_string_length (input))
1958     {
1959       /* In this case, '\' escape a character.  */
1960       unsigned char c2;
1961       re_string_skip_bytes (input, 1);
1962       c2 = re_string_peek_byte (input, 0);
1963       token->opr.c = c2;
1964       token->type = CHARACTER;
1965       return 1;
1966     }
1967   if (c == '[') /* '[' is a special char in a bracket exps.  */
1968     {
1969       unsigned char c2;
1970       int token_len;
1971       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1972         c2 = re_string_peek_byte (input, 1);
1973       else
1974         c2 = 0;
1975       token->opr.c = c2;
1976       token_len = 2;
1977       switch (c2)
1978         {
1979         case '.':
1980           token->type = OP_OPEN_COLL_ELEM;
1981           break;
1982         case '=':
1983           token->type = OP_OPEN_EQUIV_CLASS;
1984           break;
1985         case ':':
1986           if (syntax & REG_CHAR_CLASSES)
1987             {
1988               token->type = OP_OPEN_CHAR_CLASS;
1989               break;
1990             }
1991           /* else fall through.  */
1992         default:
1993           token->type = CHARACTER;
1994           token->opr.c = c;
1995           token_len = 1;
1996           break;
1997         }
1998       return token_len;
1999     }
2000   switch (c)
2001     {
2002     case '-':
2003       token->type = OP_CHARSET_RANGE;
2004       break;
2005     case ']':
2006       token->type = OP_CLOSE_BRACKET;
2007       break;
2008     case '^':
2009       token->type = OP_NON_MATCH_LIST;
2010       break;
2011     default:
2012       token->type = CHARACTER;
2013     }
2014   return 1;
2015 }
2016 \f
2017 /* Functions for parser.  */
2018
2019 /* Entry point of the parser.
2020    Parse the regular expression REGEXP and return the structure tree.
2021    If an error is occured, ERR is set by error code, and return NULL.
2022    This function build the following tree, from regular expression <reg_exp>:
2023            CAT
2024            / \
2025           /   \
2026    <reg_exp>  EOR
2027
2028    CAT means concatenation.
2029    EOR means end of regular expression.  */
2030
2031 static bin_tree_t *
2032 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2033        reg_errcode_t *err)
2034 {
2035   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2036   bin_tree_t *tree, *eor, *root;
2037   re_token_t current_token;
2038   dfa->syntax = syntax;
2039   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2040   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2041   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2042     return NULL;
2043   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2044   if (tree != NULL)
2045     root = create_tree (dfa, tree, eor, CONCAT);
2046   else
2047     root = eor;
2048   if (BE (eor == NULL || root == NULL, 0))
2049     {
2050       *err = REG_ESPACE;
2051       return NULL;
2052     }
2053   return root;
2054 }
2055
2056 /* This function build the following tree, from regular expression
2057    <branch1>|<branch2>:
2058            ALT
2059            / \
2060           /   \
2061    <branch1> <branch2>
2062
2063    ALT means alternative, which represents the operator `|'.  */
2064
2065 static bin_tree_t *
2066 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2067                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2068 {
2069   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2070   bin_tree_t *tree, *branch = NULL;
2071   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2072   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2073     return NULL;
2074
2075   while (token->type == OP_ALT)
2076     {
2077       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2078       if (token->type != OP_ALT && token->type != END_OF_RE
2079           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2080         {
2081           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2082           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2083             return NULL;
2084         }
2085       else
2086         branch = NULL;
2087       tree = create_tree (dfa, tree, branch, OP_ALT);
2088       if (BE (tree == NULL, 0))
2089         {
2090           *err = REG_ESPACE;
2091           return NULL;
2092         }
2093     }
2094   return tree;
2095 }
2096
2097 /* This function build the following tree, from regular expression
2098    <exp1><exp2>:
2099         CAT
2100         / \
2101        /   \
2102    <exp1> <exp2>
2103
2104    CAT means concatenation.  */
2105
2106 static bin_tree_t *
2107 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2108               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2109 {
2110   bin_tree_t *tree, *exp;
2111   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2112   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2113   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114     return NULL;
2115
2116   while (token->type != OP_ALT && token->type != END_OF_RE
2117          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2118     {
2119       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2120       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2121         {
2122           return NULL;
2123         }
2124       if (tree != NULL && exp != NULL)
2125         {
2126           tree = create_tree (dfa, tree, exp, CONCAT);
2127           if (tree == NULL)
2128             {
2129               *err = REG_ESPACE;
2130               return NULL;
2131             }
2132         }
2133       else if (tree == NULL)
2134         tree = exp;
2135       /* Otherwise exp == NULL, we don't need to create new tree.  */
2136     }
2137   return tree;
2138 }
2139
2140 /* This function build the following tree, from regular expression a*:
2141          *
2142          |
2143          a
2144 */
2145
2146 static bin_tree_t *
2147 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2148                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2149 {
2150   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2151   bin_tree_t *tree;
2152   switch (token->type)
2153     {
2154     case CHARACTER:
2155       tree = create_token_tree (dfa, NULL, NULL, token);
2156       if (BE (tree == NULL, 0))
2157         {
2158           *err = REG_ESPACE;
2159           return NULL;
2160         }
2161 #ifdef RE_ENABLE_I18N
2162       if (dfa->mb_cur_max > 1)
2163         {
2164           while (!re_string_eoi (regexp)
2165                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2166             {
2167               bin_tree_t *mbc_remain;
2168               fetch_token (token, regexp, syntax);
2169               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2170               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2171               if (BE (mbc_remain == NULL || tree == NULL, 0))
2172                 {
2173                   *err = REG_ESPACE;
2174                   return NULL;
2175                 }
2176             }
2177         }
2178 #endif
2179       break;
2180     case OP_OPEN_SUBEXP:
2181       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2182       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183         return NULL;
2184       break;
2185     case OP_OPEN_BRACKET:
2186       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2187       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188         return NULL;
2189       break;
2190     case OP_BACK_REF:
2191       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2192         {
2193           *err = REG_ESUBREG;
2194           return NULL;
2195         }
2196       dfa->used_bkref_map |= 1 << token->opr.idx;
2197       tree = create_token_tree (dfa, NULL, NULL, token);
2198       if (BE (tree == NULL, 0))
2199         {
2200           *err = REG_ESPACE;
2201           return NULL;
2202         }
2203       ++dfa->nbackref;
2204       dfa->has_mb_node = 1;
2205       break;
2206     case OP_OPEN_DUP_NUM:
2207       if (syntax & REG_CONTEXT_INVALID_DUP)
2208         {
2209           *err = REG_BADRPT;
2210           return NULL;
2211         }
2212       /* FALLTHROUGH */
2213     case OP_DUP_ASTERISK:
2214     case OP_DUP_PLUS:
2215     case OP_DUP_QUESTION:
2216       if (syntax & REG_CONTEXT_INVALID_OPS)
2217         {
2218           *err = REG_BADRPT;
2219           return NULL;
2220         }
2221       else if (syntax & REG_CONTEXT_INDEP_OPS)
2222         {
2223           fetch_token (token, regexp, syntax);
2224           return parse_expression (regexp, preg, token, syntax, nest, err);
2225         }
2226       /* else fall through  */
2227     case OP_CLOSE_SUBEXP:
2228       if ((token->type == OP_CLOSE_SUBEXP) &&
2229           !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2230         {
2231           *err = REG_ERPAREN;
2232           return NULL;
2233         }
2234       /* else fall through  */
2235     case OP_CLOSE_DUP_NUM:
2236       /* We treat it as a normal character.  */
2237
2238       /* Then we can these characters as normal characters.  */
2239       token->type = CHARACTER;
2240       /* mb_partial and word_char bits should be initialized already
2241          by peek_token.  */
2242       tree = create_token_tree (dfa, NULL, NULL, token);
2243       if (BE (tree == NULL, 0))
2244         {
2245           *err = REG_ESPACE;
2246           return NULL;
2247         }
2248       break;
2249     case ANCHOR:
2250       if ((token->opr.ctx_type
2251            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2252           && dfa->word_ops_used == 0)
2253         init_word_char (dfa);
2254       if (token->opr.ctx_type == WORD_DELIM
2255           || token->opr.ctx_type == NOT_WORD_DELIM)
2256         {
2257           bin_tree_t *tree_first, *tree_last;
2258           if (token->opr.ctx_type == WORD_DELIM)
2259             {
2260               token->opr.ctx_type = WORD_FIRST;
2261               tree_first = create_token_tree (dfa, NULL, NULL, token);
2262               token->opr.ctx_type = WORD_LAST;
2263             }
2264           else
2265             {
2266               token->opr.ctx_type = INSIDE_WORD;
2267               tree_first = create_token_tree (dfa, NULL, NULL, token);
2268               token->opr.ctx_type = INSIDE_NOTWORD;
2269             }
2270           tree_last = create_token_tree (dfa, NULL, NULL, token);
2271           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2272           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2273             {
2274               *err = REG_ESPACE;
2275               return NULL;
2276             }
2277         }
2278       else
2279         {
2280           tree = create_token_tree (dfa, NULL, NULL, token);
2281           if (BE (tree == NULL, 0))
2282             {
2283               *err = REG_ESPACE;
2284               return NULL;
2285             }
2286         }
2287       /* We must return here, since ANCHORs can't be followed
2288          by repetition operators.
2289          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2290              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2291       fetch_token (token, regexp, syntax);
2292       return tree;
2293     case OP_PERIOD:
2294       tree = create_token_tree (dfa, NULL, NULL, token);
2295       if (BE (tree == NULL, 0))
2296         {
2297           *err = REG_ESPACE;
2298           return NULL;
2299         }
2300       if (dfa->mb_cur_max > 1)
2301         dfa->has_mb_node = 1;
2302       break;
2303     case OP_WORD:
2304     case OP_NOTWORD:
2305       tree = build_charclass_op (dfa, regexp->trans,
2306                                  (const unsigned char *) "alnum",
2307                                  (const unsigned char *) "_",
2308                                  token->type == OP_NOTWORD, err);
2309       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2310         return NULL;
2311       break;
2312     case OP_SPACE:
2313     case OP_NOTSPACE:
2314       tree = build_charclass_op (dfa, regexp->trans,
2315                                  (const unsigned char *) "space",
2316                                  (const unsigned char *) "",
2317                                  token->type == OP_NOTSPACE, err);
2318       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2319         return NULL;
2320       break;
2321     case OP_ALT:
2322     case END_OF_RE:
2323       return NULL;
2324     case BACK_SLASH:
2325       *err = REG_EESCAPE;
2326       return NULL;
2327     default:
2328       /* Must not happen?  */
2329 #ifdef DEBUG
2330       assert (0);
2331 #endif
2332       return NULL;
2333     }
2334   fetch_token (token, regexp, syntax);
2335
2336   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2337          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2338     {
2339       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2340       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2341         return NULL;
2342       /* In BRE consecutive duplications are not allowed.  */
2343       if ((syntax & REG_CONTEXT_INVALID_DUP)
2344           && (token->type == OP_DUP_ASTERISK
2345               || token->type == OP_OPEN_DUP_NUM))
2346         {
2347           *err = REG_BADRPT;
2348           return NULL;
2349         }
2350     }
2351
2352   return tree;
2353 }
2354
2355 /* This function build the following tree, from regular expression
2356    (<reg_exp>):
2357          SUBEXP
2358             |
2359         <reg_exp>
2360 */
2361
2362 static bin_tree_t *
2363 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2364                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2365 {
2366   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2367   bin_tree_t *tree;
2368   size_t cur_nsub;
2369   cur_nsub = preg->re_nsub++;
2370
2371   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2372
2373   /* The subexpression may be a null string.  */
2374   if (token->type == OP_CLOSE_SUBEXP)
2375     tree = NULL;
2376   else
2377     {
2378       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2379       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2380         *err = REG_EPAREN;
2381       if (BE (*err != REG_NOERROR, 0))
2382         return NULL;
2383     }
2384   dfa->completed_bkref_map |= 1 << cur_nsub;
2385
2386   tree = create_tree (dfa, tree, NULL, SUBEXP);
2387   if (BE (tree == NULL, 0))
2388     {
2389       *err = REG_ESPACE;
2390       return NULL;
2391     }
2392   tree->token.opr.idx = cur_nsub;
2393   return tree;
2394 }
2395
2396 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2397
2398 static bin_tree_t *
2399 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2400               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2401 {
2402   bin_tree_t *tree = NULL, *old_tree = NULL;
2403   int i, start, end, start_idx = re_string_cur_idx (regexp);
2404   re_token_t start_token = *token;
2405
2406   if (token->type == OP_OPEN_DUP_NUM)
2407     {
2408       end = 0;
2409       start = fetch_number (regexp, token, syntax);
2410       if (start == -1)
2411         {
2412           if (token->type == CHARACTER && token->opr.c == ',')
2413             start = 0; /* We treat "{,m}" as "{0,m}".  */
2414           else
2415             {
2416               *err = REG_BADBR; /* <re>{} is invalid.  */
2417               return NULL;
2418             }
2419         }
2420       if (BE (start != -2, 1))
2421         {
2422           /* We treat "{n}" as "{n,n}".  */
2423           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2424                  : ((token->type == CHARACTER && token->opr.c == ',')
2425                     ? fetch_number (regexp, token, syntax) : -2));
2426         }
2427       if (BE (start == -2 || end == -2, 0))
2428         {
2429           /* Invalid sequence.  */
2430           if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2431             {
2432               if (token->type == END_OF_RE)
2433                 *err = REG_EBRACE;
2434               else
2435                 *err = REG_BADBR;
2436
2437               return NULL;
2438             }
2439
2440           /* If the syntax bit is set, rollback.  */
2441           re_string_set_index (regexp, start_idx);
2442           *token = start_token;
2443           token->type = CHARACTER;
2444           /* mb_partial and word_char bits should be already initialized by
2445              peek_token.  */
2446           return elem;
2447         }
2448
2449       if (BE (end != -1 && start > end, 0))
2450         {
2451           /* First number greater than second.  */
2452           *err = REG_BADBR;
2453           return NULL;
2454         }
2455     }
2456   else
2457     {
2458       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2459       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2460     }
2461
2462   fetch_token (token, regexp, syntax);
2463
2464   if (BE (elem == NULL, 0))
2465     return NULL;
2466   if (BE (start == 0 && end == 0, 0))
2467     {
2468       postorder (elem, free_tree, NULL);
2469       return NULL;
2470     }
2471
2472   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2473   if (BE (start > 0, 0))
2474     {
2475       tree = elem;
2476       for (i = 2; i <= start; ++i)
2477         {
2478           elem = duplicate_tree (elem, dfa);
2479           tree = create_tree (dfa, tree, elem, CONCAT);
2480           if (BE (elem == NULL || tree == NULL, 0))
2481             goto parse_dup_op_espace;
2482         }
2483
2484       if (start == end)
2485         return tree;
2486
2487       /* Duplicate ELEM before it is marked optional.  */
2488       elem = duplicate_tree (elem, dfa);
2489       old_tree = tree;
2490     }
2491   else
2492     old_tree = NULL;
2493
2494   if (elem->token.type == SUBEXP)
2495     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2496
2497   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2498   if (BE (tree == NULL, 0))
2499     goto parse_dup_op_espace;
2500
2501   /* This loop is actually executed only when end != -1,
2502      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2503      already created the start+1-th copy.  */
2504   for (i = start + 2; i <= end; ++i)
2505     {
2506       elem = duplicate_tree (elem, dfa);
2507       tree = create_tree (dfa, tree, elem, CONCAT);
2508       if (BE (elem == NULL || tree == NULL, 0))
2509         goto parse_dup_op_espace;
2510
2511       tree = create_tree (dfa, tree, NULL, OP_ALT);
2512       if (BE (tree == NULL, 0))
2513         goto parse_dup_op_espace;
2514     }
2515
2516   if (old_tree)
2517     tree = create_tree (dfa, old_tree, tree, CONCAT);
2518
2519   return tree;
2520
2521  parse_dup_op_espace:
2522   *err = REG_ESPACE;
2523   return NULL;
2524 }
2525
2526 /* Size of the names for collating symbol/equivalence_class/character_class.
2527    I'm not sure, but maybe enough.  */
2528 #define BRACKET_NAME_BUF_SIZE 32
2529
2530 #ifndef _LIBC
2531   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2532      Build the range expression which starts from START_ELEM, and ends
2533      at END_ELEM.  The result are written to MBCSET and SBCSET.
2534      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2535      mbcset->range_ends, is a pointer argument sinse we may
2536      update it.  */
2537
2538 static reg_errcode_t
2539 build_range_exp (re_bitset_ptr_t sbcset,
2540 # ifdef RE_ENABLE_I18N
2541                  re_charset_t *mbcset, int *range_alloc,
2542 # endif
2543                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2544 {
2545   unsigned int start_ch, end_ch;
2546   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2547   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2548           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2549           0))
2550     return REG_ERANGE;
2551
2552   /* We can handle no multi character collating elements without libc
2553      support.  */
2554   if (BE ((start_elem->type == COLL_SYM
2555            && strlen ((char *) start_elem->opr.name) > 1)
2556           || (end_elem->type == COLL_SYM
2557               && strlen ((char *) end_elem->opr.name) > 1), 0))
2558     return REG_ECOLLATE;
2559
2560 # ifdef RE_ENABLE_I18N
2561   {
2562     wchar_t wc;
2563     wint_t start_wc, end_wc;
2564     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2565
2566     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2567                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2568                    : 0));
2569     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2570               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2571                  : 0));
2572     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2573                 ? __btowc (start_ch) : start_elem->opr.wch);
2574     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2575               ? __btowc (end_ch) : end_elem->opr.wch);
2576     if (start_wc == WEOF || end_wc == WEOF)
2577       return REG_ECOLLATE;
2578     cmp_buf[0] = start_wc;
2579     cmp_buf[4] = end_wc;
2580     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2581       return REG_ERANGE;
2582
2583     /* Got valid collation sequence values, add them as a new entry.
2584        However, for !_LIBC we have no collation elements: if the
2585        character set is single byte, the single byte character set
2586        that we build below suffices.  parse_bracket_exp passes
2587        no MBCSET if dfa->mb_cur_max == 1.  */
2588     if (mbcset)
2589       {
2590         /* Check the space of the arrays.  */
2591         if (BE (*range_alloc == mbcset->nranges, 0))
2592           {
2593             /* There is not enough space, need realloc.  */
2594             wchar_t *new_array_start, *new_array_end;
2595             int new_nranges;
2596
2597             /* +1 in case of mbcset->nranges is 0.  */
2598             new_nranges = 2 * mbcset->nranges + 1;
2599             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2600                are NULL if *range_alloc == 0.  */
2601             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2602                                           new_nranges);
2603             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2604                                         new_nranges);
2605
2606             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2607               return REG_ESPACE;
2608
2609             mbcset->range_starts = new_array_start;
2610             mbcset->range_ends = new_array_end;
2611             *range_alloc = new_nranges;
2612           }
2613
2614         mbcset->range_starts[mbcset->nranges] = start_wc;
2615         mbcset->range_ends[mbcset->nranges++] = end_wc;
2616       }
2617
2618     /* Build the table for single byte characters.  */
2619     for (wc = 0; wc < SBC_MAX; ++wc)
2620       {
2621         cmp_buf[2] = wc;
2622         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2623             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2624           bitset_set (sbcset, wc);
2625       }
2626   }
2627 # else /* not RE_ENABLE_I18N */
2628   {
2629     unsigned int ch;
2630     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2631                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2632                    : 0));
2633     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2634               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2635                  : 0));
2636     if (start_ch > end_ch)
2637       return REG_ERANGE;
2638     /* Build the table for single byte characters.  */
2639     for (ch = 0; ch < SBC_MAX; ++ch)
2640       if (start_ch <= ch  && ch <= end_ch)
2641         bitset_set (sbcset, ch);
2642   }
2643 # endif /* not RE_ENABLE_I18N */
2644   return REG_NOERROR;
2645 }
2646 #endif /* not _LIBC */
2647
2648 #ifndef _LIBC
2649 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2650    Build the collating element which is represented by NAME.
2651    The result are written to MBCSET and SBCSET.
2652    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2653    pointer argument since we may update it.  */
2654
2655 static reg_errcode_t
2656 build_collating_symbol (re_bitset_ptr_t sbcset,
2657 # ifdef RE_ENABLE_I18N
2658                         re_charset_t *mbcset, int *coll_sym_alloc,
2659 # endif
2660                         const unsigned char *name)
2661 {
2662   size_t name_len = strlen ((const char *) name);
2663   if (BE (name_len != 1, 0))
2664     return REG_ECOLLATE;
2665   else
2666     {
2667       bitset_set (sbcset, name[0]);
2668       return REG_NOERROR;
2669     }
2670 }
2671 #endif /* not _LIBC */
2672
2673 /* This function parse bracket expression like "[abc]", "[a-c]",
2674    "[[.a-a.]]" etc.  */
2675
2676 static bin_tree_t *
2677 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2678                    reg_syntax_t syntax, reg_errcode_t *err)
2679 {
2680 #ifdef _LIBC
2681   const unsigned char *collseqmb;
2682   const char *collseqwc;
2683   uint32_t nrules;
2684   int32_t table_size;
2685   const int32_t *symb_table;
2686   const unsigned char *extra;
2687
2688   /* Local function for parse_bracket_exp used in _LIBC environement.
2689      Seek the collating symbol entry correspondings to NAME.
2690      Return the index of the symbol in the SYMB_TABLE.  */
2691
2692   auto inline int32_t
2693   __attribute ((always_inline))
2694   seek_collating_symbol_entry (name, name_len)
2695          const unsigned char *name;
2696          size_t name_len;
2697     {
2698       int32_t hash = elem_hash ((const char *) name, name_len);
2699       int32_t elem = hash % table_size;
2700       int32_t second = hash % (table_size - 2);
2701       while (symb_table[2 * elem] != 0)
2702         {
2703           /* First compare the hashing value.  */
2704           if (symb_table[2 * elem] == hash
2705               /* Compare the length of the name.  */
2706               && name_len == extra[symb_table[2 * elem + 1]]
2707               /* Compare the name.  */
2708               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2709                          name_len) == 0)
2710             {
2711               /* Yep, this is the entry.  */
2712               break;
2713             }
2714
2715           /* Next entry.  */
2716           elem += second;
2717         }
2718       return elem;
2719     }
2720
2721   /* Local function for parse_bracket_exp used in _LIBC environement.
2722      Look up the collation sequence value of BR_ELEM.
2723      Return the value if succeeded, UINT_MAX otherwise.  */
2724
2725   auto inline unsigned int
2726   __attribute ((always_inline))
2727   lookup_collation_sequence_value (br_elem)
2728          bracket_elem_t *br_elem;
2729     {
2730       if (br_elem->type == SB_CHAR)
2731         {
2732           /*
2733           if (MB_CUR_MAX == 1)
2734           */
2735           if (nrules == 0)
2736             return collseqmb[br_elem->opr.ch];
2737           else
2738             {
2739               wint_t wc = __btowc (br_elem->opr.ch);
2740               return __collseq_table_lookup (collseqwc, wc);
2741             }
2742         }
2743       else if (br_elem->type == MB_CHAR)
2744         {
2745           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2746         }
2747       else if (br_elem->type == COLL_SYM)
2748         {
2749           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2750           if (nrules != 0)
2751             {
2752               int32_t elem, idx;
2753               elem = seek_collating_symbol_entry (br_elem->opr.name,
2754                                                   sym_name_len);
2755               if (symb_table[2 * elem] != 0)
2756                 {
2757                   /* We found the entry.  */
2758                   idx = symb_table[2 * elem + 1];
2759                   /* Skip the name of collating element name.  */
2760                   idx += 1 + extra[idx];
2761                   /* Skip the byte sequence of the collating element.  */
2762                   idx += 1 + extra[idx];
2763                   /* Adjust for the alignment.  */
2764                   idx = (idx + 3) & ~3;
2765                   /* Skip the multibyte collation sequence value.  */
2766                   idx += sizeof (unsigned int);
2767                   /* Skip the wide char sequence of the collating element.  */
2768                   idx += sizeof (unsigned int) *
2769                     (1 + *(unsigned int *) (extra + idx));
2770                   /* Return the collation sequence value.  */
2771                   return *(unsigned int *) (extra + idx);
2772                 }
2773               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2774                 {
2775                   /* No valid character.  Match it as a single byte
2776                      character.  */
2777                   return collseqmb[br_elem->opr.name[0]];
2778                 }
2779             }
2780           else if (sym_name_len == 1)
2781             return collseqmb[br_elem->opr.name[0]];
2782         }
2783       return UINT_MAX;
2784     }
2785
2786   /* Local function for parse_bracket_exp used in _LIBC environement.
2787      Build the range expression which starts from START_ELEM, and ends
2788      at END_ELEM.  The result are written to MBCSET and SBCSET.
2789      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2790      mbcset->range_ends, is a pointer argument sinse we may
2791      update it.  */
2792
2793   auto inline reg_errcode_t
2794   __attribute ((always_inline))
2795   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2796          re_charset_t *mbcset;
2797          int *range_alloc;
2798          re_bitset_ptr_t sbcset;
2799          bracket_elem_t *start_elem, *end_elem;
2800     {
2801       unsigned int ch;
2802       uint32_t start_collseq;
2803       uint32_t end_collseq;
2804
2805       /* Equivalence Classes and Character Classes can't be a range
2806          start/end.  */
2807       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2808               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2809               0))
2810         return REG_ERANGE;
2811
2812       start_collseq = lookup_collation_sequence_value (start_elem);
2813       end_collseq = lookup_collation_sequence_value (end_elem);
2814       /* Check start/end collation sequence values.  */
2815       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2816         return REG_ECOLLATE;
2817       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2818         return REG_ERANGE;
2819
2820       /* Got valid collation sequence values, add them as a new entry.
2821          However, if we have no collation elements, and the character set
2822          is single byte, the single byte character set that we
2823          build below suffices. */
2824       if (nrules > 0 || dfa->mb_cur_max > 1)
2825         {
2826           /* Check the space of the arrays.  */
2827           if (BE (*range_alloc == mbcset->nranges, 0))
2828             {
2829               /* There is not enough space, need realloc.  */
2830               uint32_t *new_array_start;
2831               uint32_t *new_array_end;
2832               int new_nranges;
2833
2834               /* +1 in case of mbcset->nranges is 0.  */
2835               new_nranges = 2 * mbcset->nranges + 1;
2836               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2837                                             new_nranges);
2838               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2839                                           new_nranges);
2840
2841               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2842                 return REG_ESPACE;
2843
2844               mbcset->range_starts = new_array_start;
2845               mbcset->range_ends = new_array_end;
2846               *range_alloc = new_nranges;
2847             }
2848
2849           mbcset->range_starts[mbcset->nranges] = start_collseq;
2850           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2851         }
2852
2853       /* Build the table for single byte characters.  */
2854       for (ch = 0; ch < SBC_MAX; ch++)
2855         {
2856           uint32_t ch_collseq;
2857           /*
2858           if (MB_CUR_MAX == 1)
2859           */
2860           if (nrules == 0)
2861             ch_collseq = collseqmb[ch];
2862           else
2863             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2864           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2865             bitset_set (sbcset, ch);
2866         }
2867       return REG_NOERROR;
2868     }
2869
2870   /* Local function for parse_bracket_exp used in _LIBC environement.
2871      Build the collating element which is represented by NAME.
2872      The result are written to MBCSET and SBCSET.
2873      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2874      pointer argument sinse we may update it.  */
2875
2876   auto inline reg_errcode_t
2877   __attribute ((always_inline))
2878   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2879          re_charset_t *mbcset;
2880          int *coll_sym_alloc;
2881          re_bitset_ptr_t sbcset;
2882          const unsigned char *name;
2883     {
2884       int32_t elem, idx;
2885       size_t name_len = strlen ((const char *) name);
2886       if (nrules != 0)
2887         {
2888           elem = seek_collating_symbol_entry (name, name_len);
2889           if (symb_table[2 * elem] != 0)
2890             {
2891               /* We found the entry.  */
2892               idx = symb_table[2 * elem + 1];
2893               /* Skip the name of collating element name.  */
2894               idx += 1 + extra[idx];
2895             }
2896           else if (symb_table[2 * elem] == 0 && name_len == 1)
2897             {
2898               /* No valid character, treat it as a normal
2899                  character.  */
2900               bitset_set (sbcset, name[0]);
2901               return REG_NOERROR;
2902             }
2903           else
2904             return REG_ECOLLATE;
2905
2906           /* Got valid collation sequence, add it as a new entry.  */
2907           /* Check the space of the arrays.  */
2908           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2909             {
2910               /* Not enough, realloc it.  */
2911               /* +1 in case of mbcset->ncoll_syms is 0.  */
2912               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2913               /* Use realloc since mbcset->coll_syms is NULL
2914                  if *alloc == 0.  */
2915               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2916                                                    new_coll_sym_alloc);
2917               if (BE (new_coll_syms == NULL, 0))
2918                 return REG_ESPACE;
2919               mbcset->coll_syms = new_coll_syms;
2920               *coll_sym_alloc = new_coll_sym_alloc;
2921             }
2922           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2923           return REG_NOERROR;
2924         }
2925       else
2926         {
2927           if (BE (name_len != 1, 0))
2928             return REG_ECOLLATE;
2929           else
2930             {
2931               bitset_set (sbcset, name[0]);
2932               return REG_NOERROR;
2933             }
2934         }
2935     }
2936 #endif
2937
2938   re_token_t br_token;
2939   re_bitset_ptr_t sbcset;
2940 #ifdef RE_ENABLE_I18N
2941   re_charset_t *mbcset;
2942   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2943   int equiv_class_alloc = 0, char_class_alloc = 0;
2944 #endif /* not RE_ENABLE_I18N */
2945   int non_match = 0;
2946   bin_tree_t *work_tree;
2947   int token_len;
2948   int first_round = 1;
2949 #ifdef _LIBC
2950   collseqmb = (const unsigned char *)
2951     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2952   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2953   if (nrules)
2954     {
2955       /*
2956       if (MB_CUR_MAX > 1)
2957       */
2958         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2959       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2960       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2961                                                   _NL_COLLATE_SYMB_TABLEMB);
2962       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2963                                                    _NL_COLLATE_SYMB_EXTRAMB);
2964     }
2965 #endif
2966   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2967 #ifdef RE_ENABLE_I18N
2968   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2969 #endif /* RE_ENABLE_I18N */
2970 #ifdef RE_ENABLE_I18N
2971   if (BE (sbcset == NULL || mbcset == NULL, 0))
2972 #else
2973   if (BE (sbcset == NULL, 0))
2974 #endif /* RE_ENABLE_I18N */
2975     {
2976       *err = REG_ESPACE;
2977       return NULL;
2978     }
2979
2980   token_len = peek_token_bracket (token, regexp, syntax);
2981   if (BE (token->type == END_OF_RE, 0))
2982     {
2983       *err = REG_BADPAT;
2984       goto parse_bracket_exp_free_return;
2985     }
2986   if (token->type == OP_NON_MATCH_LIST)
2987     {
2988 #ifdef RE_ENABLE_I18N
2989       mbcset->non_match = 1;
2990 #endif /* not RE_ENABLE_I18N */
2991       non_match = 1;
2992       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2993         bitset_set (sbcset, '\0');
2994       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2995       token_len = peek_token_bracket (token, regexp, syntax);
2996       if (BE (token->type == END_OF_RE, 0))
2997         {
2998           *err = REG_BADPAT;
2999           goto parse_bracket_exp_free_return;
3000         }
3001     }
3002
3003   /* We treat the first ']' as a normal character.  */
3004   if (token->type == OP_CLOSE_BRACKET)
3005     token->type = CHARACTER;
3006
3007   while (1)
3008     {
3009       bracket_elem_t start_elem, end_elem;
3010       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3011       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3012       reg_errcode_t ret;
3013       int token_len2 = 0, is_range_exp = 0;
3014       re_token_t token2;
3015
3016       start_elem.opr.name = start_name_buf;
3017       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3018                                    syntax, first_round);
3019       if (BE (ret != REG_NOERROR, 0))
3020         {
3021           *err = ret;
3022           goto parse_bracket_exp_free_return;
3023         }
3024       first_round = 0;
3025
3026       /* Get information about the next token.  We need it in any case.  */
3027       token_len = peek_token_bracket (token, regexp, syntax);
3028
3029       /* Do not check for ranges if we know they are not allowed.  */
3030       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3031         {
3032           if (BE (token->type == END_OF_RE, 0))
3033             {
3034               *err = REG_EBRACK;
3035               goto parse_bracket_exp_free_return;
3036             }
3037           if (token->type == OP_CHARSET_RANGE)
3038             {
3039               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3040               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3041               if (BE (token2.type == END_OF_RE, 0))
3042                 {
3043                   *err = REG_EBRACK;
3044                   goto parse_bracket_exp_free_return;
3045                 }
3046               if (token2.type == OP_CLOSE_BRACKET)
3047                 {
3048                   /* We treat the last '-' as a normal character.  */
3049                   re_string_skip_bytes (regexp, -token_len);
3050                   token->type = CHARACTER;
3051                 }
3052               else
3053                 is_range_exp = 1;
3054             }
3055         }
3056
3057       if (is_range_exp == 1)
3058         {
3059           end_elem.opr.name = end_name_buf;
3060           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3061                                        dfa, syntax, 1);
3062           if (BE (ret != REG_NOERROR, 0))
3063             {
3064               *err = ret;
3065               goto parse_bracket_exp_free_return;
3066             }
3067
3068           token_len = peek_token_bracket (token, regexp, syntax);
3069
3070 #ifdef _LIBC
3071           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3072                                   &start_elem, &end_elem);
3073 #else
3074 # ifdef RE_ENABLE_I18N
3075           *err = build_range_exp (sbcset,
3076                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3077                                   &range_alloc, &start_elem, &end_elem);
3078 # else
3079           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3080 # endif
3081 #endif /* RE_ENABLE_I18N */
3082           if (BE (*err != REG_NOERROR, 0))
3083             goto parse_bracket_exp_free_return;
3084         }
3085       else
3086         {
3087           switch (start_elem.type)
3088             {
3089             case SB_CHAR:
3090               bitset_set (sbcset, start_elem.opr.ch);
3091               break;
3092 #ifdef RE_ENABLE_I18N
3093             case MB_CHAR:
3094               /* Check whether the array has enough space.  */
3095               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3096                 {
3097                   wchar_t *new_mbchars;
3098                   /* Not enough, realloc it.  */
3099                   /* +1 in case of mbcset->nmbchars is 0.  */
3100                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3101                   /* Use realloc since array is NULL if *alloc == 0.  */
3102                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3103                                             mbchar_alloc);
3104                   if (BE (new_mbchars == NULL, 0))
3105                     goto parse_bracket_exp_espace;
3106                   mbcset->mbchars = new_mbchars;
3107                 }
3108               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3109               break;
3110 #endif /* RE_ENABLE_I18N */
3111             case EQUIV_CLASS:
3112               *err = build_equiv_class (sbcset,
3113 #ifdef RE_ENABLE_I18N
3114                                         mbcset, &equiv_class_alloc,
3115 #endif /* RE_ENABLE_I18N */
3116                                         start_elem.opr.name);
3117               if (BE (*err != REG_NOERROR, 0))
3118                 goto parse_bracket_exp_free_return;
3119               break;
3120             case COLL_SYM:
3121               *err = build_collating_symbol (sbcset,
3122 #ifdef RE_ENABLE_I18N
3123                                              mbcset, &coll_sym_alloc,
3124 #endif /* RE_ENABLE_I18N */
3125                                              start_elem.opr.name);
3126               if (BE (*err != REG_NOERROR, 0))
3127                 goto parse_bracket_exp_free_return;
3128               break;
3129             case CHAR_CLASS:
3130               *err = build_charclass (regexp->trans, sbcset,
3131 #ifdef RE_ENABLE_I18N
3132                                       mbcset, &char_class_alloc,
3133 #endif /* RE_ENABLE_I18N */
3134                                       start_elem.opr.name, syntax);
3135               if (BE (*err != REG_NOERROR, 0))
3136                goto parse_bracket_exp_free_return;
3137               break;
3138             default:
3139               assert (0);
3140               break;
3141             }
3142         }
3143       if (BE (token->type == END_OF_RE, 0))
3144         {
3145           *err = REG_EBRACK;
3146           goto parse_bracket_exp_free_return;
3147         }
3148       if (token->type == OP_CLOSE_BRACKET)
3149         break;
3150     }
3151
3152   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3153
3154   /* If it is non-matching list.  */
3155   if (non_match)
3156     bitset_not (sbcset);
3157
3158 #ifdef RE_ENABLE_I18N
3159   /* Ensure only single byte characters are set.  */
3160   if (dfa->mb_cur_max > 1)
3161     bitset_mask (sbcset, dfa->sb_char);
3162
3163   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3164       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3165                                                      || mbcset->non_match)))
3166     {
3167       bin_tree_t *mbc_tree;
3168       int sbc_idx;
3169       /* Build a tree for complex bracket.  */
3170       dfa->has_mb_node = 1;
3171       br_token.type = COMPLEX_BRACKET;
3172       br_token.opr.mbcset = mbcset;
3173       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3174       if (BE (mbc_tree == NULL, 0))
3175         goto parse_bracket_exp_espace;
3176       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3177         if (sbcset[sbc_idx])
3178           break;
3179       /* If there are no bits set in sbcset, there is no point
3180          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3181       if (sbc_idx < BITSET_UINTS)
3182         {
3183           /* Build a tree for simple bracket.  */
3184           br_token.type = SIMPLE_BRACKET;
3185           br_token.opr.sbcset = sbcset;
3186           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3187           if (BE (work_tree == NULL, 0))
3188             goto parse_bracket_exp_espace;
3189
3190           /* Then join them by ALT node.  */
3191           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3192           if (BE (work_tree == NULL, 0))
3193             goto parse_bracket_exp_espace;
3194         }
3195       else
3196         {
3197           re_free (sbcset);
3198           work_tree = mbc_tree;
3199         }
3200     }
3201   else
3202 #endif /* not RE_ENABLE_I18N */
3203     {
3204 #ifdef RE_ENABLE_I18N
3205       free_charset (mbcset);
3206 #endif
3207       /* Build a tree for simple bracket.  */
3208       br_token.type = SIMPLE_BRACKET;
3209       br_token.opr.sbcset = sbcset;
3210       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3211       if (BE (work_tree == NULL, 0))
3212         goto parse_bracket_exp_espace;
3213     }
3214   return work_tree;
3215
3216  parse_bracket_exp_espace:
3217   *err = REG_ESPACE;
3218  parse_bracket_exp_free_return:
3219   re_free (sbcset);
3220 #ifdef RE_ENABLE_I18N
3221   free_charset (mbcset);
3222 #endif /* RE_ENABLE_I18N */
3223   return NULL;
3224 }
3225
3226 /* Parse an element in the bracket expression.  */
3227
3228 static reg_errcode_t
3229 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3230                        re_token_t *token, int token_len, re_dfa_t *dfa,
3231                        reg_syntax_t syntax, int accept_hyphen)
3232 {
3233 #ifdef RE_ENABLE_I18N
3234   int cur_char_size;
3235   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3236   if (cur_char_size > 1)
3237     {
3238       elem->type = MB_CHAR;
3239       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3240       re_string_skip_bytes (regexp, cur_char_size);
3241       return REG_NOERROR;
3242     }
3243 #endif /* RE_ENABLE_I18N */
3244   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3245   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3246       || token->type == OP_OPEN_EQUIV_CLASS)
3247     return parse_bracket_symbol (elem, regexp, token);
3248   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3249     {
3250       /* A '-' must only appear as anything but a range indicator before
3251          the closing bracket.  Everything else is an error.  */
3252       re_token_t token2;
3253       (void) peek_token_bracket (&token2, regexp, syntax);
3254       if (token2.type != OP_CLOSE_BRACKET)
3255         /* The actual error value is not standardized since this whole
3256            case is undefined.  But ERANGE makes good sense.  */
3257         return REG_ERANGE;
3258     }
3259   elem->type = SB_CHAR;
3260   elem->opr.ch = token->opr.c;
3261   return REG_NOERROR;
3262 }
3263
3264 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3265    such as [:<character_class>:], [.<collating_element>.], and
3266    [=<equivalent_class>=].  */
3267
3268 static reg_errcode_t
3269 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3270                       re_token_t *token)
3271 {
3272   unsigned char ch, delim = token->opr.c;
3273   int i = 0;
3274   if (re_string_eoi(regexp))
3275     return REG_EBRACK;
3276   for (;; ++i)
3277     {
3278       if (i >= BRACKET_NAME_BUF_SIZE)
3279         return REG_EBRACK;
3280       if (token->type == OP_OPEN_CHAR_CLASS)
3281         ch = re_string_fetch_byte_case (regexp);
3282       else
3283         ch = re_string_fetch_byte (regexp);
3284       if (re_string_eoi(regexp))
3285         return REG_EBRACK;
3286       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3287         break;
3288       elem->opr.name[i] = ch;
3289     }
3290   re_string_skip_bytes (regexp, 1);
3291   elem->opr.name[i] = '\0';
3292   switch (token->type)
3293     {
3294     case OP_OPEN_COLL_ELEM:
3295       elem->type = COLL_SYM;
3296       break;
3297     case OP_OPEN_EQUIV_CLASS:
3298       elem->type = EQUIV_CLASS;
3299       break;
3300     case OP_OPEN_CHAR_CLASS:
3301       elem->type = CHAR_CLASS;
3302       break;
3303     default:
3304       break;
3305     }
3306   return REG_NOERROR;
3307 }
3308
3309   /* Helper function for parse_bracket_exp.
3310      Build the equivalence class which is represented by NAME.
3311      The result are written to MBCSET and SBCSET.
3312      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3313      is a pointer argument sinse we may update it.  */
3314
3315 static reg_errcode_t
3316 build_equiv_class (re_bitset_ptr_t sbcset,
3317 #ifdef RE_ENABLE_I18N
3318                    re_charset_t *mbcset, int *equiv_class_alloc,
3319 #endif
3320                    const unsigned char *name)
3321 {
3322 #if defined _LIBC
3323   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3324   if (nrules != 0)
3325     {
3326       const int32_t *table, *indirect;
3327       const unsigned char *weights, *extra, *cp;
3328       unsigned char char_buf[2];
3329       int32_t idx1, idx2;
3330       unsigned int ch;
3331       size_t len;
3332       /* This #include defines a local function!  */
3333 # include <locale/weight.h>
3334       /* Calculate the index for equivalence class.  */
3335       cp = name;
3336       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3337       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3338                                                _NL_COLLATE_WEIGHTMB);
3339       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3340                                                    _NL_COLLATE_EXTRAMB);
3341       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3342                                                 _NL_COLLATE_INDIRECTMB);
3343       idx1 = findidx (&cp);
3344       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3345         /* This isn't a valid character.  */
3346         return REG_ECOLLATE;
3347
3348       /* Build single byte matcing table for this equivalence class.  */
3349       char_buf[1] = (unsigned char) '\0';
3350       len = weights[idx1];
3351       for (ch = 0; ch < SBC_MAX; ++ch)
3352         {
3353           char_buf[0] = ch;
3354           cp = char_buf;
3355           idx2 = findidx (&cp);
3356 /*
3357           idx2 = table[ch];
3358 */
3359           if (idx2 == 0)
3360             /* This isn't a valid character.  */
3361             continue;
3362           if (len == weights[idx2])
3363             {
3364               int cnt = 0;
3365               while (cnt <= len &&
3366                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3367                 ++cnt;
3368
3369               if (cnt > len)
3370                 bitset_set (sbcset, ch);
3371             }
3372         }
3373       /* Check whether the array has enough space.  */
3374       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3375         {
3376           /* Not enough, realloc it.  */
3377           /* +1 in case of mbcset->nequiv_classes is 0.  */
3378           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3379           /* Use realloc since the array is NULL if *alloc == 0.  */
3380           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3381                                                    int32_t,
3382                                                    new_equiv_class_alloc);
3383           if (BE (new_equiv_classes == NULL, 0))
3384             return REG_ESPACE;
3385           mbcset->equiv_classes = new_equiv_classes;
3386           *equiv_class_alloc = new_equiv_class_alloc;
3387         }
3388       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3389     }
3390   else
3391 #endif /* _LIBC */
3392     {
3393       if (BE (strlen ((const char *) name) != 1, 0))
3394         return REG_ECOLLATE;
3395       bitset_set (sbcset, *name);
3396     }
3397   return REG_NOERROR;
3398 }
3399
3400   /* Helper function for parse_bracket_exp.
3401      Build the character class which is represented by NAME.
3402      The result are written to MBCSET and SBCSET.
3403      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3404      is a pointer argument sinse we may update it.  */
3405
3406 static reg_errcode_t
3407 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3408 #ifdef RE_ENABLE_I18N
3409                  re_charset_t *mbcset, int *char_class_alloc,
3410 #endif
3411                  const unsigned char *class_name, reg_syntax_t syntax)
3412 {
3413   int i;
3414   const char *name = (const char *) class_name;
3415
3416   /* In case of REG_ICASE "upper" and "lower" match the both of
3417      upper and lower cases.  */
3418   if ((syntax & REG_IGNORE_CASE)
3419       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3420     name = "alpha";
3421
3422 #ifdef RE_ENABLE_I18N
3423   /* Check the space of the arrays.  */
3424   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3425     {
3426       /* Not enough, realloc it.  */
3427       /* +1 in case of mbcset->nchar_classes is 0.  */
3428       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3429       /* Use realloc since array is NULL if *alloc == 0.  */
3430       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3431                                                new_char_class_alloc);
3432       if (BE (new_char_classes == NULL, 0))
3433         return REG_ESPACE;
3434       mbcset->char_classes = new_char_classes;
3435       *char_class_alloc = new_char_class_alloc;
3436     }
3437   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3438 #endif /* RE_ENABLE_I18N */
3439
3440 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3441     for (i = 0; i < SBC_MAX; ++i)               \
3442       {                                         \
3443         if (ctype_func (i))                     \
3444           {                                     \
3445             int ch = trans ? trans[i] : i;      \
3446             bitset_set (sbcset, ch);            \
3447           }                                     \
3448       }
3449
3450   if (strcmp (name, "alnum") == 0)
3451     BUILD_CHARCLASS_LOOP (isalnum)
3452   else if (strcmp (name, "cntrl") == 0)
3453     BUILD_CHARCLASS_LOOP (iscntrl)
3454   else if (strcmp (name, "lower") == 0)
3455     BUILD_CHARCLASS_LOOP (islower)
3456   else if (strcmp (name, "space") == 0)
3457     BUILD_CHARCLASS_LOOP (isspace)
3458   else if (strcmp (name, "alpha") == 0)
3459     BUILD_CHARCLASS_LOOP (isalpha)
3460   else if (strcmp (name, "digit") == 0)
3461     BUILD_CHARCLASS_LOOP (isdigit)
3462   else if (strcmp (name, "print") == 0)
3463     BUILD_CHARCLASS_LOOP (isprint)
3464   else if (strcmp (name, "upper") == 0)
3465     BUILD_CHARCLASS_LOOP (isupper)
3466   else if (strcmp (name, "blank") == 0)
3467     BUILD_CHARCLASS_LOOP (isblank)
3468   else if (strcmp (name, "graph") == 0)
3469     BUILD_CHARCLASS_LOOP (isgraph)
3470   else if (strcmp (name, "punct") == 0)
3471     BUILD_CHARCLASS_LOOP (ispunct)
3472   else if (strcmp (name, "xdigit") == 0)
3473     BUILD_CHARCLASS_LOOP (isxdigit)
3474   else
3475     return REG_ECTYPE;
3476
3477   return REG_NOERROR;
3478 }
3479
3480 static bin_tree_t *
3481 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3482                     const unsigned char *class_name,
3483                     const unsigned char *extra,
3484                     int non_match, reg_errcode_t *err)
3485 {
3486   re_bitset_ptr_t sbcset;
3487 #ifdef RE_ENABLE_I18N
3488   re_charset_t *mbcset;
3489   int alloc = 0;
3490 #endif /* not RE_ENABLE_I18N */
3491   reg_errcode_t ret;
3492   re_token_t br_token;
3493   bin_tree_t *tree;
3494
3495   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3496 #ifdef RE_ENABLE_I18N
3497   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3498 #endif /* RE_ENABLE_I18N */
3499
3500 #ifdef RE_ENABLE_I18N
3501   if (BE (sbcset == NULL || mbcset == NULL, 0))
3502 #else /* not RE_ENABLE_I18N */
3503   if (BE (sbcset == NULL, 0))
3504 #endif /* not RE_ENABLE_I18N */
3505     {
3506       *err = REG_ESPACE;
3507       return NULL;
3508     }
3509
3510   if (non_match)
3511     {
3512 #ifdef RE_ENABLE_I18N
3513       /*
3514       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3515         bitset_set(cset->sbcset, '\0');
3516       */
3517       mbcset->non_match = 1;
3518 #endif /* not RE_ENABLE_I18N */
3519     }
3520
3521   /* We don't care the syntax in this case.  */
3522   ret = build_charclass (trans, sbcset,
3523 #ifdef RE_ENABLE_I18N
3524                          mbcset, &alloc,
3525 #endif /* RE_ENABLE_I18N */
3526                          class_name, 0);
3527
3528   if (BE (ret != REG_NOERROR, 0))
3529     {
3530       re_free (sbcset);
3531 #ifdef RE_ENABLE_I18N
3532       free_charset (mbcset);
3533 #endif /* RE_ENABLE_I18N */
3534       *err = ret;
3535       return NULL;
3536     }
3537   /* \w match '_' also.  */
3538   for (; *extra; extra++)
3539     bitset_set (sbcset, *extra);
3540
3541   /* If it is non-matching list.  */
3542   if (non_match)
3543     bitset_not (sbcset);
3544
3545 #ifdef RE_ENABLE_I18N
3546   /* Ensure only single byte characters are set.  */
3547   if (dfa->mb_cur_max > 1)
3548     bitset_mask (sbcset, dfa->sb_char);
3549 #endif
3550
3551   /* Build a tree for simple bracket.  */
3552   br_token.type = SIMPLE_BRACKET;
3553   br_token.opr.sbcset = sbcset;
3554   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3555   if (BE (tree == NULL, 0))
3556     goto build_word_op_espace;
3557
3558 #ifdef RE_ENABLE_I18N
3559   if (dfa->mb_cur_max > 1)
3560     {
3561       bin_tree_t *mbc_tree;
3562       /* Build a tree for complex bracket.  */
3563       br_token.type = COMPLEX_BRACKET;
3564       br_token.opr.mbcset = mbcset;
3565       dfa->has_mb_node = 1;
3566       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3567       if (BE (mbc_tree == NULL, 0))
3568         goto build_word_op_espace;
3569       /* Then join them by ALT node.  */
3570       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3571       if (BE (mbc_tree != NULL, 1))
3572         return tree;
3573     }
3574   else
3575     {
3576       free_charset (mbcset);
3577       return tree;
3578     }
3579 #else /* not RE_ENABLE_I18N */
3580   return tree;
3581 #endif /* not RE_ENABLE_I18N */
3582
3583  build_word_op_espace:
3584   re_free (sbcset);
3585 #ifdef RE_ENABLE_I18N
3586   free_charset (mbcset);
3587 #endif /* RE_ENABLE_I18N */
3588   *err = REG_ESPACE;
3589   return NULL;
3590 }
3591
3592 /* This is intended for the expressions like "a{1,3}".
3593    Fetch a number from `input', and return the number.
3594    Return -1, if the number field is empty like "{,1}".
3595    Return -2, If an error is occured.  */
3596
3597 static int
3598 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3599 {
3600   int num = -1;
3601   unsigned char c;
3602   while (1)
3603     {
3604       fetch_token (token, input, syntax);
3605       c = token->opr.c;
3606       if (BE (token->type == END_OF_RE, 0))
3607         return -2;
3608       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3609         break;
3610       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3611              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3612       num = (num > REG_DUP_MAX) ? -2 : num;
3613     }
3614   return num;
3615 }
3616 \f
3617 #ifdef RE_ENABLE_I18N
3618 static void
3619 free_charset (re_charset_t *cset)
3620 {
3621   re_free (cset->mbchars);
3622 # ifdef _LIBC
3623   re_free (cset->coll_syms);
3624   re_free (cset->equiv_classes);
3625   re_free (cset->range_starts);
3626   re_free (cset->range_ends);
3627 # endif
3628   re_free (cset->char_classes);
3629   re_free (cset);
3630 }
3631 #endif /* RE_ENABLE_I18N */
3632 \f
3633 /* Functions for binary tree operation.  */
3634
3635 /* Create a tree node.  */
3636
3637 static bin_tree_t *
3638 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3639              re_token_type_t type)
3640 {
3641   re_token_t t;
3642   t.type = type;
3643   return create_token_tree (dfa, left, right, &t);
3644 }
3645
3646 static bin_tree_t *
3647 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3648                    const re_token_t *token)
3649 {
3650   bin_tree_t *tree;
3651   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3652     {
3653       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3654
3655       if (storage == NULL)
3656         return NULL;
3657       storage->next = dfa->str_tree_storage;
3658       dfa->str_tree_storage = storage;
3659       dfa->str_tree_storage_idx = 0;
3660     }
3661   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3662
3663   tree->parent = NULL;
3664   tree->left = left;
3665   tree->right = right;
3666   tree->token = *token;
3667   tree->token.duplicated = 0;
3668   tree->token.opt_subexp = 0;
3669   tree->first = NULL;
3670   tree->next = NULL;
3671   tree->node_idx = -1;
3672
3673   if (left != NULL)
3674     left->parent = tree;
3675   if (right != NULL)
3676     right->parent = tree;
3677   return tree;
3678 }
3679
3680 /* Mark the tree SRC as an optional subexpression.
3681    To be called from preorder or postorder.  */
3682
3683 static reg_errcode_t
3684 mark_opt_subexp (void *extra, bin_tree_t *node)
3685 {
3686   int idx = (int) (long) extra;
3687   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3688     node->token.opt_subexp = 1;
3689
3690   return REG_NOERROR;
3691 }
3692
3693 /* Free the allocated memory inside NODE. */
3694
3695 static void
3696 free_token (re_token_t *node)
3697 {
3698 #ifdef RE_ENABLE_I18N
3699   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3700     free_charset (node->opr.mbcset);
3701   else
3702 #endif /* RE_ENABLE_I18N */
3703     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3704       re_free (node->opr.sbcset);
3705 }
3706
3707 /* Worker function for tree walking.  Free the allocated memory inside NODE
3708    and its children. */
3709
3710 static reg_errcode_t
3711 free_tree (void *extra, bin_tree_t *node)
3712 {
3713   free_token (&node->token);
3714   return REG_NOERROR;
3715 }
3716
3717
3718 /* Duplicate the node SRC, and return new node.  This is a preorder
3719    visit similar to the one implemented by the generic visitor, but
3720    we need more infrastructure to maintain two parallel trees --- so,
3721    it's easier to duplicate.  */
3722
3723 static bin_tree_t *
3724 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3725 {
3726   const bin_tree_t *node;
3727   bin_tree_t *dup_root;
3728   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3729
3730   for (node = root; ; )
3731     {
3732       /* Create a new tree and link it back to the current parent.  */
3733       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3734       if (*p_new == NULL)
3735         return NULL;
3736       (*p_new)->parent = dup_node;
3737       (*p_new)->token.duplicated = 1;
3738       dup_node = *p_new;
3739
3740       /* Go to the left node, or up and to the right.  */
3741       if (node->left)
3742         {
3743           node = node->left;
3744           p_new = &dup_node->left;
3745         }
3746       else
3747         {
3748           const bin_tree_t *prev = NULL;
3749           while (node->right == prev || node->right == NULL)
3750             {
3751               prev = node;
3752               node = node->parent;
3753               dup_node = dup_node->parent;
3754               if (!node)
3755                 return dup_root;
3756             }
3757           node = node->right;
3758           p_new = &dup_node->right;
3759         }
3760     }
3761 }