lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           Idx length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
  94                                       re_bitset_ptr_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
 103                                       re_bitset_ptr_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        unsigned REG_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (const char *pattern, size_t length,
 214                     struct re_pattern_buffer *bufp)
 215 {
 216   reg_errcode_t ret;
 217
 218   /* And GNU code determines whether or not to get register information
 219      by passing null for the REGS argument to re_match, etc., not by
 220      setting re_no_sub, unless REG_NO_SUB is set.  */
 221   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
 222
 223   /* Match anchors at newline.  */
 224   bufp->re_newline_anchor = 1;
 225
 226   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 227
 228   if (!ret)
 229     return NULL;
 230   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 231 }
 232 #ifdef _LIBC
 233 weak_alias (__re_compile_pattern, re_compile_pattern)
 234 #endif
 235
 236 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 237    also be assigned to arbitrarily: each pattern buffer stores its own
 238    syntax, so it can be changed between regex compilations.  */
 239 /* This has no initializer because initialized variables in Emacs
 240    become read-only after dumping.  */
 241 reg_syntax_t re_syntax_options;
 242
 243
 244 /* Specify the precise syntax of regexps for compilation.  This provides
 245    for compatibility for various utilities which historically have
 246    different, incompatible syntaxes.
 247
 248    The argument SYNTAX is a bit mask comprised of the various bits
 249    defined in regex.h.  We return the old syntax.  */
 250
 251 reg_syntax_t
 252 re_set_syntax (reg_syntax_t syntax)
 253 {
 254   reg_syntax_t ret = re_syntax_options;
 255
 256   re_syntax_options = syntax;
 257   return ret;
 258 }
 259 #ifdef _LIBC
 260 weak_alias (__re_set_syntax, re_set_syntax)
 261 #endif
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 267   char *fastmap = bufp->re_fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->re_fastmap_accurate = 1;
 278   return 0;
 279 }
 280 #ifdef _LIBC
 281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 282 #endif
 283
 284 static inline void
 285 __attribute ((always_inline))
 286 re_set_fastmap (char *fastmap, bool icase, int ch)
 287 {
 288   fastmap[ch] = 1;
 289   if (icase)
 290     fastmap[tolower (ch)] = 1;
 291 }
 292
 293 /* Helper function for re_compile_fastmap.
 294    Compile fastmap for the initial_state INIT_STATE.  */
 295
 296 static void
 297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 298                          char *fastmap)
 299 {
 300   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 301   Idx node_cnt;
 302   bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
 303   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 304     {
 305       Idx node = init_state->nodes.elems[node_cnt];
 306       re_token_type_t type = dfa->nodes[node].type;
 307
 308       if (type == CHARACTER)
 309         {
 310           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 311 #ifdef RE_ENABLE_I18N
 312           if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 313             {
 314               unsigned char buf[MB_LEN_MAX];
 315               unsigned char *p;
 316               wchar_t wc;
 317               mbstate_t state;
 318
 319               p = buf;
 320               *p++ = dfa->nodes[node].opr.c;
 321               while (++node < dfa->nodes_len
 322                      && dfa->nodes[node].type == CHARACTER
 323                      && dfa->nodes[node].mb_partial)
 324                 *p++ = dfa->nodes[node].opr.c;
 325               memset (&state, 0, sizeof (state));
 326               if (mbrtowc (&wc, (const char *) buf, p - buf,
 327                            &state) == p - buf
 328                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 329                       != (size_t) -1))
 330                 re_set_fastmap (fastmap, false, buf[0]);
 331             }
 332 #endif
 333         }
 334       else if (type == SIMPLE_BRACKET)
 335         {
 336           int i, j, ch;
 337           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 338             for (j = 0; j < UINT_BITS; ++j, ++ch)
 339               if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
 340                 re_set_fastmap (fastmap, icase, ch);
 341         }
 342 #ifdef RE_ENABLE_I18N
 343       else if (type == COMPLEX_BRACKET)
 344         {
 345           Idx i;
 346           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 347           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 348               || cset->nranges || cset->nchar_classes)
 349             {
 350 # ifdef _LIBC
 351               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 352                 {
 353                   /* In this case we want to catch the bytes which are
 354                      the first byte of any collation elements.
 355                      e.g. In da_DK, we want to catch 'a' since "aa"
 356                           is a valid collation element, and don't catch
 357                           'b' since 'b' is the only collation element
 358                           which starts from 'b'.  */
 359                   int j, ch;
 360                   const int32_t *table = (const int32_t *)
 361                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 362                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 363                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 364                       if (table[ch] < 0)
 365                         re_set_fastmap (fastmap, icase, ch);
 366                 }
 367 # else
 368               if (dfa->mb_cur_max > 1)
 369                 for (i = 0; i < SBC_MAX; ++i)
 370                   if (__btowc (i) == WEOF)
 371                     re_set_fastmap (fastmap, icase, i);
 372 # endif /* not _LIBC */
 373             }
 374           for (i = 0; i < cset->nmbchars; ++i)
 375             {
 376               char buf[256];
 377               mbstate_t state;
 378               memset (&state, '\0', sizeof (state));
 379               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 380                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 381               if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 382                 {
 383                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 384                       != (size_t) -1)
 385                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 386                 }
 387             }
 388         }
 389 #endif /* RE_ENABLE_I18N */
 390       else if (type == OP_PERIOD
 391 #ifdef RE_ENABLE_I18N
 392                || type == OP_UTF8_PERIOD
 393 #endif /* RE_ENABLE_I18N */
 394                || type == END_OF_RE)
 395         {
 396           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 397           if (type == END_OF_RE)
 398             bufp->re_can_be_null = 1;
 399           return;
 400         }
 401     }
 402 }
 403 \f
 404 /* Entry point for POSIX code.  */
 405 /* regcomp takes a regular expression as a string and compiles it.
 406
 407    PREG is a regex_t *.  We do not expect any fields to be initialized,
 408    since POSIX says we shouldn't.  Thus, we set
 409
 410      `re_buffer' to the compiled pattern;
 411      `re_used' to the length of the compiled pattern;
 412      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
 413        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 414        REG_SYNTAX_POSIX_BASIC;
 415      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
 416      `re_fastmap' to an allocated space for the fastmap;
 417      `re_fastmap_accurate' to zero;
 418      `re_nsub' to the number of subexpressions in PATTERN.
 419
 420    PATTERN is the address of the pattern string.
 421
 422    CFLAGS is a series of bits which affect compilation.
 423
 424      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 425      use POSIX basic syntax.
 426
 427      If REG_NEWLINE is set, then . and [^...] don't match newline.
 428      Also, regexec will try a match beginning after every newline.
 429
 430      If REG_ICASE is set, then we considers upper- and lowercase
 431      versions of letters to be equivalent when matching.
 432
 433      If REG_NOSUB is set, then when PREG is passed to regexec, that
 434      routine will report only success or failure, and nothing about the
 435      registers.
 436
 437    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 438    the return codes and their meanings.)  */
 439
 440 int
 441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 442 {
 443   reg_errcode_t ret;
 444   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
 445                         : REG_SYNTAX_POSIX_BASIC);
 446
 447   preg->re_buffer = NULL;
 448   preg->re_allocated = 0;
 449   preg->re_used = 0;
 450
 451   /* Try to allocate space for the fastmap.  */
 452   preg->re_fastmap = re_malloc (char, SBC_MAX);
 453   if (BE (preg->re_fastmap == NULL, 0))
 454     return REG_ESPACE;
 455
 456   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
 457
 458   /* If REG_NEWLINE is set, newlines are treated differently.  */
 459   if (cflags & REG_NEWLINE)
 460     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 461       syntax &= ~REG_DOT_NEWLINE;
 462       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
 463       /* It also changes the matching behavior.  */
 464       preg->re_newline_anchor = 1;
 465     }
 466   else
 467     preg->re_newline_anchor = 0;
 468   preg->re_no_sub = !!(cflags & REG_NOSUB);
 469   preg->re_translate = NULL;
 470
 471   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 472
 473   /* POSIX doesn't distinguish between an unmatched open-group and an
 474      unmatched close-group: both are REG_EPAREN.  */
 475   if (ret == REG_ERPAREN)
 476     ret = REG_EPAREN;
 477
 478   /* We have already checked preg->re_fastmap != NULL.  */
 479   if (BE (ret == REG_NOERROR, 1))
 480     /* Compute the fastmap now, since regexec cannot modify the pattern
 481        buffer.  This function never fails in this implementation.  */
 482     (void) re_compile_fastmap (preg);
 483   else
 484     {
 485       /* Some error occurred while compiling the expression.  */
 486       re_free (preg->re_fastmap);
 487       preg->re_fastmap = NULL;
 488     }
 489
 490   return (int) ret;
 491 }
 492 #ifdef _LIBC
 493 weak_alias (__regcomp, regcomp)
 494 #endif
 495
 496 /* Returns a message corresponding to an error code, ERRCODE, returned
 497    from either regcomp or regexec.   We don't use PREG here.  */
 498
 499 size_t
 500 regerror (int errcode, const regex_t *__restrict preg,
 501           char *__restrict errbuf, size_t errbuf_size)
 502 {
 503   const char *msg;
 504   size_t msg_size;
 505
 506   if (BE (errcode < 0
 507           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 508                                / sizeof (__re_error_msgid_idx[0])), 0))
 509     /* Only error codes returned by the rest of the code should be passed
 510        to this routine.  If we are given anything else, or if other regex
 511        code generates an invalid error code, then the program has a bug.
 512        Dump core so we can fix it.  */
 513     abort ();
 514
 515   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 516
 517   msg_size = strlen (msg) + 1; /* Includes the null.  */
 518
 519   if (BE (errbuf_size != 0, 1))
 520     {
 521       if (BE (msg_size > errbuf_size, 0))
 522         {
 523 #if defined HAVE_MEMPCPY || defined _LIBC
 524           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 525 #else
 526           memcpy (errbuf, msg, errbuf_size - 1);
 527           errbuf[errbuf_size - 1] = 0;
 528 #endif
 529         }
 530       else
 531         memcpy (errbuf, msg, msg_size);
 532     }
 533
 534   return msg_size;
 535 }
 536 #ifdef _LIBC
 537 weak_alias (__regerror, regerror)
 538 #endif
 539
 540
 541 #ifdef RE_ENABLE_I18N
 542 /* This static array is used for the map to single-byte characters when
 543    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 544    it the same all the time.  UTF-8 is the preferred encoding so this is
 545    a worthwhile optimization.  */
 546 static const bitset utf8_sb_map =
 547 {
 548   /* Set the first 128 bits.  */
 549 # if UINT_MAX == 0xffffffff
 550   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 551 # else
 552 #  error "Add case for new unsigned int size"
 553 # endif
 554 };
 555 #endif
 556
 557
 558 static void
 559 free_dfa_content (re_dfa_t *dfa)
 560 {
 561   Idx i, j;
 562
 563   if (dfa->nodes)
 564     for (i = 0; i < dfa->nodes_len; ++i)
 565       free_token (dfa->nodes + i);
 566   re_free (dfa->nexts);
 567   for (i = 0; i < dfa->nodes_len; ++i)
 568     {
 569       if (dfa->eclosures != NULL)
 570         re_node_set_free (dfa->eclosures + i);
 571       if (dfa->inveclosures != NULL)
 572         re_node_set_free (dfa->inveclosures + i);
 573       if (dfa->edests != NULL)
 574         re_node_set_free (dfa->edests + i);
 575     }
 576   re_free (dfa->edests);
 577   re_free (dfa->eclosures);
 578   re_free (dfa->inveclosures);
 579   re_free (dfa->nodes);
 580
 581   if (dfa->state_table)
 582     for (i = 0; i <= dfa->state_hash_mask; ++i)
 583       {
 584         struct re_state_table_entry *entry = dfa->state_table + i;
 585         for (j = 0; j < entry->num; ++j)
 586           {
 587             re_dfastate_t *state = entry->array[j];
 588             free_state (state);
 589           }
 590         re_free (entry->array);
 591       }
 592   re_free (dfa->state_table);
 593 #ifdef RE_ENABLE_I18N
 594   if (dfa->sb_char != utf8_sb_map)
 595     re_free (dfa->sb_char);
 596 #endif
 597   re_free (dfa->subexp_map);
 598 #ifdef DEBUG
 599   re_free (dfa->re_str);
 600 #endif
 601
 602   re_free (dfa);
 603 }
 604
 605
 606 /* Free dynamically allocated space used by PREG.  */
 607
 608 void
 609 regfree (regex_t *preg)
 610 {
 611   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 612   if (BE (dfa != NULL, 1))
 613     free_dfa_content (dfa);
 614   preg->re_buffer = NULL;
 615   preg->re_allocated = 0;
 616
 617   re_free (preg->re_fastmap);
 618   preg->re_fastmap = NULL;
 619
 620   re_free (preg->re_translate);
 621   preg->re_translate = NULL;
 622 }
 623 #ifdef _LIBC
 624 weak_alias (__regfree, regfree)
 625 #endif
 626 \f
 627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 628    them unless specifically requested.  */
 629
 630 #if defined _REGEX_RE_COMP || defined _LIBC
 631
 632 /* BSD has one and only one pattern buffer.  */
 633 static struct re_pattern_buffer re_comp_buf;
 634
 635 char *
 636 # ifdef _LIBC
 637 /* Make these definitions weak in libc, so POSIX programs can redefine
 638    these names if they don't use our functions, and still use
 639    regcomp/regexec above without link errors.  */
 640 weak_function
 641 # endif
 642 re_comp (const char *s)
 643 {
 644   reg_errcode_t ret;
 645   char *fastmap;
 646
 647   if (!s)
 648     {
 649       if (!re_comp_buf.re_buffer)
 650         return gettext ("No previous regular expression");
 651       return 0;
 652     }
 653
 654   if (re_comp_buf.re_buffer)
 655     {
 656       fastmap = re_comp_buf.re_fastmap;
 657       re_comp_buf.re_fastmap = NULL;
 658       __regfree (&re_comp_buf);
 659       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 660       re_comp_buf.re_fastmap = fastmap;
 661     }
 662
 663   if (re_comp_buf.re_fastmap == NULL)
 664     {
 665       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
 666       if (re_comp_buf.re_fastmap == NULL)
 667         return (char *) gettext (__re_error_msgid
 668                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 669     }
 670
 671   /* Since `re_exec' always passes NULL for the `regs' argument, we
 672      don't need to initialize the pattern buffer fields which affect it.  */
 673
 674   /* Match anchors at newlines.  */
 675   re_comp_buf.re_newline_anchor = 1;
 676
 677   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 678
 679   if (!ret)
 680     return NULL;
 681
 682   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 683   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 684 }
 685
 686 #ifdef _LIBC
 687 libc_freeres_fn (free_mem)
 688 {
 689   __regfree (&re_comp_buf);
 690 }
 691 #endif
 692
 693 #endif /* _REGEX_RE_COMP */
 694 \f
 695 /* Internal entry point.
 696    Compile the regular expression PATTERN, whose length is LENGTH.
 697    SYNTAX indicate regular expression's syntax.  */
 698
 699 static reg_errcode_t
 700 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
 701                      reg_syntax_t syntax)
 702 {
 703   reg_errcode_t err = REG_NOERROR;
 704   re_dfa_t *dfa;
 705   re_string_t regexp;
 706
 707   /* Initialize the pattern buffer.  */
 708   preg->re_fastmap_accurate = 0;
 709   preg->re_syntax = syntax;
 710   preg->re_not_bol = preg->re_not_eol = 0;
 711   preg->re_used = 0;
 712   preg->re_nsub = 0;
 713   preg->re_can_be_null = 0;
 714   preg->re_regs_allocated = REG_UNALLOCATED;
 715
 716   /* Initialize the dfa.  */
 717   dfa = (re_dfa_t *) preg->re_buffer;
 718   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
 719     {
 720       /* If zero allocated, but buffer is non-null, try to realloc
 721          enough space.  This loses if buffer's address is bogus, but
 722          that is the user's responsibility.  If buffer is null this
 723          is a simple allocation.  */
 724       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
 725       if (dfa == NULL)
 726         return REG_ESPACE;
 727       preg->re_allocated = sizeof (re_dfa_t);
 728       preg->re_buffer = (unsigned char *) dfa;
 729     }
 730   preg->re_used = sizeof (re_dfa_t);
 731
 732   __libc_lock_init (dfa->lock);
 733
 734   err = init_dfa (dfa, length);
 735   if (BE (err != REG_NOERROR, 0))
 736     {
 737       free_dfa_content (dfa);
 738       preg->re_buffer = NULL;
 739       preg->re_allocated = 0;
 740       return err;
 741     }
 742 #ifdef DEBUG
 743   dfa->re_str = re_malloc (char, length + 1);
 744   strncpy (dfa->re_str, pattern, length + 1);
 745 #endif
 746
 747   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
 748                              syntax & REG_IGNORE_CASE, dfa);
 749   if (BE (err != REG_NOERROR, 0))
 750     {
 751     re_compile_internal_free_return:
 752       free_workarea_compile (preg);
 753       re_string_destruct (&regexp);
 754       free_dfa_content (dfa);
 755       preg->re_buffer = NULL;
 756       preg->re_allocated = 0;
 757       return err;
 758     }
 759
 760   /* Parse the regular expression, and build a structure tree.  */
 761   preg->re_nsub = 0;
 762   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 763   if (BE (dfa->str_tree == NULL, 0))
 764     goto re_compile_internal_free_return;
 765
 766   /* Analyze the tree and create the nfa.  */
 767   err = analyze (preg);
 768   if (BE (err != REG_NOERROR, 0))
 769     goto re_compile_internal_free_return;
 770
 771 #ifdef RE_ENABLE_I18N
 772   /* If possible, do searching in single byte encoding to speed things up.  */
 773   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
 774     optimize_utf8 (dfa);
 775 #endif
 776
 777   /* Then create the initial state of the dfa.  */
 778   err = create_initial_state (dfa);
 779
 780   /* Release work areas.  */
 781   free_workarea_compile (preg);
 782   re_string_destruct (&regexp);
 783
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786       free_dfa_content (dfa);
 787       preg->re_buffer = NULL;
 788       preg->re_allocated = 0;
 789     }
 790
 791   return err;
 792 }
 793
 794 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 795    as the initial length of some arrays.  */
 796
 797 static reg_errcode_t
 798 init_dfa (re_dfa_t *dfa, Idx pat_len)
 799 {
 800   __re_size_t table_size;
 801 #ifndef _LIBC
 802   char *codeset_name;
 803 #endif
 804
 805   memset (dfa, '\0', sizeof (re_dfa_t));
 806
 807   /* Force allocation of str_tree_storage the first time.  */
 808   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 809
 810   dfa->nodes_alloc = pat_len + 1;
 811   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 812
 813   /*  table_size = 2 ^ ceil(log pat_len) */
 814   for (table_size = 1; table_size <= pat_len; table_size <<= 1)
 815     if (0 < (Idx) -1 && table_size == 0)
 816       return REG_ESPACE;
 817
 818   dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
 819   dfa->state_hash_mask = table_size - 1;
 820
 821   dfa->mb_cur_max = MB_CUR_MAX;
 822 #ifdef _LIBC
 823   if (dfa->mb_cur_max == 6
 824       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 825     dfa->is_utf8 = 1;
 826   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 827                        != 0);
 828 #else
 829 # ifdef HAVE_LANGINFO_CODESET
 830   codeset_name = nl_langinfo (CODESET);
 831 # else
 832   codeset_name = getenv ("LC_ALL");
 833   if (codeset_name == NULL || codeset_name[0] == '\0')
 834     codeset_name = getenv ("LC_CTYPE");
 835   if (codeset_name == NULL || codeset_name[0] == '\0')
 836     codeset_name = getenv ("LANG");
 837   if (codeset_name == NULL)
 838     codeset_name = "";
 839   else if (strchr (codeset_name, '.') !=  NULL)
 840     codeset_name = strchr (codeset_name, '.') + 1;
 841 # endif
 842
 843   if (strcasecmp (codeset_name, "UTF-8") == 0
 844       || strcasecmp (codeset_name, "UTF8") == 0)
 845     dfa->is_utf8 = 1;
 846
 847   /* We check exhaustively in the loop below if this charset is a
 848      superset of ASCII.  */
 849   dfa->map_notascii = 0;
 850 #endif
 851
 852 #ifdef RE_ENABLE_I18N
 853   if (dfa->mb_cur_max > 1)
 854     {
 855       if (dfa->is_utf8)
 856         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 857       else
 858         {
 859           int i, j, ch;
 860
 861           dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
 862           if (BE (dfa->sb_char == NULL, 0))
 863             return REG_ESPACE;
 864
 865           /* Clear all bits by, then set those corresponding to single
 866              byte chars.  */
 867           bitset_empty (dfa->sb_char);
 868
 869           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 870             for (j = 0; j < UINT_BITS; ++j, ++ch)
 871               {
 872                 wint_t wch = __btowc (ch);
 873                 if (wch != WEOF)
 874                   dfa->sb_char[i] |= 1u << j;
 875 # ifndef _LIBC
 876                 if (isascii (ch) && wch != ch)
 877                   dfa->map_notascii = 1;
 878 # endif
 879               }
 880         }
 881     }
 882 #endif
 883
 884   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 885     return REG_ESPACE;
 886   return REG_NOERROR;
 887 }
 888
 889 /* Initialize WORD_CHAR table, which indicate which character is
 890    "word".  In this case "word" means that it is the word construction
 891    character used by some operators like "\<", "\>", etc.  */
 892
 893 static void
 894 init_word_char (re_dfa_t *dfa)
 895 {
 896   int i, j, ch;
 897   dfa->word_ops_used = 1;
 898   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 899     for (j = 0; j < UINT_BITS; ++j, ++ch)
 900       if (isalnum (ch) || ch == '_')
 901         dfa->word_char[i] |= 1u << j;
 902 }
 903
 904 /* Free the work area which are only used while compiling.  */
 905
 906 static void
 907 free_workarea_compile (regex_t *preg)
 908 {
 909   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 910   bin_tree_storage_t *storage, *next;
 911   for (storage = dfa->str_tree_storage; storage; storage = next)
 912     {
 913       next = storage->next;
 914       re_free (storage);
 915     }
 916   dfa->str_tree_storage = NULL;
 917   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 918   dfa->str_tree = NULL;
 919   re_free (dfa->org_indices);
 920   dfa->org_indices = NULL;
 921 }
 922
 923 /* Create initial states for all contexts.  */
 924
 925 static reg_errcode_t
 926 create_initial_state (re_dfa_t *dfa)
 927 {
 928   Idx first, i;
 929   reg_errcode_t err;
 930   re_node_set init_nodes;
 931
 932   /* Initial states have the epsilon closure of the node which is
 933      the first node of the regular expression.  */
 934   first = dfa->str_tree->first->node_idx;
 935   dfa->init_node = first;
 936   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 937   if (BE (err != REG_NOERROR, 0))
 938     return err;
 939
 940   /* The back-references which are in initial states can epsilon transit,
 941      since in this case all of the subexpressions can be null.
 942      Then we add epsilon closures of the nodes which are the next nodes of
 943      the back-references.  */
 944   if (dfa->nbackref > 0)
 945     for (i = 0; i < init_nodes.nelem; ++i)
 946       {
 947         Idx node_idx = init_nodes.elems[i];
 948         re_token_type_t type = dfa->nodes[node_idx].type;
 949
 950         Idx clexp_idx;
 951         if (type != OP_BACK_REF)
 952           continue;
 953         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 954           {
 955             re_token_t *clexp_node;
 956             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 957             if (clexp_node->type == OP_CLOSE_SUBEXP
 958                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 959               break;
 960           }
 961         if (clexp_idx == init_nodes.nelem)
 962           continue;
 963
 964         if (type == OP_BACK_REF)
 965           {
 966             Idx dest_idx = dfa->edests[node_idx].elems[0];
 967             if (!re_node_set_contains (&init_nodes, dest_idx))
 968               {
 969                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 970                 i = 0;
 971               }
 972           }
 973       }
 974
 975   /* It must be the first time to invoke acquire_state.  */
 976   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 977   /* We don't check ERR here, since the initial state must not be NULL.  */
 978   if (BE (dfa->init_state == NULL, 0))
 979     return err;
 980   if (dfa->init_state->has_constraint)
 981     {
 982       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 983                                                        CONTEXT_WORD);
 984       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 985                                                      CONTEXT_NEWLINE);
 986       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 987                                                          &init_nodes,
 988                                                          CONTEXT_NEWLINE
 989                                                          | CONTEXT_BEGBUF);
 990       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 991               || dfa->init_state_begbuf == NULL, 0))
 992         return err;
 993     }
 994   else
 995     dfa->init_state_word = dfa->init_state_nl
 996       = dfa->init_state_begbuf = dfa->init_state;
 997
 998   re_node_set_free (&init_nodes);
 999   return REG_NOERROR;
1000 }
1001 \f
1002 #ifdef RE_ENABLE_I18N
1003 /* If it is possible to do searching in single byte encoding instead of UTF-8
1004    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1005    DFA nodes where needed.  */
1006
1007 static void
1008 optimize_utf8 (re_dfa_t *dfa)
1009 {
1010   Idx node;
1011   int i;
1012   bool mb_chars = false;
1013   bool has_period = false;
1014
1015   for (node = 0; node < dfa->nodes_len; ++node)
1016     switch (dfa->nodes[node].type)
1017       {
1018       case CHARACTER:
1019         if (dfa->nodes[node].opr.c >= 0x80)
1020           mb_chars = true;
1021         break;
1022       case ANCHOR:
1023         switch (dfa->nodes[node].opr.idx)
1024           {
1025           case LINE_FIRST:
1026           case LINE_LAST:
1027           case BUF_FIRST:
1028           case BUF_LAST:
1029             break;
1030           default:
1031             /* Word anchors etc. cannot be handled.  */
1032             return;
1033           }
1034         break;
1035       case OP_PERIOD:
1036         has_period = true;
1037         break;
1038       case OP_BACK_REF:
1039       case OP_ALT:
1040       case END_OF_RE:
1041       case OP_DUP_ASTERISK:
1042       case OP_OPEN_SUBEXP:
1043       case OP_CLOSE_SUBEXP:
1044         break;
1045       case COMPLEX_BRACKET:
1046         return;
1047       case SIMPLE_BRACKET:
1048         /* Just double check.  */
1049         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1050           if (dfa->nodes[node].opr.sbcset[i])
1051             return;
1052         break;
1053       default:
1054         abort ();
1055       }
1056
1057   if (mb_chars || has_period)
1058     for (node = 0; node < dfa->nodes_len; ++node)
1059       {
1060         if (dfa->nodes[node].type == CHARACTER
1061             && dfa->nodes[node].opr.c >= 0x80)
1062           dfa->nodes[node].mb_partial = 0;
1063         else if (dfa->nodes[node].type == OP_PERIOD)
1064           dfa->nodes[node].type = OP_UTF8_PERIOD;
1065       }
1066
1067   /* The search can be in single byte locale.  */
1068   dfa->mb_cur_max = 1;
1069   dfa->is_utf8 = 0;
1070   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1071 }
1072 #endif
1073 \f
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075    "eclosure", and "inveclosure".  */
1076
1077 static reg_errcode_t
1078 analyze (regex_t *preg)
1079 {
1080   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1081   reg_errcode_t ret;
1082
1083   /* Allocate arrays.  */
1084   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1085   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1086   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1087   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089           || dfa->eclosures == NULL, 0))
1090     return REG_ESPACE;
1091
1092   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1093   if (dfa->subexp_map != NULL)
1094     {
1095       Idx i;
1096       for (i = 0; i < preg->re_nsub; i++)
1097         dfa->subexp_map[i] = i;
1098       preorder (dfa->str_tree, optimize_subexps, dfa);
1099       for (i = 0; i < preg->re_nsub; i++)
1100         if (dfa->subexp_map[i] != i)
1101           break;
1102       if (i == preg->re_nsub)
1103         {
1104           free (dfa->subexp_map);
1105           dfa->subexp_map = NULL;
1106         }
1107     }
1108
1109   ret = postorder (dfa->str_tree, lower_subexps, preg);
1110   if (BE (ret != REG_NOERROR, 0))
1111     return ret;
1112   ret = postorder (dfa->str_tree, calc_first, dfa);
1113   if (BE (ret != REG_NOERROR, 0))
1114     return ret;
1115   preorder (dfa->str_tree, calc_next, dfa);
1116   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117   if (BE (ret != REG_NOERROR, 0))
1118     return ret;
1119   ret = calc_eclosure (dfa);
1120   if (BE (ret != REG_NOERROR, 0))
1121     return ret;
1122
1123   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1125   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126       || dfa->nbackref)
1127     {
1128       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1129       if (BE (dfa->inveclosures == NULL, 0))
1130         return REG_ESPACE;
1131       ret = calc_inveclosure (dfa);
1132     }
1133
1134   return ret;
1135 }
1136
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138    implement parse tree visits.  Instead, we use parent pointers and
1139    some hairy code in these two functions.  */
1140 static reg_errcode_t
1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142            void *extra)
1143 {
1144   bin_tree_t *node, *prev;
1145
1146   for (node = root; ; )
1147     {
1148       /* Descend down the tree, preferably to the left (or to the right
1149          if that's the only child).  */
1150       while (node->left || node->right)
1151         if (node->left)
1152           node = node->left;
1153         else
1154           node = node->right;
1155
1156       do
1157         {
1158           reg_errcode_t err = fn (extra, node);
1159           if (BE (err != REG_NOERROR, 0))
1160             return err;
1161           if (node->parent == NULL)
1162             return REG_NOERROR;
1163           prev = node;
1164           node = node->parent;
1165         }
1166       /* Go up while we have a node that is reached from the right.  */
1167       while (node->right == prev || node->right == NULL);
1168       node = node->right;
1169     }
1170 }
1171
1172 static reg_errcode_t
1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1174           void *extra)
1175 {
1176   bin_tree_t *node;
1177
1178   for (node = root; ; )
1179     {
1180       reg_errcode_t err = fn (extra, node);
1181       if (BE (err != REG_NOERROR, 0))
1182         return err;
1183
1184       /* Go to the left node, or up and to the right.  */
1185       if (node->left)
1186         node = node->left;
1187       else
1188         {
1189           bin_tree_t *prev = NULL;
1190           while (node->right == prev || node->right == NULL)
1191             {
1192               prev = node;
1193               node = node->parent;
1194               if (!node)
1195                 return REG_NOERROR;
1196             }
1197           node = node->right;
1198         }
1199     }
1200 }
1201
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1204    backreferences as well.  Requires a preorder visit.  */
1205 static reg_errcode_t
1206 optimize_subexps (void *extra, bin_tree_t *node)
1207 {
1208   re_dfa_t *dfa = (re_dfa_t *) extra;
1209
1210   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1211     {
1212       int idx = node->token.opr.idx;
1213       node->token.opr.idx = dfa->subexp_map[idx];
1214       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215     }
1216
1217   else if (node->token.type == SUBEXP
1218            && node->left && node->left->token.type == SUBEXP)
1219     {
1220       Idx other_idx = node->left->token.opr.idx;
1221
1222       node->left = node->left->left;
1223       if (node->left)
1224         node->left->parent = node;
1225
1226       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227       if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1228         dfa->used_bkref_map &= ~(1u << other_idx);
1229     }
1230
1231   return REG_NOERROR;
1232 }
1233
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1236 static reg_errcode_t
1237 lower_subexps (void *extra, bin_tree_t *node)
1238 {
1239   regex_t *preg = (regex_t *) extra;
1240   reg_errcode_t err = REG_NOERROR;
1241
1242   if (node->left && node->left->token.type == SUBEXP)
1243     {
1244       node->left = lower_subexp (&err, preg, node->left);
1245       if (node->left)
1246         node->left->parent = node;
1247     }
1248   if (node->right && node->right->token.type == SUBEXP)
1249     {
1250       node->right = lower_subexp (&err, preg, node->right);
1251       if (node->right)
1252         node->right->parent = node;
1253     }
1254
1255   return err;
1256 }
1257
1258 static bin_tree_t *
1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1260 {
1261   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1262   bin_tree_t *body = node->left;
1263   bin_tree_t *op, *cls, *tree1, *tree;
1264
1265   if (preg->re_no_sub
1266       /* We do not optimize empty subexpressions, because otherwise we may
1267          have bad CONCAT nodes with NULL children.  This is obviously not
1268          very common, so we do not lose much.  An example that triggers
1269          this case is the sed "script" /\(\)/x.  */
1270       && node->left != NULL
1271       && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1272           || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1273     return node->left;
1274
1275   /* Convert the SUBEXP node to the concatenation of an
1276      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1277   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1278   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1279   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1280   tree = create_tree (dfa, op, tree1, CONCAT);
1281   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1282     {
1283       *err = REG_ESPACE;
1284       return NULL;
1285     }
1286
1287   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1288   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1289   return tree;
1290 }
1291
1292 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1293    nodes.  Requires a postorder visit.  */
1294 static reg_errcode_t
1295 calc_first (void *extra, bin_tree_t *node)
1296 {
1297   re_dfa_t *dfa = (re_dfa_t *) extra;
1298   if (node->token.type == CONCAT)
1299     {
1300       node->first = node->left->first;
1301       node->node_idx = node->left->node_idx;
1302     }
1303   else
1304     {
1305       node->first = node;
1306       node->node_idx = re_dfa_add_node (dfa, node->token);
1307       if (BE (node->node_idx == REG_MISSING, 0))
1308         return REG_ESPACE;
1309     }
1310   return REG_NOERROR;
1311 }
1312
1313 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1314 static reg_errcode_t
1315 calc_next (void *extra, bin_tree_t *node)
1316 {
1317   switch (node->token.type)
1318     {
1319     case OP_DUP_ASTERISK:
1320       node->left->next = node;
1321       break;
1322     case CONCAT:
1323       node->left->next = node->right->first;
1324       node->right->next = node->next;
1325       break;
1326     default:
1327       if (node->left)
1328         node->left->next = node->next;
1329       if (node->right)
1330         node->right->next = node->next;
1331       break;
1332     }
1333   return REG_NOERROR;
1334 }
1335
1336 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1337 static reg_errcode_t
1338 link_nfa_nodes (void *extra, bin_tree_t *node)
1339 {
1340   re_dfa_t *dfa = (re_dfa_t *) extra;
1341   Idx idx = node->node_idx;
1342   reg_errcode_t err = REG_NOERROR;
1343
1344   switch (node->token.type)
1345     {
1346     case CONCAT:
1347       break;
1348
1349     case END_OF_RE:
1350       assert (node->next == NULL);
1351       break;
1352
1353     case OP_DUP_ASTERISK:
1354     case OP_ALT:
1355       {
1356         Idx left, right;
1357         dfa->has_plural_match = 1;
1358         if (node->left != NULL)
1359           left = node->left->first->node_idx;
1360         else
1361           left = node->next->node_idx;
1362         if (node->right != NULL)
1363           right = node->right->first->node_idx;
1364         else
1365           right = node->next->node_idx;
1366         assert (REG_VALID_INDEX (left));
1367         assert (REG_VALID_INDEX (right));
1368         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1369       }
1370       break;
1371
1372     case ANCHOR:
1373     case OP_OPEN_SUBEXP:
1374     case OP_CLOSE_SUBEXP:
1375       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1376       break;
1377
1378     case OP_BACK_REF:
1379       dfa->nexts[idx] = node->next->node_idx;
1380       if (node->token.type == OP_BACK_REF)
1381         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1382       break;
1383
1384     default:
1385       assert (!IS_EPSILON_NODE (node->token.type));
1386       dfa->nexts[idx] = node->next->node_idx;
1387       break;
1388     }
1389
1390   return err;
1391 }
1392
1393 /* Duplicate the epsilon closure of the node ROOT_NODE.
1394    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1395    to their own constraint.  */
1396
1397 static reg_errcode_t
1398 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1399                         Idx top_clone_node, Idx root_node,
1400                         unsigned int init_constraint)
1401 {
1402   Idx org_node, clone_node;
1403   bool ok;
1404   unsigned int constraint = init_constraint;
1405   for (org_node = top_org_node, clone_node = top_clone_node;;)
1406     {
1407       Idx org_dest, clone_dest;
1408       if (dfa->nodes[org_node].type == OP_BACK_REF)
1409         {
1410           /* If the back reference epsilon-transit, its destination must
1411              also have the constraint.  Then duplicate the epsilon closure
1412              of the destination of the back reference, and store it in
1413              edests of the back reference.  */
1414           org_dest = dfa->nexts[org_node];
1415           re_node_set_empty (dfa->edests + clone_node);
1416           clone_dest = duplicate_node (dfa, org_dest, constraint);
1417           if (BE (clone_dest == REG_MISSING, 0))
1418             return REG_ESPACE;
1419           dfa->nexts[clone_node] = dfa->nexts[org_node];
1420           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1421           if (BE (! ok, 0))
1422             return REG_ESPACE;
1423         }
1424       else if (dfa->edests[org_node].nelem == 0)
1425         {
1426           /* In case of the node can't epsilon-transit, don't duplicate the
1427              destination and store the original destination as the
1428              destination of the node.  */
1429           dfa->nexts[clone_node] = dfa->nexts[org_node];
1430           break;
1431         }
1432       else if (dfa->edests[org_node].nelem == 1)
1433         {
1434           /* In case of the node can epsilon-transit, and it has only one
1435              destination.  */
1436           org_dest = dfa->edests[org_node].elems[0];
1437           re_node_set_empty (dfa->edests + clone_node);
1438           if (dfa->nodes[org_node].type == ANCHOR)
1439             {
1440               /* In case of the node has another constraint, append it.  */
1441               if (org_node == root_node && clone_node != org_node)
1442                 {
1443                   /* ...but if the node is root_node itself, it means the
1444                      epsilon closure have a loop, then tie it to the
1445                      destination of the root_node.  */
1446                   ok = re_node_set_insert (dfa->edests + clone_node,
1447                                             org_dest);
1448                   if (BE (! ok, 0))
1449                     return REG_ESPACE;
1450                   break;
1451                 }
1452               constraint |= dfa->nodes[org_node].opr.ctx_type;
1453             }
1454           clone_dest = duplicate_node (dfa, org_dest, constraint);
1455           if (BE (clone_dest == REG_MISSING, 0))
1456             return REG_ESPACE;
1457           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1458           if (BE (! ok, 0))
1459             return REG_ESPACE;
1460         }
1461       else /* dfa->edests[org_node].nelem == 2 */
1462         {
1463           /* In case of the node can epsilon-transit, and it has two
1464              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1465           org_dest = dfa->edests[org_node].elems[0];
1466           re_node_set_empty (dfa->edests + clone_node);
1467           /* Search for a duplicated node which satisfies the constraint.  */
1468           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1469           if (clone_dest == REG_MISSING)
1470             {
1471               /* There are no such a duplicated node, create a new one.  */
1472               reg_errcode_t err;
1473               clone_dest = duplicate_node (dfa, org_dest, constraint);
1474               if (BE (clone_dest == REG_MISSING, 0))
1475                 return REG_ESPACE;
1476               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1477               if (BE (! ok, 0))
1478                 return REG_ESPACE;
1479               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1480                                             root_node, constraint);
1481               if (BE (err != REG_NOERROR, 0))
1482                 return err;
1483             }
1484           else
1485             {
1486               /* There are a duplicated node which satisfy the constraint,
1487                  use it to avoid infinite loop.  */
1488               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489               if (BE (! ok, 0))
1490                 return REG_ESPACE;
1491             }
1492
1493           org_dest = dfa->edests[org_node].elems[1];
1494           clone_dest = duplicate_node (dfa, org_dest, constraint);
1495           if (BE (clone_dest == REG_MISSING, 0))
1496             return REG_ESPACE;
1497           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498           if (BE (! ok, 0))
1499             return REG_ESPACE;
1500         }
1501       org_node = org_dest;
1502       clone_node = clone_dest;
1503     }
1504   return REG_NOERROR;
1505 }
1506
1507 /* Search for a node which is duplicated from the node ORG_NODE, and
1508    satisfies the constraint CONSTRAINT.  */
1509
1510 static Idx
1511 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1512                         unsigned int constraint)
1513 {
1514   Idx idx;
1515   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1516     {
1517       if (org_node == dfa->org_indices[idx]
1518           && constraint == dfa->nodes[idx].constraint)
1519         return idx; /* Found.  */
1520     }
1521   return REG_MISSING; /* Not found.  */
1522 }
1523
1524 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1525    Return the index of the new node, or REG_MISSING if insufficient storage is
1526    available.  */
1527
1528 static Idx
1529 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1530 {
1531   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1532   if (BE (dup_idx != REG_MISSING, 1))
1533     {
1534       dfa->nodes[dup_idx].constraint = constraint;
1535       if (dfa->nodes[org_idx].type == ANCHOR)
1536         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1537       dfa->nodes[dup_idx].duplicated = 1;
1538
1539       /* Store the index of the original node.  */
1540       dfa->org_indices[dup_idx] = org_idx;
1541     }
1542   return dup_idx;
1543 }
1544
1545 static reg_errcode_t
1546 calc_inveclosure (re_dfa_t *dfa)
1547 {
1548   Idx src, idx;
1549   bool ok;
1550   for (idx = 0; idx < dfa->nodes_len; ++idx)
1551     re_node_set_init_empty (dfa->inveclosures + idx);
1552
1553   for (src = 0; src < dfa->nodes_len; ++src)
1554     {
1555       Idx *elems = dfa->eclosures[src].elems;
1556       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1557         {
1558           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1559           if (BE (! ok, 0))
1560             return REG_ESPACE;
1561         }
1562     }
1563
1564   return REG_NOERROR;
1565 }
1566
1567 /* Calculate "eclosure" for all the node in DFA.  */
1568
1569 static reg_errcode_t
1570 calc_eclosure (re_dfa_t *dfa)
1571 {
1572   Idx node_idx;
1573   bool incomplete;
1574 #ifdef DEBUG
1575   assert (dfa->nodes_len > 0);
1576 #endif
1577   incomplete = false;
1578   /* For each nodes, calculate epsilon closure.  */
1579   for (node_idx = 0; ; ++node_idx)
1580     {
1581       reg_errcode_t err;
1582       re_node_set eclosure_elem;
1583       if (node_idx == dfa->nodes_len)
1584         {
1585           if (!incomplete)
1586             break;
1587           incomplete = false;
1588           node_idx = 0;
1589         }
1590
1591 #ifdef DEBUG
1592       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1593 #endif
1594
1595       /* If we have already calculated, skip it.  */
1596       if (dfa->eclosures[node_idx].nelem != 0)
1597         continue;
1598       /* Calculate epsilon closure of `node_idx'.  */
1599       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1600       if (BE (err != REG_NOERROR, 0))
1601         return err;
1602
1603       if (dfa->eclosures[node_idx].nelem == 0)
1604         {
1605           incomplete = true;
1606           re_node_set_free (&eclosure_elem);
1607         }
1608     }
1609   return REG_NOERROR;
1610 }
1611
1612 /* Calculate epsilon closure of NODE.  */
1613
1614 static reg_errcode_t
1615 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1616 {
1617   reg_errcode_t err;
1618   unsigned int constraint;
1619   Idx i;
1620   bool incomplete;
1621   bool ok;
1622   re_node_set eclosure;
1623   incomplete = false;
1624   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1625   if (BE (err != REG_NOERROR, 0))
1626     return err;
1627
1628   /* This indicates that we are calculating this node now.
1629      We reference this value to avoid infinite loop.  */
1630   dfa->eclosures[node].nelem = REG_MISSING;
1631
1632   constraint = ((dfa->nodes[node].type == ANCHOR)
1633                 ? dfa->nodes[node].opr.ctx_type : 0);
1634   /* If the current node has constraints, duplicate all nodes.
1635      Since they must inherit the constraints.  */
1636   if (constraint
1637       && dfa->edests[node].nelem
1638       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1639     {
1640       Idx org_node, cur_node;
1641       org_node = cur_node = node;
1642       err = duplicate_node_closure (dfa, node, node, node, constraint);
1643       if (BE (err != REG_NOERROR, 0))
1644         return err;
1645     }
1646
1647   /* Expand each epsilon destination nodes.  */
1648   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1649     for (i = 0; i < dfa->edests[node].nelem; ++i)
1650       {
1651         re_node_set eclosure_elem;
1652         Idx edest = dfa->edests[node].elems[i];
1653         /* If calculating the epsilon closure of `edest' is in progress,
1654            return intermediate result.  */
1655         if (dfa->eclosures[edest].nelem == REG_MISSING)
1656           {
1657             incomplete = true;
1658             continue;
1659           }
1660         /* If we haven't calculated the epsilon closure of `edest' yet,
1661            calculate now. Otherwise use calculated epsilon closure.  */
1662         if (dfa->eclosures[edest].nelem == 0)
1663           {
1664             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1665             if (BE (err != REG_NOERROR, 0))
1666               return err;
1667           }
1668         else
1669           eclosure_elem = dfa->eclosures[edest];
1670         /* Merge the epsilon closure of `edest'.  */
1671         re_node_set_merge (&eclosure, &eclosure_elem);
1672         /* If the epsilon closure of `edest' is incomplete,
1673            the epsilon closure of this node is also incomplete.  */
1674         if (dfa->eclosures[edest].nelem == 0)
1675           {
1676             incomplete = true;
1677             re_node_set_free (&eclosure_elem);
1678           }
1679       }
1680
1681   /* Epsilon closures include itself.  */
1682   ok = re_node_set_insert (&eclosure, node);
1683   if (BE (! ok, 0))
1684     return REG_ESPACE;
1685   if (incomplete && !root)
1686     dfa->eclosures[node].nelem = 0;
1687   else
1688     dfa->eclosures[node] = eclosure;
1689   *new_set = eclosure;
1690   return REG_NOERROR;
1691 }
1692 \f
1693 /* Functions for token which are used in the parser.  */
1694
1695 /* Fetch a token from INPUT.
1696    We must not use this function inside bracket expressions.  */
1697
1698 static void
1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1700 {
1701   re_string_skip_bytes (input, peek_token (result, input, syntax));
1702 }
1703
1704 /* Peek a token from INPUT, and return the length of the token.
1705    We must not use this function inside bracket expressions.  */
1706
1707 static int
1708 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1709 {
1710   unsigned char c;
1711
1712   if (re_string_eoi (input))
1713     {
1714       token->type = END_OF_RE;
1715       return 0;
1716     }
1717
1718   c = re_string_peek_byte (input, 0);
1719   token->opr.c = c;
1720
1721   token->word_char = 0;
1722 #ifdef RE_ENABLE_I18N
1723   token->mb_partial = 0;
1724   if (input->mb_cur_max > 1 &&
1725       !re_string_first_byte (input, re_string_cur_idx (input)))
1726     {
1727       token->type = CHARACTER;
1728       token->mb_partial = 1;
1729       return 1;
1730     }
1731 #endif
1732   if (c == '\\')
1733     {
1734       unsigned char c2;
1735       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1736         {
1737           token->type = BACK_SLASH;
1738           return 1;
1739         }
1740
1741       c2 = re_string_peek_byte_case (input, 1);
1742       token->opr.c = c2;
1743       token->type = CHARACTER;
1744 #ifdef RE_ENABLE_I18N
1745       if (input->mb_cur_max > 1)
1746         {
1747           wint_t wc = re_string_wchar_at (input,
1748                                           re_string_cur_idx (input) + 1);
1749           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1750         }
1751       else
1752 #endif
1753         token->word_char = IS_WORD_CHAR (c2) != 0;
1754
1755       switch (c2)
1756         {
1757         case '|':
1758           if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1759             token->type = OP_ALT;
1760           break;
1761         case '1': case '2': case '3': case '4': case '5':
1762         case '6': case '7': case '8': case '9':
1763           if (!(syntax & REG_NO_BK_REFS))
1764             {
1765               token->type = OP_BACK_REF;
1766               token->opr.idx = c2 - '1';
1767             }
1768           break;
1769         case '<':
1770           if (!(syntax & REG_NO_GNU_OPS))
1771             {
1772               token->type = ANCHOR;
1773               token->opr.ctx_type = WORD_FIRST;
1774             }
1775           break;
1776         case '>':
1777           if (!(syntax & REG_NO_GNU_OPS))
1778             {
1779               token->type = ANCHOR;
1780               token->opr.ctx_type = WORD_LAST;
1781             }
1782           break;
1783         case 'b':
1784           if (!(syntax & REG_NO_GNU_OPS))
1785             {
1786               token->type = ANCHOR;
1787               token->opr.ctx_type = WORD_DELIM;
1788             }
1789           break;
1790         case 'B':
1791           if (!(syntax & REG_NO_GNU_OPS))
1792             {
1793               token->type = ANCHOR;
1794               token->opr.ctx_type = NOT_WORD_DELIM;
1795             }
1796           break;
1797         case 'w':
1798           if (!(syntax & REG_NO_GNU_OPS))
1799             token->type = OP_WORD;
1800           break;
1801         case 'W':
1802           if (!(syntax & REG_NO_GNU_OPS))
1803             token->type = OP_NOTWORD;
1804           break;
1805         case 's':
1806           if (!(syntax & REG_NO_GNU_OPS))
1807             token->type = OP_SPACE;
1808           break;
1809         case 'S':
1810           if (!(syntax & REG_NO_GNU_OPS))
1811             token->type = OP_NOTSPACE;
1812           break;
1813         case '`':
1814           if (!(syntax & REG_NO_GNU_OPS))
1815             {
1816               token->type = ANCHOR;
1817               token->opr.ctx_type = BUF_FIRST;
1818             }
1819           break;
1820         case '\'':
1821           if (!(syntax & REG_NO_GNU_OPS))
1822             {
1823               token->type = ANCHOR;
1824               token->opr.ctx_type = BUF_LAST;
1825             }
1826           break;
1827         case '(':
1828           if (!(syntax & REG_NO_BK_PARENS))
1829             token->type = OP_OPEN_SUBEXP;
1830           break;
1831         case ')':
1832           if (!(syntax & REG_NO_BK_PARENS))
1833             token->type = OP_CLOSE_SUBEXP;
1834           break;
1835         case '+':
1836           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1837             token->type = OP_DUP_PLUS;
1838           break;
1839         case '?':
1840           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1841             token->type = OP_DUP_QUESTION;
1842           break;
1843         case '{':
1844           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1845             token->type = OP_OPEN_DUP_NUM;
1846           break;
1847         case '}':
1848           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1849             token->type = OP_CLOSE_DUP_NUM;
1850           break;
1851         default:
1852           break;
1853         }
1854       return 2;
1855     }
1856
1857   token->type = CHARACTER;
1858 #ifdef RE_ENABLE_I18N
1859   if (input->mb_cur_max > 1)
1860     {
1861       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1862       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1863     }
1864   else
1865 #endif
1866     token->word_char = IS_WORD_CHAR (token->opr.c);
1867
1868   switch (c)
1869     {
1870     case '\n':
1871       if (syntax & REG_NEWLINE_ALT)
1872         token->type = OP_ALT;
1873       break;
1874     case '|':
1875       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1876         token->type = OP_ALT;
1877       break;
1878     case '*':
1879       token->type = OP_DUP_ASTERISK;
1880       break;
1881     case '+':
1882       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1883         token->type = OP_DUP_PLUS;
1884       break;
1885     case '?':
1886       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1887         token->type = OP_DUP_QUESTION;
1888       break;
1889     case '{':
1890       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1891         token->type = OP_OPEN_DUP_NUM;
1892       break;
1893     case '}':
1894       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1895         token->type = OP_CLOSE_DUP_NUM;
1896       break;
1897     case '(':
1898       if (syntax & REG_NO_BK_PARENS)
1899         token->type = OP_OPEN_SUBEXP;
1900       break;
1901     case ')':
1902       if (syntax & REG_NO_BK_PARENS)
1903         token->type = OP_CLOSE_SUBEXP;
1904       break;
1905     case '[':
1906       token->type = OP_OPEN_BRACKET;
1907       break;
1908     case '.':
1909       token->type = OP_PERIOD;
1910       break;
1911     case '^':
1912       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1913           re_string_cur_idx (input) != 0)
1914         {
1915           char prev = re_string_peek_byte (input, -1);
1916           if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1917             break;
1918         }
1919       token->type = ANCHOR;
1920       token->opr.ctx_type = LINE_FIRST;
1921       break;
1922     case '$':
1923       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1924           re_string_cur_idx (input) + 1 != re_string_length (input))
1925         {
1926           re_token_t next;
1927           re_string_skip_bytes (input, 1);
1928           peek_token (&next, input, syntax);
1929           re_string_skip_bytes (input, -1);
1930           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1931             break;
1932         }
1933       token->type = ANCHOR;
1934       token->opr.ctx_type = LINE_LAST;
1935       break;
1936     default:
1937       break;
1938     }
1939   return 1;
1940 }
1941
1942 /* Peek a token from INPUT, and return the length of the token.
1943    We must not use this function out of bracket expressions.  */
1944
1945 static int
1946 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1947 {
1948   unsigned char c;
1949   if (re_string_eoi (input))
1950     {
1951       token->type = END_OF_RE;
1952       return 0;
1953     }
1954   c = re_string_peek_byte (input, 0);
1955   token->opr.c = c;
1956
1957 #ifdef RE_ENABLE_I18N
1958   if (input->mb_cur_max > 1 &&
1959       !re_string_first_byte (input, re_string_cur_idx (input)))
1960     {
1961       token->type = CHARACTER;
1962       return 1;
1963     }
1964 #endif /* RE_ENABLE_I18N */
1965
1966   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1967       && re_string_cur_idx (input) + 1 < re_string_length (input))
1968     {
1969       /* In this case, '\' escape a character.  */
1970       unsigned char c2;
1971       re_string_skip_bytes (input, 1);
1972       c2 = re_string_peek_byte (input, 0);
1973       token->opr.c = c2;
1974       token->type = CHARACTER;
1975       return 1;
1976     }
1977   if (c == '[') /* '[' is a special char in a bracket exps.  */
1978     {
1979       unsigned char c2;
1980       int token_len;
1981       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1982         c2 = re_string_peek_byte (input, 1);
1983       else
1984         c2 = 0;
1985       token->opr.c = c2;
1986       token_len = 2;
1987       switch (c2)
1988         {
1989         case '.':
1990           token->type = OP_OPEN_COLL_ELEM;
1991           break;
1992         case '=':
1993           token->type = OP_OPEN_EQUIV_CLASS;
1994           break;
1995         case ':':
1996           if (syntax & REG_CHAR_CLASSES)
1997             {
1998               token->type = OP_OPEN_CHAR_CLASS;
1999               break;
2000             }
2001           /* else fall through.  */
2002         default:
2003           token->type = CHARACTER;
2004           token->opr.c = c;
2005           token_len = 1;
2006           break;
2007         }
2008       return token_len;
2009     }
2010   switch (c)
2011     {
2012     case '-':
2013       token->type = OP_CHARSET_RANGE;
2014       break;
2015     case ']':
2016       token->type = OP_CLOSE_BRACKET;
2017       break;
2018     case '^':
2019       token->type = OP_NON_MATCH_LIST;
2020       break;
2021     default:
2022       token->type = CHARACTER;
2023     }
2024   return 1;
2025 }
2026 \f
2027 /* Functions for parser.  */
2028
2029 /* Entry point of the parser.
2030    Parse the regular expression REGEXP and return the structure tree.
2031    If an error is occured, ERR is set by error code, and return NULL.
2032    This function build the following tree, from regular expression <reg_exp>:
2033            CAT
2034            / \
2035           /   \
2036    <reg_exp>  EOR
2037
2038    CAT means concatenation.
2039    EOR means end of regular expression.  */
2040
2041 static bin_tree_t *
2042 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2043        reg_errcode_t *err)
2044 {
2045   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2046   bin_tree_t *tree, *eor, *root;
2047   re_token_t current_token;
2048   dfa->syntax = syntax;
2049   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2050   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2051   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2052     return NULL;
2053   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2054   if (tree != NULL)
2055     root = create_tree (dfa, tree, eor, CONCAT);
2056   else
2057     root = eor;
2058   if (BE (eor == NULL || root == NULL, 0))
2059     {
2060       *err = REG_ESPACE;
2061       return NULL;
2062     }
2063   return root;
2064 }
2065
2066 /* This function build the following tree, from regular expression
2067    <branch1>|<branch2>:
2068            ALT
2069            / \
2070           /   \
2071    <branch1> <branch2>
2072
2073    ALT means alternative, which represents the operator `|'.  */
2074
2075 static bin_tree_t *
2076 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2077                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2078 {
2079   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2080   bin_tree_t *tree, *branch = NULL;
2081   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2082   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2083     return NULL;
2084
2085   while (token->type == OP_ALT)
2086     {
2087       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2088       if (token->type != OP_ALT && token->type != END_OF_RE
2089           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2090         {
2091           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2092           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2093             return NULL;
2094         }
2095       else
2096         branch = NULL;
2097       tree = create_tree (dfa, tree, branch, OP_ALT);
2098       if (BE (tree == NULL, 0))
2099         {
2100           *err = REG_ESPACE;
2101           return NULL;
2102         }
2103     }
2104   return tree;
2105 }
2106
2107 /* This function build the following tree, from regular expression
2108    <exp1><exp2>:
2109         CAT
2110         / \
2111        /   \
2112    <exp1> <exp2>
2113
2114    CAT means concatenation.  */
2115
2116 static bin_tree_t *
2117 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2118               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2119 {
2120   bin_tree_t *tree, *exp;
2121   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2122   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2123   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124     return NULL;
2125
2126   while (token->type != OP_ALT && token->type != END_OF_RE
2127          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128     {
2129       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2130       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2131         {
2132           return NULL;
2133         }
2134       if (tree != NULL && exp != NULL)
2135         {
2136           tree = create_tree (dfa, tree, exp, CONCAT);
2137           if (tree == NULL)
2138             {
2139               *err = REG_ESPACE;
2140               return NULL;
2141             }
2142         }
2143       else if (tree == NULL)
2144         tree = exp;
2145       /* Otherwise exp == NULL, we don't need to create new tree.  */
2146     }
2147   return tree;
2148 }
2149
2150 /* This function build the following tree, from regular expression a*:
2151          *
2152          |
2153          a
2154 */
2155
2156 static bin_tree_t *
2157 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2158                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2159 {
2160   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2161   bin_tree_t *tree;
2162   switch (token->type)
2163     {
2164     case CHARACTER:
2165       tree = create_token_tree (dfa, NULL, NULL, token);
2166       if (BE (tree == NULL, 0))
2167         {
2168           *err = REG_ESPACE;
2169           return NULL;
2170         }
2171 #ifdef RE_ENABLE_I18N
2172       if (dfa->mb_cur_max > 1)
2173         {
2174           while (!re_string_eoi (regexp)
2175                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2176             {
2177               bin_tree_t *mbc_remain;
2178               fetch_token (token, regexp, syntax);
2179               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2180               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2181               if (BE (mbc_remain == NULL || tree == NULL, 0))
2182                 {
2183                   *err = REG_ESPACE;
2184                   return NULL;
2185                 }
2186             }
2187         }
2188 #endif
2189       break;
2190     case OP_OPEN_SUBEXP:
2191       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2192       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193         return NULL;
2194       break;
2195     case OP_OPEN_BRACKET:
2196       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2197       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2198         return NULL;
2199       break;
2200     case OP_BACK_REF:
2201       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2202         {
2203           *err = REG_ESUBREG;
2204           return NULL;
2205         }
2206       dfa->used_bkref_map |= 1 << token->opr.idx;
2207       tree = create_token_tree (dfa, NULL, NULL, token);
2208       if (BE (tree == NULL, 0))
2209         {
2210           *err = REG_ESPACE;
2211           return NULL;
2212         }
2213       ++dfa->nbackref;
2214       dfa->has_mb_node = 1;
2215       break;
2216     case OP_OPEN_DUP_NUM:
2217       if (syntax & REG_CONTEXT_INVALID_DUP)
2218         {
2219           *err = REG_BADRPT;
2220           return NULL;
2221         }
2222       /* FALLTHROUGH */
2223     case OP_DUP_ASTERISK:
2224     case OP_DUP_PLUS:
2225     case OP_DUP_QUESTION:
2226       if (syntax & REG_CONTEXT_INVALID_OPS)
2227         {
2228           *err = REG_BADRPT;
2229           return NULL;
2230         }
2231       else if (syntax & REG_CONTEXT_INDEP_OPS)
2232         {
2233           fetch_token (token, regexp, syntax);
2234           return parse_expression (regexp, preg, token, syntax, nest, err);
2235         }
2236       /* else fall through  */
2237     case OP_CLOSE_SUBEXP:
2238       if ((token->type == OP_CLOSE_SUBEXP) &&
2239           !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2240         {
2241           *err = REG_ERPAREN;
2242           return NULL;
2243         }
2244       /* else fall through  */
2245     case OP_CLOSE_DUP_NUM:
2246       /* We treat it as a normal character.  */
2247
2248       /* Then we can these characters as normal characters.  */
2249       token->type = CHARACTER;
2250       /* mb_partial and word_char bits should be initialized already
2251          by peek_token.  */
2252       tree = create_token_tree (dfa, NULL, NULL, token);
2253       if (BE (tree == NULL, 0))
2254         {
2255           *err = REG_ESPACE;
2256           return NULL;
2257         }
2258       break;
2259     case ANCHOR:
2260       if ((token->opr.ctx_type
2261            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2262           && dfa->word_ops_used == 0)
2263         init_word_char (dfa);
2264       if (token->opr.ctx_type == WORD_DELIM
2265           || token->opr.ctx_type == NOT_WORD_DELIM)
2266         {
2267           bin_tree_t *tree_first, *tree_last;
2268           if (token->opr.ctx_type == WORD_DELIM)
2269             {
2270               token->opr.ctx_type = WORD_FIRST;
2271               tree_first = create_token_tree (dfa, NULL, NULL, token);
2272               token->opr.ctx_type = WORD_LAST;
2273             }
2274           else
2275             {
2276               token->opr.ctx_type = INSIDE_WORD;
2277               tree_first = create_token_tree (dfa, NULL, NULL, token);
2278               token->opr.ctx_type = INSIDE_NOTWORD;
2279             }
2280           tree_last = create_token_tree (dfa, NULL, NULL, token);
2281           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2282           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2283             {
2284               *err = REG_ESPACE;
2285               return NULL;
2286             }
2287         }
2288       else
2289         {
2290           tree = create_token_tree (dfa, NULL, NULL, token);
2291           if (BE (tree == NULL, 0))
2292             {
2293               *err = REG_ESPACE;
2294               return NULL;
2295             }
2296         }
2297       /* We must return here, since ANCHORs can't be followed
2298          by repetition operators.
2299          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2300              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2301       fetch_token (token, regexp, syntax);
2302       return tree;
2303     case OP_PERIOD:
2304       tree = create_token_tree (dfa, NULL, NULL, token);
2305       if (BE (tree == NULL, 0))
2306         {
2307           *err = REG_ESPACE;
2308           return NULL;
2309         }
2310       if (dfa->mb_cur_max > 1)
2311         dfa->has_mb_node = 1;
2312       break;
2313     case OP_WORD:
2314     case OP_NOTWORD:
2315       tree = build_charclass_op (dfa, regexp->trans,
2316                                  (const unsigned char *) "alnum",
2317                                  (const unsigned char *) "_",
2318                                  token->type == OP_NOTWORD, err);
2319       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2320         return NULL;
2321       break;
2322     case OP_SPACE:
2323     case OP_NOTSPACE:
2324       tree = build_charclass_op (dfa, regexp->trans,
2325                                  (const unsigned char *) "space",
2326                                  (const unsigned char *) "",
2327                                  token->type == OP_NOTSPACE, err);
2328       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2329         return NULL;
2330       break;
2331     case OP_ALT:
2332     case END_OF_RE:
2333       return NULL;
2334     case BACK_SLASH:
2335       *err = REG_EESCAPE;
2336       return NULL;
2337     default:
2338       /* Must not happen?  */
2339 #ifdef DEBUG
2340       assert (0);
2341 #endif
2342       return NULL;
2343     }
2344   fetch_token (token, regexp, syntax);
2345
2346   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2347          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2348     {
2349       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2350       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2351         return NULL;
2352       /* In BRE consecutive duplications are not allowed.  */
2353       if ((syntax & REG_CONTEXT_INVALID_DUP)
2354           && (token->type == OP_DUP_ASTERISK
2355               || token->type == OP_OPEN_DUP_NUM))
2356         {
2357           *err = REG_BADRPT;
2358           return NULL;
2359         }
2360     }
2361
2362   return tree;
2363 }
2364
2365 /* This function build the following tree, from regular expression
2366    (<reg_exp>):
2367          SUBEXP
2368             |
2369         <reg_exp>
2370 */
2371
2372 static bin_tree_t *
2373 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2374                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2375 {
2376   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2377   bin_tree_t *tree;
2378   size_t cur_nsub;
2379   cur_nsub = preg->re_nsub++;
2380
2381   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2382
2383   /* The subexpression may be a null string.  */
2384   if (token->type == OP_CLOSE_SUBEXP)
2385     tree = NULL;
2386   else
2387     {
2388       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2389       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2390         *err = REG_EPAREN;
2391       if (BE (*err != REG_NOERROR, 0))
2392         return NULL;
2393     }
2394
2395   if (cur_nsub <= '9' - '1')
2396     dfa->completed_bkref_map |= 1 << cur_nsub;
2397
2398   tree = create_tree (dfa, tree, NULL, SUBEXP);
2399   if (BE (tree == NULL, 0))
2400     {
2401       *err = REG_ESPACE;
2402       return NULL;
2403     }
2404   tree->token.opr.idx = cur_nsub;
2405   return tree;
2406 }
2407
2408 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2409
2410 static bin_tree_t *
2411 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2412               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2413 {
2414   bin_tree_t *tree = NULL, *old_tree = NULL;
2415   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2416   re_token_t start_token = *token;
2417
2418   if (token->type == OP_OPEN_DUP_NUM)
2419     {
2420       end = 0;
2421       start = fetch_number (regexp, token, syntax);
2422       if (start == REG_MISSING)
2423         {
2424           if (token->type == CHARACTER && token->opr.c == ',')
2425             start = 0; /* We treat "{,m}" as "{0,m}".  */
2426           else
2427             {
2428               *err = REG_BADBR; /* <re>{} is invalid.  */
2429               return NULL;
2430             }
2431         }
2432       if (BE (start != REG_ERROR, 1))
2433         {
2434           /* We treat "{n}" as "{n,n}".  */
2435           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2436                  : ((token->type == CHARACTER && token->opr.c == ',')
2437                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2438         }
2439       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2440         {
2441           /* Invalid sequence.  */
2442           if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2443             {
2444               if (token->type == END_OF_RE)
2445                 *err = REG_EBRACE;
2446               else
2447                 *err = REG_BADBR;
2448
2449               return NULL;
2450             }
2451
2452           /* If the syntax bit is set, rollback.  */
2453           re_string_set_index (regexp, start_idx);
2454           *token = start_token;
2455           token->type = CHARACTER;
2456           /* mb_partial and word_char bits should be already initialized by
2457              peek_token.  */
2458           return elem;
2459         }
2460
2461       if (BE (end != REG_MISSING && start > end, 0))
2462         {
2463           /* First number greater than second.  */
2464           *err = REG_BADBR;
2465           return NULL;
2466         }
2467     }
2468   else
2469     {
2470       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2471       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2472     }
2473
2474   fetch_token (token, regexp, syntax);
2475
2476   if (BE (elem == NULL, 0))
2477     return NULL;
2478   if (BE (start == 0 && end == 0, 0))
2479     {
2480       postorder (elem, free_tree, NULL);
2481       return NULL;
2482     }
2483
2484   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2485   if (BE (start > 0, 0))
2486     {
2487       tree = elem;
2488       for (i = 2; i <= start; ++i)
2489         {
2490           elem = duplicate_tree (elem, dfa);
2491           tree = create_tree (dfa, tree, elem, CONCAT);
2492           if (BE (elem == NULL || tree == NULL, 0))
2493             goto parse_dup_op_espace;
2494         }
2495
2496       if (start == end)
2497         return tree;
2498
2499       /* Duplicate ELEM before it is marked optional.  */
2500       elem = duplicate_tree (elem, dfa);
2501       old_tree = tree;
2502     }
2503   else
2504     old_tree = NULL;
2505
2506   if (elem->token.type == SUBEXP)
2507     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2508
2509   tree = create_tree (dfa, elem, NULL,
2510                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2511   if (BE (tree == NULL, 0))
2512     goto parse_dup_op_espace;
2513
2514   /* This loop is actually executed only when end != REG_MISSING,
2515      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2516      already created the start+1-th copy.  */
2517   if ((Idx) -1 < 0 || end != REG_MISSING)
2518     for (i = start + 2; i <= end; ++i)
2519       {
2520         elem = duplicate_tree (elem, dfa);
2521         tree = create_tree (dfa, tree, elem, CONCAT);
2522         if (BE (elem == NULL || tree == NULL, 0))
2523           goto parse_dup_op_espace;
2524
2525         tree = create_tree (dfa, tree, NULL, OP_ALT);
2526         if (BE (tree == NULL, 0))
2527           goto parse_dup_op_espace;
2528       }
2529
2530   if (old_tree)
2531     tree = create_tree (dfa, old_tree, tree, CONCAT);
2532
2533   return tree;
2534
2535  parse_dup_op_espace:
2536   *err = REG_ESPACE;
2537   return NULL;
2538 }
2539
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541    I'm not sure, but maybe enough.  */
2542 #define BRACKET_NAME_BUF_SIZE 32
2543
2544 #ifndef _LIBC
2545   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546      Build the range expression which starts from START_ELEM, and ends
2547      at END_ELEM.  The result are written to MBCSET and SBCSET.
2548      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549      mbcset->range_ends, is a pointer argument sinse we may
2550      update it.  */
2551
2552 static reg_errcode_t
2553 build_range_exp (re_bitset_ptr_t sbcset,
2554 # ifdef RE_ENABLE_I18N
2555                  re_charset_t *mbcset, Idx *range_alloc,
2556 # endif
2557                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2558 {
2559   unsigned int start_ch, end_ch;
2560   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2561   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2562           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2563           0))
2564     return REG_ERANGE;
2565
2566   /* We can handle no multi character collating elements without libc
2567      support.  */
2568   if (BE ((start_elem->type == COLL_SYM
2569            && strlen ((char *) start_elem->opr.name) > 1)
2570           || (end_elem->type == COLL_SYM
2571               && strlen ((char *) end_elem->opr.name) > 1), 0))
2572     return REG_ECOLLATE;
2573
2574 # ifdef RE_ENABLE_I18N
2575   {
2576     wchar_t wc;
2577     wint_t start_wc, end_wc;
2578     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2579
2580     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2581                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2582                    : 0));
2583     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2584               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2585                  : 0));
2586     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2587                 ? __btowc (start_ch) : start_elem->opr.wch);
2588     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2589               ? __btowc (end_ch) : end_elem->opr.wch);
2590     if (start_wc == WEOF || end_wc == WEOF)
2591       return REG_ECOLLATE;
2592     cmp_buf[0] = start_wc;
2593     cmp_buf[4] = end_wc;
2594     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2595       return REG_ERANGE;
2596
2597     /* Got valid collation sequence values, add them as a new entry.
2598        However, for !_LIBC we have no collation elements: if the
2599        character set is single byte, the single byte character set
2600        that we build below suffices.  parse_bracket_exp passes
2601        no MBCSET if dfa->mb_cur_max == 1.  */
2602     if (mbcset)
2603       {
2604         /* Check the space of the arrays.  */
2605         if (BE (*range_alloc == mbcset->nranges, 0))
2606           {
2607             /* There is not enough space, need realloc.  */
2608             wchar_t *new_array_start, *new_array_end;
2609             Idx new_nranges;
2610
2611             /* +1 in case of mbcset->nranges is 0.  */
2612             new_nranges = 2 * mbcset->nranges + 1;
2613             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2614                are NULL if *range_alloc == 0.  */
2615             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2616                                           new_nranges);
2617             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2618                                         new_nranges);
2619
2620             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2621               return REG_ESPACE;
2622
2623             mbcset->range_starts = new_array_start;
2624             mbcset->range_ends = new_array_end;
2625             *range_alloc = new_nranges;
2626           }
2627
2628         mbcset->range_starts[mbcset->nranges] = start_wc;
2629         mbcset->range_ends[mbcset->nranges++] = end_wc;
2630       }
2631
2632     /* Build the table for single byte characters.  */
2633     for (wc = 0; wc < SBC_MAX; ++wc)
2634       {
2635         cmp_buf[2] = wc;
2636         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2637             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2638           bitset_set (sbcset, wc);
2639       }
2640   }
2641 # else /* not RE_ENABLE_I18N */
2642   {
2643     unsigned int ch;
2644     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2645                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2646                    : 0));
2647     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2648               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2649                  : 0));
2650     if (start_ch > end_ch)
2651       return REG_ERANGE;
2652     /* Build the table for single byte characters.  */
2653     for (ch = 0; ch < SBC_MAX; ++ch)
2654       if (start_ch <= ch  && ch <= end_ch)
2655         bitset_set (sbcset, ch);
2656   }
2657 # endif /* not RE_ENABLE_I18N */
2658   return REG_NOERROR;
2659 }
2660 #endif /* not _LIBC */
2661
2662 #ifndef _LIBC
2663 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2664    Build the collating element which is represented by NAME.
2665    The result are written to MBCSET and SBCSET.
2666    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2667    pointer argument since we may update it.  */
2668
2669 static reg_errcode_t
2670 build_collating_symbol (re_bitset_ptr_t sbcset,
2671 # ifdef RE_ENABLE_I18N
2672                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2673 # endif
2674                         const unsigned char *name)
2675 {
2676   size_t name_len = strlen ((const char *) name);
2677   if (BE (name_len != 1, 0))
2678     return REG_ECOLLATE;
2679   else
2680     {
2681       bitset_set (sbcset, name[0]);
2682       return REG_NOERROR;
2683     }
2684 }
2685 #endif /* not _LIBC */
2686
2687 /* This function parse bracket expression like "[abc]", "[a-c]",
2688    "[[.a-a.]]" etc.  */
2689
2690 static bin_tree_t *
2691 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2692                    reg_syntax_t syntax, reg_errcode_t *err)
2693 {
2694 #ifdef _LIBC
2695   const unsigned char *collseqmb;
2696   const char *collseqwc;
2697   uint32_t nrules;
2698   int32_t table_size;
2699   const int32_t *symb_table;
2700   const unsigned char *extra;
2701
2702   /* Local function for parse_bracket_exp used in _LIBC environement.
2703      Seek the collating symbol entry correspondings to NAME.
2704      Return the index of the symbol in the SYMB_TABLE.  */
2705
2706   auto inline int32_t
2707   __attribute ((always_inline))
2708   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2709     {
2710       int32_t hash = elem_hash ((const char *) name, name_len);
2711       int32_t elem = hash % table_size;
2712       int32_t second = hash % (table_size - 2);
2713       while (symb_table[2 * elem] != 0)
2714         {
2715           /* First compare the hashing value.  */
2716           if (symb_table[2 * elem] == hash
2717               /* Compare the length of the name.  */
2718               && name_len == extra[symb_table[2 * elem + 1]]
2719               /* Compare the name.  */
2720               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2721                          name_len) == 0)
2722             {
2723               /* Yep, this is the entry.  */
2724               break;
2725             }
2726
2727           /* Next entry.  */
2728           elem += second;
2729         }
2730       return elem;
2731     }
2732
2733   /* Local function for parse_bracket_exp used in _LIBC environement.
2734      Look up the collation sequence value of BR_ELEM.
2735      Return the value if succeeded, UINT_MAX otherwise.  */
2736
2737   auto inline unsigned int
2738   __attribute ((always_inline))
2739   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2740     {
2741       if (br_elem->type == SB_CHAR)
2742         {
2743           /*
2744           if (MB_CUR_MAX == 1)
2745           */
2746           if (nrules == 0)
2747             return collseqmb[br_elem->opr.ch];
2748           else
2749             {
2750               wint_t wc = __btowc (br_elem->opr.ch);
2751               return __collseq_table_lookup (collseqwc, wc);
2752             }
2753         }
2754       else if (br_elem->type == MB_CHAR)
2755         {
2756           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2757         }
2758       else if (br_elem->type == COLL_SYM)
2759         {
2760           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2761           if (nrules != 0)
2762             {
2763               int32_t elem, idx;
2764               elem = seek_collating_symbol_entry (br_elem->opr.name,
2765                                                   sym_name_len);
2766               if (symb_table[2 * elem] != 0)
2767                 {
2768                   /* We found the entry.  */
2769                   idx = symb_table[2 * elem + 1];
2770                   /* Skip the name of collating element name.  */
2771                   idx += 1 + extra[idx];
2772                   /* Skip the byte sequence of the collating element.  */
2773                   idx += 1 + extra[idx];
2774                   /* Adjust for the alignment.  */
2775                   idx = (idx + 3) & ~3;
2776                   /* Skip the multibyte collation sequence value.  */
2777                   idx += sizeof (unsigned int);
2778                   /* Skip the wide char sequence of the collating element.  */
2779                   idx += sizeof (unsigned int) *
2780                     (1 + *(unsigned int *) (extra + idx));
2781                   /* Return the collation sequence value.  */
2782                   return *(unsigned int *) (extra + idx);
2783                 }
2784               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2785                 {
2786                   /* No valid character.  Match it as a single byte
2787                      character.  */
2788                   return collseqmb[br_elem->opr.name[0]];
2789                 }
2790             }
2791           else if (sym_name_len == 1)
2792             return collseqmb[br_elem->opr.name[0]];
2793         }
2794       return UINT_MAX;
2795     }
2796
2797   /* Local function for parse_bracket_exp used in _LIBC environement.
2798      Build the range expression which starts from START_ELEM, and ends
2799      at END_ELEM.  The result are written to MBCSET and SBCSET.
2800      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2801      mbcset->range_ends, is a pointer argument sinse we may
2802      update it.  */
2803
2804   auto inline reg_errcode_t
2805   __attribute ((always_inline))
2806   build_range_exp (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2807                    Idx *range_alloc,
2808                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2809     {
2810       unsigned int ch;
2811       uint32_t start_collseq;
2812       uint32_t end_collseq;
2813
2814       /* Equivalence Classes and Character Classes can't be a range
2815          start/end.  */
2816       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2817               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2818               0))
2819         return REG_ERANGE;
2820
2821       start_collseq = lookup_collation_sequence_value (start_elem);
2822       end_collseq = lookup_collation_sequence_value (end_elem);
2823       /* Check start/end collation sequence values.  */
2824       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2825         return REG_ECOLLATE;
2826       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2827         return REG_ERANGE;
2828
2829       /* Got valid collation sequence values, add them as a new entry.
2830          However, if we have no collation elements, and the character set
2831          is single byte, the single byte character set that we
2832          build below suffices. */
2833       if (nrules > 0 || dfa->mb_cur_max > 1)
2834         {
2835           /* Check the space of the arrays.  */
2836           if (BE (*range_alloc == mbcset->nranges, 0))
2837             {
2838               /* There is not enough space, need realloc.  */
2839               uint32_t *new_array_start;
2840               uint32_t *new_array_end;
2841               Idx new_nranges;
2842
2843               /* +1 in case of mbcset->nranges is 0.  */
2844               new_nranges = 2 * mbcset->nranges + 1;
2845               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2846                                             new_nranges);
2847               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2848                                           new_nranges);
2849
2850               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2851                 return REG_ESPACE;
2852
2853               mbcset->range_starts = new_array_start;
2854               mbcset->range_ends = new_array_end;
2855               *range_alloc = new_nranges;
2856             }
2857
2858           mbcset->range_starts[mbcset->nranges] = start_collseq;
2859           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2860         }
2861
2862       /* Build the table for single byte characters.  */
2863       for (ch = 0; ch < SBC_MAX; ch++)
2864         {
2865           uint32_t ch_collseq;
2866           /*
2867           if (MB_CUR_MAX == 1)
2868           */
2869           if (nrules == 0)
2870             ch_collseq = collseqmb[ch];
2871           else
2872             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2873           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2874             bitset_set (sbcset, ch);
2875         }
2876       return REG_NOERROR;
2877     }
2878
2879   /* Local function for parse_bracket_exp used in _LIBC environement.
2880      Build the collating element which is represented by NAME.
2881      The result are written to MBCSET and SBCSET.
2882      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2883      pointer argument sinse we may update it.  */
2884
2885   auto inline reg_errcode_t
2886   __attribute ((always_inline))
2887   build_collating_symbol (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2888                           Idx *coll_sym_alloc, const unsigned char *name)
2889     {
2890       int32_t elem, idx;
2891       size_t name_len = strlen ((const char *) name);
2892       if (nrules != 0)
2893         {
2894           elem = seek_collating_symbol_entry (name, name_len);
2895           if (symb_table[2 * elem] != 0)
2896             {
2897               /* We found the entry.  */
2898               idx = symb_table[2 * elem + 1];
2899               /* Skip the name of collating element name.  */
2900               idx += 1 + extra[idx];
2901             }
2902           else if (symb_table[2 * elem] == 0 && name_len == 1)
2903             {
2904               /* No valid character, treat it as a normal
2905                  character.  */
2906               bitset_set (sbcset, name[0]);
2907               return REG_NOERROR;
2908             }
2909           else
2910             return REG_ECOLLATE;
2911
2912           /* Got valid collation sequence, add it as a new entry.  */
2913           /* Check the space of the arrays.  */
2914           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2915             {
2916               /* Not enough, realloc it.  */
2917               /* +1 in case of mbcset->ncoll_syms is 0.  */
2918               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2919               /* Use realloc since mbcset->coll_syms is NULL
2920                  if *alloc == 0.  */
2921               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2922                                                    new_coll_sym_alloc);
2923               if (BE (new_coll_syms == NULL, 0))
2924                 return REG_ESPACE;
2925               mbcset->coll_syms = new_coll_syms;
2926               *coll_sym_alloc = new_coll_sym_alloc;
2927             }
2928           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2929           return REG_NOERROR;
2930         }
2931       else
2932         {
2933           if (BE (name_len != 1, 0))
2934             return REG_ECOLLATE;
2935           else
2936             {
2937               bitset_set (sbcset, name[0]);
2938               return REG_NOERROR;
2939             }
2940         }
2941     }
2942 #endif
2943
2944   re_token_t br_token;
2945   re_bitset_ptr_t sbcset;
2946 #ifdef RE_ENABLE_I18N
2947   re_charset_t *mbcset;
2948   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2949   Idx equiv_class_alloc = 0, char_class_alloc = 0;
2950 #endif /* not RE_ENABLE_I18N */
2951   bool non_match = false;
2952   bin_tree_t *work_tree;
2953   int token_len;
2954   bool first_round = true;
2955 #ifdef _LIBC
2956   collseqmb = (const unsigned char *)
2957     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2958   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2959   if (nrules)
2960     {
2961       /*
2962       if (MB_CUR_MAX > 1)
2963       */
2964         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2965       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2966       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2967                                                   _NL_COLLATE_SYMB_TABLEMB);
2968       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2969                                                    _NL_COLLATE_SYMB_EXTRAMB);
2970     }
2971 #endif
2972   sbcset = re_calloc (unsigned int, BITSET_UINTS);
2973 #ifdef RE_ENABLE_I18N
2974   mbcset = re_calloc (re_charset_t, 1);
2975 #endif /* RE_ENABLE_I18N */
2976 #ifdef RE_ENABLE_I18N
2977   if (BE (sbcset == NULL || mbcset == NULL, 0))
2978 #else
2979   if (BE (sbcset == NULL, 0))
2980 #endif /* RE_ENABLE_I18N */
2981     {
2982       *err = REG_ESPACE;
2983       return NULL;
2984     }
2985
2986   token_len = peek_token_bracket (token, regexp, syntax);
2987   if (BE (token->type == END_OF_RE, 0))
2988     {
2989       *err = REG_BADPAT;
2990       goto parse_bracket_exp_free_return;
2991     }
2992   if (token->type == OP_NON_MATCH_LIST)
2993     {
2994 #ifdef RE_ENABLE_I18N
2995       mbcset->non_match = 1;
2996 #endif /* not RE_ENABLE_I18N */
2997       non_match = true;
2998       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2999         bitset_set (sbcset, '\0');
3000       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3001       token_len = peek_token_bracket (token, regexp, syntax);
3002       if (BE (token->type == END_OF_RE, 0))
3003         {
3004           *err = REG_BADPAT;
3005           goto parse_bracket_exp_free_return;
3006         }
3007     }
3008
3009   /* We treat the first ']' as a normal character.  */
3010   if (token->type == OP_CLOSE_BRACKET)
3011     token->type = CHARACTER;
3012
3013   while (1)
3014     {
3015       bracket_elem_t start_elem, end_elem;
3016       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3017       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3018       reg_errcode_t ret;
3019       int token_len2 = 0;
3020       bool is_range_exp = false;
3021       re_token_t token2;
3022
3023       start_elem.opr.name = start_name_buf;
3024       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3025                                    syntax, first_round);
3026       if (BE (ret != REG_NOERROR, 0))
3027         {
3028           *err = ret;
3029           goto parse_bracket_exp_free_return;
3030         }
3031       first_round = false;
3032
3033       /* Get information about the next token.  We need it in any case.  */
3034       token_len = peek_token_bracket (token, regexp, syntax);
3035
3036       /* Do not check for ranges if we know they are not allowed.  */
3037       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3038         {
3039           if (BE (token->type == END_OF_RE, 0))
3040             {
3041               *err = REG_EBRACK;
3042               goto parse_bracket_exp_free_return;
3043             }
3044           if (token->type == OP_CHARSET_RANGE)
3045             {
3046               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3047               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3048               if (BE (token2.type == END_OF_RE, 0))
3049                 {
3050                   *err = REG_EBRACK;
3051                   goto parse_bracket_exp_free_return;
3052                 }
3053               if (token2.type == OP_CLOSE_BRACKET)
3054                 {
3055                   /* We treat the last '-' as a normal character.  */
3056                   re_string_skip_bytes (regexp, -token_len);
3057                   token->type = CHARACTER;
3058                 }
3059               else
3060                 is_range_exp = true;
3061             }
3062         }
3063
3064       if (is_range_exp == true)
3065         {
3066           end_elem.opr.name = end_name_buf;
3067           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3068                                        dfa, syntax, true);
3069           if (BE (ret != REG_NOERROR, 0))
3070             {
3071               *err = ret;
3072               goto parse_bracket_exp_free_return;
3073             }
3074
3075           token_len = peek_token_bracket (token, regexp, syntax);
3076
3077 #ifdef _LIBC
3078           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3079                                   &start_elem, &end_elem);
3080 #else
3081 # ifdef RE_ENABLE_I18N
3082           *err = build_range_exp (sbcset,
3083                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3084                                   &range_alloc, &start_elem, &end_elem);
3085 # else
3086           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3087 # endif
3088 #endif /* RE_ENABLE_I18N */
3089           if (BE (*err != REG_NOERROR, 0))
3090             goto parse_bracket_exp_free_return;
3091         }
3092       else
3093         {
3094           switch (start_elem.type)
3095             {
3096             case SB_CHAR:
3097               bitset_set (sbcset, start_elem.opr.ch);
3098               break;
3099 #ifdef RE_ENABLE_I18N
3100             case MB_CHAR:
3101               /* Check whether the array has enough space.  */
3102               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3103                 {
3104                   wchar_t *new_mbchars;
3105                   /* Not enough, realloc it.  */
3106                   /* +1 in case of mbcset->nmbchars is 0.  */
3107                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3108                   /* Use realloc since array is NULL if *alloc == 0.  */
3109                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3110                                             mbchar_alloc);
3111                   if (BE (new_mbchars == NULL, 0))
3112                     goto parse_bracket_exp_espace;
3113                   mbcset->mbchars = new_mbchars;
3114                 }
3115               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3116               break;
3117 #endif /* RE_ENABLE_I18N */
3118             case EQUIV_CLASS:
3119               *err = build_equiv_class (sbcset,
3120 #ifdef RE_ENABLE_I18N
3121                                         mbcset, &equiv_class_alloc,
3122 #endif /* RE_ENABLE_I18N */
3123                                         start_elem.opr.name);
3124               if (BE (*err != REG_NOERROR, 0))
3125                 goto parse_bracket_exp_free_return;
3126               break;
3127             case COLL_SYM:
3128               *err = build_collating_symbol (sbcset,
3129 #ifdef RE_ENABLE_I18N
3130                                              mbcset, &coll_sym_alloc,
3131 #endif /* RE_ENABLE_I18N */
3132                                              start_elem.opr.name);
3133               if (BE (*err != REG_NOERROR, 0))
3134                 goto parse_bracket_exp_free_return;
3135               break;
3136             case CHAR_CLASS:
3137               *err = build_charclass (regexp->trans, sbcset,
3138 #ifdef RE_ENABLE_I18N
3139                                       mbcset, &char_class_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141                                       start_elem.opr.name, syntax);
3142               if (BE (*err != REG_NOERROR, 0))
3143                goto parse_bracket_exp_free_return;
3144               break;
3145             default:
3146               assert (0);
3147               break;
3148             }
3149         }
3150       if (BE (token->type == END_OF_RE, 0))
3151         {
3152           *err = REG_EBRACK;
3153           goto parse_bracket_exp_free_return;
3154         }
3155       if (token->type == OP_CLOSE_BRACKET)
3156         break;
3157     }
3158
3159   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3160
3161   /* If it is non-matching list.  */
3162   if (non_match)
3163     bitset_not (sbcset);
3164
3165 #ifdef RE_ENABLE_I18N
3166   /* Ensure only single byte characters are set.  */
3167   if (dfa->mb_cur_max > 1)
3168     bitset_mask (sbcset, dfa->sb_char);
3169
3170   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3171       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3172                                                      || mbcset->non_match)))
3173     {
3174       bin_tree_t *mbc_tree;
3175       int sbc_idx;
3176       /* Build a tree for complex bracket.  */
3177       dfa->has_mb_node = 1;
3178       br_token.type = COMPLEX_BRACKET;
3179       br_token.opr.mbcset = mbcset;
3180       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3181       if (BE (mbc_tree == NULL, 0))
3182         goto parse_bracket_exp_espace;
3183       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3184         if (sbcset[sbc_idx])
3185           break;
3186       /* If there are no bits set in sbcset, there is no point
3187          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3188       if (sbc_idx < BITSET_UINTS)
3189         {
3190           /* Build a tree for simple bracket.  */
3191           br_token.type = SIMPLE_BRACKET;
3192           br_token.opr.sbcset = sbcset;
3193           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3194           if (BE (work_tree == NULL, 0))
3195             goto parse_bracket_exp_espace;
3196
3197           /* Then join them by ALT node.  */
3198           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3199           if (BE (work_tree == NULL, 0))
3200             goto parse_bracket_exp_espace;
3201         }
3202       else
3203         {
3204           re_free (sbcset);
3205           work_tree = mbc_tree;
3206         }
3207     }
3208   else
3209 #endif /* not RE_ENABLE_I18N */
3210     {
3211 #ifdef RE_ENABLE_I18N
3212       free_charset (mbcset);
3213 #endif
3214       /* Build a tree for simple bracket.  */
3215       br_token.type = SIMPLE_BRACKET;
3216       br_token.opr.sbcset = sbcset;
3217       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218       if (BE (work_tree == NULL, 0))
3219         goto parse_bracket_exp_espace;
3220     }
3221   return work_tree;
3222
3223  parse_bracket_exp_espace:
3224   *err = REG_ESPACE;
3225  parse_bracket_exp_free_return:
3226   re_free (sbcset);
3227 #ifdef RE_ENABLE_I18N
3228   free_charset (mbcset);
3229 #endif /* RE_ENABLE_I18N */
3230   return NULL;
3231 }
3232
3233 /* Parse an element in the bracket expression.  */
3234
3235 static reg_errcode_t
3236 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3237                        re_token_t *token, int token_len, re_dfa_t *dfa,
3238                        reg_syntax_t syntax, bool accept_hyphen)
3239 {
3240 #ifdef RE_ENABLE_I18N
3241   int cur_char_size;
3242   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3243   if (cur_char_size > 1)
3244     {
3245       elem->type = MB_CHAR;
3246       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3247       re_string_skip_bytes (regexp, cur_char_size);
3248       return REG_NOERROR;
3249     }
3250 #endif /* RE_ENABLE_I18N */
3251   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3252   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3253       || token->type == OP_OPEN_EQUIV_CLASS)
3254     return parse_bracket_symbol (elem, regexp, token);
3255   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3256     {
3257       /* A '-' must only appear as anything but a range indicator before
3258          the closing bracket.  Everything else is an error.  */
3259       re_token_t token2;
3260       (void) peek_token_bracket (&token2, regexp, syntax);
3261       if (token2.type != OP_CLOSE_BRACKET)
3262         /* The actual error value is not standardized since this whole
3263            case is undefined.  But ERANGE makes good sense.  */
3264         return REG_ERANGE;
3265     }
3266   elem->type = SB_CHAR;
3267   elem->opr.ch = token->opr.c;
3268   return REG_NOERROR;
3269 }
3270
3271 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3272    such as [:<character_class>:], [.<collating_element>.], and
3273    [=<equivalent_class>=].  */
3274
3275 static reg_errcode_t
3276 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3277                       re_token_t *token)
3278 {
3279   unsigned char ch, delim = token->opr.c;
3280   int i = 0;
3281   if (re_string_eoi(regexp))
3282     return REG_EBRACK;
3283   for (;; ++i)
3284     {
3285       if (i >= BRACKET_NAME_BUF_SIZE)
3286         return REG_EBRACK;
3287       if (token->type == OP_OPEN_CHAR_CLASS)
3288         ch = re_string_fetch_byte_case (regexp);
3289       else
3290         ch = re_string_fetch_byte (regexp);
3291       if (re_string_eoi(regexp))
3292         return REG_EBRACK;
3293       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3294         break;
3295       elem->opr.name[i] = ch;
3296     }
3297   re_string_skip_bytes (regexp, 1);
3298   elem->opr.name[i] = '\0';
3299   switch (token->type)
3300     {
3301     case OP_OPEN_COLL_ELEM:
3302       elem->type = COLL_SYM;
3303       break;
3304     case OP_OPEN_EQUIV_CLASS:
3305       elem->type = EQUIV_CLASS;
3306       break;
3307     case OP_OPEN_CHAR_CLASS:
3308       elem->type = CHAR_CLASS;
3309       break;
3310     default:
3311       break;
3312     }
3313   return REG_NOERROR;
3314 }
3315
3316   /* Helper function for parse_bracket_exp.
3317      Build the equivalence class which is represented by NAME.
3318      The result are written to MBCSET and SBCSET.
3319      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3320      is a pointer argument sinse we may update it.  */
3321
3322 static reg_errcode_t
3323 build_equiv_class (re_bitset_ptr_t sbcset,
3324 #ifdef RE_ENABLE_I18N
3325                    re_charset_t *mbcset, Idx *equiv_class_alloc,
3326 #endif
3327                    const unsigned char *name)
3328 {
3329 #if defined _LIBC
3330   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3331   if (nrules != 0)
3332     {
3333       const int32_t *table, *indirect;
3334       const unsigned char *weights, *extra, *cp;
3335       unsigned char char_buf[2];
3336       int32_t idx1, idx2;
3337       unsigned int ch;
3338       size_t len;
3339       /* This #include defines a local function!  */
3340 # include <locale/weight.h>
3341       /* Calculate the index for equivalence class.  */
3342       cp = name;
3343       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3344       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3345                                                _NL_COLLATE_WEIGHTMB);
3346       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3347                                                    _NL_COLLATE_EXTRAMB);
3348       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3349                                                 _NL_COLLATE_INDIRECTMB);
3350       idx1 = findidx (&cp);
3351       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3352         /* This isn't a valid character.  */
3353         return REG_ECOLLATE;
3354
3355       /* Build single byte matcing table for this equivalence class.  */
3356       char_buf[1] = (unsigned char) '\0';
3357       len = weights[idx1];
3358       for (ch = 0; ch < SBC_MAX; ++ch)
3359         {
3360           char_buf[0] = ch;
3361           cp = char_buf;
3362           idx2 = findidx (&cp);
3363 /*
3364           idx2 = table[ch];
3365 */
3366           if (idx2 == 0)
3367             /* This isn't a valid character.  */
3368             continue;
3369           if (len == weights[idx2])
3370             {
3371               int cnt = 0;
3372               while (cnt <= len &&
3373                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3374                 ++cnt;
3375
3376               if (cnt > len)
3377                 bitset_set (sbcset, ch);
3378             }
3379         }
3380       /* Check whether the array has enough space.  */
3381       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3382         {
3383           /* Not enough, realloc it.  */
3384           /* +1 in case of mbcset->nequiv_classes is 0.  */
3385           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3386           /* Use realloc since the array is NULL if *alloc == 0.  */
3387           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3388                                                    int32_t,
3389                                                    new_equiv_class_alloc);
3390           if (BE (new_equiv_classes == NULL, 0))
3391             return REG_ESPACE;
3392           mbcset->equiv_classes = new_equiv_classes;
3393           *equiv_class_alloc = new_equiv_class_alloc;
3394         }
3395       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3396     }
3397   else
3398 #endif /* _LIBC */
3399     {
3400       if (BE (strlen ((const char *) name) != 1, 0))
3401         return REG_ECOLLATE;
3402       bitset_set (sbcset, *name);
3403     }
3404   return REG_NOERROR;
3405 }
3406
3407   /* Helper function for parse_bracket_exp.
3408      Build the character class which is represented by NAME.
3409      The result are written to MBCSET and SBCSET.
3410      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3411      is a pointer argument sinse we may update it.  */
3412
3413 static reg_errcode_t
3414 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3415 #ifdef RE_ENABLE_I18N
3416                  re_charset_t *mbcset, Idx *char_class_alloc,
3417 #endif
3418                  const unsigned char *class_name, reg_syntax_t syntax)
3419 {
3420   int i;
3421   const char *name = (const char *) class_name;
3422
3423   /* In case of REG_ICASE "upper" and "lower" match the both of
3424      upper and lower cases.  */
3425   if ((syntax & REG_IGNORE_CASE)
3426       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3427     name = "alpha";
3428
3429 #ifdef RE_ENABLE_I18N
3430   /* Check the space of the arrays.  */
3431   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3432     {
3433       /* Not enough, realloc it.  */
3434       /* +1 in case of mbcset->nchar_classes is 0.  */
3435       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3436       /* Use realloc since array is NULL if *alloc == 0.  */
3437       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3438                                                new_char_class_alloc);
3439       if (BE (new_char_classes == NULL, 0))
3440         return REG_ESPACE;
3441       mbcset->char_classes = new_char_classes;
3442       *char_class_alloc = new_char_class_alloc;
3443     }
3444   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3445 #endif /* RE_ENABLE_I18N */
3446
3447 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3448     for (i = 0; i < SBC_MAX; ++i)               \
3449       {                                         \
3450         if (ctype_func (i))                     \
3451           {                                     \
3452             int ch = trans ? trans[i] : i;      \
3453             bitset_set (sbcset, ch);            \
3454           }                                     \
3455       }
3456
3457   if (strcmp (name, "alnum") == 0)
3458     BUILD_CHARCLASS_LOOP (isalnum)
3459   else if (strcmp (name, "cntrl") == 0)
3460     BUILD_CHARCLASS_LOOP (iscntrl)
3461   else if (strcmp (name, "lower") == 0)
3462     BUILD_CHARCLASS_LOOP (islower)
3463   else if (strcmp (name, "space") == 0)
3464     BUILD_CHARCLASS_LOOP (isspace)
3465   else if (strcmp (name, "alpha") == 0)
3466     BUILD_CHARCLASS_LOOP (isalpha)
3467   else if (strcmp (name, "digit") == 0)
3468     BUILD_CHARCLASS_LOOP (isdigit)
3469   else if (strcmp (name, "print") == 0)
3470     BUILD_CHARCLASS_LOOP (isprint)
3471   else if (strcmp (name, "upper") == 0)
3472     BUILD_CHARCLASS_LOOP (isupper)
3473   else if (strcmp (name, "blank") == 0)
3474     BUILD_CHARCLASS_LOOP (isblank)
3475   else if (strcmp (name, "graph") == 0)
3476     BUILD_CHARCLASS_LOOP (isgraph)
3477   else if (strcmp (name, "punct") == 0)
3478     BUILD_CHARCLASS_LOOP (ispunct)
3479   else if (strcmp (name, "xdigit") == 0)
3480     BUILD_CHARCLASS_LOOP (isxdigit)
3481   else
3482     return REG_ECTYPE;
3483
3484   return REG_NOERROR;
3485 }
3486
3487 static bin_tree_t *
3488 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3489                     const unsigned char *class_name,
3490                     const unsigned char *extra,
3491                     bool non_match, reg_errcode_t *err)
3492 {
3493   re_bitset_ptr_t sbcset;
3494 #ifdef RE_ENABLE_I18N
3495   re_charset_t *mbcset;
3496   Idx alloc = 0;
3497 #endif /* not RE_ENABLE_I18N */
3498   reg_errcode_t ret;
3499   re_token_t br_token;
3500   bin_tree_t *tree;
3501
3502   sbcset = re_calloc (unsigned int, BITSET_UINTS);
3503 #ifdef RE_ENABLE_I18N
3504   mbcset = re_calloc (re_charset_t, 1);
3505 #endif /* RE_ENABLE_I18N */
3506
3507 #ifdef RE_ENABLE_I18N
3508   if (BE (sbcset == NULL || mbcset == NULL, 0))
3509 #else /* not RE_ENABLE_I18N */
3510   if (BE (sbcset == NULL, 0))
3511 #endif /* not RE_ENABLE_I18N */
3512     {
3513       *err = REG_ESPACE;
3514       return NULL;
3515     }
3516
3517   if (non_match)
3518     {
3519 #ifdef RE_ENABLE_I18N
3520       /*
3521       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3522         bitset_set(cset->sbcset, '\0');
3523       */
3524       mbcset->non_match = 1;
3525 #endif /* not RE_ENABLE_I18N */
3526     }
3527
3528   /* We don't care the syntax in this case.  */
3529   ret = build_charclass (trans, sbcset,
3530 #ifdef RE_ENABLE_I18N
3531                          mbcset, &alloc,
3532 #endif /* RE_ENABLE_I18N */
3533                          class_name, 0);
3534
3535   if (BE (ret != REG_NOERROR, 0))
3536     {
3537       re_free (sbcset);
3538 #ifdef RE_ENABLE_I18N
3539       free_charset (mbcset);
3540 #endif /* RE_ENABLE_I18N */
3541       *err = ret;
3542       return NULL;
3543     }
3544   /* \w match '_' also.  */
3545   for (; *extra; extra++)
3546     bitset_set (sbcset, *extra);
3547
3548   /* If it is non-matching list.  */
3549   if (non_match)
3550     bitset_not (sbcset);
3551
3552 #ifdef RE_ENABLE_I18N
3553   /* Ensure only single byte characters are set.  */
3554   if (dfa->mb_cur_max > 1)
3555     bitset_mask (sbcset, dfa->sb_char);
3556 #endif
3557
3558   /* Build a tree for simple bracket.  */
3559   br_token.type = SIMPLE_BRACKET;
3560   br_token.opr.sbcset = sbcset;
3561   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3562   if (BE (tree == NULL, 0))
3563     goto build_word_op_espace;
3564
3565 #ifdef RE_ENABLE_I18N
3566   if (dfa->mb_cur_max > 1)
3567     {
3568       bin_tree_t *mbc_tree;
3569       /* Build a tree for complex bracket.  */
3570       br_token.type = COMPLEX_BRACKET;
3571       br_token.opr.mbcset = mbcset;
3572       dfa->has_mb_node = 1;
3573       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3574       if (BE (mbc_tree == NULL, 0))
3575         goto build_word_op_espace;
3576       /* Then join them by ALT node.  */
3577       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3578       if (BE (mbc_tree != NULL, 1))
3579         return tree;
3580     }
3581   else
3582     {
3583       free_charset (mbcset);
3584       return tree;
3585     }
3586 #else /* not RE_ENABLE_I18N */
3587   return tree;
3588 #endif /* not RE_ENABLE_I18N */
3589
3590  build_word_op_espace:
3591   re_free (sbcset);
3592 #ifdef RE_ENABLE_I18N
3593   free_charset (mbcset);
3594 #endif /* RE_ENABLE_I18N */
3595   *err = REG_ESPACE;
3596   return NULL;
3597 }
3598
3599 /* This is intended for the expressions like "a{1,3}".
3600    Fetch a number from `input', and return the number.
3601    Return REG_MISSING if the number field is empty like "{,1}".
3602    Return REG_ERROR if an error occurred.  */
3603
3604 static Idx
3605 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3606 {
3607   Idx num = REG_MISSING;
3608   unsigned char c;
3609   while (1)
3610     {
3611       fetch_token (token, input, syntax);
3612       c = token->opr.c;
3613       if (BE (token->type == END_OF_RE, 0))
3614         return REG_ERROR;
3615       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3616         break;
3617       num = ((token->type != CHARACTER || c < '0' || '9' < c
3618               || num == REG_ERROR)
3619              ? REG_ERROR
3620              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3621       num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3622     }
3623   return num;
3624 }
3625 \f
3626 #ifdef RE_ENABLE_I18N
3627 static void
3628 free_charset (re_charset_t *cset)
3629 {
3630   re_free (cset->mbchars);
3631 # ifdef _LIBC
3632   re_free (cset->coll_syms);
3633   re_free (cset->equiv_classes);
3634   re_free (cset->range_starts);
3635   re_free (cset->range_ends);
3636 # endif
3637   re_free (cset->char_classes);
3638   re_free (cset);
3639 }
3640 #endif /* RE_ENABLE_I18N */
3641 \f
3642 /* Functions for binary tree operation.  */
3643
3644 /* Create a tree node.  */
3645
3646 static bin_tree_t *
3647 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3648              re_token_type_t type)
3649 {
3650   re_token_t t;
3651   t.type = type;
3652   return create_token_tree (dfa, left, right, &t);
3653 }
3654
3655 static bin_tree_t *
3656 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3657                    const re_token_t *token)
3658 {
3659   bin_tree_t *tree;
3660   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3661     {
3662       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3663
3664       if (storage == NULL)
3665         return NULL;
3666       storage->next = dfa->str_tree_storage;
3667       dfa->str_tree_storage = storage;
3668       dfa->str_tree_storage_idx = 0;
3669     }
3670   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3671
3672   tree->parent = NULL;
3673   tree->left = left;
3674   tree->right = right;
3675   tree->token = *token;
3676   tree->token.duplicated = 0;
3677   tree->token.opt_subexp = 0;
3678   tree->first = NULL;
3679   tree->next = NULL;
3680   tree->node_idx = REG_MISSING;
3681
3682   if (left != NULL)
3683     left->parent = tree;
3684   if (right != NULL)
3685     right->parent = tree;
3686   return tree;
3687 }
3688
3689 /* Mark the tree SRC as an optional subexpression.
3690    To be called from preorder or postorder.  */
3691
3692 static reg_errcode_t
3693 mark_opt_subexp (void *extra, bin_tree_t *node)
3694 {
3695   Idx idx = (Idx) (long) extra;
3696   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3697     node->token.opt_subexp = 1;
3698
3699   return REG_NOERROR;
3700 }
3701
3702 /* Free the allocated memory inside NODE. */
3703
3704 static void
3705 free_token (re_token_t *node)
3706 {
3707 #ifdef RE_ENABLE_I18N
3708   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3709     free_charset (node->opr.mbcset);
3710   else
3711 #endif /* RE_ENABLE_I18N */
3712     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3713       re_free (node->opr.sbcset);
3714 }
3715
3716 /* Worker function for tree walking.  Free the allocated memory inside NODE
3717    and its children. */
3718
3719 static reg_errcode_t
3720 free_tree (void *extra, bin_tree_t *node)
3721 {
3722   free_token (&node->token);
3723   return REG_NOERROR;
3724 }
3725
3726
3727 /* Duplicate the node SRC, and return new node.  This is a preorder
3728    visit similar to the one implemented by the generic visitor, but
3729    we need more infrastructure to maintain two parallel trees --- so,
3730    it's easier to duplicate.  */
3731
3732 static bin_tree_t *
3733 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3734 {
3735   const bin_tree_t *node;
3736   bin_tree_t *dup_root;
3737   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3738
3739   for (node = root; ; )
3740     {
3741       /* Create a new tree and link it back to the current parent.  */
3742       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3743       if (*p_new == NULL)
3744         return NULL;
3745       (*p_new)->parent = dup_node;
3746       (*p_new)->token.duplicated = 1;
3747       dup_node = *p_new;
3748
3749       /* Go to the left node, or up and to the right.  */
3750       if (node->left)
3751         {
3752           node = node->left;
3753           p_new = &dup_node->left;
3754         }
3755       else
3756         {
3757           const bin_tree_t *prev = NULL;
3758           while (node->right == prev || node->right == NULL)
3759             {
3760               prev = node;
3761               node = node->parent;
3762               dup_node = dup_node->parent;
3763               if (!node)
3764                 return dup_root;
3765             }
3766           node = node->right;
3767           p_new = &dup_node->right;
3768         }
3769     }
3770 }