lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           int length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  26 static void init_word_char (re_dfa_t *dfa);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  51                                              int top_clone_node, int root_node,
  52                                              unsigned int constraint);
  53 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  54 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  55                                    unsigned int constraint);
  56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  58                                          int node, int root);
  59 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  60 static int fetch_number (re_string_t *input, re_token_t *token,
  61                          reg_syntax_t syntax);
  62 static void fetch_token (re_token_t *result, re_string_t *input,
  63                          reg_syntax_t syntax);
  64 static int peek_token (re_token_t *token, re_string_t *input,
  65                         reg_syntax_t syntax);
  66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  67                                reg_syntax_t syntax);
  68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  69                           reg_syntax_t syntax, reg_errcode_t *err);
  70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  74                                  re_token_t *token, reg_syntax_t syntax,
  75                                  int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  77                                      re_token_t *token, reg_syntax_t syntax,
  78                                      int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  80                                   re_token_t *token, reg_syntax_t syntax,
  81                                   int nest, reg_errcode_t *err);
  82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  83                                  re_dfa_t *dfa, re_token_t *token,
  84                                  reg_syntax_t syntax, reg_errcode_t *err);
  85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  86                                       re_token_t *token, reg_syntax_t syntax,
  87                                       reg_errcode_t *err);
  88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  89                                             re_string_t *regexp,
  90                                             re_token_t *token, int token_len,
  91                                             re_dfa_t *dfa,
  92                                             reg_syntax_t syntax,
  93                                             int accept_hyphen);
  94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  95                                           re_string_t *regexp,
  96                                           re_token_t *token);
  97 #ifndef _LIBC
  98 # ifdef RE_ENABLE_I18N
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       re_charset_t *mbcset, int *range_alloc,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              re_charset_t *mbcset,
 105                                              int *coll_sym_alloc,
 106                                              const unsigned char *name);
 107 # else /* not RE_ENABLE_I18N */
 108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 109                                       bracket_elem_t *start_elem,
 110                                       bracket_elem_t *end_elem);
 111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 112                                              const unsigned char *name);
 113 # endif /* not RE_ENABLE_I18N */
 114 #endif /* not _LIBC */
 115 #ifdef RE_ENABLE_I18N
 116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 117                                         re_charset_t *mbcset,
 118                                         int *equiv_class_alloc,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 121                                       re_bitset_ptr_t sbcset,
 122                                       re_charset_t *mbcset,
 123                                       int *char_class_alloc,
 124                                       const unsigned char *class_name,
 125                                       reg_syntax_t syntax);
 126 #else  /* not RE_ENABLE_I18N */
 127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 128                                         const unsigned char *name);
 129 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 130                                       re_bitset_ptr_t sbcset,
 131                                       const unsigned char *class_name,
 132                                       reg_syntax_t syntax);
 133 #endif /* not RE_ENABLE_I18N */
 134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 135                                        unsigned RE_TRANSLATE_TYPE trans,
 136                                        const unsigned char *class_name,
 137                                        const unsigned char *extra,
 138                                        int non_match, reg_errcode_t *err);
 139 static bin_tree_t *create_tree (re_dfa_t *dfa,
 140                                 bin_tree_t *left, bin_tree_t *right,
 141                                 re_token_type_t type);
 142 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 143                                       bin_tree_t *left, bin_tree_t *right,
 144                                       const re_token_t *token);
 145 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 146 static void free_token (re_token_t *node);
 147 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 148 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 149 \f
 150 /* This table gives an error message for each of the error codes listed
 151    in regex.h.  Obviously the order here has to be same as there.
 152    POSIX doesn't require that we do anything for REG_NOERROR,
 153    but why not be nice?  */
 154
 155 const char __re_error_msgid[] attribute_hidden =
 156   {
 157 #define REG_NOERROR_IDX 0
 158     gettext_noop ("Success")    /* REG_NOERROR */
 159     "\0"
 160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 161     gettext_noop ("No match")   /* REG_NOMATCH */
 162     "\0"
 163 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 164     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 165     "\0"
 166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 167     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 168     "\0"
 169 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 170     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 171     "\0"
 172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 173     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 174     "\0"
 175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 176     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 177     "\0"
 178 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 179     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 180     "\0"
 181 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 182     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 183     "\0"
 184 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 185     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 186     "\0"
 187 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 188     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 189     "\0"
 190 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 191     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 192     "\0"
 193 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 194     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 195     "\0"
 196 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 197     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 198     "\0"
 199 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 200     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 201     "\0"
 202 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 203     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 204     "\0"
 205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 206     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 207   };
 208
 209 const size_t __re_error_msgid_idx[] attribute_hidden =
 210   {
 211     REG_NOERROR_IDX,
 212     REG_NOMATCH_IDX,
 213     REG_BADPAT_IDX,
 214     REG_ECOLLATE_IDX,
 215     REG_ECTYPE_IDX,
 216     REG_EESCAPE_IDX,
 217     REG_ESUBREG_IDX,
 218     REG_EBRACK_IDX,
 219     REG_EPAREN_IDX,
 220     REG_EBRACE_IDX,
 221     REG_BADBR_IDX,
 222     REG_ERANGE_IDX,
 223     REG_ESPACE_IDX,
 224     REG_BADRPT_IDX,
 225     REG_EEND_IDX,
 226     REG_ESIZE_IDX,
 227     REG_ERPAREN_IDX
 228   };
 229 \f
 230 /* Entry points for GNU code.  */
 231
 232 /* re_compile_pattern is the GNU regular expression compiler: it
 233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234    Returns 0 if the pattern was valid, otherwise an error string.
 235
 236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237    are set in BUFP on entry.  */
 238
 239 const char *
 240 re_compile_pattern (const char *pattern, size_t length,
 241                     struct re_pattern_buffer *bufp)
 242 {
 243   reg_errcode_t ret;
 244
 245   /* And GNU code determines whether or not to get register information
 246      by passing null for the REGS argument to re_match, etc., not by
 247      setting no_sub, unless RE_NO_SUB is set.  */
 248   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 249
 250   /* Match anchors at newline.  */
 251   bufp->newline_anchor = 1;
 252
 253   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 254
 255   if (!ret)
 256     return NULL;
 257   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 258 }
 259 #ifdef _LIBC
 260 weak_alias (__re_compile_pattern, re_compile_pattern)
 261 #endif
 262
 263 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 264    also be assigned to arbitrarily: each pattern buffer stores its own
 265    syntax, so it can be changed between regex compilations.  */
 266 /* This has no initializer because initialized variables in Emacs
 267    become read-only after dumping.  */
 268 reg_syntax_t re_syntax_options;
 269
 270
 271 /* Specify the precise syntax of regexps for compilation.  This provides
 272    for compatibility for various utilities which historically have
 273    different, incompatible syntaxes.
 274
 275    The argument SYNTAX is a bit mask comprised of the various bits
 276    defined in regex.h.  We return the old syntax.  */
 277
 278 reg_syntax_t
 279 re_set_syntax (reg_syntax_t syntax)
 280 {
 281   reg_syntax_t ret = re_syntax_options;
 282
 283   re_syntax_options = syntax;
 284   return ret;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_set_syntax, re_set_syntax)
 288 #endif
 289
 290 int
 291 re_compile_fastmap (struct re_pattern_buffer *bufp)
 292 {
 293   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 294   char *fastmap = bufp->fastmap;
 295
 296   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 297   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 298   if (dfa->init_state != dfa->init_state_word)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 300   if (dfa->init_state != dfa->init_state_nl)
 301     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 302   if (dfa->init_state != dfa->init_state_begbuf)
 303     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 304   bufp->fastmap_accurate = 1;
 305   return 0;
 306 }
 307 #ifdef _LIBC
 308 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 309 #endif
 310
 311 static inline void
 312 __attribute ((always_inline))
 313 re_set_fastmap (char *fastmap, int icase, int ch)
 314 {
 315   fastmap[ch] = 1;
 316   if (icase)
 317     fastmap[tolower (ch)] = 1;
 318 }
 319
 320 /* Helper function for re_compile_fastmap.
 321    Compile fastmap for the initial_state INIT_STATE.  */
 322
 323 static void
 324 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 325                          char *fastmap)
 326 {
 327   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 328   int node_cnt;
 329   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 330   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 331     {
 332       int node = init_state->nodes.elems[node_cnt];
 333       re_token_type_t type = dfa->nodes[node].type;
 334
 335       if (type == CHARACTER)
 336         {
 337           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 338 #ifdef RE_ENABLE_I18N
 339           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 340             {
 341               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 342               wchar_t wc;
 343               mbstate_t state;
 344
 345               p = buf;
 346               *p++ = dfa->nodes[node].opr.c;
 347               while (++node < dfa->nodes_len
 348                      && dfa->nodes[node].type == CHARACTER
 349                      && dfa->nodes[node].mb_partial)
 350                 *p++ = dfa->nodes[node].opr.c;
 351               memset (&state, 0, sizeof (state));
 352               if (mbrtowc (&wc, (const char *) buf, p - buf,
 353                            &state) == p - buf
 354                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 355                       != (size_t) -1))
 356                 re_set_fastmap (fastmap, 0, buf[0]);
 357             }
 358 #endif
 359         }
 360       else if (type == SIMPLE_BRACKET)
 361         {
 362           int i, j, ch;
 363           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 364             for (j = 0; j < UINT_BITS; ++j, ++ch)
 365               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 366                 re_set_fastmap (fastmap, icase, ch);
 367         }
 368 #ifdef RE_ENABLE_I18N
 369       else if (type == COMPLEX_BRACKET)
 370         {
 371           int i;
 372           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 373           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 374               || cset->nranges || cset->nchar_classes)
 375             {
 376 # ifdef _LIBC
 377               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 378                 {
 379                   /* In this case we want to catch the bytes which are
 380                      the first byte of any collation elements.
 381                      e.g. In da_DK, we want to catch 'a' since "aa"
 382                           is a valid collation element, and don't catch
 383                           'b' since 'b' is the only collation element
 384                           which starts from 'b'.  */
 385                   int j, ch;
 386                   const int32_t *table = (const int32_t *)
 387                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 388                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 389                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 390                       if (table[ch] < 0)
 391                         re_set_fastmap (fastmap, icase, ch);
 392                 }
 393 # else
 394               if (dfa->mb_cur_max > 1)
 395                 for (i = 0; i < SBC_MAX; ++i)
 396                   if (__btowc (i) == WEOF)
 397                     re_set_fastmap (fastmap, icase, i);
 398 # endif /* not _LIBC */
 399             }
 400           for (i = 0; i < cset->nmbchars; ++i)
 401             {
 402               char buf[256];
 403               mbstate_t state;
 404               memset (&state, '\0', sizeof (state));
 405               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                 {
 409                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 410                       != (size_t) -1)
 411                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      `buffer' to the compiled pattern;
 437      `used' to the length of the compiled pattern;
 438      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      `fastmap' to an allocated space for the fastmap;
 443      `fastmap_accurate' to zero;
 444      `re_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 468 {
 469   reg_errcode_t ret;
 470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 471                          : RE_SYNTAX_POSIX_BASIC);
 472
 473   preg->buffer = NULL;
 474   preg->allocated = 0;
 475   preg->used = 0;
 476
 477   /* Try to allocate space for the fastmap.  */
 478   preg->fastmap = re_malloc (char, SBC_MAX);
 479   if (BE (preg->fastmap == NULL, 0))
 480     return REG_ESPACE;
 481
 482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 483
 484   /* If REG_NEWLINE is set, newlines are treated differently.  */
 485   if (cflags & REG_NEWLINE)
 486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 487       syntax &= ~RE_DOT_NEWLINE;
 488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 489       /* It also changes the matching behavior.  */
 490       preg->newline_anchor = 1;
 491     }
 492   else
 493     preg->newline_anchor = 0;
 494   preg->no_sub = !!(cflags & REG_NOSUB);
 495   preg->translate = NULL;
 496
 497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 498
 499   /* POSIX doesn't distinguish between an unmatched open-group and an
 500      unmatched close-group: both are REG_EPAREN.  */
 501   if (ret == REG_ERPAREN)
 502     ret = REG_EPAREN;
 503
 504   /* We have already checked preg->fastmap != NULL.  */
 505   if (BE (ret == REG_NOERROR, 1))
 506     /* Compute the fastmap now, since regexec cannot modify the pattern
 507        buffer.  This function never fails in this implementation.  */
 508     (void) re_compile_fastmap (preg);
 509   else
 510     {
 511       /* Some error occurred while compiling the expression.  */
 512       re_free (preg->fastmap);
 513       preg->fastmap = NULL;
 514     }
 515
 516   return (int) ret;
 517 }
 518 #ifdef _LIBC
 519 weak_alias (__regcomp, regcomp)
 520 #endif
 521
 522 /* Returns a message corresponding to an error code, ERRCODE, returned
 523    from either regcomp or regexec.   We don't use PREG here.  */
 524
 525 size_t
 526 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
 527 {
 528   const char *msg;
 529   size_t msg_size;
 530
 531   if (BE (errcode < 0
 532           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 533                                / sizeof (__re_error_msgid_idx[0])), 0))
 534     /* Only error codes returned by the rest of the code should be passed
 535        to this routine.  If we are given anything else, or if other regex
 536        code generates an invalid error code, then the program has a bug.
 537        Dump core so we can fix it.  */
 538     abort ();
 539
 540   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 541
 542   msg_size = strlen (msg) + 1; /* Includes the null.  */
 543
 544   if (BE (errbuf_size != 0, 1))
 545     {
 546       if (BE (msg_size > errbuf_size, 0))
 547         {
 548 #if defined HAVE_MEMPCPY || defined _LIBC
 549           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 550 #else
 551           memcpy (errbuf, msg, errbuf_size - 1);
 552           errbuf[errbuf_size - 1] = 0;
 553 #endif
 554         }
 555       else
 556         memcpy (errbuf, msg, msg_size);
 557     }
 558
 559   return msg_size;
 560 }
 561 #ifdef _LIBC
 562 weak_alias (__regerror, regerror)
 563 #endif
 564
 565
 566 #ifdef RE_ENABLE_I18N
 567 /* This static array is used for the map to single-byte characters when
 568    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 569    it the same all the time.  UTF-8 is the preferred encoding so this is
 570    a worthwhile optimization.  */
 571 static const bitset utf8_sb_map =
 572 {
 573   /* Set the first 128 bits.  */
 574 # if UINT_MAX == 0xffffffff
 575   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 576 # else
 577 #  error "Add case for new unsigned int size"
 578 # endif
 579 };
 580 #endif
 581
 582
 583 static void
 584 free_dfa_content (re_dfa_t *dfa)
 585 {
 586   int i, j;
 587
 588   if (dfa->nodes)
 589     for (i = 0; i < dfa->nodes_len; ++i)
 590       free_token (dfa->nodes + i);
 591   re_free (dfa->nexts);
 592   for (i = 0; i < dfa->nodes_len; ++i)
 593     {
 594       if (dfa->eclosures != NULL)
 595         re_node_set_free (dfa->eclosures + i);
 596       if (dfa->inveclosures != NULL)
 597         re_node_set_free (dfa->inveclosures + i);
 598       if (dfa->edests != NULL)
 599         re_node_set_free (dfa->edests + i);
 600     }
 601   re_free (dfa->edests);
 602   re_free (dfa->eclosures);
 603   re_free (dfa->inveclosures);
 604   re_free (dfa->nodes);
 605
 606   if (dfa->state_table)
 607     for (i = 0; i <= dfa->state_hash_mask; ++i)
 608       {
 609         struct re_state_table_entry *entry = dfa->state_table + i;
 610         for (j = 0; j < entry->num; ++j)
 611           {
 612             re_dfastate_t *state = entry->array[j];
 613             free_state (state);
 614           }
 615         re_free (entry->array);
 616       }
 617   re_free (dfa->state_table);
 618 #ifdef RE_ENABLE_I18N
 619   if (dfa->sb_char != utf8_sb_map)
 620     re_free (dfa->sb_char);
 621 #endif
 622   re_free (dfa->subexp_map);
 623 #ifdef DEBUG
 624   re_free (dfa->re_str);
 625 #endif
 626
 627   re_free (dfa);
 628 }
 629
 630
 631 /* Free dynamically allocated space used by PREG.  */
 632
 633 void
 634 regfree (regex_t *preg)
 635 {
 636   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 637   if (BE (dfa != NULL, 1))
 638     free_dfa_content (dfa);
 639   preg->buffer = NULL;
 640   preg->allocated = 0;
 641
 642   re_free (preg->fastmap);
 643   preg->fastmap = NULL;
 644
 645   re_free (preg->translate);
 646   preg->translate = NULL;
 647 }
 648 #ifdef _LIBC
 649 weak_alias (__regfree, regfree)
 650 #endif
 651 \f
 652 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 653    them unless specifically requested.  */
 654
 655 #if defined _REGEX_RE_COMP || defined _LIBC
 656
 657 /* BSD has one and only one pattern buffer.  */
 658 static struct re_pattern_buffer re_comp_buf;
 659
 660 char *
 661 # ifdef _LIBC
 662 /* Make these definitions weak in libc, so POSIX programs can redefine
 663    these names if they don't use our functions, and still use
 664    regcomp/regexec above without link errors.  */
 665 weak_function
 666 # endif
 667 re_comp (s)
 668      const char *s;
 669 {
 670   reg_errcode_t ret;
 671   char *fastmap;
 672
 673   if (!s)
 674     {
 675       if (!re_comp_buf.buffer)
 676         return gettext ("No previous regular expression");
 677       return 0;
 678     }
 679
 680   if (re_comp_buf.buffer)
 681     {
 682       fastmap = re_comp_buf.fastmap;
 683       re_comp_buf.fastmap = NULL;
 684       __regfree (&re_comp_buf);
 685       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 686       re_comp_buf.fastmap = fastmap;
 687     }
 688
 689   if (re_comp_buf.fastmap == NULL)
 690     {
 691       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 692       if (re_comp_buf.fastmap == NULL)
 693         return (char *) gettext (__re_error_msgid
 694                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 695     }
 696
 697   /* Since `re_exec' always passes NULL for the `regs' argument, we
 698      don't need to initialize the pattern buffer fields which affect it.  */
 699
 700   /* Match anchors at newlines.  */
 701   re_comp_buf.newline_anchor = 1;
 702
 703   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 704
 705   if (!ret)
 706     return NULL;
 707
 708   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 709   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 710 }
 711
 712 #ifdef _LIBC
 713 libc_freeres_fn (free_mem)
 714 {
 715   __regfree (&re_comp_buf);
 716 }
 717 #endif
 718
 719 #endif /* _REGEX_RE_COMP */
 720 \f
 721 /* Internal entry point.
 722    Compile the regular expression PATTERN, whose length is LENGTH.
 723    SYNTAX indicate regular expression's syntax.  */
 724
 725 static reg_errcode_t
 726 re_compile_internal (regex_t *preg, const char * pattern, int length,
 727                      reg_syntax_t syntax)
 728 {
 729   reg_errcode_t err = REG_NOERROR;
 730   re_dfa_t *dfa;
 731   re_string_t regexp;
 732
 733   /* Initialize the pattern buffer.  */
 734   preg->fastmap_accurate = 0;
 735   preg->syntax = syntax;
 736   preg->not_bol = preg->not_eol = 0;
 737   preg->used = 0;
 738   preg->re_nsub = 0;
 739   preg->can_be_null = 0;
 740   preg->regs_allocated = REGS_UNALLOCATED;
 741
 742   /* Initialize the dfa.  */
 743   dfa = (re_dfa_t *) preg->buffer;
 744   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 745     {
 746       /* If zero allocated, but buffer is non-null, try to realloc
 747          enough space.  This loses if buffer's address is bogus, but
 748          that is the user's responsibility.  If ->buffer is NULL this
 749          is a simple allocation.  */
 750       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 751       if (dfa == NULL)
 752         return REG_ESPACE;
 753       preg->allocated = sizeof (re_dfa_t);
 754       preg->buffer = (unsigned char *) dfa;
 755     }
 756   preg->used = sizeof (re_dfa_t);
 757
 758   __libc_lock_init (dfa->lock);
 759
 760   err = init_dfa (dfa, length);
 761   if (BE (err != REG_NOERROR, 0))
 762     {
 763       free_dfa_content (dfa);
 764       preg->buffer = NULL;
 765       preg->allocated = 0;
 766       return err;
 767     }
 768 #ifdef DEBUG
 769   dfa->re_str = re_malloc (char, length + 1);
 770   strncpy (dfa->re_str, pattern, length + 1);
 771 #endif
 772
 773   err = re_string_construct (&regexp, pattern, length, preg->translate,
 774                              syntax & RE_ICASE, dfa);
 775   if (BE (err != REG_NOERROR, 0))
 776     {
 777     re_compile_internal_free_return:
 778       free_workarea_compile (preg);
 779       re_string_destruct (&regexp);
 780       free_dfa_content (dfa);
 781       preg->buffer = NULL;
 782       preg->allocated = 0;
 783       return err;
 784     }
 785
 786   /* Parse the regular expression, and build a structure tree.  */
 787   preg->re_nsub = 0;
 788   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 789   if (BE (dfa->str_tree == NULL, 0))
 790     goto re_compile_internal_free_return;
 791
 792   /* Analyze the tree and create the nfa.  */
 793   err = analyze (preg);
 794   if (BE (err != REG_NOERROR, 0))
 795     goto re_compile_internal_free_return;
 796
 797 #ifdef RE_ENABLE_I18N
 798   /* If possible, do searching in single byte encoding to speed things up.  */
 799   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 800     optimize_utf8 (dfa);
 801 #endif
 802
 803   /* Then create the initial state of the dfa.  */
 804   err = create_initial_state (dfa);
 805
 806   /* Release work areas.  */
 807   free_workarea_compile (preg);
 808   re_string_destruct (&regexp);
 809
 810   if (BE (err != REG_NOERROR, 0))
 811     {
 812       free_dfa_content (dfa);
 813       preg->buffer = NULL;
 814       preg->allocated = 0;
 815     }
 816
 817   return err;
 818 }
 819
 820 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 821    as the initial length of some arrays.  */
 822
 823 static reg_errcode_t
 824 init_dfa (re_dfa_t *dfa, int pat_len)
 825 {
 826   int table_size;
 827 #ifndef _LIBC
 828   char *codeset_name;
 829 #endif
 830
 831   memset (dfa, '\0', sizeof (re_dfa_t));
 832
 833   /* Force allocation of str_tree_storage the first time.  */
 834   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 835
 836   dfa->nodes_alloc = pat_len + 1;
 837   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 838
 839   dfa->states_alloc = pat_len + 1;
 840
 841   /*  table_size = 2 ^ ceil(log pat_len) */
 842   for (table_size = 1; table_size > 0; table_size <<= 1)
 843     if (table_size > pat_len)
 844       break;
 845
 846   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 847   dfa->state_hash_mask = table_size - 1;
 848
 849   dfa->mb_cur_max = MB_CUR_MAX;
 850 #ifdef _LIBC
 851   if (dfa->mb_cur_max == 6
 852       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 853     dfa->is_utf8 = 1;
 854   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 855                        != 0);
 856 #else
 857 # ifdef HAVE_LANGINFO_CODESET
 858   codeset_name = nl_langinfo (CODESET);
 859 # else
 860   codeset_name = getenv ("LC_ALL");
 861   if (codeset_name == NULL || codeset_name[0] == '\0')
 862     codeset_name = getenv ("LC_CTYPE");
 863   if (codeset_name == NULL || codeset_name[0] == '\0')
 864     codeset_name = getenv ("LANG");
 865   if (codeset_name == NULL)
 866     codeset_name = "";
 867   else if (strchr (codeset_name, '.') !=  NULL)
 868     codeset_name = strchr (codeset_name, '.') + 1;
 869 # endif
 870
 871   if (strcasecmp (codeset_name, "UTF-8") == 0
 872       || strcasecmp (codeset_name, "UTF8") == 0)
 873     dfa->is_utf8 = 1;
 874
 875   /* We check exhaustively in the loop below if this charset is a
 876      superset of ASCII.  */
 877   dfa->map_notascii = 0;
 878 #endif
 879
 880 #ifdef RE_ENABLE_I18N
 881   if (dfa->mb_cur_max > 1)
 882     {
 883       if (dfa->is_utf8)
 884         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 885       else
 886         {
 887           int i, j, ch;
 888
 889           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 890           if (BE (dfa->sb_char == NULL, 0))
 891             return REG_ESPACE;
 892
 893           /* Clear all bits by, then set those corresponding to single
 894              byte chars.  */
 895           bitset_empty (dfa->sb_char);
 896
 897           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 898             for (j = 0; j < UINT_BITS; ++j, ++ch)
 899               {
 900                 wint_t wch = __btowc (ch);
 901                 if (wch != WEOF)
 902                   dfa->sb_char[i] |= 1 << j;
 903 # ifndef _LIBC
 904                 if (isascii (ch) && wch != ch)
 905                   dfa->map_notascii = 1;
 906 # endif
 907               }
 908         }
 909     }
 910 #endif
 911
 912   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 913     return REG_ESPACE;
 914   return REG_NOERROR;
 915 }
 916
 917 /* Initialize WORD_CHAR table, which indicate which character is
 918    "word".  In this case "word" means that it is the word construction
 919    character used by some operators like "\<", "\>", etc.  */
 920
 921 static void
 922 init_word_char (re_dfa_t *dfa)
 923 {
 924   int i, j, ch;
 925   dfa->word_ops_used = 1;
 926   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 927     for (j = 0; j < UINT_BITS; ++j, ++ch)
 928       if (isalnum (ch) || ch == '_')
 929         dfa->word_char[i] |= 1 << j;
 930 }
 931
 932 /* Free the work area which are only used while compiling.  */
 933
 934 static void
 935 free_workarea_compile (regex_t *preg)
 936 {
 937   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 938   bin_tree_storage_t *storage, *next;
 939   for (storage = dfa->str_tree_storage; storage; storage = next)
 940     {
 941       next = storage->next;
 942       re_free (storage);
 943     }
 944   dfa->str_tree_storage = NULL;
 945   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 946   dfa->str_tree = NULL;
 947   re_free (dfa->org_indices);
 948   dfa->org_indices = NULL;
 949 }
 950
 951 /* Create initial states for all contexts.  */
 952
 953 static reg_errcode_t
 954 create_initial_state (re_dfa_t *dfa)
 955 {
 956   int first, i;
 957   reg_errcode_t err;
 958   re_node_set init_nodes;
 959
 960   /* Initial states have the epsilon closure of the node which is
 961      the first node of the regular expression.  */
 962   first = dfa->str_tree->first->node_idx;
 963   dfa->init_node = first;
 964   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 965   if (BE (err != REG_NOERROR, 0))
 966     return err;
 967
 968   /* The back-references which are in initial states can epsilon transit,
 969      since in this case all of the subexpressions can be null.
 970      Then we add epsilon closures of the nodes which are the next nodes of
 971      the back-references.  */
 972   if (dfa->nbackref > 0)
 973     for (i = 0; i < init_nodes.nelem; ++i)
 974       {
 975         int node_idx = init_nodes.elems[i];
 976         re_token_type_t type = dfa->nodes[node_idx].type;
 977
 978         int clexp_idx;
 979         if (type != OP_BACK_REF)
 980           continue;
 981         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 982           {
 983             re_token_t *clexp_node;
 984             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 985             if (clexp_node->type == OP_CLOSE_SUBEXP
 986                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 987               break;
 988           }
 989         if (clexp_idx == init_nodes.nelem)
 990           continue;
 991
 992         if (type == OP_BACK_REF)
 993           {
 994             int dest_idx = dfa->edests[node_idx].elems[0];
 995             if (!re_node_set_contains (&init_nodes, dest_idx))
 996               {
 997                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 998                 i = 0;
 999               }
1000           }
1001       }
1002
1003   /* It must be the first time to invoke acquire_state.  */
1004   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1005   /* We don't check ERR here, since the initial state must not be NULL.  */
1006   if (BE (dfa->init_state == NULL, 0))
1007     return err;
1008   if (dfa->init_state->has_constraint)
1009     {
1010       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1011                                                        CONTEXT_WORD);
1012       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1013                                                      CONTEXT_NEWLINE);
1014       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1015                                                          &init_nodes,
1016                                                          CONTEXT_NEWLINE
1017                                                          | CONTEXT_BEGBUF);
1018       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1019               || dfa->init_state_begbuf == NULL, 0))
1020         return err;
1021     }
1022   else
1023     dfa->init_state_word = dfa->init_state_nl
1024       = dfa->init_state_begbuf = dfa->init_state;
1025
1026   re_node_set_free (&init_nodes);
1027   return REG_NOERROR;
1028 }
1029 \f
1030 #ifdef RE_ENABLE_I18N
1031 /* If it is possible to do searching in single byte encoding instead of UTF-8
1032    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1033    DFA nodes where needed.  */
1034
1035 static void
1036 optimize_utf8 (re_dfa_t *dfa)
1037 {
1038   int node, i, mb_chars = 0, has_period = 0;
1039
1040   for (node = 0; node < dfa->nodes_len; ++node)
1041     switch (dfa->nodes[node].type)
1042       {
1043       case CHARACTER:
1044         if (dfa->nodes[node].opr.c >= 0x80)
1045           mb_chars = 1;
1046         break;
1047       case ANCHOR:
1048         switch (dfa->nodes[node].opr.idx)
1049           {
1050           case LINE_FIRST:
1051           case LINE_LAST:
1052           case BUF_FIRST:
1053           case BUF_LAST:
1054             break;
1055           default:
1056             /* Word anchors etc. cannot be handled.  */
1057             return;
1058           }
1059         break;
1060       case OP_PERIOD:
1061         has_period = 1;
1062         break;
1063       case OP_BACK_REF:
1064       case OP_ALT:
1065       case END_OF_RE:
1066       case OP_DUP_ASTERISK:
1067       case OP_OPEN_SUBEXP:
1068       case OP_CLOSE_SUBEXP:
1069         break;
1070       case COMPLEX_BRACKET:
1071         return;
1072       case SIMPLE_BRACKET:
1073         /* Just double check.  */
1074         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1075           if (dfa->nodes[node].opr.sbcset[i])
1076             return;
1077         break;
1078       default:
1079         abort ();
1080       }
1081
1082   if (mb_chars || has_period)
1083     for (node = 0; node < dfa->nodes_len; ++node)
1084       {
1085         if (dfa->nodes[node].type == CHARACTER
1086             && dfa->nodes[node].opr.c >= 0x80)
1087           dfa->nodes[node].mb_partial = 0;
1088         else if (dfa->nodes[node].type == OP_PERIOD)
1089           dfa->nodes[node].type = OP_UTF8_PERIOD;
1090       }
1091
1092   /* The search can be in single byte locale.  */
1093   dfa->mb_cur_max = 1;
1094   dfa->is_utf8 = 0;
1095   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1096 }
1097 #endif
1098 \f
1099 /* Analyze the structure tree, and calculate "first", "next", "edest",
1100    "eclosure", and "inveclosure".  */
1101
1102 static reg_errcode_t
1103 analyze (regex_t *preg)
1104 {
1105   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1106   reg_errcode_t ret;
1107
1108   /* Allocate arrays.  */
1109   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1110   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1111   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1112   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1113   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1114           || dfa->eclosures == NULL, 0))
1115     return REG_ESPACE;
1116
1117   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1118   if (dfa->subexp_map != NULL)
1119     {
1120       int i;
1121       for (i = 0; i < preg->re_nsub; i++)
1122         dfa->subexp_map[i] = i;
1123       preorder (dfa->str_tree, optimize_subexps, dfa);
1124       for (i = 0; i < preg->re_nsub; i++)
1125         if (dfa->subexp_map[i] != i)
1126           break;
1127       if (i == preg->re_nsub)
1128         {
1129           free (dfa->subexp_map);
1130           dfa->subexp_map = NULL;
1131         }
1132     }
1133
1134   ret = postorder (dfa->str_tree, lower_subexps, preg);
1135   if (BE (ret != REG_NOERROR, 0))
1136     return ret;
1137   ret = postorder (dfa->str_tree, calc_first, dfa);
1138   if (BE (ret != REG_NOERROR, 0))
1139     return ret;
1140   preorder (dfa->str_tree, calc_next, dfa);
1141   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1142   if (BE (ret != REG_NOERROR, 0))
1143     return ret;
1144   ret = calc_eclosure (dfa);
1145   if (BE (ret != REG_NOERROR, 0))
1146     return ret;
1147
1148   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1149      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1150   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1151       || dfa->nbackref)
1152     {
1153       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1154       if (BE (dfa->inveclosures == NULL, 0))
1155         return REG_ESPACE;
1156       ret = calc_inveclosure (dfa);
1157     }
1158
1159   return ret;
1160 }
1161
1162 /* Our parse trees are very unbalanced, so we cannot use a stack to
1163    implement parse tree visits.  Instead, we use parent pointers and
1164    some hairy code in these two functions.  */
1165 static reg_errcode_t
1166 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1167            void *extra)
1168 {
1169   bin_tree_t *node, *prev;
1170
1171   for (node = root; ; )
1172     {
1173       /* Descend down the tree, preferably to the left (or to the right
1174          if that's the only child).  */
1175       while (node->left || node->right)
1176         if (node->left)
1177           node = node->left;
1178         else
1179           node = node->right;
1180
1181       do
1182         {
1183           reg_errcode_t err = fn (extra, node);
1184           if (BE (err != REG_NOERROR, 0))
1185             return err;
1186           if (node->parent == NULL)
1187             return REG_NOERROR;
1188           prev = node;
1189           node = node->parent;
1190         }
1191       /* Go up while we have a node that is reached from the right.  */
1192       while (node->right == prev || node->right == NULL);
1193       node = node->right;
1194     }
1195 }
1196
1197 static reg_errcode_t
1198 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1199           void *extra)
1200 {
1201   bin_tree_t *node;
1202
1203   for (node = root; ; )
1204     {
1205       reg_errcode_t err = fn (extra, node);
1206       if (BE (err != REG_NOERROR, 0))
1207         return err;
1208
1209       /* Go to the left node, or up and to the right.  */
1210       if (node->left)
1211         node = node->left;
1212       else
1213         {
1214           bin_tree_t *prev = NULL;
1215           while (node->right == prev || node->right == NULL)
1216             {
1217               prev = node;
1218               node = node->parent;
1219               if (!node)
1220                 return REG_NOERROR;
1221             }
1222           node = node->right;
1223         }
1224     }
1225 }
1226
1227 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1228    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1229    backreferences as well.  Requires a preorder visit.  */
1230 static reg_errcode_t
1231 optimize_subexps (void *extra, bin_tree_t *node)
1232 {
1233   re_dfa_t *dfa = (re_dfa_t *) extra;
1234
1235   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1236     {
1237       int idx = node->token.opr.idx;
1238       node->token.opr.idx = dfa->subexp_map[idx];
1239       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1240     }
1241
1242   else if (node->token.type == SUBEXP
1243            && node->left && node->left->token.type == SUBEXP)
1244     {
1245       int other_idx = node->left->token.opr.idx;
1246
1247       node->left = node->left->left;
1248       if (node->left)
1249         node->left->parent = node;
1250
1251       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1252       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1253         dfa->used_bkref_map &= ~(1 << other_idx);
1254     }
1255
1256   return REG_NOERROR;
1257 }
1258
1259 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1260    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1261 static reg_errcode_t
1262 lower_subexps (void *extra, bin_tree_t *node)
1263 {
1264   regex_t *preg = (regex_t *) extra;
1265   reg_errcode_t err = REG_NOERROR;
1266
1267   if (node->left && node->left->token.type == SUBEXP)
1268     {
1269       node->left = lower_subexp (&err, preg, node->left);
1270       if (node->left)
1271         node->left->parent = node;
1272     }
1273   if (node->right && node->right->token.type == SUBEXP)
1274     {
1275       node->right = lower_subexp (&err, preg, node->right);
1276       if (node->right)
1277         node->right->parent = node;
1278     }
1279
1280   return err;
1281 }
1282
1283 static bin_tree_t *
1284 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1285 {
1286   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1287   bin_tree_t *body = node->left;
1288   bin_tree_t *op, *cls, *tree1, *tree;
1289
1290   if (preg->no_sub
1291       /* We do not optimize empty subexpressions, because otherwise we may
1292          have bad CONCAT nodes with NULL children.  This is obviously not
1293          very common, so we do not lose much.  An example that triggers
1294          this case is the sed "script" /\(\)/x.  */
1295       && node->left != NULL
1296       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1297           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1298     return node->left;
1299
1300   /* Convert the SUBEXP node to the concatenation of an
1301      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1302   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1303   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1304   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1305   tree = create_tree (dfa, op, tree1, CONCAT);
1306   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1307     {
1308       *err = REG_ESPACE;
1309       return NULL;
1310     }
1311
1312   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1313   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1314   return tree;
1315 }
1316
1317 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1318    nodes.  Requires a postorder visit.  */
1319 static reg_errcode_t
1320 calc_first (void *extra, bin_tree_t *node)
1321 {
1322   re_dfa_t *dfa = (re_dfa_t *) extra;
1323   if (node->token.type == CONCAT)
1324     {
1325       node->first = node->left->first;
1326       node->node_idx = node->left->node_idx;
1327     }
1328   else
1329     {
1330       node->first = node;
1331       node->node_idx = re_dfa_add_node (dfa, node->token);
1332       if (BE (node->node_idx == -1, 0))
1333         return REG_ESPACE;
1334     }
1335   return REG_NOERROR;
1336 }
1337
1338 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1339 static reg_errcode_t
1340 calc_next (void *extra, bin_tree_t *node)
1341 {
1342   switch (node->token.type)
1343     {
1344     case OP_DUP_ASTERISK:
1345       node->left->next = node;
1346       break;
1347     case CONCAT:
1348       node->left->next = node->right->first;
1349       node->right->next = node->next;
1350       break;
1351     default:
1352       if (node->left)
1353         node->left->next = node->next;
1354       if (node->right)
1355         node->right->next = node->next;
1356       break;
1357     }
1358   return REG_NOERROR;
1359 }
1360
1361 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1362 static reg_errcode_t
1363 link_nfa_nodes (void *extra, bin_tree_t *node)
1364 {
1365   re_dfa_t *dfa = (re_dfa_t *) extra;
1366   int idx = node->node_idx;
1367   reg_errcode_t err = REG_NOERROR;
1368
1369   switch (node->token.type)
1370     {
1371     case CONCAT:
1372       break;
1373
1374     case END_OF_RE:
1375       assert (node->next == NULL);
1376       break;
1377
1378     case OP_DUP_ASTERISK:
1379     case OP_ALT:
1380       {
1381         int left, right;
1382         dfa->has_plural_match = 1;
1383         if (node->left != NULL)
1384           left = node->left->first->node_idx;
1385         else
1386           left = node->next->node_idx;
1387         if (node->right != NULL)
1388           right = node->right->first->node_idx;
1389         else
1390           right = node->next->node_idx;
1391         assert (left > -1);
1392         assert (right > -1);
1393         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1394       }
1395       break;
1396
1397     case ANCHOR:
1398     case OP_OPEN_SUBEXP:
1399     case OP_CLOSE_SUBEXP:
1400       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1401       break;
1402
1403     case OP_BACK_REF:
1404       dfa->nexts[idx] = node->next->node_idx;
1405       if (node->token.type == OP_BACK_REF)
1406         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1407       break;
1408
1409     default:
1410       assert (!IS_EPSILON_NODE (node->token.type));
1411       dfa->nexts[idx] = node->next->node_idx;
1412       break;
1413     }
1414
1415   return err;
1416 }
1417
1418 /* Duplicate the epsilon closure of the node ROOT_NODE.
1419    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1420    to their own constraint.  */
1421
1422 static reg_errcode_t
1423 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1424                         int root_node, unsigned int init_constraint)
1425 {
1426   int org_node, clone_node, ret;
1427   unsigned int constraint = init_constraint;
1428   for (org_node = top_org_node, clone_node = top_clone_node;;)
1429     {
1430       int org_dest, clone_dest;
1431       if (dfa->nodes[org_node].type == OP_BACK_REF)
1432         {
1433           /* If the back reference epsilon-transit, its destination must
1434              also have the constraint.  Then duplicate the epsilon closure
1435              of the destination of the back reference, and store it in
1436              edests of the back reference.  */
1437           org_dest = dfa->nexts[org_node];
1438           re_node_set_empty (dfa->edests + clone_node);
1439           clone_dest = duplicate_node (dfa, org_dest, constraint);
1440           if (BE (clone_dest == -1, 0))
1441             return REG_ESPACE;
1442           dfa->nexts[clone_node] = dfa->nexts[org_node];
1443           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1444           if (BE (ret < 0, 0))
1445             return REG_ESPACE;
1446         }
1447       else if (dfa->edests[org_node].nelem == 0)
1448         {
1449           /* In case of the node can't epsilon-transit, don't duplicate the
1450              destination and store the original destination as the
1451              destination of the node.  */
1452           dfa->nexts[clone_node] = dfa->nexts[org_node];
1453           break;
1454         }
1455       else if (dfa->edests[org_node].nelem == 1)
1456         {
1457           /* In case of the node can epsilon-transit, and it has only one
1458              destination.  */
1459           org_dest = dfa->edests[org_node].elems[0];
1460           re_node_set_empty (dfa->edests + clone_node);
1461           if (dfa->nodes[org_node].type == ANCHOR)
1462             {
1463               /* In case of the node has another constraint, append it.  */
1464               if (org_node == root_node && clone_node != org_node)
1465                 {
1466                   /* ...but if the node is root_node itself, it means the
1467                      epsilon closure have a loop, then tie it to the
1468                      destination of the root_node.  */
1469                   ret = re_node_set_insert (dfa->edests + clone_node,
1470                                             org_dest);
1471                   if (BE (ret < 0, 0))
1472                     return REG_ESPACE;
1473                   break;
1474                 }
1475               constraint |= dfa->nodes[org_node].opr.ctx_type;
1476             }
1477           clone_dest = duplicate_node (dfa, org_dest, constraint);
1478           if (BE (clone_dest == -1, 0))
1479             return REG_ESPACE;
1480           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1481           if (BE (ret < 0, 0))
1482             return REG_ESPACE;
1483         }
1484       else /* dfa->edests[org_node].nelem == 2 */
1485         {
1486           /* In case of the node can epsilon-transit, and it has two
1487              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1488           org_dest = dfa->edests[org_node].elems[0];
1489           re_node_set_empty (dfa->edests + clone_node);
1490           /* Search for a duplicated node which satisfies the constraint.  */
1491           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1492           if (clone_dest == -1)
1493             {
1494               /* There are no such a duplicated node, create a new one.  */
1495               reg_errcode_t err;
1496               clone_dest = duplicate_node (dfa, org_dest, constraint);
1497               if (BE (clone_dest == -1, 0))
1498                 return REG_ESPACE;
1499               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500               if (BE (ret < 0, 0))
1501                 return REG_ESPACE;
1502               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1503                                             root_node, constraint);
1504               if (BE (err != REG_NOERROR, 0))
1505                 return err;
1506             }
1507           else
1508             {
1509               /* There are a duplicated node which satisfy the constraint,
1510                  use it to avoid infinite loop.  */
1511               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1512               if (BE (ret < 0, 0))
1513                 return REG_ESPACE;
1514             }
1515
1516           org_dest = dfa->edests[org_node].elems[1];
1517           clone_dest = duplicate_node (dfa, org_dest, constraint);
1518           if (BE (clone_dest == -1, 0))
1519             return REG_ESPACE;
1520           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521           if (BE (ret < 0, 0))
1522             return REG_ESPACE;
1523         }
1524       org_node = org_dest;
1525       clone_node = clone_dest;
1526     }
1527   return REG_NOERROR;
1528 }
1529
1530 /* Search for a node which is duplicated from the node ORG_NODE, and
1531    satisfies the constraint CONSTRAINT.  */
1532
1533 static int
1534 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1535 {
1536   int idx;
1537   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1538     {
1539       if (org_node == dfa->org_indices[idx]
1540           && constraint == dfa->nodes[idx].constraint)
1541         return idx; /* Found.  */
1542     }
1543   return -1; /* Not found.  */
1544 }
1545
1546 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1547    Return the index of the new node, or -1 if insufficient storage is
1548    available.  */
1549
1550 static int
1551 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1552 {
1553   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1554   if (BE (dup_idx != -1, 1))
1555     {
1556       dfa->nodes[dup_idx].constraint = constraint;
1557       if (dfa->nodes[org_idx].type == ANCHOR)
1558         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1559       dfa->nodes[dup_idx].duplicated = 1;
1560
1561       /* Store the index of the original node.  */
1562       dfa->org_indices[dup_idx] = org_idx;
1563     }
1564   return dup_idx;
1565 }
1566
1567 static reg_errcode_t
1568 calc_inveclosure (re_dfa_t *dfa)
1569 {
1570   int src, idx, ret;
1571   for (idx = 0; idx < dfa->nodes_len; ++idx)
1572     re_node_set_init_empty (dfa->inveclosures + idx);
1573
1574   for (src = 0; src < dfa->nodes_len; ++src)
1575     {
1576       int *elems = dfa->eclosures[src].elems;
1577       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1578         {
1579           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1580           if (BE (ret == -1, 0))
1581             return REG_ESPACE;
1582         }
1583     }
1584
1585   return REG_NOERROR;
1586 }
1587
1588 /* Calculate "eclosure" for all the node in DFA.  */
1589
1590 static reg_errcode_t
1591 calc_eclosure (re_dfa_t *dfa)
1592 {
1593   int node_idx, incomplete;
1594 #ifdef DEBUG
1595   assert (dfa->nodes_len > 0);
1596 #endif
1597   incomplete = 0;
1598   /* For each nodes, calculate epsilon closure.  */
1599   for (node_idx = 0; ; ++node_idx)
1600     {
1601       reg_errcode_t err;
1602       re_node_set eclosure_elem;
1603       if (node_idx == dfa->nodes_len)
1604         {
1605           if (!incomplete)
1606             break;
1607           incomplete = 0;
1608           node_idx = 0;
1609         }
1610
1611 #ifdef DEBUG
1612       assert (dfa->eclosures[node_idx].nelem != -1);
1613 #endif
1614
1615       /* If we have already calculated, skip it.  */
1616       if (dfa->eclosures[node_idx].nelem != 0)
1617         continue;
1618       /* Calculate epsilon closure of `node_idx'.  */
1619       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1620       if (BE (err != REG_NOERROR, 0))
1621         return err;
1622
1623       if (dfa->eclosures[node_idx].nelem == 0)
1624         {
1625           incomplete = 1;
1626           re_node_set_free (&eclosure_elem);
1627         }
1628     }
1629   return REG_NOERROR;
1630 }
1631
1632 /* Calculate epsilon closure of NODE.  */
1633
1634 static reg_errcode_t
1635 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1636 {
1637   reg_errcode_t err;
1638   unsigned int constraint;
1639   int i, incomplete;
1640   re_node_set eclosure;
1641   incomplete = 0;
1642   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1643   if (BE (err != REG_NOERROR, 0))
1644     return err;
1645
1646   /* This indicates that we are calculating this node now.
1647      We reference this value to avoid infinite loop.  */
1648   dfa->eclosures[node].nelem = -1;
1649
1650   constraint = ((dfa->nodes[node].type == ANCHOR)
1651                 ? dfa->nodes[node].opr.ctx_type : 0);
1652   /* If the current node has constraints, duplicate all nodes.
1653      Since they must inherit the constraints.  */
1654   if (constraint
1655       && dfa->edests[node].nelem
1656       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1657     {
1658       int org_node, cur_node;
1659       org_node = cur_node = node;
1660       err = duplicate_node_closure (dfa, node, node, node, constraint);
1661       if (BE (err != REG_NOERROR, 0))
1662         return err;
1663     }
1664
1665   /* Expand each epsilon destination nodes.  */
1666   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1667     for (i = 0; i < dfa->edests[node].nelem; ++i)
1668       {
1669         re_node_set eclosure_elem;
1670         int edest = dfa->edests[node].elems[i];
1671         /* If calculating the epsilon closure of `edest' is in progress,
1672            return intermediate result.  */
1673         if (dfa->eclosures[edest].nelem == -1)
1674           {
1675             incomplete = 1;
1676             continue;
1677           }
1678         /* If we haven't calculated the epsilon closure of `edest' yet,
1679            calculate now. Otherwise use calculated epsilon closure.  */
1680         if (dfa->eclosures[edest].nelem == 0)
1681           {
1682             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1683             if (BE (err != REG_NOERROR, 0))
1684               return err;
1685           }
1686         else
1687           eclosure_elem = dfa->eclosures[edest];
1688         /* Merge the epsilon closure of `edest'.  */
1689         re_node_set_merge (&eclosure, &eclosure_elem);
1690         /* If the epsilon closure of `edest' is incomplete,
1691            the epsilon closure of this node is also incomplete.  */
1692         if (dfa->eclosures[edest].nelem == 0)
1693           {
1694             incomplete = 1;
1695             re_node_set_free (&eclosure_elem);
1696           }
1697       }
1698
1699   /* Epsilon closures include itself.  */
1700   re_node_set_insert (&eclosure, node);
1701   if (incomplete && !root)
1702     dfa->eclosures[node].nelem = 0;
1703   else
1704     dfa->eclosures[node] = eclosure;
1705   *new_set = eclosure;
1706   return REG_NOERROR;
1707 }
1708 \f
1709 /* Functions for token which are used in the parser.  */
1710
1711 /* Fetch a token from INPUT.
1712    We must not use this function inside bracket expressions.  */
1713
1714 static void
1715 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1716 {
1717   re_string_skip_bytes (input, peek_token (result, input, syntax));
1718 }
1719
1720 /* Peek a token from INPUT, and return the length of the token.
1721    We must not use this function inside bracket expressions.  */
1722
1723 static int
1724 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1725 {
1726   unsigned char c;
1727
1728   if (re_string_eoi (input))
1729     {
1730       token->type = END_OF_RE;
1731       return 0;
1732     }
1733
1734   c = re_string_peek_byte (input, 0);
1735   token->opr.c = c;
1736
1737   token->word_char = 0;
1738 #ifdef RE_ENABLE_I18N
1739   token->mb_partial = 0;
1740   if (input->mb_cur_max > 1 &&
1741       !re_string_first_byte (input, re_string_cur_idx (input)))
1742     {
1743       token->type = CHARACTER;
1744       token->mb_partial = 1;
1745       return 1;
1746     }
1747 #endif
1748   if (c == '\\')
1749     {
1750       unsigned char c2;
1751       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1752         {
1753           token->type = BACK_SLASH;
1754           return 1;
1755         }
1756
1757       c2 = re_string_peek_byte_case (input, 1);
1758       token->opr.c = c2;
1759       token->type = CHARACTER;
1760 #ifdef RE_ENABLE_I18N
1761       if (input->mb_cur_max > 1)
1762         {
1763           wint_t wc = re_string_wchar_at (input,
1764                                           re_string_cur_idx (input) + 1);
1765           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1766         }
1767       else
1768 #endif
1769         token->word_char = IS_WORD_CHAR (c2) != 0;
1770
1771       switch (c2)
1772         {
1773         case '|':
1774           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1775             token->type = OP_ALT;
1776           break;
1777         case '1': case '2': case '3': case '4': case '5':
1778         case '6': case '7': case '8': case '9':
1779           if (!(syntax & RE_NO_BK_REFS))
1780             {
1781               token->type = OP_BACK_REF;
1782               token->opr.idx = c2 - '1';
1783             }
1784           break;
1785         case '<':
1786           if (!(syntax & RE_NO_GNU_OPS))
1787             {
1788               token->type = ANCHOR;
1789               token->opr.ctx_type = WORD_FIRST;
1790             }
1791           break;
1792         case '>':
1793           if (!(syntax & RE_NO_GNU_OPS))
1794             {
1795               token->type = ANCHOR;
1796               token->opr.ctx_type = WORD_LAST;
1797             }
1798           break;
1799         case 'b':
1800           if (!(syntax & RE_NO_GNU_OPS))
1801             {
1802               token->type = ANCHOR;
1803               token->opr.ctx_type = WORD_DELIM;
1804             }
1805           break;
1806         case 'B':
1807           if (!(syntax & RE_NO_GNU_OPS))
1808             {
1809               token->type = ANCHOR;
1810               token->opr.ctx_type = NOT_WORD_DELIM;
1811             }
1812           break;
1813         case 'w':
1814           if (!(syntax & RE_NO_GNU_OPS))
1815             token->type = OP_WORD;
1816           break;
1817         case 'W':
1818           if (!(syntax & RE_NO_GNU_OPS))
1819             token->type = OP_NOTWORD;
1820           break;
1821         case 's':
1822           if (!(syntax & RE_NO_GNU_OPS))
1823             token->type = OP_SPACE;
1824           break;
1825         case 'S':
1826           if (!(syntax & RE_NO_GNU_OPS))
1827             token->type = OP_NOTSPACE;
1828           break;
1829         case '`':
1830           if (!(syntax & RE_NO_GNU_OPS))
1831             {
1832               token->type = ANCHOR;
1833               token->opr.ctx_type = BUF_FIRST;
1834             }
1835           break;
1836         case '\'':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             {
1839               token->type = ANCHOR;
1840               token->opr.ctx_type = BUF_LAST;
1841             }
1842           break;
1843         case '(':
1844           if (!(syntax & RE_NO_BK_PARENS))
1845             token->type = OP_OPEN_SUBEXP;
1846           break;
1847         case ')':
1848           if (!(syntax & RE_NO_BK_PARENS))
1849             token->type = OP_CLOSE_SUBEXP;
1850           break;
1851         case '+':
1852           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1853             token->type = OP_DUP_PLUS;
1854           break;
1855         case '?':
1856           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1857             token->type = OP_DUP_QUESTION;
1858           break;
1859         case '{':
1860           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1861             token->type = OP_OPEN_DUP_NUM;
1862           break;
1863         case '}':
1864           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1865             token->type = OP_CLOSE_DUP_NUM;
1866           break;
1867         default:
1868           break;
1869         }
1870       return 2;
1871     }
1872
1873   token->type = CHARACTER;
1874 #ifdef RE_ENABLE_I18N
1875   if (input->mb_cur_max > 1)
1876     {
1877       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1878       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1879     }
1880   else
1881 #endif
1882     token->word_char = IS_WORD_CHAR (token->opr.c);
1883
1884   switch (c)
1885     {
1886     case '\n':
1887       if (syntax & RE_NEWLINE_ALT)
1888         token->type = OP_ALT;
1889       break;
1890     case '|':
1891       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1892         token->type = OP_ALT;
1893       break;
1894     case '*':
1895       token->type = OP_DUP_ASTERISK;
1896       break;
1897     case '+':
1898       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1899         token->type = OP_DUP_PLUS;
1900       break;
1901     case '?':
1902       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1903         token->type = OP_DUP_QUESTION;
1904       break;
1905     case '{':
1906       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1907         token->type = OP_OPEN_DUP_NUM;
1908       break;
1909     case '}':
1910       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1911         token->type = OP_CLOSE_DUP_NUM;
1912       break;
1913     case '(':
1914       if (syntax & RE_NO_BK_PARENS)
1915         token->type = OP_OPEN_SUBEXP;
1916       break;
1917     case ')':
1918       if (syntax & RE_NO_BK_PARENS)
1919         token->type = OP_CLOSE_SUBEXP;
1920       break;
1921     case '[':
1922       token->type = OP_OPEN_BRACKET;
1923       break;
1924     case '.':
1925       token->type = OP_PERIOD;
1926       break;
1927     case '^':
1928       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1929           re_string_cur_idx (input) != 0)
1930         {
1931           char prev = re_string_peek_byte (input, -1);
1932           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1933             break;
1934         }
1935       token->type = ANCHOR;
1936       token->opr.ctx_type = LINE_FIRST;
1937       break;
1938     case '$':
1939       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1940           re_string_cur_idx (input) + 1 != re_string_length (input))
1941         {
1942           re_token_t next;
1943           re_string_skip_bytes (input, 1);
1944           peek_token (&next, input, syntax);
1945           re_string_skip_bytes (input, -1);
1946           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1947             break;
1948         }
1949       token->type = ANCHOR;
1950       token->opr.ctx_type = LINE_LAST;
1951       break;
1952     default:
1953       break;
1954     }
1955   return 1;
1956 }
1957
1958 /* Peek a token from INPUT, and return the length of the token.
1959    We must not use this function out of bracket expressions.  */
1960
1961 static int
1962 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1963 {
1964   unsigned char c;
1965   if (re_string_eoi (input))
1966     {
1967       token->type = END_OF_RE;
1968       return 0;
1969     }
1970   c = re_string_peek_byte (input, 0);
1971   token->opr.c = c;
1972
1973 #ifdef RE_ENABLE_I18N
1974   if (input->mb_cur_max > 1 &&
1975       !re_string_first_byte (input, re_string_cur_idx (input)))
1976     {
1977       token->type = CHARACTER;
1978       return 1;
1979     }
1980 #endif /* RE_ENABLE_I18N */
1981
1982   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1983       && re_string_cur_idx (input) + 1 < re_string_length (input))
1984     {
1985       /* In this case, '\' escape a character.  */
1986       unsigned char c2;
1987       re_string_skip_bytes (input, 1);
1988       c2 = re_string_peek_byte (input, 0);
1989       token->opr.c = c2;
1990       token->type = CHARACTER;
1991       return 1;
1992     }
1993   if (c == '[') /* '[' is a special char in a bracket exps.  */
1994     {
1995       unsigned char c2;
1996       int token_len;
1997       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1998         c2 = re_string_peek_byte (input, 1);
1999       else
2000         c2 = 0;
2001       token->opr.c = c2;
2002       token_len = 2;
2003       switch (c2)
2004         {
2005         case '.':
2006           token->type = OP_OPEN_COLL_ELEM;
2007           break;
2008         case '=':
2009           token->type = OP_OPEN_EQUIV_CLASS;
2010           break;
2011         case ':':
2012           if (syntax & RE_CHAR_CLASSES)
2013             {
2014               token->type = OP_OPEN_CHAR_CLASS;
2015               break;
2016             }
2017           /* else fall through.  */
2018         default:
2019           token->type = CHARACTER;
2020           token->opr.c = c;
2021           token_len = 1;
2022           break;
2023         }
2024       return token_len;
2025     }
2026   switch (c)
2027     {
2028     case '-':
2029       token->type = OP_CHARSET_RANGE;
2030       break;
2031     case ']':
2032       token->type = OP_CLOSE_BRACKET;
2033       break;
2034     case '^':
2035       token->type = OP_NON_MATCH_LIST;
2036       break;
2037     default:
2038       token->type = CHARACTER;
2039     }
2040   return 1;
2041 }
2042 \f
2043 /* Functions for parser.  */
2044
2045 /* Entry point of the parser.
2046    Parse the regular expression REGEXP and return the structure tree.
2047    If an error is occured, ERR is set by error code, and return NULL.
2048    This function build the following tree, from regular expression <reg_exp>:
2049            CAT
2050            / \
2051           /   \
2052    <reg_exp>  EOR
2053
2054    CAT means concatenation.
2055    EOR means end of regular expression.  */
2056
2057 static bin_tree_t *
2058 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2059        reg_errcode_t *err)
2060 {
2061   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2062   bin_tree_t *tree, *eor, *root;
2063   re_token_t current_token;
2064   dfa->syntax = syntax;
2065   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2066   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2067   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2068     return NULL;
2069   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2070   if (tree != NULL)
2071     root = create_tree (dfa, tree, eor, CONCAT);
2072   else
2073     root = eor;
2074   if (BE (eor == NULL || root == NULL, 0))
2075     {
2076       *err = REG_ESPACE;
2077       return NULL;
2078     }
2079   return root;
2080 }
2081
2082 /* This function build the following tree, from regular expression
2083    <branch1>|<branch2>:
2084            ALT
2085            / \
2086           /   \
2087    <branch1> <branch2>
2088
2089    ALT means alternative, which represents the operator `|'.  */
2090
2091 static bin_tree_t *
2092 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2093                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2094 {
2095   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2096   bin_tree_t *tree, *branch = NULL;
2097   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2098   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2099     return NULL;
2100
2101   while (token->type == OP_ALT)
2102     {
2103       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2104       if (token->type != OP_ALT && token->type != END_OF_RE
2105           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2106         {
2107           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2108           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2109             return NULL;
2110         }
2111       else
2112         branch = NULL;
2113       tree = create_tree (dfa, tree, branch, OP_ALT);
2114       if (BE (tree == NULL, 0))
2115         {
2116           *err = REG_ESPACE;
2117           return NULL;
2118         }
2119     }
2120   return tree;
2121 }
2122
2123 /* This function build the following tree, from regular expression
2124    <exp1><exp2>:
2125         CAT
2126         / \
2127        /   \
2128    <exp1> <exp2>
2129
2130    CAT means concatenation.  */
2131
2132 static bin_tree_t *
2133 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2135 {
2136   bin_tree_t *tree, *exp;
2137   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2139   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2140     return NULL;
2141
2142   while (token->type != OP_ALT && token->type != END_OF_RE
2143          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2144     {
2145       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2146       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2147         {
2148           return NULL;
2149         }
2150       if (tree != NULL && exp != NULL)
2151         {
2152           tree = create_tree (dfa, tree, exp, CONCAT);
2153           if (tree == NULL)
2154             {
2155               *err = REG_ESPACE;
2156               return NULL;
2157             }
2158         }
2159       else if (tree == NULL)
2160         tree = exp;
2161       /* Otherwise exp == NULL, we don't need to create new tree.  */
2162     }
2163   return tree;
2164 }
2165
2166 /* This function build the following tree, from regular expression a*:
2167          *
2168          |
2169          a
2170 */
2171
2172 static bin_tree_t *
2173 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2175 {
2176   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2177   bin_tree_t *tree;
2178   switch (token->type)
2179     {
2180     case CHARACTER:
2181       tree = create_token_tree (dfa, NULL, NULL, token);
2182       if (BE (tree == NULL, 0))
2183         {
2184           *err = REG_ESPACE;
2185           return NULL;
2186         }
2187 #ifdef RE_ENABLE_I18N
2188       if (dfa->mb_cur_max > 1)
2189         {
2190           while (!re_string_eoi (regexp)
2191                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2192             {
2193               bin_tree_t *mbc_remain;
2194               fetch_token (token, regexp, syntax);
2195               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2196               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2197               if (BE (mbc_remain == NULL || tree == NULL, 0))
2198                 {
2199                   *err = REG_ESPACE;
2200                   return NULL;
2201                 }
2202             }
2203         }
2204 #endif
2205       break;
2206     case OP_OPEN_SUBEXP:
2207       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2208       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2209         return NULL;
2210       break;
2211     case OP_OPEN_BRACKET:
2212       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2213       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2214         return NULL;
2215       break;
2216     case OP_BACK_REF:
2217       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2218         {
2219           *err = REG_ESUBREG;
2220           return NULL;
2221         }
2222       dfa->used_bkref_map |= 1 << token->opr.idx;
2223       tree = create_token_tree (dfa, NULL, NULL, token);
2224       if (BE (tree == NULL, 0))
2225         {
2226           *err = REG_ESPACE;
2227           return NULL;
2228         }
2229       ++dfa->nbackref;
2230       dfa->has_mb_node = 1;
2231       break;
2232     case OP_OPEN_DUP_NUM:
2233       if (syntax & RE_CONTEXT_INVALID_DUP)
2234         {
2235           *err = REG_BADRPT;
2236           return NULL;
2237         }
2238       /* FALLTHROUGH */
2239     case OP_DUP_ASTERISK:
2240     case OP_DUP_PLUS:
2241     case OP_DUP_QUESTION:
2242       if (syntax & RE_CONTEXT_INVALID_OPS)
2243         {
2244           *err = REG_BADRPT;
2245           return NULL;
2246         }
2247       else if (syntax & RE_CONTEXT_INDEP_OPS)
2248         {
2249           fetch_token (token, regexp, syntax);
2250           return parse_expression (regexp, preg, token, syntax, nest, err);
2251         }
2252       /* else fall through  */
2253     case OP_CLOSE_SUBEXP:
2254       if ((token->type == OP_CLOSE_SUBEXP) &&
2255           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2256         {
2257           *err = REG_ERPAREN;
2258           return NULL;
2259         }
2260       /* else fall through  */
2261     case OP_CLOSE_DUP_NUM:
2262       /* We treat it as a normal character.  */
2263
2264       /* Then we can these characters as normal characters.  */
2265       token->type = CHARACTER;
2266       /* mb_partial and word_char bits should be initialized already
2267          by peek_token.  */
2268       tree = create_token_tree (dfa, NULL, NULL, token);
2269       if (BE (tree == NULL, 0))
2270         {
2271           *err = REG_ESPACE;
2272           return NULL;
2273         }
2274       break;
2275     case ANCHOR:
2276       if ((token->opr.ctx_type
2277            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2278           && dfa->word_ops_used == 0)
2279         init_word_char (dfa);
2280       if (token->opr.ctx_type == WORD_DELIM
2281           || token->opr.ctx_type == NOT_WORD_DELIM)
2282         {
2283           bin_tree_t *tree_first, *tree_last;
2284           if (token->opr.ctx_type == WORD_DELIM)
2285             {
2286               token->opr.ctx_type = WORD_FIRST;
2287               tree_first = create_token_tree (dfa, NULL, NULL, token);
2288               token->opr.ctx_type = WORD_LAST;
2289             }
2290           else
2291             {
2292               token->opr.ctx_type = INSIDE_WORD;
2293               tree_first = create_token_tree (dfa, NULL, NULL, token);
2294               token->opr.ctx_type = INSIDE_NOTWORD;
2295             }
2296           tree_last = create_token_tree (dfa, NULL, NULL, token);
2297           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2298           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2299             {
2300               *err = REG_ESPACE;
2301               return NULL;
2302             }
2303         }
2304       else
2305         {
2306           tree = create_token_tree (dfa, NULL, NULL, token);
2307           if (BE (tree == NULL, 0))
2308             {
2309               *err = REG_ESPACE;
2310               return NULL;
2311             }
2312         }
2313       /* We must return here, since ANCHORs can't be followed
2314          by repetition operators.
2315          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2316              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2317       fetch_token (token, regexp, syntax);
2318       return tree;
2319     case OP_PERIOD:
2320       tree = create_token_tree (dfa, NULL, NULL, token);
2321       if (BE (tree == NULL, 0))
2322         {
2323           *err = REG_ESPACE;
2324           return NULL;
2325         }
2326       if (dfa->mb_cur_max > 1)
2327         dfa->has_mb_node = 1;
2328       break;
2329     case OP_WORD:
2330     case OP_NOTWORD:
2331       tree = build_charclass_op (dfa, regexp->trans,
2332                                  (const unsigned char *) "alnum",
2333                                  (const unsigned char *) "_",
2334                                  token->type == OP_NOTWORD, err);
2335       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2336         return NULL;
2337       break;
2338     case OP_SPACE:
2339     case OP_NOTSPACE:
2340       tree = build_charclass_op (dfa, regexp->trans,
2341                                  (const unsigned char *) "space",
2342                                  (const unsigned char *) "",
2343                                  token->type == OP_NOTSPACE, err);
2344       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2345         return NULL;
2346       break;
2347     case OP_ALT:
2348     case END_OF_RE:
2349       return NULL;
2350     case BACK_SLASH:
2351       *err = REG_EESCAPE;
2352       return NULL;
2353     default:
2354       /* Must not happen?  */
2355 #ifdef DEBUG
2356       assert (0);
2357 #endif
2358       return NULL;
2359     }
2360   fetch_token (token, regexp, syntax);
2361
2362   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2363          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2364     {
2365       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2366       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367         return NULL;
2368       /* In BRE consecutive duplications are not allowed.  */
2369       if ((syntax & RE_CONTEXT_INVALID_DUP)
2370           && (token->type == OP_DUP_ASTERISK
2371               || token->type == OP_OPEN_DUP_NUM))
2372         {
2373           *err = REG_BADRPT;
2374           return NULL;
2375         }
2376     }
2377
2378   return tree;
2379 }
2380
2381 /* This function build the following tree, from regular expression
2382    (<reg_exp>):
2383          SUBEXP
2384             |
2385         <reg_exp>
2386 */
2387
2388 static bin_tree_t *
2389 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2390                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2391 {
2392   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2393   bin_tree_t *tree;
2394   size_t cur_nsub;
2395   cur_nsub = preg->re_nsub++;
2396
2397   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2398
2399   /* The subexpression may be a null string.  */
2400   if (token->type == OP_CLOSE_SUBEXP)
2401     tree = NULL;
2402   else
2403     {
2404       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2405       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2406         *err = REG_EPAREN;
2407       if (BE (*err != REG_NOERROR, 0))
2408         return NULL;
2409     }
2410   dfa->completed_bkref_map |= 1 << cur_nsub;
2411
2412   tree = create_tree (dfa, tree, NULL, SUBEXP);
2413   if (BE (tree == NULL, 0))
2414     {
2415       *err = REG_ESPACE;
2416       return NULL;
2417     }
2418   tree->token.opr.idx = cur_nsub;
2419   return tree;
2420 }
2421
2422 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2423
2424 static bin_tree_t *
2425 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2426               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2427 {
2428   bin_tree_t *tree = NULL, *old_tree = NULL;
2429   int i, start, end, start_idx = re_string_cur_idx (regexp);
2430   re_token_t start_token = *token;
2431
2432   if (token->type == OP_OPEN_DUP_NUM)
2433     {
2434       end = 0;
2435       start = fetch_number (regexp, token, syntax);
2436       if (start == -1)
2437         {
2438           if (token->type == CHARACTER && token->opr.c == ',')
2439             start = 0; /* We treat "{,m}" as "{0,m}".  */
2440           else
2441             {
2442               *err = REG_BADBR; /* <re>{} is invalid.  */
2443               return NULL;
2444             }
2445         }
2446       if (BE (start != -2, 1))
2447         {
2448           /* We treat "{n}" as "{n,n}".  */
2449           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2450                  : ((token->type == CHARACTER && token->opr.c == ',')
2451                     ? fetch_number (regexp, token, syntax) : -2));
2452         }
2453       if (BE (start == -2 || end == -2, 0))
2454         {
2455           /* Invalid sequence.  */
2456           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2457             {
2458               if (token->type == END_OF_RE)
2459                 *err = REG_EBRACE;
2460               else
2461                 *err = REG_BADBR;
2462
2463               return NULL;
2464             }
2465
2466           /* If the syntax bit is set, rollback.  */
2467           re_string_set_index (regexp, start_idx);
2468           *token = start_token;
2469           token->type = CHARACTER;
2470           /* mb_partial and word_char bits should be already initialized by
2471              peek_token.  */
2472           return elem;
2473         }
2474
2475       if (BE (end != -1 && start > end, 0))
2476         {
2477           /* First number greater than second.  */
2478           *err = REG_BADBR;
2479           return NULL;
2480         }
2481     }
2482   else
2483     {
2484       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2485       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2486     }
2487
2488   fetch_token (token, regexp, syntax);
2489
2490   if (BE (elem == NULL, 0))
2491     return NULL;
2492   if (BE (start == 0 && end == 0, 0))
2493     {
2494       postorder (elem, free_tree, NULL);
2495       return NULL;
2496     }
2497
2498   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2499   if (BE (start > 0, 0))
2500     {
2501       tree = elem;
2502       for (i = 2; i <= start; ++i)
2503         {
2504           elem = duplicate_tree (elem, dfa);
2505           tree = create_tree (dfa, tree, elem, CONCAT);
2506           if (BE (elem == NULL || tree == NULL, 0))
2507             goto parse_dup_op_espace;
2508         }
2509
2510       if (start == end)
2511         return tree;
2512
2513       /* Duplicate ELEM before it is marked optional.  */
2514       elem = duplicate_tree (elem, dfa);
2515       old_tree = tree;
2516     }
2517   else
2518     old_tree = NULL;
2519
2520   if (elem->token.type == SUBEXP)
2521     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2522
2523   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2524   if (BE (tree == NULL, 0))
2525     goto parse_dup_op_espace;
2526
2527   /* This loop is actually executed only when end != -1,
2528      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2529      already created the start+1-th copy.  */
2530   for (i = start + 2; i <= end; ++i)
2531     {
2532       elem = duplicate_tree (elem, dfa);
2533       tree = create_tree (dfa, tree, elem, CONCAT);
2534       if (BE (elem == NULL || tree == NULL, 0))
2535         goto parse_dup_op_espace;
2536
2537       tree = create_tree (dfa, tree, NULL, OP_ALT);
2538       if (BE (tree == NULL, 0))
2539         goto parse_dup_op_espace;
2540     }
2541
2542   if (old_tree)
2543     tree = create_tree (dfa, old_tree, tree, CONCAT);
2544
2545   return tree;
2546
2547  parse_dup_op_espace:
2548   *err = REG_ESPACE;
2549   return NULL;
2550 }
2551
2552 /* Size of the names for collating symbol/equivalence_class/character_class.
2553    I'm not sure, but maybe enough.  */
2554 #define BRACKET_NAME_BUF_SIZE 32
2555
2556 #ifndef _LIBC
2557   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2558      Build the range expression which starts from START_ELEM, and ends
2559      at END_ELEM.  The result are written to MBCSET and SBCSET.
2560      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2561      mbcset->range_ends, is a pointer argument sinse we may
2562      update it.  */
2563
2564 static reg_errcode_t
2565 build_range_exp (re_bitset_ptr_t sbcset,
2566 # ifdef RE_ENABLE_I18N
2567                  re_charset_t *mbcset, int *range_alloc,
2568 # endif
2569                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2570 {
2571   unsigned int start_ch, end_ch;
2572   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2573   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2574           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2575           0))
2576     return REG_ERANGE;
2577
2578   /* We can handle no multi character collating elements without libc
2579      support.  */
2580   if (BE ((start_elem->type == COLL_SYM
2581            && strlen ((char *) start_elem->opr.name) > 1)
2582           || (end_elem->type == COLL_SYM
2583               && strlen ((char *) end_elem->opr.name) > 1), 0))
2584     return REG_ECOLLATE;
2585
2586 # ifdef RE_ENABLE_I18N
2587   {
2588     wchar_t wc;
2589     wint_t start_wc, end_wc;
2590     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2591
2592     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2593                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2594                    : 0));
2595     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2596               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2597                  : 0));
2598     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2599                 ? __btowc (start_ch) : start_elem->opr.wch);
2600     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2601               ? __btowc (end_ch) : end_elem->opr.wch);
2602     if (start_wc == WEOF || end_wc == WEOF)
2603       return REG_ECOLLATE;
2604     cmp_buf[0] = start_wc;
2605     cmp_buf[4] = end_wc;
2606     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2607       return REG_ERANGE;
2608
2609     /* Got valid collation sequence values, add them as a new entry.
2610        However, for !_LIBC we have no collation elements: if the
2611        character set is single byte, the single byte character set
2612        that we build below suffices.  parse_bracket_exp passes
2613        no MBCSET if dfa->mb_cur_max == 1.  */
2614     if (mbcset)
2615       {
2616         /* Check the space of the arrays.  */
2617         if (BE (*range_alloc == mbcset->nranges, 0))
2618           {
2619             /* There is not enough space, need realloc.  */
2620             wchar_t *new_array_start, *new_array_end;
2621             int new_nranges;
2622
2623             /* +1 in case of mbcset->nranges is 0.  */
2624             new_nranges = 2 * mbcset->nranges + 1;
2625             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2626                are NULL if *range_alloc == 0.  */
2627             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2628                                           new_nranges);
2629             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2630                                         new_nranges);
2631
2632             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2633               return REG_ESPACE;
2634
2635             mbcset->range_starts = new_array_start;
2636             mbcset->range_ends = new_array_end;
2637             *range_alloc = new_nranges;
2638           }
2639
2640         mbcset->range_starts[mbcset->nranges] = start_wc;
2641         mbcset->range_ends[mbcset->nranges++] = end_wc;
2642       }
2643
2644     /* Build the table for single byte characters.  */
2645     for (wc = 0; wc < SBC_MAX; ++wc)
2646       {
2647         cmp_buf[2] = wc;
2648         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2649             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2650           bitset_set (sbcset, wc);
2651       }
2652   }
2653 # else /* not RE_ENABLE_I18N */
2654   {
2655     unsigned int ch;
2656     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2657                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658                    : 0));
2659     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2660               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661                  : 0));
2662     if (start_ch > end_ch)
2663       return REG_ERANGE;
2664     /* Build the table for single byte characters.  */
2665     for (ch = 0; ch < SBC_MAX; ++ch)
2666       if (start_ch <= ch  && ch <= end_ch)
2667         bitset_set (sbcset, ch);
2668   }
2669 # endif /* not RE_ENABLE_I18N */
2670   return REG_NOERROR;
2671 }
2672 #endif /* not _LIBC */
2673
2674 #ifndef _LIBC
2675 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2676    Build the collating element which is represented by NAME.
2677    The result are written to MBCSET and SBCSET.
2678    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2679    pointer argument since we may update it.  */
2680
2681 static reg_errcode_t
2682 build_collating_symbol (re_bitset_ptr_t sbcset,
2683 # ifdef RE_ENABLE_I18N
2684                         re_charset_t *mbcset, int *coll_sym_alloc,
2685 # endif
2686                         const unsigned char *name)
2687 {
2688   size_t name_len = strlen ((const char *) name);
2689   if (BE (name_len != 1, 0))
2690     return REG_ECOLLATE;
2691   else
2692     {
2693       bitset_set (sbcset, name[0]);
2694       return REG_NOERROR;
2695     }
2696 }
2697 #endif /* not _LIBC */
2698
2699 /* This function parse bracket expression like "[abc]", "[a-c]",
2700    "[[.a-a.]]" etc.  */
2701
2702 static bin_tree_t *
2703 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2704                    reg_syntax_t syntax, reg_errcode_t *err)
2705 {
2706 #ifdef _LIBC
2707   const unsigned char *collseqmb;
2708   const char *collseqwc;
2709   uint32_t nrules;
2710   int32_t table_size;
2711   const int32_t *symb_table;
2712   const unsigned char *extra;
2713
2714   /* Local function for parse_bracket_exp used in _LIBC environement.
2715      Seek the collating symbol entry correspondings to NAME.
2716      Return the index of the symbol in the SYMB_TABLE.  */
2717
2718   auto inline int32_t
2719   __attribute ((always_inline))
2720   seek_collating_symbol_entry (name, name_len)
2721          const unsigned char *name;
2722          size_t name_len;
2723     {
2724       int32_t hash = elem_hash ((const char *) name, name_len);
2725       int32_t elem = hash % table_size;
2726       int32_t second = hash % (table_size - 2);
2727       while (symb_table[2 * elem] != 0)
2728         {
2729           /* First compare the hashing value.  */
2730           if (symb_table[2 * elem] == hash
2731               /* Compare the length of the name.  */
2732               && name_len == extra[symb_table[2 * elem + 1]]
2733               /* Compare the name.  */
2734               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2735                          name_len) == 0)
2736             {
2737               /* Yep, this is the entry.  */
2738               break;
2739             }
2740
2741           /* Next entry.  */
2742           elem += second;
2743         }
2744       return elem;
2745     }
2746
2747   /* Local function for parse_bracket_exp used in _LIBC environement.
2748      Look up the collation sequence value of BR_ELEM.
2749      Return the value if succeeded, UINT_MAX otherwise.  */
2750
2751   auto inline unsigned int
2752   __attribute ((always_inline))
2753   lookup_collation_sequence_value (br_elem)
2754          bracket_elem_t *br_elem;
2755     {
2756       if (br_elem->type == SB_CHAR)
2757         {
2758           /*
2759           if (MB_CUR_MAX == 1)
2760           */
2761           if (nrules == 0)
2762             return collseqmb[br_elem->opr.ch];
2763           else
2764             {
2765               wint_t wc = __btowc (br_elem->opr.ch);
2766               return __collseq_table_lookup (collseqwc, wc);
2767             }
2768         }
2769       else if (br_elem->type == MB_CHAR)
2770         {
2771           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2772         }
2773       else if (br_elem->type == COLL_SYM)
2774         {
2775           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2776           if (nrules != 0)
2777             {
2778               int32_t elem, idx;
2779               elem = seek_collating_symbol_entry (br_elem->opr.name,
2780                                                   sym_name_len);
2781               if (symb_table[2 * elem] != 0)
2782                 {
2783                   /* We found the entry.  */
2784                   idx = symb_table[2 * elem + 1];
2785                   /* Skip the name of collating element name.  */
2786                   idx += 1 + extra[idx];
2787                   /* Skip the byte sequence of the collating element.  */
2788                   idx += 1 + extra[idx];
2789                   /* Adjust for the alignment.  */
2790                   idx = (idx + 3) & ~3;
2791                   /* Skip the multibyte collation sequence value.  */
2792                   idx += sizeof (unsigned int);
2793                   /* Skip the wide char sequence of the collating element.  */
2794                   idx += sizeof (unsigned int) *
2795                     (1 + *(unsigned int *) (extra + idx));
2796                   /* Return the collation sequence value.  */
2797                   return *(unsigned int *) (extra + idx);
2798                 }
2799               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2800                 {
2801                   /* No valid character.  Match it as a single byte
2802                      character.  */
2803                   return collseqmb[br_elem->opr.name[0]];
2804                 }
2805             }
2806           else if (sym_name_len == 1)
2807             return collseqmb[br_elem->opr.name[0]];
2808         }
2809       return UINT_MAX;
2810     }
2811
2812   /* Local function for parse_bracket_exp used in _LIBC environement.
2813      Build the range expression which starts from START_ELEM, and ends
2814      at END_ELEM.  The result are written to MBCSET and SBCSET.
2815      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2816      mbcset->range_ends, is a pointer argument sinse we may
2817      update it.  */
2818
2819   auto inline reg_errcode_t
2820   __attribute ((always_inline))
2821   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2822          re_charset_t *mbcset;
2823          int *range_alloc;
2824          re_bitset_ptr_t sbcset;
2825          bracket_elem_t *start_elem, *end_elem;
2826     {
2827       unsigned int ch;
2828       uint32_t start_collseq;
2829       uint32_t end_collseq;
2830
2831       /* Equivalence Classes and Character Classes can't be a range
2832          start/end.  */
2833       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2834               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2835               0))
2836         return REG_ERANGE;
2837
2838       start_collseq = lookup_collation_sequence_value (start_elem);
2839       end_collseq = lookup_collation_sequence_value (end_elem);
2840       /* Check start/end collation sequence values.  */
2841       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2842         return REG_ECOLLATE;
2843       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2844         return REG_ERANGE;
2845
2846       /* Got valid collation sequence values, add them as a new entry.
2847          However, if we have no collation elements, and the character set
2848          is single byte, the single byte character set that we
2849          build below suffices. */
2850       if (nrules > 0 || dfa->mb_cur_max > 1)
2851         {
2852           /* Check the space of the arrays.  */
2853           if (BE (*range_alloc == mbcset->nranges, 0))
2854             {
2855               /* There is not enough space, need realloc.  */
2856               uint32_t *new_array_start;
2857               uint32_t *new_array_end;
2858               int new_nranges;
2859
2860               /* +1 in case of mbcset->nranges is 0.  */
2861               new_nranges = 2 * mbcset->nranges + 1;
2862               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2863                                             new_nranges);
2864               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2865                                           new_nranges);
2866
2867               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2868                 return REG_ESPACE;
2869
2870               mbcset->range_starts = new_array_start;
2871               mbcset->range_ends = new_array_end;
2872               *range_alloc = new_nranges;
2873             }
2874
2875           mbcset->range_starts[mbcset->nranges] = start_collseq;
2876           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2877         }
2878
2879       /* Build the table for single byte characters.  */
2880       for (ch = 0; ch < SBC_MAX; ch++)
2881         {
2882           uint32_t ch_collseq;
2883           /*
2884           if (MB_CUR_MAX == 1)
2885           */
2886           if (nrules == 0)
2887             ch_collseq = collseqmb[ch];
2888           else
2889             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2890           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2891             bitset_set (sbcset, ch);
2892         }
2893       return REG_NOERROR;
2894     }
2895
2896   /* Local function for parse_bracket_exp used in _LIBC environement.
2897      Build the collating element which is represented by NAME.
2898      The result are written to MBCSET and SBCSET.
2899      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2900      pointer argument sinse we may update it.  */
2901
2902   auto inline reg_errcode_t
2903   __attribute ((always_inline))
2904   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2905          re_charset_t *mbcset;
2906          int *coll_sym_alloc;
2907          re_bitset_ptr_t sbcset;
2908          const unsigned char *name;
2909     {
2910       int32_t elem, idx;
2911       size_t name_len = strlen ((const char *) name);
2912       if (nrules != 0)
2913         {
2914           elem = seek_collating_symbol_entry (name, name_len);
2915           if (symb_table[2 * elem] != 0)
2916             {
2917               /* We found the entry.  */
2918               idx = symb_table[2 * elem + 1];
2919               /* Skip the name of collating element name.  */
2920               idx += 1 + extra[idx];
2921             }
2922           else if (symb_table[2 * elem] == 0 && name_len == 1)
2923             {
2924               /* No valid character, treat it as a normal
2925                  character.  */
2926               bitset_set (sbcset, name[0]);
2927               return REG_NOERROR;
2928             }
2929           else
2930             return REG_ECOLLATE;
2931
2932           /* Got valid collation sequence, add it as a new entry.  */
2933           /* Check the space of the arrays.  */
2934           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2935             {
2936               /* Not enough, realloc it.  */
2937               /* +1 in case of mbcset->ncoll_syms is 0.  */
2938               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2939               /* Use realloc since mbcset->coll_syms is NULL
2940                  if *alloc == 0.  */
2941               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2942                                                    new_coll_sym_alloc);
2943               if (BE (new_coll_syms == NULL, 0))
2944                 return REG_ESPACE;
2945               mbcset->coll_syms = new_coll_syms;
2946               *coll_sym_alloc = new_coll_sym_alloc;
2947             }
2948           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2949           return REG_NOERROR;
2950         }
2951       else
2952         {
2953           if (BE (name_len != 1, 0))
2954             return REG_ECOLLATE;
2955           else
2956             {
2957               bitset_set (sbcset, name[0]);
2958               return REG_NOERROR;
2959             }
2960         }
2961     }
2962 #endif
2963
2964   re_token_t br_token;
2965   re_bitset_ptr_t sbcset;
2966 #ifdef RE_ENABLE_I18N
2967   re_charset_t *mbcset;
2968   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2969   int equiv_class_alloc = 0, char_class_alloc = 0;
2970 #endif /* not RE_ENABLE_I18N */
2971   int non_match = 0;
2972   bin_tree_t *work_tree;
2973   int token_len;
2974   int first_round = 1;
2975 #ifdef _LIBC
2976   collseqmb = (const unsigned char *)
2977     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2978   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2979   if (nrules)
2980     {
2981       /*
2982       if (MB_CUR_MAX > 1)
2983       */
2984         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2985       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2986       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2987                                                   _NL_COLLATE_SYMB_TABLEMB);
2988       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2989                                                    _NL_COLLATE_SYMB_EXTRAMB);
2990     }
2991 #endif
2992   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2993 #ifdef RE_ENABLE_I18N
2994   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2995 #endif /* RE_ENABLE_I18N */
2996 #ifdef RE_ENABLE_I18N
2997   if (BE (sbcset == NULL || mbcset == NULL, 0))
2998 #else
2999   if (BE (sbcset == NULL, 0))
3000 #endif /* RE_ENABLE_I18N */
3001     {
3002       *err = REG_ESPACE;
3003       return NULL;
3004     }
3005
3006   token_len = peek_token_bracket (token, regexp, syntax);
3007   if (BE (token->type == END_OF_RE, 0))
3008     {
3009       *err = REG_BADPAT;
3010       goto parse_bracket_exp_free_return;
3011     }
3012   if (token->type == OP_NON_MATCH_LIST)
3013     {
3014 #ifdef RE_ENABLE_I18N
3015       mbcset->non_match = 1;
3016 #endif /* not RE_ENABLE_I18N */
3017       non_match = 1;
3018       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3019         bitset_set (sbcset, '\0');
3020       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3021       token_len = peek_token_bracket (token, regexp, syntax);
3022       if (BE (token->type == END_OF_RE, 0))
3023         {
3024           *err = REG_BADPAT;
3025           goto parse_bracket_exp_free_return;
3026         }
3027     }
3028
3029   /* We treat the first ']' as a normal character.  */
3030   if (token->type == OP_CLOSE_BRACKET)
3031     token->type = CHARACTER;
3032
3033   while (1)
3034     {
3035       bracket_elem_t start_elem, end_elem;
3036       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3037       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3038       reg_errcode_t ret;
3039       int token_len2 = 0, is_range_exp = 0;
3040       re_token_t token2;
3041
3042       start_elem.opr.name = start_name_buf;
3043       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3044                                    syntax, first_round);
3045       if (BE (ret != REG_NOERROR, 0))
3046         {
3047           *err = ret;
3048           goto parse_bracket_exp_free_return;
3049         }
3050       first_round = 0;
3051
3052       /* Get information about the next token.  We need it in any case.  */
3053       token_len = peek_token_bracket (token, regexp, syntax);
3054
3055       /* Do not check for ranges if we know they are not allowed.  */
3056       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3057         {
3058           if (BE (token->type == END_OF_RE, 0))
3059             {
3060               *err = REG_EBRACK;
3061               goto parse_bracket_exp_free_return;
3062             }
3063           if (token->type == OP_CHARSET_RANGE)
3064             {
3065               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3066               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3067               if (BE (token2.type == END_OF_RE, 0))
3068                 {
3069                   *err = REG_EBRACK;
3070                   goto parse_bracket_exp_free_return;
3071                 }
3072               if (token2.type == OP_CLOSE_BRACKET)
3073                 {
3074                   /* We treat the last '-' as a normal character.  */
3075                   re_string_skip_bytes (regexp, -token_len);
3076                   token->type = CHARACTER;
3077                 }
3078               else
3079                 is_range_exp = 1;
3080             }
3081         }
3082
3083       if (is_range_exp == 1)
3084         {
3085           end_elem.opr.name = end_name_buf;
3086           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3087                                        dfa, syntax, 1);
3088           if (BE (ret != REG_NOERROR, 0))
3089             {
3090               *err = ret;
3091               goto parse_bracket_exp_free_return;
3092             }
3093
3094           token_len = peek_token_bracket (token, regexp, syntax);
3095
3096 #ifdef _LIBC
3097           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3098                                   &start_elem, &end_elem);
3099 #else
3100 # ifdef RE_ENABLE_I18N
3101           *err = build_range_exp (sbcset,
3102                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3103                                   &range_alloc, &start_elem, &end_elem);
3104 # else
3105           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3106 # endif
3107 #endif /* RE_ENABLE_I18N */
3108           if (BE (*err != REG_NOERROR, 0))
3109             goto parse_bracket_exp_free_return;
3110         }
3111       else
3112         {
3113           switch (start_elem.type)
3114             {
3115             case SB_CHAR:
3116               bitset_set (sbcset, start_elem.opr.ch);
3117               break;
3118 #ifdef RE_ENABLE_I18N
3119             case MB_CHAR:
3120               /* Check whether the array has enough space.  */
3121               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3122                 {
3123                   wchar_t *new_mbchars;
3124                   /* Not enough, realloc it.  */
3125                   /* +1 in case of mbcset->nmbchars is 0.  */
3126                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3127                   /* Use realloc since array is NULL if *alloc == 0.  */
3128                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3129                                             mbchar_alloc);
3130                   if (BE (new_mbchars == NULL, 0))
3131                     goto parse_bracket_exp_espace;
3132                   mbcset->mbchars = new_mbchars;
3133                 }
3134               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3135               break;
3136 #endif /* RE_ENABLE_I18N */
3137             case EQUIV_CLASS:
3138               *err = build_equiv_class (sbcset,
3139 #ifdef RE_ENABLE_I18N
3140                                         mbcset, &equiv_class_alloc,
3141 #endif /* RE_ENABLE_I18N */
3142                                         start_elem.opr.name);
3143               if (BE (*err != REG_NOERROR, 0))
3144                 goto parse_bracket_exp_free_return;
3145               break;
3146             case COLL_SYM:
3147               *err = build_collating_symbol (sbcset,
3148 #ifdef RE_ENABLE_I18N
3149                                              mbcset, &coll_sym_alloc,
3150 #endif /* RE_ENABLE_I18N */
3151                                              start_elem.opr.name);
3152               if (BE (*err != REG_NOERROR, 0))
3153                 goto parse_bracket_exp_free_return;
3154               break;
3155             case CHAR_CLASS:
3156               *err = build_charclass (regexp->trans, sbcset,
3157 #ifdef RE_ENABLE_I18N
3158                                       mbcset, &char_class_alloc,
3159 #endif /* RE_ENABLE_I18N */
3160                                       start_elem.opr.name, syntax);
3161               if (BE (*err != REG_NOERROR, 0))
3162                goto parse_bracket_exp_free_return;
3163               break;
3164             default:
3165               assert (0);
3166               break;
3167             }
3168         }
3169       if (BE (token->type == END_OF_RE, 0))
3170         {
3171           *err = REG_EBRACK;
3172           goto parse_bracket_exp_free_return;
3173         }
3174       if (token->type == OP_CLOSE_BRACKET)
3175         break;
3176     }
3177
3178   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3179
3180   /* If it is non-matching list.  */
3181   if (non_match)
3182     bitset_not (sbcset);
3183
3184 #ifdef RE_ENABLE_I18N
3185   /* Ensure only single byte characters are set.  */
3186   if (dfa->mb_cur_max > 1)
3187     bitset_mask (sbcset, dfa->sb_char);
3188
3189   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3190       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3191                                                      || mbcset->non_match)))
3192     {
3193       bin_tree_t *mbc_tree;
3194       int sbc_idx;
3195       /* Build a tree for complex bracket.  */
3196       dfa->has_mb_node = 1;
3197       br_token.type = COMPLEX_BRACKET;
3198       br_token.opr.mbcset = mbcset;
3199       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3200       if (BE (mbc_tree == NULL, 0))
3201         goto parse_bracket_exp_espace;
3202       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3203         if (sbcset[sbc_idx])
3204           break;
3205       /* If there are no bits set in sbcset, there is no point
3206          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3207       if (sbc_idx < BITSET_UINTS)
3208         {
3209           /* Build a tree for simple bracket.  */
3210           br_token.type = SIMPLE_BRACKET;
3211           br_token.opr.sbcset = sbcset;
3212           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3213           if (BE (work_tree == NULL, 0))
3214             goto parse_bracket_exp_espace;
3215
3216           /* Then join them by ALT node.  */
3217           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3218           if (BE (work_tree == NULL, 0))
3219             goto parse_bracket_exp_espace;
3220         }
3221       else
3222         {
3223           re_free (sbcset);
3224           work_tree = mbc_tree;
3225         }
3226     }
3227   else
3228 #endif /* not RE_ENABLE_I18N */
3229     {
3230 #ifdef RE_ENABLE_I18N
3231       free_charset (mbcset);
3232 #endif
3233       /* Build a tree for simple bracket.  */
3234       br_token.type = SIMPLE_BRACKET;
3235       br_token.opr.sbcset = sbcset;
3236       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237       if (BE (work_tree == NULL, 0))
3238         goto parse_bracket_exp_espace;
3239     }
3240   return work_tree;
3241
3242  parse_bracket_exp_espace:
3243   *err = REG_ESPACE;
3244  parse_bracket_exp_free_return:
3245   re_free (sbcset);
3246 #ifdef RE_ENABLE_I18N
3247   free_charset (mbcset);
3248 #endif /* RE_ENABLE_I18N */
3249   return NULL;
3250 }
3251
3252 /* Parse an element in the bracket expression.  */
3253
3254 static reg_errcode_t
3255 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3256                        re_token_t *token, int token_len, re_dfa_t *dfa,
3257                        reg_syntax_t syntax, int accept_hyphen)
3258 {
3259 #ifdef RE_ENABLE_I18N
3260   int cur_char_size;
3261   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3262   if (cur_char_size > 1)
3263     {
3264       elem->type = MB_CHAR;
3265       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3266       re_string_skip_bytes (regexp, cur_char_size);
3267       return REG_NOERROR;
3268     }
3269 #endif /* RE_ENABLE_I18N */
3270   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3271   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3272       || token->type == OP_OPEN_EQUIV_CLASS)
3273     return parse_bracket_symbol (elem, regexp, token);
3274   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3275     {
3276       /* A '-' must only appear as anything but a range indicator before
3277          the closing bracket.  Everything else is an error.  */
3278       re_token_t token2;
3279       (void) peek_token_bracket (&token2, regexp, syntax);
3280       if (token2.type != OP_CLOSE_BRACKET)
3281         /* The actual error value is not standardized since this whole
3282            case is undefined.  But ERANGE makes good sense.  */
3283         return REG_ERANGE;
3284     }
3285   elem->type = SB_CHAR;
3286   elem->opr.ch = token->opr.c;
3287   return REG_NOERROR;
3288 }
3289
3290 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3291    such as [:<character_class>:], [.<collating_element>.], and
3292    [=<equivalent_class>=].  */
3293
3294 static reg_errcode_t
3295 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3296                       re_token_t *token)
3297 {
3298   unsigned char ch, delim = token->opr.c;
3299   int i = 0;
3300   if (re_string_eoi(regexp))
3301     return REG_EBRACK;
3302   for (;; ++i)
3303     {
3304       if (i >= BRACKET_NAME_BUF_SIZE)
3305         return REG_EBRACK;
3306       if (token->type == OP_OPEN_CHAR_CLASS)
3307         ch = re_string_fetch_byte_case (regexp);
3308       else
3309         ch = re_string_fetch_byte (regexp);
3310       if (re_string_eoi(regexp))
3311         return REG_EBRACK;
3312       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3313         break;
3314       elem->opr.name[i] = ch;
3315     }
3316   re_string_skip_bytes (regexp, 1);
3317   elem->opr.name[i] = '\0';
3318   switch (token->type)
3319     {
3320     case OP_OPEN_COLL_ELEM:
3321       elem->type = COLL_SYM;
3322       break;
3323     case OP_OPEN_EQUIV_CLASS:
3324       elem->type = EQUIV_CLASS;
3325       break;
3326     case OP_OPEN_CHAR_CLASS:
3327       elem->type = CHAR_CLASS;
3328       break;
3329     default:
3330       break;
3331     }
3332   return REG_NOERROR;
3333 }
3334
3335   /* Helper function for parse_bracket_exp.
3336      Build the equivalence class which is represented by NAME.
3337      The result are written to MBCSET and SBCSET.
3338      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3339      is a pointer argument sinse we may update it.  */
3340
3341 static reg_errcode_t
3342 build_equiv_class (re_bitset_ptr_t sbcset,
3343 #ifdef RE_ENABLE_I18N
3344                    re_charset_t *mbcset, int *equiv_class_alloc,
3345 #endif
3346                    const unsigned char *name)
3347 {
3348 #if defined _LIBC
3349   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3350   if (nrules != 0)
3351     {
3352       const int32_t *table, *indirect;
3353       const unsigned char *weights, *extra, *cp;
3354       unsigned char char_buf[2];
3355       int32_t idx1, idx2;
3356       unsigned int ch;
3357       size_t len;
3358       /* This #include defines a local function!  */
3359 # include <locale/weight.h>
3360       /* Calculate the index for equivalence class.  */
3361       cp = name;
3362       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3363       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3364                                                _NL_COLLATE_WEIGHTMB);
3365       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3366                                                    _NL_COLLATE_EXTRAMB);
3367       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3368                                                 _NL_COLLATE_INDIRECTMB);
3369       idx1 = findidx (&cp);
3370       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3371         /* This isn't a valid character.  */
3372         return REG_ECOLLATE;
3373
3374       /* Build single byte matcing table for this equivalence class.  */
3375       char_buf[1] = (unsigned char) '\0';
3376       len = weights[idx1];
3377       for (ch = 0; ch < SBC_MAX; ++ch)
3378         {
3379           char_buf[0] = ch;
3380           cp = char_buf;
3381           idx2 = findidx (&cp);
3382 /*
3383           idx2 = table[ch];
3384 */
3385           if (idx2 == 0)
3386             /* This isn't a valid character.  */
3387             continue;
3388           if (len == weights[idx2])
3389             {
3390               int cnt = 0;
3391               while (cnt <= len &&
3392                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3393                 ++cnt;
3394
3395               if (cnt > len)
3396                 bitset_set (sbcset, ch);
3397             }
3398         }
3399       /* Check whether the array has enough space.  */
3400       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3401         {
3402           /* Not enough, realloc it.  */
3403           /* +1 in case of mbcset->nequiv_classes is 0.  */
3404           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3405           /* Use realloc since the array is NULL if *alloc == 0.  */
3406           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3407                                                    int32_t,
3408                                                    new_equiv_class_alloc);
3409           if (BE (new_equiv_classes == NULL, 0))
3410             return REG_ESPACE;
3411           mbcset->equiv_classes = new_equiv_classes;
3412           *equiv_class_alloc = new_equiv_class_alloc;
3413         }
3414       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3415     }
3416   else
3417 #endif /* _LIBC */
3418     {
3419       if (BE (strlen ((const char *) name) != 1, 0))
3420         return REG_ECOLLATE;
3421       bitset_set (sbcset, *name);
3422     }
3423   return REG_NOERROR;
3424 }
3425
3426   /* Helper function for parse_bracket_exp.
3427      Build the character class which is represented by NAME.
3428      The result are written to MBCSET and SBCSET.
3429      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3430      is a pointer argument sinse we may update it.  */
3431
3432 static reg_errcode_t
3433 build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3434 #ifdef RE_ENABLE_I18N
3435                  re_charset_t *mbcset, int *char_class_alloc,
3436 #endif
3437                  const unsigned char *class_name, reg_syntax_t syntax)
3438 {
3439   int i;
3440   const char *name = (const char *) class_name;
3441
3442   /* In case of REG_ICASE "upper" and "lower" match the both of
3443      upper and lower cases.  */
3444   if ((syntax & RE_ICASE)
3445       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3446     name = "alpha";
3447
3448 #ifdef RE_ENABLE_I18N
3449   /* Check the space of the arrays.  */
3450   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3451     {
3452       /* Not enough, realloc it.  */
3453       /* +1 in case of mbcset->nchar_classes is 0.  */
3454       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3455       /* Use realloc since array is NULL if *alloc == 0.  */
3456       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3457                                                new_char_class_alloc);
3458       if (BE (new_char_classes == NULL, 0))
3459         return REG_ESPACE;
3460       mbcset->char_classes = new_char_classes;
3461       *char_class_alloc = new_char_class_alloc;
3462     }
3463   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3464 #endif /* RE_ENABLE_I18N */
3465
3466 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3467     for (i = 0; i < SBC_MAX; ++i)               \
3468       {                                         \
3469         if (ctype_func (i))                     \
3470           {                                     \
3471             int ch = trans ? trans[i] : i;      \
3472             bitset_set (sbcset, ch);            \
3473           }                                     \
3474       }
3475
3476   if (strcmp (name, "alnum") == 0)
3477     BUILD_CHARCLASS_LOOP (isalnum)
3478   else if (strcmp (name, "cntrl") == 0)
3479     BUILD_CHARCLASS_LOOP (iscntrl)
3480   else if (strcmp (name, "lower") == 0)
3481     BUILD_CHARCLASS_LOOP (islower)
3482   else if (strcmp (name, "space") == 0)
3483     BUILD_CHARCLASS_LOOP (isspace)
3484   else if (strcmp (name, "alpha") == 0)
3485     BUILD_CHARCLASS_LOOP (isalpha)
3486   else if (strcmp (name, "digit") == 0)
3487     BUILD_CHARCLASS_LOOP (isdigit)
3488   else if (strcmp (name, "print") == 0)
3489     BUILD_CHARCLASS_LOOP (isprint)
3490   else if (strcmp (name, "upper") == 0)
3491     BUILD_CHARCLASS_LOOP (isupper)
3492   else if (strcmp (name, "blank") == 0)
3493     BUILD_CHARCLASS_LOOP (isblank)
3494   else if (strcmp (name, "graph") == 0)
3495     BUILD_CHARCLASS_LOOP (isgraph)
3496   else if (strcmp (name, "punct") == 0)
3497     BUILD_CHARCLASS_LOOP (ispunct)
3498   else if (strcmp (name, "xdigit") == 0)
3499     BUILD_CHARCLASS_LOOP (isxdigit)
3500   else
3501     return REG_ECTYPE;
3502
3503   return REG_NOERROR;
3504 }
3505
3506 static bin_tree_t *
3507 build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans,
3508                     const unsigned char *class_name,
3509                     const unsigned char *extra,
3510                     int non_match, reg_errcode_t *err)
3511 {
3512   re_bitset_ptr_t sbcset;
3513 #ifdef RE_ENABLE_I18N
3514   re_charset_t *mbcset;
3515   int alloc = 0;
3516 #endif /* not RE_ENABLE_I18N */
3517   reg_errcode_t ret;
3518   re_token_t br_token;
3519   bin_tree_t *tree;
3520
3521   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3522 #ifdef RE_ENABLE_I18N
3523   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3524 #endif /* RE_ENABLE_I18N */
3525
3526 #ifdef RE_ENABLE_I18N
3527   if (BE (sbcset == NULL || mbcset == NULL, 0))
3528 #else /* not RE_ENABLE_I18N */
3529   if (BE (sbcset == NULL, 0))
3530 #endif /* not RE_ENABLE_I18N */
3531     {
3532       *err = REG_ESPACE;
3533       return NULL;
3534     }
3535
3536   if (non_match)
3537     {
3538 #ifdef RE_ENABLE_I18N
3539       /*
3540       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3541         bitset_set(cset->sbcset, '\0');
3542       */
3543       mbcset->non_match = 1;
3544 #endif /* not RE_ENABLE_I18N */
3545     }
3546
3547   /* We don't care the syntax in this case.  */
3548   ret = build_charclass (trans, sbcset,
3549 #ifdef RE_ENABLE_I18N
3550                          mbcset, &alloc,
3551 #endif /* RE_ENABLE_I18N */
3552                          class_name, 0);
3553
3554   if (BE (ret != REG_NOERROR, 0))
3555     {
3556       re_free (sbcset);
3557 #ifdef RE_ENABLE_I18N
3558       free_charset (mbcset);
3559 #endif /* RE_ENABLE_I18N */
3560       *err = ret;
3561       return NULL;
3562     }
3563   /* \w match '_' also.  */
3564   for (; *extra; extra++)
3565     bitset_set (sbcset, *extra);
3566
3567   /* If it is non-matching list.  */
3568   if (non_match)
3569     bitset_not (sbcset);
3570
3571 #ifdef RE_ENABLE_I18N
3572   /* Ensure only single byte characters are set.  */
3573   if (dfa->mb_cur_max > 1)
3574     bitset_mask (sbcset, dfa->sb_char);
3575 #endif
3576
3577   /* Build a tree for simple bracket.  */
3578   br_token.type = SIMPLE_BRACKET;
3579   br_token.opr.sbcset = sbcset;
3580   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581   if (BE (tree == NULL, 0))
3582     goto build_word_op_espace;
3583
3584 #ifdef RE_ENABLE_I18N
3585   if (dfa->mb_cur_max > 1)
3586     {
3587       bin_tree_t *mbc_tree;
3588       /* Build a tree for complex bracket.  */
3589       br_token.type = COMPLEX_BRACKET;
3590       br_token.opr.mbcset = mbcset;
3591       dfa->has_mb_node = 1;
3592       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3593       if (BE (mbc_tree == NULL, 0))
3594         goto build_word_op_espace;
3595       /* Then join them by ALT node.  */
3596       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3597       if (BE (mbc_tree != NULL, 1))
3598         return tree;
3599     }
3600   else
3601     {
3602       free_charset (mbcset);
3603       return tree;
3604     }
3605 #else /* not RE_ENABLE_I18N */
3606   return tree;
3607 #endif /* not RE_ENABLE_I18N */
3608
3609  build_word_op_espace:
3610   re_free (sbcset);
3611 #ifdef RE_ENABLE_I18N
3612   free_charset (mbcset);
3613 #endif /* RE_ENABLE_I18N */
3614   *err = REG_ESPACE;
3615   return NULL;
3616 }
3617
3618 /* This is intended for the expressions like "a{1,3}".
3619    Fetch a number from `input', and return the number.
3620    Return -1, if the number field is empty like "{,1}".
3621    Return -2, If an error is occured.  */
3622
3623 static int
3624 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3625 {
3626   int num = -1;
3627   unsigned char c;
3628   while (1)
3629     {
3630       fetch_token (token, input, syntax);
3631       c = token->opr.c;
3632       if (BE (token->type == END_OF_RE, 0))
3633         return -2;
3634       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3635         break;
3636       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3637              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3638       num = (num > RE_DUP_MAX) ? -2 : num;
3639     }
3640   return num;
3641 }
3642 \f
3643 #ifdef RE_ENABLE_I18N
3644 static void
3645 free_charset (re_charset_t *cset)
3646 {
3647   re_free (cset->mbchars);
3648 # ifdef _LIBC
3649   re_free (cset->coll_syms);
3650   re_free (cset->equiv_classes);
3651   re_free (cset->range_starts);
3652   re_free (cset->range_ends);
3653 # endif
3654   re_free (cset->char_classes);
3655   re_free (cset);
3656 }
3657 #endif /* RE_ENABLE_I18N */
3658 \f
3659 /* Functions for binary tree operation.  */
3660
3661 /* Create a tree node.  */
3662
3663 static bin_tree_t *
3664 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3665              re_token_type_t type)
3666 {
3667   re_token_t t;
3668   t.type = type;
3669   return create_token_tree (dfa, left, right, &t);
3670 }
3671
3672 static bin_tree_t *
3673 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3674                    const re_token_t *token)
3675 {
3676   bin_tree_t *tree;
3677   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3678     {
3679       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3680
3681       if (storage == NULL)
3682         return NULL;
3683       storage->next = dfa->str_tree_storage;
3684       dfa->str_tree_storage = storage;
3685       dfa->str_tree_storage_idx = 0;
3686     }
3687   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3688
3689   tree->parent = NULL;
3690   tree->left = left;
3691   tree->right = right;
3692   tree->token = *token;
3693   tree->token.duplicated = 0;
3694   tree->token.opt_subexp = 0;
3695   tree->first = NULL;
3696   tree->next = NULL;
3697   tree->node_idx = -1;
3698
3699   if (left != NULL)
3700     left->parent = tree;
3701   if (right != NULL)
3702     right->parent = tree;
3703   return tree;
3704 }
3705
3706 /* Mark the tree SRC as an optional subexpression.
3707    To be called from preorder or postorder.  */
3708
3709 static reg_errcode_t
3710 mark_opt_subexp (void *extra, bin_tree_t *node)
3711 {
3712   int idx = (int) (long) extra;
3713   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3714     node->token.opt_subexp = 1;
3715
3716   return REG_NOERROR;
3717 }
3718
3719 /* Free the allocated memory inside NODE. */
3720
3721 static void
3722 free_token (re_token_t *node)
3723 {
3724 #ifdef RE_ENABLE_I18N
3725   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3726     free_charset (node->opr.mbcset);
3727   else
3728 #endif /* RE_ENABLE_I18N */
3729     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3730       re_free (node->opr.sbcset);
3731 }
3732
3733 /* Worker function for tree walking.  Free the allocated memory inside NODE
3734    and its children. */
3735
3736 static reg_errcode_t
3737 free_tree (void *extra, bin_tree_t *node)
3738 {
3739   free_token (&node->token);
3740   return REG_NOERROR;
3741 }
3742
3743
3744 /* Duplicate the node SRC, and return new node.  This is a preorder
3745    visit similar to the one implemented by the generic visitor, but
3746    we need more infrastructure to maintain two parallel trees --- so,
3747    it's easier to duplicate.  */
3748
3749 static bin_tree_t *
3750 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3751 {
3752   const bin_tree_t *node;
3753   bin_tree_t *dup_root;
3754   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3755
3756   for (node = root; ; )
3757     {
3758       /* Create a new tree and link it back to the current parent.  */
3759       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3760       if (*p_new == NULL)
3761         return NULL;
3762       (*p_new)->parent = dup_node;
3763       (*p_new)->token.duplicated = 1;
3764       dup_node = *p_new;
3765
3766       /* Go to the left node, or up and to the right.  */
3767       if (node->left)
3768         {
3769           node = node->left;
3770           p_new = &dup_node->left;
3771         }
3772       else
3773         {
3774           const bin_tree_t *prev = NULL;
3775           while (node->right == prev || node->right == NULL)
3776             {
3777               prev = node;
3778               node = node->parent;
3779               dup_node = dup_node->parent;
3780               if (!node)
3781                 return dup_root;
3782             }
3783           node = node->right;
3784           p_new = &dup_node->right;
3785         }
3786     }
3787 }