lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           int length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  26 static void init_word_char (re_dfa_t *dfa);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  51                                              int top_clone_node, int root_node,
  52                                              unsigned int constraint);
  53 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  54                                      unsigned int constraint);
  55 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static void fetch_token (re_token_t *result, re_string_t *input,
  64                          reg_syntax_t syntax);
  65 static int peek_token (re_token_t *token, re_string_t *input,
  66                         reg_syntax_t syntax);
  67 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  68                                reg_syntax_t syntax);
  69 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  70                           reg_syntax_t syntax, reg_errcode_t *err);
  71 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  75                                  re_token_t *token, reg_syntax_t syntax,
  76                                  int nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  78                                      re_token_t *token, reg_syntax_t syntax,
  79                                      int nest, reg_errcode_t *err);
  80 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  81                                   re_token_t *token, reg_syntax_t syntax,
  82                                   int nest, reg_errcode_t *err);
  83 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  84                                  re_dfa_t *dfa, re_token_t *token,
  85                                  reg_syntax_t syntax, reg_errcode_t *err);
  86 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  87                                       re_token_t *token, reg_syntax_t syntax,
  88                                       reg_errcode_t *err);
  89 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  90                                             re_string_t *regexp,
  91                                             re_token_t *token, int token_len,
  92                                             re_dfa_t *dfa,
  93                                             reg_syntax_t syntax,
  94                                             int accept_hyphen);
  95 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  96                                           re_string_t *regexp,
  97                                           re_token_t *token);
  98 #ifndef _LIBC
  99 # ifdef RE_ENABLE_I18N
 100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 101                                       re_charset_t *mbcset, int *range_alloc,
 102                                       bracket_elem_t *start_elem,
 103                                       bracket_elem_t *end_elem);
 104 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 105                                              re_charset_t *mbcset,
 106                                              int *coll_sym_alloc,
 107                                              const unsigned char *name);
 108 # else /* not RE_ENABLE_I18N */
 109 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 110                                       bracket_elem_t *start_elem,
 111                                       bracket_elem_t *end_elem);
 112 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 113                                              const unsigned char *name);
 114 # endif /* not RE_ENABLE_I18N */
 115 #endif /* not _LIBC */
 116 #ifdef RE_ENABLE_I18N
 117 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 118                                         re_charset_t *mbcset,
 119                                         int *equiv_class_alloc,
 120                                         const unsigned char *name);
 121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 122                                       re_bitset_ptr_t sbcset,
 123                                       re_charset_t *mbcset,
 124                                       int *char_class_alloc,
 125                                       const unsigned char *class_name,
 126                                       reg_syntax_t syntax);
 127 #else  /* not RE_ENABLE_I18N */
 128 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 129                                         const unsigned char *name);
 130 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 131                                       re_bitset_ptr_t sbcset,
 132                                       const unsigned char *class_name,
 133                                       reg_syntax_t syntax);
 134 #endif /* not RE_ENABLE_I18N */
 135 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 136                                        unsigned RE_TRANSLATE_TYPE trans,
 137                                        const unsigned char *class_name,
 138                                        const unsigned char *extra,
 139                                        int non_match, reg_errcode_t *err);
 140 static bin_tree_t *create_tree (re_dfa_t *dfa,
 141                                 bin_tree_t *left, bin_tree_t *right,
 142                                 re_token_type_t type);
 143 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 144                                       bin_tree_t *left, bin_tree_t *right,
 145                                       const re_token_t *token);
 146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 147 static void free_token (re_token_t *node);
 148 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 149 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 150 \f
 151 /* This table gives an error message for each of the error codes listed
 152    in regex.h.  Obviously the order here has to be same as there.
 153    POSIX doesn't require that we do anything for REG_NOERROR,
 154    but why not be nice?  */
 155
 156 const char __re_error_msgid[] attribute_hidden =
 157   {
 158 #define REG_NOERROR_IDX 0
 159     gettext_noop ("Success")    /* REG_NOERROR */
 160     "\0"
 161 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 162     gettext_noop ("No match")   /* REG_NOMATCH */
 163     "\0"
 164 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 165     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 166     "\0"
 167 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 168     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 169     "\0"
 170 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 171     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 172     "\0"
 173 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 174     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 175     "\0"
 176 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 177     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 178     "\0"
 179 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 180     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 181     "\0"
 182 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 183     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 184     "\0"
 185 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 186     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 187     "\0"
 188 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 189     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 190     "\0"
 191 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 192     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 193     "\0"
 194 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 195     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 196     "\0"
 197 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 198     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 199     "\0"
 200 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 201     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 202     "\0"
 203 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 204     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 205     "\0"
 206 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 207     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 208   };
 209
 210 const size_t __re_error_msgid_idx[] attribute_hidden =
 211   {
 212     REG_NOERROR_IDX,
 213     REG_NOMATCH_IDX,
 214     REG_BADPAT_IDX,
 215     REG_ECOLLATE_IDX,
 216     REG_ECTYPE_IDX,
 217     REG_EESCAPE_IDX,
 218     REG_ESUBREG_IDX,
 219     REG_EBRACK_IDX,
 220     REG_EPAREN_IDX,
 221     REG_EBRACE_IDX,
 222     REG_BADBR_IDX,
 223     REG_ERANGE_IDX,
 224     REG_ESPACE_IDX,
 225     REG_BADRPT_IDX,
 226     REG_EEND_IDX,
 227     REG_ESIZE_IDX,
 228     REG_ERPAREN_IDX
 229   };
 230 \f
 231 /* Entry points for GNU code.  */
 232
 233 /* re_compile_pattern is the GNU regular expression compiler: it
 234    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 235    Returns 0 if the pattern was valid, otherwise an error string.
 236
 237    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 238    are set in BUFP on entry.  */
 239
 240 const char *
 241 re_compile_pattern (pattern, length, bufp)
 242     const char *pattern;
 243     size_t length;
 244     struct re_pattern_buffer *bufp;
 245 {
 246   reg_errcode_t ret;
 247
 248   /* And GNU code determines whether or not to get register information
 249      by passing null for the REGS argument to re_match, etc., not by
 250      setting no_sub, unless RE_NO_SUB is set.  */
 251   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 252
 253   /* Match anchors at newline.  */
 254   bufp->newline_anchor = 1;
 255
 256   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 257
 258   if (!ret)
 259     return NULL;
 260   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 261 }
 262 #ifdef _LIBC
 263 weak_alias (__re_compile_pattern, re_compile_pattern)
 264 #endif
 265
 266 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 267    also be assigned to arbitrarily: each pattern buffer stores its own
 268    syntax, so it can be changed between regex compilations.  */
 269 /* This has no initializer because initialized variables in Emacs
 270    become read-only after dumping.  */
 271 reg_syntax_t re_syntax_options;
 272
 273
 274 /* Specify the precise syntax of regexps for compilation.  This provides
 275    for compatibility for various utilities which historically have
 276    different, incompatible syntaxes.
 277
 278    The argument SYNTAX is a bit mask comprised of the various bits
 279    defined in regex.h.  We return the old syntax.  */
 280
 281 reg_syntax_t
 282 re_set_syntax (syntax)
 283     reg_syntax_t syntax;
 284 {
 285   reg_syntax_t ret = re_syntax_options;
 286
 287   re_syntax_options = syntax;
 288   return ret;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_set_syntax, re_set_syntax)
 292 #endif
 293
 294 int
 295 re_compile_fastmap (bufp)
 296     struct re_pattern_buffer *bufp;
 297 {
 298   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 299   char *fastmap = bufp->fastmap;
 300
 301   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 302   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 303   if (dfa->init_state != dfa->init_state_word)
 304     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 305   if (dfa->init_state != dfa->init_state_nl)
 306     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 307   if (dfa->init_state != dfa->init_state_begbuf)
 308     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 309   bufp->fastmap_accurate = 1;
 310   return 0;
 311 }
 312 #ifdef _LIBC
 313 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 314 #endif
 315
 316 static inline void
 317 __attribute ((always_inline))
 318 re_set_fastmap (char *fastmap, int icase, int ch)
 319 {
 320   fastmap[ch] = 1;
 321   if (icase)
 322     fastmap[tolower (ch)] = 1;
 323 }
 324
 325 /* Helper function for re_compile_fastmap.
 326    Compile fastmap for the initial_state INIT_STATE.  */
 327
 328 static void
 329 re_compile_fastmap_iter (bufp, init_state, fastmap)
 330      regex_t *bufp;
 331      const re_dfastate_t *init_state;
 332      char *fastmap;
 333 {
 334   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 335   int node_cnt;
 336   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 337   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 338     {
 339       int node = init_state->nodes.elems[node_cnt];
 340       re_token_type_t type = dfa->nodes[node].type;
 341
 342       if (type == CHARACTER)
 343         {
 344           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 345 #ifdef RE_ENABLE_I18N
 346           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 347             {
 348               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 349               wchar_t wc;
 350               mbstate_t state;
 351
 352               p = buf;
 353               *p++ = dfa->nodes[node].opr.c;
 354               while (++node < dfa->nodes_len
 355                      && dfa->nodes[node].type == CHARACTER
 356                      && dfa->nodes[node].mb_partial)
 357                 *p++ = dfa->nodes[node].opr.c;
 358               memset (&state, 0, sizeof (state));
 359               if (mbrtowc (&wc, (const char *) buf, p - buf,
 360                            &state) == p - buf
 361                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 362                       != (size_t) -1))
 363                 re_set_fastmap (fastmap, 0, buf[0]);
 364             }
 365 #endif
 366         }
 367       else if (type == SIMPLE_BRACKET)
 368         {
 369           int i, j, ch;
 370           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 371             for (j = 0; j < UINT_BITS; ++j, ++ch)
 372               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 373                 re_set_fastmap (fastmap, icase, ch);
 374         }
 375 #ifdef RE_ENABLE_I18N
 376       else if (type == COMPLEX_BRACKET)
 377         {
 378           int i;
 379           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 381               || cset->nranges || cset->nchar_classes)
 382             {
 383 # ifdef _LIBC
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 385                 {
 386                   /* In this case we want to catch the bytes which are
 387                      the first byte of any collation elements.
 388                      e.g. In da_DK, we want to catch 'a' since "aa"
 389                           is a valid collation element, and don't catch
 390                           'b' since 'b' is the only collation element
 391                           which starts from 'b'.  */
 392                   int j, ch;
 393                   const int32_t *table = (const int32_t *)
 394                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 395                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 396                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 397                       if (table[ch] < 0)
 398                         re_set_fastmap (fastmap, icase, ch);
 399                 }
 400 # else
 401               if (dfa->mb_cur_max > 1)
 402                 for (i = 0; i < SBC_MAX; ++i)
 403                   if (__btowc (i) == WEOF)
 404                     re_set_fastmap (fastmap, icase, i);
 405 # endif /* not _LIBC */
 406             }
 407           for (i = 0; i < cset->nmbchars; ++i)
 408             {
 409               char buf[256];
 410               mbstate_t state;
 411               memset (&state, '\0', sizeof (state));
 412               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 413                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                 {
 416                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 417                       != (size_t) -1)
 418                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 419                 }
 420             }
 421         }
 422 #endif /* RE_ENABLE_I18N */
 423       else if (type == OP_PERIOD
 424 #ifdef RE_ENABLE_I18N
 425                || type == OP_UTF8_PERIOD
 426 #endif /* RE_ENABLE_I18N */
 427                || type == END_OF_RE)
 428         {
 429           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 430           if (type == END_OF_RE)
 431             bufp->can_be_null = 1;
 432           return;
 433         }
 434     }
 435 }
 436 \f
 437 /* Entry point for POSIX code.  */
 438 /* regcomp takes a regular expression as a string and compiles it.
 439
 440    PREG is a regex_t *.  We do not expect any fields to be initialized,
 441    since POSIX says we shouldn't.  Thus, we set
 442
 443      `buffer' to the compiled pattern;
 444      `used' to the length of the compiled pattern;
 445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 447        RE_SYNTAX_POSIX_BASIC;
 448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 449      `fastmap' to an allocated space for the fastmap;
 450      `fastmap_accurate' to zero;
 451      `re_nsub' to the number of subexpressions in PATTERN.
 452
 453    PATTERN is the address of the pattern string.
 454
 455    CFLAGS is a series of bits which affect compilation.
 456
 457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 458      use POSIX basic syntax.
 459
 460      If REG_NEWLINE is set, then . and [^...] don't match newline.
 461      Also, regexec will try a match beginning after every newline.
 462
 463      If REG_ICASE is set, then we considers upper- and lowercase
 464      versions of letters to be equivalent when matching.
 465
 466      If REG_NOSUB is set, then when PREG is passed to regexec, that
 467      routine will report only success or failure, and nothing about the
 468      registers.
 469
 470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 471    the return codes and their meanings.)  */
 472
 473 int
 474 regcomp (preg, pattern, cflags)
 475     regex_t *__restrict preg;
 476     const char *__restrict pattern;
 477     int cflags;
 478 {
 479   reg_errcode_t ret;
 480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 481                          : RE_SYNTAX_POSIX_BASIC);
 482
 483   preg->buffer = NULL;
 484   preg->allocated = 0;
 485   preg->used = 0;
 486
 487   /* Try to allocate space for the fastmap.  */
 488   preg->fastmap = re_malloc (char, SBC_MAX);
 489   if (BE (preg->fastmap == NULL, 0))
 490     return REG_ESPACE;
 491
 492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 493
 494   /* If REG_NEWLINE is set, newlines are treated differently.  */
 495   if (cflags & REG_NEWLINE)
 496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 497       syntax &= ~RE_DOT_NEWLINE;
 498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 499       /* It also changes the matching behavior.  */
 500       preg->newline_anchor = 1;
 501     }
 502   else
 503     preg->newline_anchor = 0;
 504   preg->no_sub = !!(cflags & REG_NOSUB);
 505   preg->translate = NULL;
 506
 507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 508
 509   /* POSIX doesn't distinguish between an unmatched open-group and an
 510      unmatched close-group: both are REG_EPAREN.  */
 511   if (ret == REG_ERPAREN)
 512     ret = REG_EPAREN;
 513
 514   /* We have already checked preg->fastmap != NULL.  */
 515   if (BE (ret == REG_NOERROR, 1))
 516     /* Compute the fastmap now, since regexec cannot modify the pattern
 517        buffer.  This function never fails in this implementation.  */
 518     (void) re_compile_fastmap (preg);
 519   else
 520     {
 521       /* Some error occurred while compiling the expression.  */
 522       re_free (preg->fastmap);
 523       preg->fastmap = NULL;
 524     }
 525
 526   return (int) ret;
 527 }
 528 #ifdef _LIBC
 529 weak_alias (__regcomp, regcomp)
 530 #endif
 531
 532 /* Returns a message corresponding to an error code, ERRCODE, returned
 533    from either regcomp or regexec.   We don't use PREG here.  */
 534
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *preg;
 539     char *errbuf;
 540     size_t errbuf_size;
 541 {
 542   const char *msg;
 543   size_t msg_size;
 544
 545   if (BE (errcode < 0
 546           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 547                                / sizeof (__re_error_msgid_idx[0])), 0))
 548     /* Only error codes returned by the rest of the code should be passed
 549        to this routine.  If we are given anything else, or if other regex
 550        code generates an invalid error code, then the program has a bug.
 551        Dump core so we can fix it.  */
 552     abort ();
 553
 554   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 555
 556   msg_size = strlen (msg) + 1; /* Includes the null.  */
 557
 558   if (BE (errbuf_size != 0, 1))
 559     {
 560       if (BE (msg_size > errbuf_size, 0))
 561         {
 562 #if defined HAVE_MEMPCPY || defined _LIBC
 563           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 564 #else
 565           memcpy (errbuf, msg, errbuf_size - 1);
 566           errbuf[errbuf_size - 1] = 0;
 567 #endif
 568         }
 569       else
 570         memcpy (errbuf, msg, msg_size);
 571     }
 572
 573   return msg_size;
 574 }
 575 #ifdef _LIBC
 576 weak_alias (__regerror, regerror)
 577 #endif
 578
 579
 580 #ifdef RE_ENABLE_I18N
 581 /* This static array is used for the map to single-byte characters when
 582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 583    it the same all the time.  UTF-8 is the preferred encoding so this is
 584    a worthwhile optimization.  */
 585 static const bitset utf8_sb_map =
 586 {
 587   /* Set the first 128 bits.  */
 588 # if UINT_MAX == 0xffffffff
 589   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 590 # else
 591 #  error "Add case for new unsigned int size"
 592 # endif
 593 };
 594 #endif
 595
 596
 597 static void
 598 free_dfa_content (re_dfa_t *dfa)
 599 {
 600   int i, j;
 601
 602   if (dfa->nodes)
 603     for (i = 0; i < dfa->nodes_len; ++i)
 604       free_token (dfa->nodes + i);
 605   re_free (dfa->nexts);
 606   for (i = 0; i < dfa->nodes_len; ++i)
 607     {
 608       if (dfa->eclosures != NULL)
 609         re_node_set_free (dfa->eclosures + i);
 610       if (dfa->inveclosures != NULL)
 611         re_node_set_free (dfa->inveclosures + i);
 612       if (dfa->edests != NULL)
 613         re_node_set_free (dfa->edests + i);
 614     }
 615   re_free (dfa->edests);
 616   re_free (dfa->eclosures);
 617   re_free (dfa->inveclosures);
 618   re_free (dfa->nodes);
 619
 620   if (dfa->state_table)
 621     for (i = 0; i <= dfa->state_hash_mask; ++i)
 622       {
 623         struct re_state_table_entry *entry = dfa->state_table + i;
 624         for (j = 0; j < entry->num; ++j)
 625           {
 626             re_dfastate_t *state = entry->array[j];
 627             free_state (state);
 628           }
 629         re_free (entry->array);
 630       }
 631   re_free (dfa->state_table);
 632 #ifdef RE_ENABLE_I18N
 633   if (dfa->sb_char != utf8_sb_map)
 634     re_free (dfa->sb_char);
 635 #endif
 636   re_free (dfa->subexp_map);
 637 #ifdef DEBUG
 638   re_free (dfa->re_str);
 639 #endif
 640
 641   re_free (dfa);
 642 }
 643
 644
 645 /* Free dynamically allocated space used by PREG.  */
 646
 647 void
 648 regfree (preg)
 649     regex_t *preg;
 650 {
 651   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 652   if (BE (dfa != NULL, 1))
 653     free_dfa_content (dfa);
 654   preg->buffer = NULL;
 655   preg->allocated = 0;
 656
 657   re_free (preg->fastmap);
 658   preg->fastmap = NULL;
 659
 660   re_free (preg->translate);
 661   preg->translate = NULL;
 662 }
 663 #ifdef _LIBC
 664 weak_alias (__regfree, regfree)
 665 #endif
 666 \f
 667 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 668    them unless specifically requested.  */
 669
 670 #if defined _REGEX_RE_COMP || defined _LIBC
 671
 672 /* BSD has one and only one pattern buffer.  */
 673 static struct re_pattern_buffer re_comp_buf;
 674
 675 char *
 676 # ifdef _LIBC
 677 /* Make these definitions weak in libc, so POSIX programs can redefine
 678    these names if they don't use our functions, and still use
 679    regcomp/regexec above without link errors.  */
 680 weak_function
 681 # endif
 682 re_comp (s)
 683      const char *s;
 684 {
 685   reg_errcode_t ret;
 686   char *fastmap;
 687
 688   if (!s)
 689     {
 690       if (!re_comp_buf.buffer)
 691         return gettext ("No previous regular expression");
 692       return 0;
 693     }
 694
 695   if (re_comp_buf.buffer)
 696     {
 697       fastmap = re_comp_buf.fastmap;
 698       re_comp_buf.fastmap = NULL;
 699       __regfree (&re_comp_buf);
 700       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 701       re_comp_buf.fastmap = fastmap;
 702     }
 703
 704   if (re_comp_buf.fastmap == NULL)
 705     {
 706       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 707       if (re_comp_buf.fastmap == NULL)
 708         return (char *) gettext (__re_error_msgid
 709                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 710     }
 711
 712   /* Since `re_exec' always passes NULL for the `regs' argument, we
 713      don't need to initialize the pattern buffer fields which affect it.  */
 714
 715   /* Match anchors at newlines.  */
 716   re_comp_buf.newline_anchor = 1;
 717
 718   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 719
 720   if (!ret)
 721     return NULL;
 722
 723   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 724   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 725 }
 726
 727 #ifdef _LIBC
 728 libc_freeres_fn (free_mem)
 729 {
 730   __regfree (&re_comp_buf);
 731 }
 732 #endif
 733
 734 #endif /* _REGEX_RE_COMP */
 735 \f
 736 /* Internal entry point.
 737    Compile the regular expression PATTERN, whose length is LENGTH.
 738    SYNTAX indicate regular expression's syntax.  */
 739
 740 static reg_errcode_t
 741 re_compile_internal (preg, pattern, length, syntax)
 742      regex_t *preg;
 743      const char * pattern;
 744      int length;
 745      reg_syntax_t syntax;
 746 {
 747   reg_errcode_t err = REG_NOERROR;
 748   re_dfa_t *dfa;
 749   re_string_t regexp;
 750
 751   /* Initialize the pattern buffer.  */
 752   preg->fastmap_accurate = 0;
 753   preg->syntax = syntax;
 754   preg->not_bol = preg->not_eol = 0;
 755   preg->used = 0;
 756   preg->re_nsub = 0;
 757   preg->can_be_null = 0;
 758   preg->regs_allocated = REGS_UNALLOCATED;
 759
 760   /* Initialize the dfa.  */
 761   dfa = (re_dfa_t *) preg->buffer;
 762   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 763     {
 764       /* If zero allocated, but buffer is non-null, try to realloc
 765          enough space.  This loses if buffer's address is bogus, but
 766          that is the user's responsibility.  If ->buffer is NULL this
 767          is a simple allocation.  */
 768       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 769       if (dfa == NULL)
 770         return REG_ESPACE;
 771       preg->allocated = sizeof (re_dfa_t);
 772       preg->buffer = (unsigned char *) dfa;
 773     }
 774   preg->used = sizeof (re_dfa_t);
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = init_dfa (dfa, length);
 779   if (BE (err != REG_NOERROR, 0))
 780     {
 781       free_dfa_content (dfa);
 782       preg->buffer = NULL;
 783       preg->allocated = 0;
 784       return err;
 785     }
 786 #ifdef DEBUG
 787   dfa->re_str = re_malloc (char, length + 1);
 788   strncpy (dfa->re_str, pattern, length + 1);
 789 #endif
 790
 791   err = re_string_construct (&regexp, pattern, length, preg->translate,
 792                              syntax & RE_ICASE, dfa);
 793   if (BE (err != REG_NOERROR, 0))
 794     {
 795     re_compile_internal_free_return:
 796       free_workarea_compile (preg);
 797       re_string_destruct (&regexp);
 798       free_dfa_content (dfa);
 799       preg->buffer = NULL;
 800       preg->allocated = 0;
 801       return err;
 802     }
 803
 804   /* Parse the regular expression, and build a structure tree.  */
 805   preg->re_nsub = 0;
 806   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 807   if (BE (dfa->str_tree == NULL, 0))
 808     goto re_compile_internal_free_return;
 809
 810   /* Analyze the tree and create the nfa.  */
 811   err = analyze (preg);
 812   if (BE (err != REG_NOERROR, 0))
 813     goto re_compile_internal_free_return;
 814
 815 #ifdef RE_ENABLE_I18N
 816   /* If possible, do searching in single byte encoding to speed things up.  */
 817   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 818     optimize_utf8 (dfa);
 819 #endif
 820
 821   /* Then create the initial state of the dfa.  */
 822   err = create_initial_state (dfa);
 823
 824   /* Release work areas.  */
 825   free_workarea_compile (preg);
 826   re_string_destruct (&regexp);
 827
 828   if (BE (err != REG_NOERROR, 0))
 829     {
 830       free_dfa_content (dfa);
 831       preg->buffer = NULL;
 832       preg->allocated = 0;
 833     }
 834
 835   return err;
 836 }
 837
 838 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 839    as the initial length of some arrays.  */
 840
 841 static reg_errcode_t
 842 init_dfa (dfa, pat_len)
 843      re_dfa_t *dfa;
 844      int pat_len;
 845 {
 846   int table_size;
 847 #ifndef _LIBC
 848   char *codeset_name;
 849 #endif
 850
 851   memset (dfa, '\0', sizeof (re_dfa_t));
 852
 853   /* Force allocation of str_tree_storage the first time.  */
 854   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 855
 856   dfa->nodes_alloc = pat_len + 1;
 857   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 858
 859   dfa->states_alloc = pat_len + 1;
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; table_size > 0; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877 # ifdef HAVE_LANGINFO_CODESET
 878   codeset_name = nl_langinfo (CODESET);
 879 # else
 880   codeset_name = getenv ("LC_ALL");
 881   if (codeset_name == NULL || codeset_name[0] == '\0')
 882     codeset_name = getenv ("LC_CTYPE");
 883   if (codeset_name == NULL || codeset_name[0] == '\0')
 884     codeset_name = getenv ("LANG");
 885   if (codeset_name == NULL)
 886     codeset_name = "";
 887   else if (strchr (codeset_name, '.') !=  NULL)
 888     codeset_name = strchr (codeset_name, '.') + 1;
 889 # endif
 890
 891   if (strcasecmp (codeset_name, "UTF-8") == 0
 892       || strcasecmp (codeset_name, "UTF8") == 0)
 893     dfa->is_utf8 = 1;
 894
 895   /* We check exhaustively in the loop below if this charset is a
 896      superset of ASCII.  */
 897   dfa->map_notascii = 0;
 898 #endif
 899
 900 #ifdef RE_ENABLE_I18N
 901   if (dfa->mb_cur_max > 1)
 902     {
 903       if (dfa->is_utf8)
 904         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 905       else
 906         {
 907           int i, j, ch;
 908
 909           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 910           if (BE (dfa->sb_char == NULL, 0))
 911             return REG_ESPACE;
 912
 913           /* Clear all bits by, then set those corresponding to single
 914              byte chars.  */
 915           bitset_empty (dfa->sb_char);
 916
 917           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 918             for (j = 0; j < UINT_BITS; ++j, ++ch)
 919               {
 920                 wchar_t wch = __btowc (ch);
 921                 if (wch != WEOF)
 922                   dfa->sb_char[i] |= 1 << j;
 923 # ifndef _LIBC
 924                 if (isascii (ch) && wch != (wchar_t) ch)
 925                   dfa->map_notascii = 1;
 926 # endif
 927               }
 928         }
 929     }
 930 #endif
 931
 932   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 933     return REG_ESPACE;
 934   return REG_NOERROR;
 935 }
 936
 937 /* Initialize WORD_CHAR table, which indicate which character is
 938    "word".  In this case "word" means that it is the word construction
 939    character used by some operators like "\<", "\>", etc.  */
 940
 941 static void
 942 init_word_char (dfa)
 943      re_dfa_t *dfa;
 944 {
 945   int i, j, ch;
 946   dfa->word_ops_used = 1;
 947   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 948     for (j = 0; j < UINT_BITS; ++j, ++ch)
 949       if (isalnum (ch) || ch == '_')
 950         dfa->word_char[i] |= 1 << j;
 951 }
 952
 953 /* Free the work area which are only used while compiling.  */
 954
 955 static void
 956 free_workarea_compile (preg)
 957      regex_t *preg;
 958 {
 959   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 960   bin_tree_storage_t *storage, *next;
 961   for (storage = dfa->str_tree_storage; storage; storage = next)
 962     {
 963       next = storage->next;
 964       re_free (storage);
 965     }
 966   dfa->str_tree_storage = NULL;
 967   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 968   dfa->str_tree = NULL;
 969   re_free (dfa->org_indices);
 970   dfa->org_indices = NULL;
 971 }
 972
 973 /* Create initial states for all contexts.  */
 974
 975 static reg_errcode_t
 976 create_initial_state (dfa)
 977      re_dfa_t *dfa;
 978 {
 979   int first, i;
 980   reg_errcode_t err;
 981   re_node_set init_nodes;
 982
 983   /* Initial states have the epsilon closure of the node which is
 984      the first node of the regular expression.  */
 985   first = dfa->str_tree->first->node_idx;
 986   dfa->init_node = first;
 987   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 988   if (BE (err != REG_NOERROR, 0))
 989     return err;
 990
 991   /* The back-references which are in initial states can epsilon transit,
 992      since in this case all of the subexpressions can be null.
 993      Then we add epsilon closures of the nodes which are the next nodes of
 994      the back-references.  */
 995   if (dfa->nbackref > 0)
 996     for (i = 0; i < init_nodes.nelem; ++i)
 997       {
 998         int node_idx = init_nodes.elems[i];
 999         re_token_type_t type = dfa->nodes[node_idx].type;
1000
1001         int clexp_idx;
1002         if (type != OP_BACK_REF)
1003           continue;
1004         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1005           {
1006             re_token_t *clexp_node;
1007             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1008             if (clexp_node->type == OP_CLOSE_SUBEXP
1009                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1010               break;
1011           }
1012         if (clexp_idx == init_nodes.nelem)
1013           continue;
1014
1015         if (type == OP_BACK_REF)
1016           {
1017             int dest_idx = dfa->edests[node_idx].elems[0];
1018             if (!re_node_set_contains (&init_nodes, dest_idx))
1019               {
1020                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1021                 i = 0;
1022               }
1023           }
1024       }
1025
1026   /* It must be the first time to invoke acquire_state.  */
1027   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1028   /* We don't check ERR here, since the initial state must not be NULL.  */
1029   if (BE (dfa->init_state == NULL, 0))
1030     return err;
1031   if (dfa->init_state->has_constraint)
1032     {
1033       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1034                                                        CONTEXT_WORD);
1035       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1036                                                      CONTEXT_NEWLINE);
1037       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1038                                                          &init_nodes,
1039                                                          CONTEXT_NEWLINE
1040                                                          | CONTEXT_BEGBUF);
1041       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1042               || dfa->init_state_begbuf == NULL, 0))
1043         return err;
1044     }
1045   else
1046     dfa->init_state_word = dfa->init_state_nl
1047       = dfa->init_state_begbuf = dfa->init_state;
1048
1049   re_node_set_free (&init_nodes);
1050   return REG_NOERROR;
1051 }
1052 \f
1053 #ifdef RE_ENABLE_I18N
1054 /* If it is possible to do searching in single byte encoding instead of UTF-8
1055    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1056    DFA nodes where needed.  */
1057
1058 static void
1059 optimize_utf8 (dfa)
1060      re_dfa_t *dfa;
1061 {
1062   int node, i, mb_chars = 0, has_period = 0;
1063
1064   for (node = 0; node < dfa->nodes_len; ++node)
1065     switch (dfa->nodes[node].type)
1066       {
1067       case CHARACTER:
1068         if (dfa->nodes[node].opr.c >= 0x80)
1069           mb_chars = 1;
1070         break;
1071       case ANCHOR:
1072         switch (dfa->nodes[node].opr.idx)
1073           {
1074           case LINE_FIRST:
1075           case LINE_LAST:
1076           case BUF_FIRST:
1077           case BUF_LAST:
1078             break;
1079           default:
1080             /* Word anchors etc. cannot be handled.  */
1081             return;
1082           }
1083         break;
1084       case OP_PERIOD:
1085         has_period = 1;
1086         break;
1087       case OP_BACK_REF:
1088       case OP_ALT:
1089       case END_OF_RE:
1090       case OP_DUP_ASTERISK:
1091       case OP_OPEN_SUBEXP:
1092       case OP_CLOSE_SUBEXP:
1093         break;
1094       case COMPLEX_BRACKET:
1095         return;
1096       case SIMPLE_BRACKET:
1097         /* Just double check.  */
1098         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1099           if (dfa->nodes[node].opr.sbcset[i])
1100             return;
1101         break;
1102       default:
1103         abort ();
1104       }
1105
1106   if (mb_chars || has_period)
1107     for (node = 0; node < dfa->nodes_len; ++node)
1108       {
1109         if (dfa->nodes[node].type == CHARACTER
1110             && dfa->nodes[node].opr.c >= 0x80)
1111           dfa->nodes[node].mb_partial = 0;
1112         else if (dfa->nodes[node].type == OP_PERIOD)
1113           dfa->nodes[node].type = OP_UTF8_PERIOD;
1114       }
1115
1116   /* The search can be in single byte locale.  */
1117   dfa->mb_cur_max = 1;
1118   dfa->is_utf8 = 0;
1119   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1120 }
1121 #endif
1122 \f
1123 /* Analyze the structure tree, and calculate "first", "next", "edest",
1124    "eclosure", and "inveclosure".  */
1125
1126 static reg_errcode_t
1127 analyze (preg)
1128      regex_t *preg;
1129 {
1130   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1131   reg_errcode_t ret;
1132
1133   /* Allocate arrays.  */
1134   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1135   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1136   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1137   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1138   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1139           || dfa->eclosures == NULL, 0))
1140     return REG_ESPACE;
1141
1142   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1143   if (dfa->subexp_map != NULL)
1144     {
1145       int i;
1146       for (i = 0; i < preg->re_nsub; i++)
1147         dfa->subexp_map[i] = i;
1148       preorder (dfa->str_tree, optimize_subexps, dfa);
1149       for (i = 0; i < preg->re_nsub; i++)
1150         if (dfa->subexp_map[i] != i)
1151           break;
1152       if (i == preg->re_nsub)
1153         {
1154           free (dfa->subexp_map);
1155           dfa->subexp_map = NULL;
1156         }
1157     }
1158
1159   ret = postorder (dfa->str_tree, lower_subexps, preg);
1160   if (BE (ret != REG_NOERROR, 0))
1161     return ret;
1162   ret = postorder (dfa->str_tree, calc_first, dfa);
1163   if (BE (ret != REG_NOERROR, 0))
1164     return ret;
1165   preorder (dfa->str_tree, calc_next, dfa);
1166   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1167   if (BE (ret != REG_NOERROR, 0))
1168     return ret;
1169   ret = calc_eclosure (dfa);
1170   if (BE (ret != REG_NOERROR, 0))
1171     return ret;
1172
1173   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1174      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1175   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1176       || dfa->nbackref)
1177     {
1178       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1179       if (BE (dfa->inveclosures == NULL, 0))
1180         return REG_ESPACE;
1181       ret = calc_inveclosure (dfa);
1182     }
1183
1184   return ret;
1185 }
1186
1187 /* Our parse trees are very unbalanced, so we cannot use a stack to
1188    implement parse tree visits.  Instead, we use parent pointers and
1189    some hairy code in these two functions.  */
1190 static reg_errcode_t
1191 postorder (root, fn, extra)
1192      bin_tree_t *root;
1193      reg_errcode_t (fn (void *, bin_tree_t *));
1194      void *extra;
1195 {
1196   bin_tree_t *node, *prev;
1197
1198   for (node = root; ; )
1199     {
1200       /* Descend down the tree, preferably to the left (or to the right
1201          if that's the only child).  */
1202       while (node->left || node->right)
1203         if (node->left)
1204           node = node->left;
1205         else
1206           node = node->right;
1207
1208       do
1209         {
1210           reg_errcode_t err = fn (extra, node);
1211           if (BE (err != REG_NOERROR, 0))
1212             return err;
1213           if (node->parent == NULL)
1214             return REG_NOERROR;
1215           prev = node;
1216           node = node->parent;
1217         }
1218       /* Go up while we have a node that is reached from the right.  */
1219       while (node->right == prev || node->right == NULL);
1220       node = node->right;
1221     }
1222 }
1223
1224 static reg_errcode_t
1225 preorder (root, fn, extra)
1226      bin_tree_t *root;
1227      reg_errcode_t (fn (void *, bin_tree_t *));
1228      void *extra;
1229 {
1230   bin_tree_t *node;
1231
1232   for (node = root; ; )
1233     {
1234       reg_errcode_t err = fn (extra, node);
1235       if (BE (err != REG_NOERROR, 0))
1236         return err;
1237
1238       /* Go to the left node, or up and to the right.  */
1239       if (node->left)
1240         node = node->left;
1241       else
1242         {
1243           bin_tree_t *prev = NULL;
1244           while (node->right == prev || node->right == NULL)
1245             {
1246               prev = node;
1247               node = node->parent;
1248               if (!node)
1249                 return REG_NOERROR;
1250             }
1251           node = node->right;
1252         }
1253     }
1254 }
1255
1256 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1257    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1258    backreferences as well.  Requires a preorder visit.  */
1259 static reg_errcode_t
1260 optimize_subexps (extra, node)
1261      void *extra;
1262      bin_tree_t *node;
1263 {
1264   re_dfa_t *dfa = (re_dfa_t *) extra;
1265
1266   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1267     {
1268       int idx = node->token.opr.idx;
1269       node->token.opr.idx = dfa->subexp_map[idx];
1270       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1271     }
1272
1273   else if (node->token.type == SUBEXP
1274            && node->left && node->left->token.type == SUBEXP)
1275     {
1276       int other_idx = node->left->token.opr.idx;
1277
1278       node->left = node->left->left;
1279       if (node->left)
1280         node->left->parent = node;
1281
1282       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1283       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1284         dfa->used_bkref_map &= ~(1 << other_idx);
1285     }
1286
1287   return REG_NOERROR;
1288 }
1289
1290 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1291    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1292 static reg_errcode_t
1293 lower_subexps (extra, node)
1294      void *extra;
1295      bin_tree_t *node;
1296 {
1297   regex_t *preg = (regex_t *) extra;
1298   reg_errcode_t err = REG_NOERROR;
1299
1300   if (node->left && node->left->token.type == SUBEXP)
1301     {
1302       node->left = lower_subexp (&err, preg, node->left);
1303       if (node->left)
1304         node->left->parent = node;
1305     }
1306   if (node->right && node->right->token.type == SUBEXP)
1307     {
1308       node->right = lower_subexp (&err, preg, node->right);
1309       if (node->right)
1310         node->right->parent = node;
1311     }
1312
1313   return err;
1314 }
1315
1316 static bin_tree_t *
1317 lower_subexp (err, preg, node)
1318      reg_errcode_t *err;
1319      regex_t *preg;
1320      bin_tree_t *node;
1321 {
1322   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323   bin_tree_t *body = node->left;
1324   bin_tree_t *op, *cls, *tree1, *tree;
1325
1326   if (preg->no_sub
1327       /* We do not optimize empty subexpressions, because otherwise we may
1328          have bad CONCAT nodes with NULL children.  This is obviously not
1329          very common, so we do not lose much.  An example that triggers
1330          this case is the sed "script" /\(\)/x.  */
1331       && node->left != NULL
1332       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1333           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1334     return node->left;
1335
1336   /* Convert the SUBEXP node to the concatenation of an
1337      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1338   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1339   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1340   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1341   tree = create_tree (dfa, op, tree1, CONCAT);
1342   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1343     {
1344       *err = REG_ESPACE;
1345       return NULL;
1346     }
1347
1348   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1350   return tree;
1351 }
1352
1353 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354    nodes.  Requires a postorder visit.  */
1355 static reg_errcode_t
1356 calc_first (extra, node)
1357      void *extra;
1358      bin_tree_t *node;
1359 {
1360   re_dfa_t *dfa = (re_dfa_t *) extra;
1361   if (node->token.type == CONCAT)
1362     {
1363       node->first = node->left->first;
1364       node->node_idx = node->left->node_idx;
1365     }
1366   else
1367     {
1368       node->first = node;
1369       node->node_idx = re_dfa_add_node (dfa, node->token);
1370       if (BE (node->node_idx == -1, 0))
1371         return REG_ESPACE;
1372     }
1373   return REG_NOERROR;
1374 }
1375
1376 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1377 static reg_errcode_t
1378 calc_next (extra, node)
1379      void *extra;
1380      bin_tree_t *node;
1381 {
1382   switch (node->token.type)
1383     {
1384     case OP_DUP_ASTERISK:
1385       node->left->next = node;
1386       break;
1387     case CONCAT:
1388       node->left->next = node->right->first;
1389       node->right->next = node->next;
1390       break;
1391     default:
1392       if (node->left)
1393         node->left->next = node->next;
1394       if (node->right)
1395         node->right->next = node->next;
1396       break;
1397     }
1398   return REG_NOERROR;
1399 }
1400
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1402 static reg_errcode_t
1403 link_nfa_nodes (extra, node)
1404      void *extra;
1405      bin_tree_t *node;
1406 {
1407   re_dfa_t *dfa = (re_dfa_t *) extra;
1408   int idx = node->node_idx;
1409   reg_errcode_t err = REG_NOERROR;
1410
1411   switch (node->token.type)
1412     {
1413     case CONCAT:
1414       break;
1415
1416     case END_OF_RE:
1417       assert (node->next == NULL);
1418       break;
1419
1420     case OP_DUP_ASTERISK:
1421     case OP_ALT:
1422       {
1423         int left, right;
1424         dfa->has_plural_match = 1;
1425         if (node->left != NULL)
1426           left = node->left->first->node_idx;
1427         else
1428           left = node->next->node_idx;
1429         if (node->right != NULL)
1430           right = node->right->first->node_idx;
1431         else
1432           right = node->next->node_idx;
1433         assert (left > -1);
1434         assert (right > -1);
1435         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436       }
1437       break;
1438
1439     case ANCHOR:
1440     case OP_OPEN_SUBEXP:
1441     case OP_CLOSE_SUBEXP:
1442       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443       break;
1444
1445     case OP_BACK_REF:
1446       dfa->nexts[idx] = node->next->node_idx;
1447       if (node->token.type == OP_BACK_REF)
1448         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1449       break;
1450
1451     default:
1452       assert (!IS_EPSILON_NODE (node->token.type));
1453       dfa->nexts[idx] = node->next->node_idx;
1454       break;
1455     }
1456
1457   return err;
1458 }
1459
1460 /* Duplicate the epsilon closure of the node ROOT_NODE.
1461    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462    to their own constraint.  */
1463
1464 static reg_errcode_t
1465 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1466                         init_constraint)
1467      re_dfa_t *dfa;
1468      int top_org_node, top_clone_node, root_node;
1469      unsigned int init_constraint;
1470 {
1471   reg_errcode_t err;
1472   int org_node, clone_node, ret;
1473   unsigned int constraint = init_constraint;
1474   for (org_node = top_org_node, clone_node = top_clone_node;;)
1475     {
1476       int org_dest, clone_dest;
1477       if (dfa->nodes[org_node].type == OP_BACK_REF)
1478         {
1479           /* If the back reference epsilon-transit, its destination must
1480              also have the constraint.  Then duplicate the epsilon closure
1481              of the destination of the back reference, and store it in
1482              edests of the back reference.  */
1483           org_dest = dfa->nexts[org_node];
1484           re_node_set_empty (dfa->edests + clone_node);
1485           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1486           if (BE (err != REG_NOERROR, 0))
1487             return err;
1488           dfa->nexts[clone_node] = dfa->nexts[org_node];
1489           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1490           if (BE (ret < 0, 0))
1491             return REG_ESPACE;
1492         }
1493       else if (dfa->edests[org_node].nelem == 0)
1494         {
1495           /* In case of the node can't epsilon-transit, don't duplicate the
1496              destination and store the original destination as the
1497              destination of the node.  */
1498           dfa->nexts[clone_node] = dfa->nexts[org_node];
1499           break;
1500         }
1501       else if (dfa->edests[org_node].nelem == 1)
1502         {
1503           /* In case of the node can epsilon-transit, and it has only one
1504              destination.  */
1505           org_dest = dfa->edests[org_node].elems[0];
1506           re_node_set_empty (dfa->edests + clone_node);
1507           if (dfa->nodes[org_node].type == ANCHOR)
1508             {
1509               /* In case of the node has another constraint, append it.  */
1510               if (org_node == root_node && clone_node != org_node)
1511                 {
1512                   /* ...but if the node is root_node itself, it means the
1513                      epsilon closure have a loop, then tie it to the
1514                      destination of the root_node.  */
1515                   ret = re_node_set_insert (dfa->edests + clone_node,
1516                                             org_dest);
1517                   if (BE (ret < 0, 0))
1518                     return REG_ESPACE;
1519                   break;
1520                 }
1521               constraint |= dfa->nodes[org_node].opr.ctx_type;
1522             }
1523           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1524           if (BE (err != REG_NOERROR, 0))
1525             return err;
1526           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527           if (BE (ret < 0, 0))
1528             return REG_ESPACE;
1529         }
1530       else /* dfa->edests[org_node].nelem == 2 */
1531         {
1532           /* In case of the node can epsilon-transit, and it has two
1533              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1534           org_dest = dfa->edests[org_node].elems[0];
1535           re_node_set_empty (dfa->edests + clone_node);
1536           /* Search for a duplicated node which satisfies the constraint.  */
1537           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1538           if (clone_dest == -1)
1539             {
1540               /* There are no such a duplicated node, create a new one.  */
1541               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1542               if (BE (err != REG_NOERROR, 0))
1543                 return err;
1544               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545               if (BE (ret < 0, 0))
1546                 return REG_ESPACE;
1547               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548                                             root_node, constraint);
1549               if (BE (err != REG_NOERROR, 0))
1550                 return err;
1551             }
1552           else
1553             {
1554               /* There are a duplicated node which satisfy the constraint,
1555                  use it to avoid infinite loop.  */
1556               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557               if (BE (ret < 0, 0))
1558                 return REG_ESPACE;
1559             }
1560
1561           org_dest = dfa->edests[org_node].elems[1];
1562           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1563           if (BE (err != REG_NOERROR, 0))
1564             return err;
1565           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566           if (BE (ret < 0, 0))
1567             return REG_ESPACE;
1568         }
1569       org_node = org_dest;
1570       clone_node = clone_dest;
1571     }
1572   return REG_NOERROR;
1573 }
1574
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576    satisfies the constraint CONSTRAINT.  */
1577
1578 static int
1579 search_duplicated_node (dfa, org_node, constraint)
1580      re_dfa_t *dfa;
1581      int org_node;
1582      unsigned int constraint;
1583 {
1584   int idx;
1585   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1586     {
1587       if (org_node == dfa->org_indices[idx]
1588           && constraint == dfa->nodes[idx].constraint)
1589         return idx; /* Found.  */
1590     }
1591   return -1; /* Not found.  */
1592 }
1593
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1596    otherwise return the error code.  */
1597
1598 static reg_errcode_t
1599 duplicate_node (new_idx, dfa, org_idx, constraint)
1600      re_dfa_t *dfa;
1601      int *new_idx, org_idx;
1602      unsigned int constraint;
1603 {
1604   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1605   if (BE (dup_idx == -1, 0))
1606     return REG_ESPACE;
1607   dfa->nodes[dup_idx].constraint = constraint;
1608   if (dfa->nodes[org_idx].type == ANCHOR)
1609     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1610   dfa->nodes[dup_idx].duplicated = 1;
1611
1612   /* Store the index of the original node.  */
1613   dfa->org_indices[dup_idx] = org_idx;
1614   *new_idx = dup_idx;
1615   return REG_NOERROR;
1616 }
1617
1618 static reg_errcode_t
1619 calc_inveclosure (dfa)
1620      re_dfa_t *dfa;
1621 {
1622   int src, idx, ret;
1623   for (idx = 0; idx < dfa->nodes_len; ++idx)
1624     re_node_set_init_empty (dfa->inveclosures + idx);
1625
1626   for (src = 0; src < dfa->nodes_len; ++src)
1627     {
1628       int *elems = dfa->eclosures[src].elems;
1629       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630         {
1631           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632           if (BE (ret == -1, 0))
1633             return REG_ESPACE;
1634         }
1635     }
1636
1637   return REG_NOERROR;
1638 }
1639
1640 /* Calculate "eclosure" for all the node in DFA.  */
1641
1642 static reg_errcode_t
1643 calc_eclosure (dfa)
1644      re_dfa_t *dfa;
1645 {
1646   int node_idx, incomplete;
1647 #ifdef DEBUG
1648   assert (dfa->nodes_len > 0);
1649 #endif
1650   incomplete = 0;
1651   /* For each nodes, calculate epsilon closure.  */
1652   for (node_idx = 0; ; ++node_idx)
1653     {
1654       reg_errcode_t err;
1655       re_node_set eclosure_elem;
1656       if (node_idx == dfa->nodes_len)
1657         {
1658           if (!incomplete)
1659             break;
1660           incomplete = 0;
1661           node_idx = 0;
1662         }
1663
1664 #ifdef DEBUG
1665       assert (dfa->eclosures[node_idx].nelem != -1);
1666 #endif
1667
1668       /* If we have already calculated, skip it.  */
1669       if (dfa->eclosures[node_idx].nelem != 0)
1670         continue;
1671       /* Calculate epsilon closure of `node_idx'.  */
1672       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1673       if (BE (err != REG_NOERROR, 0))
1674         return err;
1675
1676       if (dfa->eclosures[node_idx].nelem == 0)
1677         {
1678           incomplete = 1;
1679           re_node_set_free (&eclosure_elem);
1680         }
1681     }
1682   return REG_NOERROR;
1683 }
1684
1685 /* Calculate epsilon closure of NODE.  */
1686
1687 static reg_errcode_t
1688 calc_eclosure_iter (new_set, dfa, node, root)
1689      re_node_set *new_set;
1690      re_dfa_t *dfa;
1691      int node, root;
1692 {
1693   reg_errcode_t err;
1694   unsigned int constraint;
1695   int i, incomplete;
1696   re_node_set eclosure;
1697   incomplete = 0;
1698   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1699   if (BE (err != REG_NOERROR, 0))
1700     return err;
1701
1702   /* This indicates that we are calculating this node now.
1703      We reference this value to avoid infinite loop.  */
1704   dfa->eclosures[node].nelem = -1;
1705
1706   constraint = ((dfa->nodes[node].type == ANCHOR)
1707                 ? dfa->nodes[node].opr.ctx_type : 0);
1708   /* If the current node has constraints, duplicate all nodes.
1709      Since they must inherit the constraints.  */
1710   if (constraint
1711       && dfa->edests[node].nelem
1712       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1713     {
1714       int org_node, cur_node;
1715       org_node = cur_node = node;
1716       err = duplicate_node_closure (dfa, node, node, node, constraint);
1717       if (BE (err != REG_NOERROR, 0))
1718         return err;
1719     }
1720
1721   /* Expand each epsilon destination nodes.  */
1722   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1723     for (i = 0; i < dfa->edests[node].nelem; ++i)
1724       {
1725         re_node_set eclosure_elem;
1726         int edest = dfa->edests[node].elems[i];
1727         /* If calculating the epsilon closure of `edest' is in progress,
1728            return intermediate result.  */
1729         if (dfa->eclosures[edest].nelem == -1)
1730           {
1731             incomplete = 1;
1732             continue;
1733           }
1734         /* If we haven't calculated the epsilon closure of `edest' yet,
1735            calculate now. Otherwise use calculated epsilon closure.  */
1736         if (dfa->eclosures[edest].nelem == 0)
1737           {
1738             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1739             if (BE (err != REG_NOERROR, 0))
1740               return err;
1741           }
1742         else
1743           eclosure_elem = dfa->eclosures[edest];
1744         /* Merge the epsilon closure of `edest'.  */
1745         re_node_set_merge (&eclosure, &eclosure_elem);
1746         /* If the epsilon closure of `edest' is incomplete,
1747            the epsilon closure of this node is also incomplete.  */
1748         if (dfa->eclosures[edest].nelem == 0)
1749           {
1750             incomplete = 1;
1751             re_node_set_free (&eclosure_elem);
1752           }
1753       }
1754
1755   /* Epsilon closures include itself.  */
1756   re_node_set_insert (&eclosure, node);
1757   if (incomplete && !root)
1758     dfa->eclosures[node].nelem = 0;
1759   else
1760     dfa->eclosures[node] = eclosure;
1761   *new_set = eclosure;
1762   return REG_NOERROR;
1763 }
1764 \f
1765 /* Functions for token which are used in the parser.  */
1766
1767 /* Fetch a token from INPUT.
1768    We must not use this function inside bracket expressions.  */
1769
1770 static void
1771 fetch_token (result, input, syntax)
1772      re_token_t *result;
1773      re_string_t *input;
1774      reg_syntax_t syntax;
1775 {
1776   re_string_skip_bytes (input, peek_token (result, input, syntax));
1777 }
1778
1779 /* Peek a token from INPUT, and return the length of the token.
1780    We must not use this function inside bracket expressions.  */
1781
1782 static int
1783 peek_token (token, input, syntax)
1784      re_token_t *token;
1785      re_string_t *input;
1786      reg_syntax_t syntax;
1787 {
1788   unsigned char c;
1789
1790   if (re_string_eoi (input))
1791     {
1792       token->type = END_OF_RE;
1793       return 0;
1794     }
1795
1796   c = re_string_peek_byte (input, 0);
1797   token->opr.c = c;
1798
1799   token->word_char = 0;
1800 #ifdef RE_ENABLE_I18N
1801   token->mb_partial = 0;
1802   if (input->mb_cur_max > 1 &&
1803       !re_string_first_byte (input, re_string_cur_idx (input)))
1804     {
1805       token->type = CHARACTER;
1806       token->mb_partial = 1;
1807       return 1;
1808     }
1809 #endif
1810   if (c == '\\')
1811     {
1812       unsigned char c2;
1813       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1814         {
1815           token->type = BACK_SLASH;
1816           return 1;
1817         }
1818
1819       c2 = re_string_peek_byte_case (input, 1);
1820       token->opr.c = c2;
1821       token->type = CHARACTER;
1822 #ifdef RE_ENABLE_I18N
1823       if (input->mb_cur_max > 1)
1824         {
1825           wint_t wc = re_string_wchar_at (input,
1826                                           re_string_cur_idx (input) + 1);
1827           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1828         }
1829       else
1830 #endif
1831         token->word_char = IS_WORD_CHAR (c2) != 0;
1832
1833       switch (c2)
1834         {
1835         case '|':
1836           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1837             token->type = OP_ALT;
1838           break;
1839         case '1': case '2': case '3': case '4': case '5':
1840         case '6': case '7': case '8': case '9':
1841           if (!(syntax & RE_NO_BK_REFS))
1842             {
1843               token->type = OP_BACK_REF;
1844               token->opr.idx = c2 - '1';
1845             }
1846           break;
1847         case '<':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = WORD_FIRST;
1852             }
1853           break;
1854         case '>':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             {
1857               token->type = ANCHOR;
1858               token->opr.ctx_type = WORD_LAST;
1859             }
1860           break;
1861         case 'b':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             {
1864               token->type = ANCHOR;
1865               token->opr.ctx_type = WORD_DELIM;
1866             }
1867           break;
1868         case 'B':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             {
1871               token->type = ANCHOR;
1872               token->opr.ctx_type = NOT_WORD_DELIM;
1873             }
1874           break;
1875         case 'w':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_WORD;
1878           break;
1879         case 'W':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_NOTWORD;
1882           break;
1883         case 's':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             token->type = OP_SPACE;
1886           break;
1887         case 'S':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             token->type = OP_NOTSPACE;
1890           break;
1891         case '`':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = BUF_FIRST;
1896             }
1897           break;
1898         case '\'':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             {
1901               token->type = ANCHOR;
1902               token->opr.ctx_type = BUF_LAST;
1903             }
1904           break;
1905         case '(':
1906           if (!(syntax & RE_NO_BK_PARENS))
1907             token->type = OP_OPEN_SUBEXP;
1908           break;
1909         case ')':
1910           if (!(syntax & RE_NO_BK_PARENS))
1911             token->type = OP_CLOSE_SUBEXP;
1912           break;
1913         case '+':
1914           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915             token->type = OP_DUP_PLUS;
1916           break;
1917         case '?':
1918           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1919             token->type = OP_DUP_QUESTION;
1920           break;
1921         case '{':
1922           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923             token->type = OP_OPEN_DUP_NUM;
1924           break;
1925         case '}':
1926           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1927             token->type = OP_CLOSE_DUP_NUM;
1928           break;
1929         default:
1930           break;
1931         }
1932       return 2;
1933     }
1934
1935   token->type = CHARACTER;
1936 #ifdef RE_ENABLE_I18N
1937   if (input->mb_cur_max > 1)
1938     {
1939       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1940       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1941     }
1942   else
1943 #endif
1944     token->word_char = IS_WORD_CHAR (token->opr.c);
1945
1946   switch (c)
1947     {
1948     case '\n':
1949       if (syntax & RE_NEWLINE_ALT)
1950         token->type = OP_ALT;
1951       break;
1952     case '|':
1953       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1954         token->type = OP_ALT;
1955       break;
1956     case '*':
1957       token->type = OP_DUP_ASTERISK;
1958       break;
1959     case '+':
1960       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961         token->type = OP_DUP_PLUS;
1962       break;
1963     case '?':
1964       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1965         token->type = OP_DUP_QUESTION;
1966       break;
1967     case '{':
1968       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969         token->type = OP_OPEN_DUP_NUM;
1970       break;
1971     case '}':
1972       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1973         token->type = OP_CLOSE_DUP_NUM;
1974       break;
1975     case '(':
1976       if (syntax & RE_NO_BK_PARENS)
1977         token->type = OP_OPEN_SUBEXP;
1978       break;
1979     case ')':
1980       if (syntax & RE_NO_BK_PARENS)
1981         token->type = OP_CLOSE_SUBEXP;
1982       break;
1983     case '[':
1984       token->type = OP_OPEN_BRACKET;
1985       break;
1986     case '.':
1987       token->type = OP_PERIOD;
1988       break;
1989     case '^':
1990       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1991           re_string_cur_idx (input) != 0)
1992         {
1993           char prev = re_string_peek_byte (input, -1);
1994           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1995             break;
1996         }
1997       token->type = ANCHOR;
1998       token->opr.ctx_type = LINE_FIRST;
1999       break;
2000     case '$':
2001       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2002           re_string_cur_idx (input) + 1 != re_string_length (input))
2003         {
2004           re_token_t next;
2005           re_string_skip_bytes (input, 1);
2006           peek_token (&next, input, syntax);
2007           re_string_skip_bytes (input, -1);
2008           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2009             break;
2010         }
2011       token->type = ANCHOR;
2012       token->opr.ctx_type = LINE_LAST;
2013       break;
2014     default:
2015       break;
2016     }
2017   return 1;
2018 }
2019
2020 /* Peek a token from INPUT, and return the length of the token.
2021    We must not use this function out of bracket expressions.  */
2022
2023 static int
2024 peek_token_bracket (token, input, syntax)
2025      re_token_t *token;
2026      re_string_t *input;
2027      reg_syntax_t syntax;
2028 {
2029   unsigned char c;
2030   if (re_string_eoi (input))
2031     {
2032       token->type = END_OF_RE;
2033       return 0;
2034     }
2035   c = re_string_peek_byte (input, 0);
2036   token->opr.c = c;
2037
2038 #ifdef RE_ENABLE_I18N
2039   if (input->mb_cur_max > 1 &&
2040       !re_string_first_byte (input, re_string_cur_idx (input)))
2041     {
2042       token->type = CHARACTER;
2043       return 1;
2044     }
2045 #endif /* RE_ENABLE_I18N */
2046
2047   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2048       && re_string_cur_idx (input) + 1 < re_string_length (input))
2049     {
2050       /* In this case, '\' escape a character.  */
2051       unsigned char c2;
2052       re_string_skip_bytes (input, 1);
2053       c2 = re_string_peek_byte (input, 0);
2054       token->opr.c = c2;
2055       token->type = CHARACTER;
2056       return 1;
2057     }
2058   if (c == '[') /* '[' is a special char in a bracket exps.  */
2059     {
2060       unsigned char c2;
2061       int token_len;
2062       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2063         c2 = re_string_peek_byte (input, 1);
2064       else
2065         c2 = 0;
2066       token->opr.c = c2;
2067       token_len = 2;
2068       switch (c2)
2069         {
2070         case '.':
2071           token->type = OP_OPEN_COLL_ELEM;
2072           break;
2073         case '=':
2074           token->type = OP_OPEN_EQUIV_CLASS;
2075           break;
2076         case ':':
2077           if (syntax & RE_CHAR_CLASSES)
2078             {
2079               token->type = OP_OPEN_CHAR_CLASS;
2080               break;
2081             }
2082           /* else fall through.  */
2083         default:
2084           token->type = CHARACTER;
2085           token->opr.c = c;
2086           token_len = 1;
2087           break;
2088         }
2089       return token_len;
2090     }
2091   switch (c)
2092     {
2093     case '-':
2094       token->type = OP_CHARSET_RANGE;
2095       break;
2096     case ']':
2097       token->type = OP_CLOSE_BRACKET;
2098       break;
2099     case '^':
2100       token->type = OP_NON_MATCH_LIST;
2101       break;
2102     default:
2103       token->type = CHARACTER;
2104     }
2105   return 1;
2106 }
2107 \f
2108 /* Functions for parser.  */
2109
2110 /* Entry point of the parser.
2111    Parse the regular expression REGEXP and return the structure tree.
2112    If an error is occured, ERR is set by error code, and return NULL.
2113    This function build the following tree, from regular expression <reg_exp>:
2114            CAT
2115            / \
2116           /   \
2117    <reg_exp>  EOR
2118
2119    CAT means concatenation.
2120    EOR means end of regular expression.  */
2121
2122 static bin_tree_t *
2123 parse (regexp, preg, syntax, err)
2124      re_string_t *regexp;
2125      regex_t *preg;
2126      reg_syntax_t syntax;
2127      reg_errcode_t *err;
2128 {
2129   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130   bin_tree_t *tree, *eor, *root;
2131   re_token_t current_token;
2132   dfa->syntax = syntax;
2133   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2134   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2135   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2136     return NULL;
2137   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2138   if (tree != NULL)
2139     root = create_tree (dfa, tree, eor, CONCAT);
2140   else
2141     root = eor;
2142   if (BE (eor == NULL || root == NULL, 0))
2143     {
2144       *err = REG_ESPACE;
2145       return NULL;
2146     }
2147   return root;
2148 }
2149
2150 /* This function build the following tree, from regular expression
2151    <branch1>|<branch2>:
2152            ALT
2153            / \
2154           /   \
2155    <branch1> <branch2>
2156
2157    ALT means alternative, which represents the operator `|'.  */
2158
2159 static bin_tree_t *
2160 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2161      re_string_t *regexp;
2162      regex_t *preg;
2163      re_token_t *token;
2164      reg_syntax_t syntax;
2165      int nest;
2166      reg_errcode_t *err;
2167 {
2168   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2169   bin_tree_t *tree, *branch = NULL;
2170   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2171   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2172     return NULL;
2173
2174   while (token->type == OP_ALT)
2175     {
2176       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2177       if (token->type != OP_ALT && token->type != END_OF_RE
2178           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2179         {
2180           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2181           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2182             return NULL;
2183         }
2184       else
2185         branch = NULL;
2186       tree = create_tree (dfa, tree, branch, OP_ALT);
2187       if (BE (tree == NULL, 0))
2188         {
2189           *err = REG_ESPACE;
2190           return NULL;
2191         }
2192     }
2193   return tree;
2194 }
2195
2196 /* This function build the following tree, from regular expression
2197    <exp1><exp2>:
2198         CAT
2199         / \
2200        /   \
2201    <exp1> <exp2>
2202
2203    CAT means concatenation.  */
2204
2205 static bin_tree_t *
2206 parse_branch (regexp, preg, token, syntax, nest, err)
2207      re_string_t *regexp;
2208      regex_t *preg;
2209      re_token_t *token;
2210      reg_syntax_t syntax;
2211      int nest;
2212      reg_errcode_t *err;
2213 {
2214   bin_tree_t *tree, *exp;
2215   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2216   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2217   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2218     return NULL;
2219
2220   while (token->type != OP_ALT && token->type != END_OF_RE
2221          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2222     {
2223       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2224       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2225         {
2226           return NULL;
2227         }
2228       if (tree != NULL && exp != NULL)
2229         {
2230           tree = create_tree (dfa, tree, exp, CONCAT);
2231           if (tree == NULL)
2232             {
2233               *err = REG_ESPACE;
2234               return NULL;
2235             }
2236         }
2237       else if (tree == NULL)
2238         tree = exp;
2239       /* Otherwise exp == NULL, we don't need to create new tree.  */
2240     }
2241   return tree;
2242 }
2243
2244 /* This function build the following tree, from regular expression a*:
2245          *
2246          |
2247          a
2248 */
2249
2250 static bin_tree_t *
2251 parse_expression (regexp, preg, token, syntax, nest, err)
2252      re_string_t *regexp;
2253      regex_t *preg;
2254      re_token_t *token;
2255      reg_syntax_t syntax;
2256      int nest;
2257      reg_errcode_t *err;
2258 {
2259   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2260   bin_tree_t *tree;
2261   switch (token->type)
2262     {
2263     case CHARACTER:
2264       tree = create_token_tree (dfa, NULL, NULL, token);
2265       if (BE (tree == NULL, 0))
2266         {
2267           *err = REG_ESPACE;
2268           return NULL;
2269         }
2270 #ifdef RE_ENABLE_I18N
2271       if (dfa->mb_cur_max > 1)
2272         {
2273           while (!re_string_eoi (regexp)
2274                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2275             {
2276               bin_tree_t *mbc_remain;
2277               fetch_token (token, regexp, syntax);
2278               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2279               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2280               if (BE (mbc_remain == NULL || tree == NULL, 0))
2281                 {
2282                   *err = REG_ESPACE;
2283                   return NULL;
2284                 }
2285             }
2286         }
2287 #endif
2288       break;
2289     case OP_OPEN_SUBEXP:
2290       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2291       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2292         return NULL;
2293       break;
2294     case OP_OPEN_BRACKET:
2295       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2296       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2297         return NULL;
2298       break;
2299     case OP_BACK_REF:
2300       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2301         {
2302           *err = REG_ESUBREG;
2303           return NULL;
2304         }
2305       dfa->used_bkref_map |= 1 << token->opr.idx;
2306       tree = create_token_tree (dfa, NULL, NULL, token);
2307       if (BE (tree == NULL, 0))
2308         {
2309           *err = REG_ESPACE;
2310           return NULL;
2311         }
2312       ++dfa->nbackref;
2313       dfa->has_mb_node = 1;
2314       break;
2315     case OP_OPEN_DUP_NUM:
2316       if (syntax & RE_CONTEXT_INVALID_DUP)
2317         {
2318           *err = REG_BADRPT;
2319           return NULL;
2320         }
2321       /* FALLTHROUGH */
2322     case OP_DUP_ASTERISK:
2323     case OP_DUP_PLUS:
2324     case OP_DUP_QUESTION:
2325       if (syntax & RE_CONTEXT_INVALID_OPS)
2326         {
2327           *err = REG_BADRPT;
2328           return NULL;
2329         }
2330       else if (syntax & RE_CONTEXT_INDEP_OPS)
2331         {
2332           fetch_token (token, regexp, syntax);
2333           return parse_expression (regexp, preg, token, syntax, nest, err);
2334         }
2335       /* else fall through  */
2336     case OP_CLOSE_SUBEXP:
2337       if ((token->type == OP_CLOSE_SUBEXP) &&
2338           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2339         {
2340           *err = REG_ERPAREN;
2341           return NULL;
2342         }
2343       /* else fall through  */
2344     case OP_CLOSE_DUP_NUM:
2345       /* We treat it as a normal character.  */
2346
2347       /* Then we can these characters as normal characters.  */
2348       token->type = CHARACTER;
2349       /* mb_partial and word_char bits should be initialized already
2350          by peek_token.  */
2351       tree = create_token_tree (dfa, NULL, NULL, token);
2352       if (BE (tree == NULL, 0))
2353         {
2354           *err = REG_ESPACE;
2355           return NULL;
2356         }
2357       break;
2358     case ANCHOR:
2359       if ((token->opr.ctx_type
2360            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2361           && dfa->word_ops_used == 0)
2362         init_word_char (dfa);
2363       if (token->opr.ctx_type == WORD_DELIM
2364           || token->opr.ctx_type == NOT_WORD_DELIM)
2365         {
2366           bin_tree_t *tree_first, *tree_last;
2367           if (token->opr.ctx_type == WORD_DELIM)
2368             {
2369               token->opr.ctx_type = WORD_FIRST;
2370               tree_first = create_token_tree (dfa, NULL, NULL, token);
2371               token->opr.ctx_type = WORD_LAST;
2372             }
2373           else
2374             {
2375               token->opr.ctx_type = INSIDE_WORD;
2376               tree_first = create_token_tree (dfa, NULL, NULL, token);
2377               token->opr.ctx_type = INSIDE_NOTWORD;
2378             }
2379           tree_last = create_token_tree (dfa, NULL, NULL, token);
2380           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2381           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2382             {
2383               *err = REG_ESPACE;
2384               return NULL;
2385             }
2386         }
2387       else
2388         {
2389           tree = create_token_tree (dfa, NULL, NULL, token);
2390           if (BE (tree == NULL, 0))
2391             {
2392               *err = REG_ESPACE;
2393               return NULL;
2394             }
2395         }
2396       /* We must return here, since ANCHORs can't be followed
2397          by repetition operators.
2398          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2399              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2400       fetch_token (token, regexp, syntax);
2401       return tree;
2402     case OP_PERIOD:
2403       tree = create_token_tree (dfa, NULL, NULL, token);
2404       if (BE (tree == NULL, 0))
2405         {
2406           *err = REG_ESPACE;
2407           return NULL;
2408         }
2409       if (dfa->mb_cur_max > 1)
2410         dfa->has_mb_node = 1;
2411       break;
2412     case OP_WORD:
2413     case OP_NOTWORD:
2414       tree = build_charclass_op (dfa, regexp->trans,
2415                                  (const unsigned char *) "alnum",
2416                                  (const unsigned char *) "_",
2417                                  token->type == OP_NOTWORD, err);
2418       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2419         return NULL;
2420       break;
2421     case OP_SPACE:
2422     case OP_NOTSPACE:
2423       tree = build_charclass_op (dfa, regexp->trans,
2424                                  (const unsigned char *) "space",
2425                                  (const unsigned char *) "",
2426                                  token->type == OP_NOTSPACE, err);
2427       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2428         return NULL;
2429       break;
2430     case OP_ALT:
2431     case END_OF_RE:
2432       return NULL;
2433     case BACK_SLASH:
2434       *err = REG_EESCAPE;
2435       return NULL;
2436     default:
2437       /* Must not happen?  */
2438 #ifdef DEBUG
2439       assert (0);
2440 #endif
2441       return NULL;
2442     }
2443   fetch_token (token, regexp, syntax);
2444
2445   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2446          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2447     {
2448       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2449       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2450         return NULL;
2451       /* In BRE consecutive duplications are not allowed.  */
2452       if ((syntax & RE_CONTEXT_INVALID_DUP)
2453           && (token->type == OP_DUP_ASTERISK
2454               || token->type == OP_OPEN_DUP_NUM))
2455         {
2456           *err = REG_BADRPT;
2457           return NULL;
2458         }
2459     }
2460
2461   return tree;
2462 }
2463
2464 /* This function build the following tree, from regular expression
2465    (<reg_exp>):
2466          SUBEXP
2467             |
2468         <reg_exp>
2469 */
2470
2471 static bin_tree_t *
2472 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2473      re_string_t *regexp;
2474      regex_t *preg;
2475      re_token_t *token;
2476      reg_syntax_t syntax;
2477      int nest;
2478      reg_errcode_t *err;
2479 {
2480   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2481   bin_tree_t *tree;
2482   size_t cur_nsub;
2483   cur_nsub = preg->re_nsub++;
2484
2485   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2486
2487   /* The subexpression may be a null string.  */
2488   if (token->type == OP_CLOSE_SUBEXP)
2489     tree = NULL;
2490   else
2491     {
2492       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2493       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2494         *err = REG_EPAREN;
2495       if (BE (*err != REG_NOERROR, 0))
2496         return NULL;
2497     }
2498   dfa->completed_bkref_map |= 1 << cur_nsub;
2499
2500   tree = create_tree (dfa, tree, NULL, SUBEXP);
2501   if (BE (tree == NULL, 0))
2502     {
2503       *err = REG_ESPACE;
2504       return NULL;
2505     }
2506   tree->token.opr.idx = cur_nsub;
2507   return tree;
2508 }
2509
2510 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2511
2512 static bin_tree_t *
2513 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2514      bin_tree_t *elem;
2515      re_string_t *regexp;
2516      re_dfa_t *dfa;
2517      re_token_t *token;
2518      reg_syntax_t syntax;
2519      reg_errcode_t *err;
2520 {
2521   bin_tree_t *tree = NULL, *old_tree = NULL;
2522   int i, start, end, start_idx = re_string_cur_idx (regexp);
2523   re_token_t start_token = *token;
2524
2525   if (token->type == OP_OPEN_DUP_NUM)
2526     {
2527       end = 0;
2528       start = fetch_number (regexp, token, syntax);
2529       if (start == -1)
2530         {
2531           if (token->type == CHARACTER && token->opr.c == ',')
2532             start = 0; /* We treat "{,m}" as "{0,m}".  */
2533           else
2534             {
2535               *err = REG_BADBR; /* <re>{} is invalid.  */
2536               return NULL;
2537             }
2538         }
2539       if (BE (start != -2, 1))
2540         {
2541           /* We treat "{n}" as "{n,n}".  */
2542           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2543                  : ((token->type == CHARACTER && token->opr.c == ',')
2544                     ? fetch_number (regexp, token, syntax) : -2));
2545         }
2546       if (BE (start == -2 || end == -2, 0))
2547         {
2548           /* Invalid sequence.  */
2549           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2550             {
2551               if (token->type == END_OF_RE)
2552                 *err = REG_EBRACE;
2553               else
2554                 *err = REG_BADBR;
2555
2556               return NULL;
2557             }
2558
2559           /* If the syntax bit is set, rollback.  */
2560           re_string_set_index (regexp, start_idx);
2561           *token = start_token;
2562           token->type = CHARACTER;
2563           /* mb_partial and word_char bits should be already initialized by
2564              peek_token.  */
2565           return elem;
2566         }
2567
2568       if (BE (end != -1 && start > end, 0))
2569         {
2570           /* First number greater than second.  */
2571           *err = REG_BADBR;
2572           return NULL;
2573         }
2574     }
2575   else
2576     {
2577       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2578       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2579     }
2580
2581   fetch_token (token, regexp, syntax);
2582
2583   if (BE (elem == NULL, 0))
2584     return NULL;
2585   if (BE (start == 0 && end == 0, 0))
2586     {
2587       postorder (elem, free_tree, NULL);
2588       return NULL;
2589     }
2590
2591   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2592   if (BE (start > 0, 0))
2593     {
2594       tree = elem;
2595       for (i = 2; i <= start; ++i)
2596         {
2597           elem = duplicate_tree (elem, dfa);
2598           tree = create_tree (dfa, tree, elem, CONCAT);
2599           if (BE (elem == NULL || tree == NULL, 0))
2600             goto parse_dup_op_espace;
2601         }
2602
2603       if (start == end)
2604         return tree;
2605
2606       /* Duplicate ELEM before it is marked optional.  */
2607       elem = duplicate_tree (elem, dfa);
2608       old_tree = tree;
2609     }
2610   else
2611     old_tree = NULL;
2612
2613   if (elem->token.type == SUBEXP)
2614     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2615
2616   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2617   if (BE (tree == NULL, 0))
2618     goto parse_dup_op_espace;
2619
2620   /* This loop is actually executed only when end != -1,
2621      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2622      already created the start+1-th copy.  */
2623   for (i = start + 2; i <= end; ++i)
2624     {
2625       elem = duplicate_tree (elem, dfa);
2626       tree = create_tree (dfa, tree, elem, CONCAT);
2627       if (BE (elem == NULL || tree == NULL, 0))
2628         goto parse_dup_op_espace;
2629
2630       tree = create_tree (dfa, tree, NULL, OP_ALT);
2631       if (BE (tree == NULL, 0))
2632         goto parse_dup_op_espace;
2633     }
2634
2635   if (old_tree)
2636     tree = create_tree (dfa, old_tree, tree, CONCAT);
2637
2638   return tree;
2639
2640  parse_dup_op_espace:
2641   *err = REG_ESPACE;
2642   return NULL;
2643 }
2644
2645 /* Size of the names for collating symbol/equivalence_class/character_class.
2646    I'm not sure, but maybe enough.  */
2647 #define BRACKET_NAME_BUF_SIZE 32
2648
2649 #ifndef _LIBC
2650   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2651      Build the range expression which starts from START_ELEM, and ends
2652      at END_ELEM.  The result are written to MBCSET and SBCSET.
2653      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2654      mbcset->range_ends, is a pointer argument sinse we may
2655      update it.  */
2656
2657 static reg_errcode_t
2658 # ifdef RE_ENABLE_I18N
2659 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2660      re_charset_t *mbcset;
2661      int *range_alloc;
2662 # else /* not RE_ENABLE_I18N */
2663 build_range_exp (sbcset, start_elem, end_elem)
2664 # endif /* not RE_ENABLE_I18N */
2665      re_bitset_ptr_t sbcset;
2666      bracket_elem_t *start_elem, *end_elem;
2667 {
2668   unsigned int start_ch, end_ch;
2669   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2670   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2671           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2672           0))
2673     return REG_ERANGE;
2674
2675   /* We can handle no multi character collating elements without libc
2676      support.  */
2677   if (BE ((start_elem->type == COLL_SYM
2678            && strlen ((char *) start_elem->opr.name) > 1)
2679           || (end_elem->type == COLL_SYM
2680               && strlen ((char *) end_elem->opr.name) > 1), 0))
2681     return REG_ECOLLATE;
2682
2683 # ifdef RE_ENABLE_I18N
2684   {
2685     wchar_t wc, start_wc, end_wc;
2686     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2687
2688     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2689                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2690                    : 0));
2691     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2692               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2693                  : 0));
2694     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2695                 ? __btowc (start_ch) : start_elem->opr.wch);
2696     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2697               ? __btowc (end_ch) : end_elem->opr.wch);
2698     if (start_wc == WEOF || end_wc == WEOF)
2699       return REG_ECOLLATE;
2700     cmp_buf[0] = start_wc;
2701     cmp_buf[4] = end_wc;
2702     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2703       return REG_ERANGE;
2704
2705     /* Got valid collation sequence values, add them as a new entry.
2706        However, for !_LIBC we have no collation elements: if the
2707        character set is single byte, the single byte character set
2708        that we build below suffices.  parse_bracket_exp passes
2709        no MBCSET if dfa->mb_cur_max == 1.  */
2710     if (mbcset)
2711       {
2712         /* Check the space of the arrays.  */
2713         if (BE (*range_alloc == mbcset->nranges, 0))
2714           {
2715             /* There is not enough space, need realloc.  */
2716             wchar_t *new_array_start, *new_array_end;
2717             int new_nranges;
2718
2719             /* +1 in case of mbcset->nranges is 0.  */
2720             new_nranges = 2 * mbcset->nranges + 1;
2721             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2722                are NULL if *range_alloc == 0.  */
2723             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2724                                           new_nranges);
2725             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2726                                         new_nranges);
2727
2728             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2729               return REG_ESPACE;
2730
2731             mbcset->range_starts = new_array_start;
2732             mbcset->range_ends = new_array_end;
2733             *range_alloc = new_nranges;
2734           }
2735
2736         mbcset->range_starts[mbcset->nranges] = start_wc;
2737         mbcset->range_ends[mbcset->nranges++] = end_wc;
2738       }
2739
2740     /* Build the table for single byte characters.  */
2741     for (wc = 0; wc < SBC_MAX; ++wc)
2742       {
2743         cmp_buf[2] = wc;
2744         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2745             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2746           bitset_set (sbcset, wc);
2747       }
2748   }
2749 # else /* not RE_ENABLE_I18N */
2750   {
2751     unsigned int ch;
2752     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2753                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2754                    : 0));
2755     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2756               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2757                  : 0));
2758     if (start_ch > end_ch)
2759       return REG_ERANGE;
2760     /* Build the table for single byte characters.  */
2761     for (ch = 0; ch < SBC_MAX; ++ch)
2762       if (start_ch <= ch  && ch <= end_ch)
2763         bitset_set (sbcset, ch);
2764   }
2765 # endif /* not RE_ENABLE_I18N */
2766   return REG_NOERROR;
2767 }
2768 #endif /* not _LIBC */
2769
2770 #ifndef _LIBC
2771 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2772    Build the collating element which is represented by NAME.
2773    The result are written to MBCSET and SBCSET.
2774    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2775    pointer argument since we may update it.  */
2776
2777 static reg_errcode_t
2778 # ifdef RE_ENABLE_I18N
2779 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2780      re_charset_t *mbcset;
2781      int *coll_sym_alloc;
2782 # else /* not RE_ENABLE_I18N */
2783 build_collating_symbol (sbcset, name)
2784 # endif /* not RE_ENABLE_I18N */
2785      re_bitset_ptr_t sbcset;
2786      const unsigned char *name;
2787 {
2788   size_t name_len = strlen ((const char *) name);
2789   if (BE (name_len != 1, 0))
2790     return REG_ECOLLATE;
2791   else
2792     {
2793       bitset_set (sbcset, name[0]);
2794       return REG_NOERROR;
2795     }
2796 }
2797 #endif /* not _LIBC */
2798
2799 /* This function parse bracket expression like "[abc]", "[a-c]",
2800    "[[.a-a.]]" etc.  */
2801
2802 static bin_tree_t *
2803 parse_bracket_exp (regexp, dfa, token, syntax, err)
2804      re_string_t *regexp;
2805      re_dfa_t *dfa;
2806      re_token_t *token;
2807      reg_syntax_t syntax;
2808      reg_errcode_t *err;
2809 {
2810 #ifdef _LIBC
2811   const unsigned char *collseqmb;
2812   const char *collseqwc;
2813   uint32_t nrules;
2814   int32_t table_size;
2815   const int32_t *symb_table;
2816   const unsigned char *extra;
2817
2818   /* Local function for parse_bracket_exp used in _LIBC environement.
2819      Seek the collating symbol entry correspondings to NAME.
2820      Return the index of the symbol in the SYMB_TABLE.  */
2821
2822   auto inline int32_t
2823   __attribute ((always_inline))
2824   seek_collating_symbol_entry (name, name_len)
2825          const unsigned char *name;
2826          size_t name_len;
2827     {
2828       int32_t hash = elem_hash ((const char *) name, name_len);
2829       int32_t elem = hash % table_size;
2830       int32_t second = hash % (table_size - 2);
2831       while (symb_table[2 * elem] != 0)
2832         {
2833           /* First compare the hashing value.  */
2834           if (symb_table[2 * elem] == hash
2835               /* Compare the length of the name.  */
2836               && name_len == extra[symb_table[2 * elem + 1]]
2837               /* Compare the name.  */
2838               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2839                          name_len) == 0)
2840             {
2841               /* Yep, this is the entry.  */
2842               break;
2843             }
2844
2845           /* Next entry.  */
2846           elem += second;
2847         }
2848       return elem;
2849     }
2850
2851   /* Local function for parse_bracket_exp used in _LIBC environement.
2852      Look up the collation sequence value of BR_ELEM.
2853      Return the value if succeeded, UINT_MAX otherwise.  */
2854
2855   auto inline unsigned int
2856   __attribute ((always_inline))
2857   lookup_collation_sequence_value (br_elem)
2858          bracket_elem_t *br_elem;
2859     {
2860       if (br_elem->type == SB_CHAR)
2861         {
2862           /*
2863           if (MB_CUR_MAX == 1)
2864           */
2865           if (nrules == 0)
2866             return collseqmb[br_elem->opr.ch];
2867           else
2868             {
2869               wint_t wc = __btowc (br_elem->opr.ch);
2870               return __collseq_table_lookup (collseqwc, wc);
2871             }
2872         }
2873       else if (br_elem->type == MB_CHAR)
2874         {
2875           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2876         }
2877       else if (br_elem->type == COLL_SYM)
2878         {
2879           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2880           if (nrules != 0)
2881             {
2882               int32_t elem, idx;
2883               elem = seek_collating_symbol_entry (br_elem->opr.name,
2884                                                   sym_name_len);
2885               if (symb_table[2 * elem] != 0)
2886                 {
2887                   /* We found the entry.  */
2888                   idx = symb_table[2 * elem + 1];
2889                   /* Skip the name of collating element name.  */
2890                   idx += 1 + extra[idx];
2891                   /* Skip the byte sequence of the collating element.  */
2892                   idx += 1 + extra[idx];
2893                   /* Adjust for the alignment.  */
2894                   idx = (idx + 3) & ~3;
2895                   /* Skip the multibyte collation sequence value.  */
2896                   idx += sizeof (unsigned int);
2897                   /* Skip the wide char sequence of the collating element.  */
2898                   idx += sizeof (unsigned int) *
2899                     (1 + *(unsigned int *) (extra + idx));
2900                   /* Return the collation sequence value.  */
2901                   return *(unsigned int *) (extra + idx);
2902                 }
2903               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2904                 {
2905                   /* No valid character.  Match it as a single byte
2906                      character.  */
2907                   return collseqmb[br_elem->opr.name[0]];
2908                 }
2909             }
2910           else if (sym_name_len == 1)
2911             return collseqmb[br_elem->opr.name[0]];
2912         }
2913       return UINT_MAX;
2914     }
2915
2916   /* Local function for parse_bracket_exp used in _LIBC environement.
2917      Build the range expression which starts from START_ELEM, and ends
2918      at END_ELEM.  The result are written to MBCSET and SBCSET.
2919      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2920      mbcset->range_ends, is a pointer argument sinse we may
2921      update it.  */
2922
2923   auto inline reg_errcode_t
2924   __attribute ((always_inline))
2925   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2926          re_charset_t *mbcset;
2927          int *range_alloc;
2928          re_bitset_ptr_t sbcset;
2929          bracket_elem_t *start_elem, *end_elem;
2930     {
2931       unsigned int ch;
2932       uint32_t start_collseq;
2933       uint32_t end_collseq;
2934
2935       /* Equivalence Classes and Character Classes can't be a range
2936          start/end.  */
2937       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2938               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2939               0))
2940         return REG_ERANGE;
2941
2942       start_collseq = lookup_collation_sequence_value (start_elem);
2943       end_collseq = lookup_collation_sequence_value (end_elem);
2944       /* Check start/end collation sequence values.  */
2945       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2946         return REG_ECOLLATE;
2947       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2948         return REG_ERANGE;
2949
2950       /* Got valid collation sequence values, add them as a new entry.
2951          However, if we have no collation elements, and the character set
2952          is single byte, the single byte character set that we
2953          build below suffices. */
2954       if (nrules > 0 || dfa->mb_cur_max > 1)
2955         {
2956           /* Check the space of the arrays.  */
2957           if (BE (*range_alloc == mbcset->nranges, 0))
2958             {
2959               /* There is not enough space, need realloc.  */
2960               uint32_t *new_array_start;
2961               uint32_t *new_array_end;
2962               int new_nranges;
2963
2964               /* +1 in case of mbcset->nranges is 0.  */
2965               new_nranges = 2 * mbcset->nranges + 1;
2966               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2967                                             new_nranges);
2968               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2969                                           new_nranges);
2970
2971               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2972                 return REG_ESPACE;
2973
2974               mbcset->range_starts = new_array_start;
2975               mbcset->range_ends = new_array_end;
2976               *range_alloc = new_nranges;
2977             }
2978
2979           mbcset->range_starts[mbcset->nranges] = start_collseq;
2980           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2981         }
2982
2983       /* Build the table for single byte characters.  */
2984       for (ch = 0; ch < SBC_MAX; ch++)
2985         {
2986           uint32_t ch_collseq;
2987           /*
2988           if (MB_CUR_MAX == 1)
2989           */
2990           if (nrules == 0)
2991             ch_collseq = collseqmb[ch];
2992           else
2993             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2994           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2995             bitset_set (sbcset, ch);
2996         }
2997       return REG_NOERROR;
2998     }
2999
3000   /* Local function for parse_bracket_exp used in _LIBC environement.
3001      Build the collating element which is represented by NAME.
3002      The result are written to MBCSET and SBCSET.
3003      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3004      pointer argument sinse we may update it.  */
3005
3006   auto inline reg_errcode_t
3007   __attribute ((always_inline))
3008   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3009          re_charset_t *mbcset;
3010          int *coll_sym_alloc;
3011          re_bitset_ptr_t sbcset;
3012          const unsigned char *name;
3013     {
3014       int32_t elem, idx;
3015       size_t name_len = strlen ((const char *) name);
3016       if (nrules != 0)
3017         {
3018           elem = seek_collating_symbol_entry (name, name_len);
3019           if (symb_table[2 * elem] != 0)
3020             {
3021               /* We found the entry.  */
3022               idx = symb_table[2 * elem + 1];
3023               /* Skip the name of collating element name.  */
3024               idx += 1 + extra[idx];
3025             }
3026           else if (symb_table[2 * elem] == 0 && name_len == 1)
3027             {
3028               /* No valid character, treat it as a normal
3029                  character.  */
3030               bitset_set (sbcset, name[0]);
3031               return REG_NOERROR;
3032             }
3033           else
3034             return REG_ECOLLATE;
3035
3036           /* Got valid collation sequence, add it as a new entry.  */
3037           /* Check the space of the arrays.  */
3038           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3039             {
3040               /* Not enough, realloc it.  */
3041               /* +1 in case of mbcset->ncoll_syms is 0.  */
3042               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3043               /* Use realloc since mbcset->coll_syms is NULL
3044                  if *alloc == 0.  */
3045               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3046                                                    new_coll_sym_alloc);
3047               if (BE (new_coll_syms == NULL, 0))
3048                 return REG_ESPACE;
3049               mbcset->coll_syms = new_coll_syms;
3050               *coll_sym_alloc = new_coll_sym_alloc;
3051             }
3052           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3053           return REG_NOERROR;
3054         }
3055       else
3056         {
3057           if (BE (name_len != 1, 0))
3058             return REG_ECOLLATE;
3059           else
3060             {
3061               bitset_set (sbcset, name[0]);
3062               return REG_NOERROR;
3063             }
3064         }
3065     }
3066 #endif
3067
3068   re_token_t br_token;
3069   re_bitset_ptr_t sbcset;
3070 #ifdef RE_ENABLE_I18N
3071   re_charset_t *mbcset;
3072   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3073   int equiv_class_alloc = 0, char_class_alloc = 0;
3074 #endif /* not RE_ENABLE_I18N */
3075   int non_match = 0;
3076   bin_tree_t *work_tree;
3077   int token_len;
3078   int first_round = 1;
3079 #ifdef _LIBC
3080   collseqmb = (const unsigned char *)
3081     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3082   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3083   if (nrules)
3084     {
3085       /*
3086       if (MB_CUR_MAX > 1)
3087       */
3088         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3089       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3090       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3091                                                   _NL_COLLATE_SYMB_TABLEMB);
3092       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3093                                                    _NL_COLLATE_SYMB_EXTRAMB);
3094     }
3095 #endif
3096   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3097 #ifdef RE_ENABLE_I18N
3098   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3099 #endif /* RE_ENABLE_I18N */
3100 #ifdef RE_ENABLE_I18N
3101   if (BE (sbcset == NULL || mbcset == NULL, 0))
3102 #else
3103   if (BE (sbcset == NULL, 0))
3104 #endif /* RE_ENABLE_I18N */
3105     {
3106       *err = REG_ESPACE;
3107       return NULL;
3108     }
3109
3110   token_len = peek_token_bracket (token, regexp, syntax);
3111   if (BE (token->type == END_OF_RE, 0))
3112     {
3113       *err = REG_BADPAT;
3114       goto parse_bracket_exp_free_return;
3115     }
3116   if (token->type == OP_NON_MATCH_LIST)
3117     {
3118 #ifdef RE_ENABLE_I18N
3119       mbcset->non_match = 1;
3120 #endif /* not RE_ENABLE_I18N */
3121       non_match = 1;
3122       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3123         bitset_set (sbcset, '\0');
3124       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3125       token_len = peek_token_bracket (token, regexp, syntax);
3126       if (BE (token->type == END_OF_RE, 0))
3127         {
3128           *err = REG_BADPAT;
3129           goto parse_bracket_exp_free_return;
3130         }
3131     }
3132
3133   /* We treat the first ']' as a normal character.  */
3134   if (token->type == OP_CLOSE_BRACKET)
3135     token->type = CHARACTER;
3136
3137   while (1)
3138     {
3139       bracket_elem_t start_elem, end_elem;
3140       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3141       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3142       reg_errcode_t ret;
3143       int token_len2 = 0, is_range_exp = 0;
3144       re_token_t token2;
3145
3146       start_elem.opr.name = start_name_buf;
3147       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3148                                    syntax, first_round);
3149       if (BE (ret != REG_NOERROR, 0))
3150         {
3151           *err = ret;
3152           goto parse_bracket_exp_free_return;
3153         }
3154       first_round = 0;
3155
3156       /* Get information about the next token.  We need it in any case.  */
3157       token_len = peek_token_bracket (token, regexp, syntax);
3158
3159       /* Do not check for ranges if we know they are not allowed.  */
3160       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3161         {
3162           if (BE (token->type == END_OF_RE, 0))
3163             {
3164               *err = REG_EBRACK;
3165               goto parse_bracket_exp_free_return;
3166             }
3167           if (token->type == OP_CHARSET_RANGE)
3168             {
3169               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3170               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3171               if (BE (token2.type == END_OF_RE, 0))
3172                 {
3173                   *err = REG_EBRACK;
3174                   goto parse_bracket_exp_free_return;
3175                 }
3176               if (token2.type == OP_CLOSE_BRACKET)
3177                 {
3178                   /* We treat the last '-' as a normal character.  */
3179                   re_string_skip_bytes (regexp, -token_len);
3180                   token->type = CHARACTER;
3181                 }
3182               else
3183                 is_range_exp = 1;
3184             }
3185         }
3186
3187       if (is_range_exp == 1)
3188         {
3189           end_elem.opr.name = end_name_buf;
3190           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3191                                        dfa, syntax, 1);
3192           if (BE (ret != REG_NOERROR, 0))
3193             {
3194               *err = ret;
3195               goto parse_bracket_exp_free_return;
3196             }
3197
3198           token_len = peek_token_bracket (token, regexp, syntax);
3199
3200 #ifdef _LIBC
3201           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3202                                   &start_elem, &end_elem);
3203 #else
3204 # ifdef RE_ENABLE_I18N
3205           *err = build_range_exp (sbcset,
3206                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3207                                   &range_alloc, &start_elem, &end_elem);
3208 # else
3209           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3210 # endif
3211 #endif /* RE_ENABLE_I18N */
3212           if (BE (*err != REG_NOERROR, 0))
3213             goto parse_bracket_exp_free_return;
3214         }
3215       else
3216         {
3217           switch (start_elem.type)
3218             {
3219             case SB_CHAR:
3220               bitset_set (sbcset, start_elem.opr.ch);
3221               break;
3222 #ifdef RE_ENABLE_I18N
3223             case MB_CHAR:
3224               /* Check whether the array has enough space.  */
3225               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3226                 {
3227                   wchar_t *new_mbchars;
3228                   /* Not enough, realloc it.  */
3229                   /* +1 in case of mbcset->nmbchars is 0.  */
3230                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3231                   /* Use realloc since array is NULL if *alloc == 0.  */
3232                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3233                                             mbchar_alloc);
3234                   if (BE (new_mbchars == NULL, 0))
3235                     goto parse_bracket_exp_espace;
3236                   mbcset->mbchars = new_mbchars;
3237                 }
3238               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3239               break;
3240 #endif /* RE_ENABLE_I18N */
3241             case EQUIV_CLASS:
3242               *err = build_equiv_class (sbcset,
3243 #ifdef RE_ENABLE_I18N
3244                                         mbcset, &equiv_class_alloc,
3245 #endif /* RE_ENABLE_I18N */
3246                                         start_elem.opr.name);
3247               if (BE (*err != REG_NOERROR, 0))
3248                 goto parse_bracket_exp_free_return;
3249               break;
3250             case COLL_SYM:
3251               *err = build_collating_symbol (sbcset,
3252 #ifdef RE_ENABLE_I18N
3253                                              mbcset, &coll_sym_alloc,
3254 #endif /* RE_ENABLE_I18N */
3255                                              start_elem.opr.name);
3256               if (BE (*err != REG_NOERROR, 0))
3257                 goto parse_bracket_exp_free_return;
3258               break;
3259             case CHAR_CLASS:
3260               *err = build_charclass (regexp->trans, sbcset,
3261 #ifdef RE_ENABLE_I18N
3262                                       mbcset, &char_class_alloc,
3263 #endif /* RE_ENABLE_I18N */
3264                                       start_elem.opr.name, syntax);
3265               if (BE (*err != REG_NOERROR, 0))
3266                goto parse_bracket_exp_free_return;
3267               break;
3268             default:
3269               assert (0);
3270               break;
3271             }
3272         }
3273       if (BE (token->type == END_OF_RE, 0))
3274         {
3275           *err = REG_EBRACK;
3276           goto parse_bracket_exp_free_return;
3277         }
3278       if (token->type == OP_CLOSE_BRACKET)
3279         break;
3280     }
3281
3282   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3283
3284   /* If it is non-matching list.  */
3285   if (non_match)
3286     bitset_not (sbcset);
3287
3288 #ifdef RE_ENABLE_I18N
3289   /* Ensure only single byte characters are set.  */
3290   if (dfa->mb_cur_max > 1)
3291     bitset_mask (sbcset, dfa->sb_char);
3292
3293   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3294       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3295                                                      || mbcset->non_match)))
3296     {
3297       bin_tree_t *mbc_tree;
3298       int sbc_idx;
3299       /* Build a tree for complex bracket.  */
3300       dfa->has_mb_node = 1;
3301       br_token.type = COMPLEX_BRACKET;
3302       br_token.opr.mbcset = mbcset;
3303       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304       if (BE (mbc_tree == NULL, 0))
3305         goto parse_bracket_exp_espace;
3306       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3307         if (sbcset[sbc_idx])
3308           break;
3309       /* If there are no bits set in sbcset, there is no point
3310          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3311       if (sbc_idx < BITSET_UINTS)
3312         {
3313           /* Build a tree for simple bracket.  */
3314           br_token.type = SIMPLE_BRACKET;
3315           br_token.opr.sbcset = sbcset;
3316           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3317           if (BE (work_tree == NULL, 0))
3318             goto parse_bracket_exp_espace;
3319
3320           /* Then join them by ALT node.  */
3321           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3322           if (BE (work_tree == NULL, 0))
3323             goto parse_bracket_exp_espace;
3324         }
3325       else
3326         {
3327           re_free (sbcset);
3328           work_tree = mbc_tree;
3329         }
3330     }
3331   else
3332 #endif /* not RE_ENABLE_I18N */
3333     {
3334 #ifdef RE_ENABLE_I18N
3335       free_charset (mbcset);
3336 #endif
3337       /* Build a tree for simple bracket.  */
3338       br_token.type = SIMPLE_BRACKET;
3339       br_token.opr.sbcset = sbcset;
3340       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341       if (BE (work_tree == NULL, 0))
3342         goto parse_bracket_exp_espace;
3343     }
3344   return work_tree;
3345
3346  parse_bracket_exp_espace:
3347   *err = REG_ESPACE;
3348  parse_bracket_exp_free_return:
3349   re_free (sbcset);
3350 #ifdef RE_ENABLE_I18N
3351   free_charset (mbcset);
3352 #endif /* RE_ENABLE_I18N */
3353   return NULL;
3354 }
3355
3356 /* Parse an element in the bracket expression.  */
3357
3358 static reg_errcode_t
3359 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3360                        accept_hyphen)
3361      bracket_elem_t *elem;
3362      re_string_t *regexp;
3363      re_token_t *token;
3364      int token_len;
3365      re_dfa_t *dfa;
3366      reg_syntax_t syntax;
3367      int accept_hyphen;
3368 {
3369 #ifdef RE_ENABLE_I18N
3370   int cur_char_size;
3371   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3372   if (cur_char_size > 1)
3373     {
3374       elem->type = MB_CHAR;
3375       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3376       re_string_skip_bytes (regexp, cur_char_size);
3377       return REG_NOERROR;
3378     }
3379 #endif /* RE_ENABLE_I18N */
3380   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3381   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3382       || token->type == OP_OPEN_EQUIV_CLASS)
3383     return parse_bracket_symbol (elem, regexp, token);
3384   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3385     {
3386       /* A '-' must only appear as anything but a range indicator before
3387          the closing bracket.  Everything else is an error.  */
3388       re_token_t token2;
3389       (void) peek_token_bracket (&token2, regexp, syntax);
3390       if (token2.type != OP_CLOSE_BRACKET)
3391         /* The actual error value is not standardized since this whole
3392            case is undefined.  But ERANGE makes good sense.  */
3393         return REG_ERANGE;
3394     }
3395   elem->type = SB_CHAR;
3396   elem->opr.ch = token->opr.c;
3397   return REG_NOERROR;
3398 }
3399
3400 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3401    such as [:<character_class>:], [.<collating_element>.], and
3402    [=<equivalent_class>=].  */
3403
3404 static reg_errcode_t
3405 parse_bracket_symbol (elem, regexp, token)
3406      bracket_elem_t *elem;
3407      re_string_t *regexp;
3408      re_token_t *token;
3409 {
3410   unsigned char ch, delim = token->opr.c;
3411   int i = 0;
3412   if (re_string_eoi(regexp))
3413     return REG_EBRACK;
3414   for (;; ++i)
3415     {
3416       if (i >= BRACKET_NAME_BUF_SIZE)
3417         return REG_EBRACK;
3418       if (token->type == OP_OPEN_CHAR_CLASS)
3419         ch = re_string_fetch_byte_case (regexp);
3420       else
3421         ch = re_string_fetch_byte (regexp);
3422       if (re_string_eoi(regexp))
3423         return REG_EBRACK;
3424       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3425         break;
3426       elem->opr.name[i] = ch;
3427     }
3428   re_string_skip_bytes (regexp, 1);
3429   elem->opr.name[i] = '\0';
3430   switch (token->type)
3431     {
3432     case OP_OPEN_COLL_ELEM:
3433       elem->type = COLL_SYM;
3434       break;
3435     case OP_OPEN_EQUIV_CLASS:
3436       elem->type = EQUIV_CLASS;
3437       break;
3438     case OP_OPEN_CHAR_CLASS:
3439       elem->type = CHAR_CLASS;
3440       break;
3441     default:
3442       break;
3443     }
3444   return REG_NOERROR;
3445 }
3446
3447   /* Helper function for parse_bracket_exp.
3448      Build the equivalence class which is represented by NAME.
3449      The result are written to MBCSET and SBCSET.
3450      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3451      is a pointer argument sinse we may update it.  */
3452
3453 static reg_errcode_t
3454 #ifdef RE_ENABLE_I18N
3455 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3456      re_charset_t *mbcset;
3457      int *equiv_class_alloc;
3458 #else /* not RE_ENABLE_I18N */
3459 build_equiv_class (sbcset, name)
3460 #endif /* not RE_ENABLE_I18N */
3461      re_bitset_ptr_t sbcset;
3462      const unsigned char *name;
3463 {
3464 #if defined _LIBC
3465   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3466   if (nrules != 0)
3467     {
3468       const int32_t *table, *indirect;
3469       const unsigned char *weights, *extra, *cp;
3470       unsigned char char_buf[2];
3471       int32_t idx1, idx2;
3472       unsigned int ch;
3473       size_t len;
3474       /* This #include defines a local function!  */
3475 # include <locale/weight.h>
3476       /* Calculate the index for equivalence class.  */
3477       cp = name;
3478       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3479       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3480                                                _NL_COLLATE_WEIGHTMB);
3481       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3482                                                    _NL_COLLATE_EXTRAMB);
3483       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3484                                                 _NL_COLLATE_INDIRECTMB);
3485       idx1 = findidx (&cp);
3486       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3487         /* This isn't a valid character.  */
3488         return REG_ECOLLATE;
3489
3490       /* Build single byte matcing table for this equivalence class.  */
3491       char_buf[1] = (unsigned char) '\0';
3492       len = weights[idx1];
3493       for (ch = 0; ch < SBC_MAX; ++ch)
3494         {
3495           char_buf[0] = ch;
3496           cp = char_buf;
3497           idx2 = findidx (&cp);
3498 /*
3499           idx2 = table[ch];
3500 */
3501           if (idx2 == 0)
3502             /* This isn't a valid character.  */
3503             continue;
3504           if (len == weights[idx2])
3505             {
3506               int cnt = 0;
3507               while (cnt <= len &&
3508                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3509                 ++cnt;
3510
3511               if (cnt > len)
3512                 bitset_set (sbcset, ch);
3513             }
3514         }
3515       /* Check whether the array has enough space.  */
3516       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3517         {
3518           /* Not enough, realloc it.  */
3519           /* +1 in case of mbcset->nequiv_classes is 0.  */
3520           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3521           /* Use realloc since the array is NULL if *alloc == 0.  */
3522           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3523                                                    int32_t,
3524                                                    new_equiv_class_alloc);
3525           if (BE (new_equiv_classes == NULL, 0))
3526             return REG_ESPACE;
3527           mbcset->equiv_classes = new_equiv_classes;
3528           *equiv_class_alloc = new_equiv_class_alloc;
3529         }
3530       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3531     }
3532   else
3533 #endif /* _LIBC */
3534     {
3535       if (BE (strlen ((const char *) name) != 1, 0))
3536         return REG_ECOLLATE;
3537       bitset_set (sbcset, *name);
3538     }
3539   return REG_NOERROR;
3540 }
3541
3542   /* Helper function for parse_bracket_exp.
3543      Build the character class which is represented by NAME.
3544      The result are written to MBCSET and SBCSET.
3545      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3546      is a pointer argument sinse we may update it.  */
3547
3548 static reg_errcode_t
3549 #ifdef RE_ENABLE_I18N
3550 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3551      re_charset_t *mbcset;
3552      int *char_class_alloc;
3553 #else /* not RE_ENABLE_I18N */
3554 build_charclass (trans, sbcset, class_name, syntax)
3555 #endif /* not RE_ENABLE_I18N */
3556      unsigned RE_TRANSLATE_TYPE trans;
3557      re_bitset_ptr_t sbcset;
3558      const unsigned char *class_name;
3559      reg_syntax_t syntax;
3560 {
3561   int i;
3562   const char *name = (const char *) class_name;
3563
3564   /* In case of REG_ICASE "upper" and "lower" match the both of
3565      upper and lower cases.  */
3566   if ((syntax & RE_ICASE)
3567       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3568     name = "alpha";
3569
3570 #ifdef RE_ENABLE_I18N
3571   /* Check the space of the arrays.  */
3572   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3573     {
3574       /* Not enough, realloc it.  */
3575       /* +1 in case of mbcset->nchar_classes is 0.  */
3576       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3577       /* Use realloc since array is NULL if *alloc == 0.  */
3578       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3579                                                new_char_class_alloc);
3580       if (BE (new_char_classes == NULL, 0))
3581         return REG_ESPACE;
3582       mbcset->char_classes = new_char_classes;
3583       *char_class_alloc = new_char_class_alloc;
3584     }
3585   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3586 #endif /* RE_ENABLE_I18N */
3587
3588 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3589     for (i = 0; i < SBC_MAX; ++i)               \
3590       {                                         \
3591         if (ctype_func (i))                     \
3592           {                                     \
3593             int ch = trans ? trans[i] : i;      \
3594             bitset_set (sbcset, ch);            \
3595           }                                     \
3596       }
3597
3598   if (strcmp (name, "alnum") == 0)
3599     BUILD_CHARCLASS_LOOP (isalnum)
3600   else if (strcmp (name, "cntrl") == 0)
3601     BUILD_CHARCLASS_LOOP (iscntrl)
3602   else if (strcmp (name, "lower") == 0)
3603     BUILD_CHARCLASS_LOOP (islower)
3604   else if (strcmp (name, "space") == 0)
3605     BUILD_CHARCLASS_LOOP (isspace)
3606   else if (strcmp (name, "alpha") == 0)
3607     BUILD_CHARCLASS_LOOP (isalpha)
3608   else if (strcmp (name, "digit") == 0)
3609     BUILD_CHARCLASS_LOOP (isdigit)
3610   else if (strcmp (name, "print") == 0)
3611     BUILD_CHARCLASS_LOOP (isprint)
3612   else if (strcmp (name, "upper") == 0)
3613     BUILD_CHARCLASS_LOOP (isupper)
3614   else if (strcmp (name, "blank") == 0)
3615     BUILD_CHARCLASS_LOOP (isblank)
3616   else if (strcmp (name, "graph") == 0)
3617     BUILD_CHARCLASS_LOOP (isgraph)
3618   else if (strcmp (name, "punct") == 0)
3619     BUILD_CHARCLASS_LOOP (ispunct)
3620   else if (strcmp (name, "xdigit") == 0)
3621     BUILD_CHARCLASS_LOOP (isxdigit)
3622   else
3623     return REG_ECTYPE;
3624
3625   return REG_NOERROR;
3626 }
3627
3628 static bin_tree_t *
3629 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3630      re_dfa_t *dfa;
3631      unsigned RE_TRANSLATE_TYPE trans;
3632      const unsigned char *class_name;
3633      const unsigned char *extra;
3634      int non_match;
3635      reg_errcode_t *err;
3636 {
3637   re_bitset_ptr_t sbcset;
3638 #ifdef RE_ENABLE_I18N
3639   re_charset_t *mbcset;
3640   int alloc = 0;
3641 #endif /* not RE_ENABLE_I18N */
3642   reg_errcode_t ret;
3643   re_token_t br_token;
3644   bin_tree_t *tree;
3645
3646   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3647 #ifdef RE_ENABLE_I18N
3648   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3649 #endif /* RE_ENABLE_I18N */
3650
3651 #ifdef RE_ENABLE_I18N
3652   if (BE (sbcset == NULL || mbcset == NULL, 0))
3653 #else /* not RE_ENABLE_I18N */
3654   if (BE (sbcset == NULL, 0))
3655 #endif /* not RE_ENABLE_I18N */
3656     {
3657       *err = REG_ESPACE;
3658       return NULL;
3659     }
3660
3661   if (non_match)
3662     {
3663 #ifdef RE_ENABLE_I18N
3664       /*
3665       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3666         bitset_set(cset->sbcset, '\0');
3667       */
3668       mbcset->non_match = 1;
3669 #endif /* not RE_ENABLE_I18N */
3670     }
3671
3672   /* We don't care the syntax in this case.  */
3673   ret = build_charclass (trans, sbcset,
3674 #ifdef RE_ENABLE_I18N
3675                          mbcset, &alloc,
3676 #endif /* RE_ENABLE_I18N */
3677                          class_name, 0);
3678
3679   if (BE (ret != REG_NOERROR, 0))
3680     {
3681       re_free (sbcset);
3682 #ifdef RE_ENABLE_I18N
3683       free_charset (mbcset);
3684 #endif /* RE_ENABLE_I18N */
3685       *err = ret;
3686       return NULL;
3687     }
3688   /* \w match '_' also.  */
3689   for (; *extra; extra++)
3690     bitset_set (sbcset, *extra);
3691
3692   /* If it is non-matching list.  */
3693   if (non_match)
3694     bitset_not (sbcset);
3695
3696 #ifdef RE_ENABLE_I18N
3697   /* Ensure only single byte characters are set.  */
3698   if (dfa->mb_cur_max > 1)
3699     bitset_mask (sbcset, dfa->sb_char);
3700 #endif
3701
3702   /* Build a tree for simple bracket.  */
3703   br_token.type = SIMPLE_BRACKET;
3704   br_token.opr.sbcset = sbcset;
3705   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3706   if (BE (tree == NULL, 0))
3707     goto build_word_op_espace;
3708
3709 #ifdef RE_ENABLE_I18N
3710   if (dfa->mb_cur_max > 1)
3711     {
3712       bin_tree_t *mbc_tree;
3713       /* Build a tree for complex bracket.  */
3714       br_token.type = COMPLEX_BRACKET;
3715       br_token.opr.mbcset = mbcset;
3716       dfa->has_mb_node = 1;
3717       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3718       if (BE (mbc_tree == NULL, 0))
3719         goto build_word_op_espace;
3720       /* Then join them by ALT node.  */
3721       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3722       if (BE (mbc_tree != NULL, 1))
3723         return tree;
3724     }
3725   else
3726     {
3727       free_charset (mbcset);
3728       return tree;
3729     }
3730 #else /* not RE_ENABLE_I18N */
3731   return tree;
3732 #endif /* not RE_ENABLE_I18N */
3733
3734  build_word_op_espace:
3735   re_free (sbcset);
3736 #ifdef RE_ENABLE_I18N
3737   free_charset (mbcset);
3738 #endif /* RE_ENABLE_I18N */
3739   *err = REG_ESPACE;
3740   return NULL;
3741 }
3742
3743 /* This is intended for the expressions like "a{1,3}".
3744    Fetch a number from `input', and return the number.
3745    Return -1, if the number field is empty like "{,1}".
3746    Return -2, If an error is occured.  */
3747
3748 static int
3749 fetch_number (input, token, syntax)
3750      re_string_t *input;
3751      re_token_t *token;
3752      reg_syntax_t syntax;
3753 {
3754   int num = -1;
3755   unsigned char c;
3756   while (1)
3757     {
3758       fetch_token (token, input, syntax);
3759       c = token->opr.c;
3760       if (BE (token->type == END_OF_RE, 0))
3761         return -2;
3762       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3763         break;
3764       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3765              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3766       num = (num > RE_DUP_MAX) ? -2 : num;
3767     }
3768   return num;
3769 }
3770 \f
3771 #ifdef RE_ENABLE_I18N
3772 static void
3773 free_charset (re_charset_t *cset)
3774 {
3775   re_free (cset->mbchars);
3776 # ifdef _LIBC
3777   re_free (cset->coll_syms);
3778   re_free (cset->equiv_classes);
3779   re_free (cset->range_starts);
3780   re_free (cset->range_ends);
3781 # endif
3782   re_free (cset->char_classes);
3783   re_free (cset);
3784 }
3785 #endif /* RE_ENABLE_I18N */
3786 \f
3787 /* Functions for binary tree operation.  */
3788
3789 /* Create a tree node.  */
3790
3791 static bin_tree_t *
3792 create_tree (dfa, left, right, type)
3793      re_dfa_t *dfa;
3794      bin_tree_t *left;
3795      bin_tree_t *right;
3796      re_token_type_t type;
3797 {
3798   re_token_t t;
3799   t.type = type;
3800   return create_token_tree (dfa, left, right, &t);
3801 }
3802
3803 static bin_tree_t *
3804 create_token_tree (dfa, left, right, token)
3805      re_dfa_t *dfa;
3806      bin_tree_t *left;
3807      bin_tree_t *right;
3808      const re_token_t *token;
3809 {
3810   bin_tree_t *tree;
3811   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3812     {
3813       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3814
3815       if (storage == NULL)
3816         return NULL;
3817       storage->next = dfa->str_tree_storage;
3818       dfa->str_tree_storage = storage;
3819       dfa->str_tree_storage_idx = 0;
3820     }
3821   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3822
3823   tree->parent = NULL;
3824   tree->left = left;
3825   tree->right = right;
3826   tree->token = *token;
3827   tree->token.duplicated = 0;
3828   tree->token.opt_subexp = 0;
3829   tree->first = NULL;
3830   tree->next = NULL;
3831   tree->node_idx = -1;
3832
3833   if (left != NULL)
3834     left->parent = tree;
3835   if (right != NULL)
3836     right->parent = tree;
3837   return tree;
3838 }
3839
3840 /* Mark the tree SRC as an optional subexpression.
3841    To be called from preorder or postorder.  */
3842
3843 static reg_errcode_t
3844 mark_opt_subexp (extra, node)
3845      void *extra;
3846      bin_tree_t *node;
3847 {
3848   int idx = (int) (long) extra;
3849   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3850     node->token.opt_subexp = 1;
3851
3852   return REG_NOERROR;
3853 }
3854
3855 /* Free the allocated memory inside NODE. */
3856
3857 static void
3858 free_token (re_token_t *node)
3859 {
3860 #ifdef RE_ENABLE_I18N
3861   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3862     free_charset (node->opr.mbcset);
3863   else
3864 #endif /* RE_ENABLE_I18N */
3865     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3866       re_free (node->opr.sbcset);
3867 }
3868
3869 /* Worker function for tree walking.  Free the allocated memory inside NODE
3870    and its children. */
3871
3872 static reg_errcode_t
3873 free_tree (void *extra, bin_tree_t *node)
3874 {
3875   free_token (&node->token);
3876   return REG_NOERROR;
3877 }
3878
3879
3880 /* Duplicate the node SRC, and return new node.  This is a preorder
3881    visit similar to the one implemented by the generic visitor, but
3882    we need more infrastructure to maintain two parallel trees --- so,
3883    it's easier to duplicate.  */
3884
3885 static bin_tree_t *
3886 duplicate_tree (root, dfa)
3887      const bin_tree_t *root;
3888      re_dfa_t *dfa;
3889 {
3890   const bin_tree_t *node;
3891   bin_tree_t *dup_root;
3892   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3893
3894   for (node = root; ; )
3895     {
3896       /* Create a new tree and link it back to the current parent.  */
3897       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3898       if (*p_new == NULL)
3899         return NULL;
3900       (*p_new)->parent = dup_node;
3901       (*p_new)->token.duplicated = 1;
3902       dup_node = *p_new;
3903
3904       /* Go to the left node, or up and to the right.  */
3905       if (node->left)
3906         {
3907           node = node->left;
3908           p_new = &dup_node->left;
3909         }
3910       else
3911         {
3912           const bin_tree_t *prev = NULL;
3913           while (node->right == prev || node->right == NULL)
3914             {
3915               prev = node;
3916               node = node->parent;
3917               dup_node = dup_node->parent;
3918               if (!node)
3919                 return dup_root;
3920             }
3921           node = node->right;
3922           p_new = &dup_node->right;
3923         }
3924     }
3925 }