lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           int length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  26 static void init_word_char (re_dfa_t *dfa);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  51                                              int top_clone_node, int root_node,
  52                                              unsigned int constraint);
  53 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  54                                      unsigned int constraint);
  55 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static void fetch_token (re_token_t *result, re_string_t *input,
  64                          reg_syntax_t syntax);
  65 static int peek_token (re_token_t *token, re_string_t *input,
  66                         reg_syntax_t syntax);
  67 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  68                                reg_syntax_t syntax);
  69 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  70                           reg_syntax_t syntax, reg_errcode_t *err);
  71 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  75                                  re_token_t *token, reg_syntax_t syntax,
  76                                  int nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  78                                      re_token_t *token, reg_syntax_t syntax,
  79                                      int nest, reg_errcode_t *err);
  80 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  81                                   re_token_t *token, reg_syntax_t syntax,
  82                                   int nest, reg_errcode_t *err);
  83 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  84                                  re_dfa_t *dfa, re_token_t *token,
  85                                  reg_syntax_t syntax, reg_errcode_t *err);
  86 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  87                                       re_token_t *token, reg_syntax_t syntax,
  88                                       reg_errcode_t *err);
  89 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  90                                             re_string_t *regexp,
  91                                             re_token_t *token, int token_len,
  92                                             re_dfa_t *dfa,
  93                                             reg_syntax_t syntax,
  94                                             int accept_hyphen);
  95 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  96                                           re_string_t *regexp,
  97                                           re_token_t *token);
  98 #ifndef _LIBC
  99 # ifdef RE_ENABLE_I18N
 100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 101                                       re_charset_t *mbcset, int *range_alloc,
 102                                       bracket_elem_t *start_elem,
 103                                       bracket_elem_t *end_elem);
 104 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 105                                              re_charset_t *mbcset,
 106                                              int *coll_sym_alloc,
 107                                              const unsigned char *name);
 108 # else /* not RE_ENABLE_I18N */
 109 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 110                                       bracket_elem_t *start_elem,
 111                                       bracket_elem_t *end_elem);
 112 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 113                                              const unsigned char *name);
 114 # endif /* not RE_ENABLE_I18N */
 115 #endif /* not _LIBC */
 116 #ifdef RE_ENABLE_I18N
 117 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 118                                         re_charset_t *mbcset,
 119                                         int *equiv_class_alloc,
 120                                         const unsigned char *name);
 121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 122                                       re_bitset_ptr_t sbcset,
 123                                       re_charset_t *mbcset,
 124                                       int *char_class_alloc,
 125                                       const unsigned char *class_name,
 126                                       reg_syntax_t syntax);
 127 #else  /* not RE_ENABLE_I18N */
 128 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 129                                         const unsigned char *name);
 130 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 131                                       re_bitset_ptr_t sbcset,
 132                                       const unsigned char *class_name,
 133                                       reg_syntax_t syntax);
 134 #endif /* not RE_ENABLE_I18N */
 135 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 136                                        unsigned RE_TRANSLATE_TYPE trans,
 137                                        const unsigned char *class_name,
 138                                        const unsigned char *extra,
 139                                        int non_match, reg_errcode_t *err);
 140 static bin_tree_t *create_tree (re_dfa_t *dfa,
 141                                 bin_tree_t *left, bin_tree_t *right,
 142                                 re_token_type_t type);
 143 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 144                                       bin_tree_t *left, bin_tree_t *right,
 145                                       const re_token_t *token);
 146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 147 static void free_token (re_token_t *node);
 148 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 149 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 150 \f
 151 /* This table gives an error message for each of the error codes listed
 152    in regex.h.  Obviously the order here has to be same as there.
 153    POSIX doesn't require that we do anything for REG_NOERROR,
 154    but why not be nice?  */
 155
 156 const char __re_error_msgid[] attribute_hidden =
 157   {
 158 #define REG_NOERROR_IDX 0
 159     gettext_noop ("Success")    /* REG_NOERROR */
 160     "\0"
 161 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 162     gettext_noop ("No match")   /* REG_NOMATCH */
 163     "\0"
 164 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 165     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 166     "\0"
 167 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 168     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 169     "\0"
 170 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 171     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 172     "\0"
 173 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 174     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 175     "\0"
 176 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 177     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 178     "\0"
 179 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 180     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 181     "\0"
 182 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 183     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 184     "\0"
 185 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 186     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 187     "\0"
 188 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 189     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 190     "\0"
 191 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 192     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 193     "\0"
 194 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 195     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 196     "\0"
 197 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 198     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 199     "\0"
 200 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 201     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 202     "\0"
 203 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 204     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 205     "\0"
 206 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 207     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 208   };
 209
 210 const size_t __re_error_msgid_idx[] attribute_hidden =
 211   {
 212     REG_NOERROR_IDX,
 213     REG_NOMATCH_IDX,
 214     REG_BADPAT_IDX,
 215     REG_ECOLLATE_IDX,
 216     REG_ECTYPE_IDX,
 217     REG_EESCAPE_IDX,
 218     REG_ESUBREG_IDX,
 219     REG_EBRACK_IDX,
 220     REG_EPAREN_IDX,
 221     REG_EBRACE_IDX,
 222     REG_BADBR_IDX,
 223     REG_ERANGE_IDX,
 224     REG_ESPACE_IDX,
 225     REG_BADRPT_IDX,
 226     REG_EEND_IDX,
 227     REG_ESIZE_IDX,
 228     REG_ERPAREN_IDX
 229   };
 230 \f
 231 /* Entry points for GNU code.  */
 232
 233 /* re_compile_pattern is the GNU regular expression compiler: it
 234    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 235    Returns 0 if the pattern was valid, otherwise an error string.
 236
 237    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 238    are set in BUFP on entry.  */
 239
 240 const char *
 241 re_compile_pattern (pattern, length, bufp)
 242     const char *pattern;
 243     size_t length;
 244     struct re_pattern_buffer *bufp;
 245 {
 246   reg_errcode_t ret;
 247
 248   /* And GNU code determines whether or not to get register information
 249      by passing null for the REGS argument to re_match, etc., not by
 250      setting no_sub, unless RE_NO_SUB is set.  */
 251   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 252
 253   /* Match anchors at newline.  */
 254   bufp->newline_anchor = 1;
 255
 256   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 257
 258   if (!ret)
 259     return NULL;
 260   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 261 }
 262 #ifdef _LIBC
 263 weak_alias (__re_compile_pattern, re_compile_pattern)
 264 #endif
 265
 266 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 267    also be assigned to arbitrarily: each pattern buffer stores its own
 268    syntax, so it can be changed between regex compilations.  */
 269 /* This has no initializer because initialized variables in Emacs
 270    become read-only after dumping.  */
 271 reg_syntax_t re_syntax_options;
 272
 273
 274 /* Specify the precise syntax of regexps for compilation.  This provides
 275    for compatibility for various utilities which historically have
 276    different, incompatible syntaxes.
 277
 278    The argument SYNTAX is a bit mask comprised of the various bits
 279    defined in regex.h.  We return the old syntax.  */
 280
 281 reg_syntax_t
 282 re_set_syntax (syntax)
 283     reg_syntax_t syntax;
 284 {
 285   reg_syntax_t ret = re_syntax_options;
 286
 287   re_syntax_options = syntax;
 288   return ret;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_set_syntax, re_set_syntax)
 292 #endif
 293
 294 int
 295 re_compile_fastmap (bufp)
 296     struct re_pattern_buffer *bufp;
 297 {
 298   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 299   char *fastmap = bufp->fastmap;
 300
 301   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 302   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 303   if (dfa->init_state != dfa->init_state_word)
 304     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 305   if (dfa->init_state != dfa->init_state_nl)
 306     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 307   if (dfa->init_state != dfa->init_state_begbuf)
 308     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 309   bufp->fastmap_accurate = 1;
 310   return 0;
 311 }
 312 #ifdef _LIBC
 313 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 314 #endif
 315
 316 static inline void
 317 __attribute ((always_inline))
 318 re_set_fastmap (char *fastmap, int icase, int ch)
 319 {
 320   fastmap[ch] = 1;
 321   if (icase)
 322     fastmap[tolower (ch)] = 1;
 323 }
 324
 325 /* Helper function for re_compile_fastmap.
 326    Compile fastmap for the initial_state INIT_STATE.  */
 327
 328 static void
 329 re_compile_fastmap_iter (bufp, init_state, fastmap)
 330      regex_t *bufp;
 331      const re_dfastate_t *init_state;
 332      char *fastmap;
 333 {
 334   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 335   int node_cnt;
 336   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 337   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 338     {
 339       int node = init_state->nodes.elems[node_cnt];
 340       re_token_type_t type = dfa->nodes[node].type;
 341
 342       if (type == CHARACTER)
 343         {
 344           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 345 #ifdef RE_ENABLE_I18N
 346           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 347             {
 348               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 349               wchar_t wc;
 350               mbstate_t state;
 351
 352               p = buf;
 353               *p++ = dfa->nodes[node].opr.c;
 354               while (++node < dfa->nodes_len
 355                      && dfa->nodes[node].type == CHARACTER
 356                      && dfa->nodes[node].mb_partial)
 357                 *p++ = dfa->nodes[node].opr.c;
 358               memset (&state, 0, sizeof (state));
 359               if (mbrtowc (&wc, (const char *) buf, p - buf,
 360                            &state) == p - buf
 361                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 362                       != (size_t) -1))
 363                 re_set_fastmap (fastmap, 0, buf[0]);
 364             }
 365 #endif
 366         }
 367       else if (type == SIMPLE_BRACKET)
 368         {
 369           int i, j, ch;
 370           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 371             for (j = 0; j < UINT_BITS; ++j, ++ch)
 372               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 373                 re_set_fastmap (fastmap, icase, ch);
 374         }
 375 #ifdef RE_ENABLE_I18N
 376       else if (type == COMPLEX_BRACKET)
 377         {
 378           int i;
 379           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 381               || cset->nranges || cset->nchar_classes)
 382             {
 383 # ifdef _LIBC
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 385                 {
 386                   /* In this case we want to catch the bytes which are
 387                      the first byte of any collation elements.
 388                      e.g. In da_DK, we want to catch 'a' since "aa"
 389                           is a valid collation element, and don't catch
 390                           'b' since 'b' is the only collation element
 391                           which starts from 'b'.  */
 392                   int j, ch;
 393                   const int32_t *table = (const int32_t *)
 394                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 395                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 396                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 397                       if (table[ch] < 0)
 398                         re_set_fastmap (fastmap, icase, ch);
 399                 }
 400 # else
 401               if (dfa->mb_cur_max > 1)
 402                 for (i = 0; i < SBC_MAX; ++i)
 403                   if (__btowc (i) == WEOF)
 404                     re_set_fastmap (fastmap, icase, i);
 405 # endif /* not _LIBC */
 406             }
 407           for (i = 0; i < cset->nmbchars; ++i)
 408             {
 409               char buf[256];
 410               mbstate_t state;
 411               memset (&state, '\0', sizeof (state));
 412               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 413                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                 {
 416                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 417                       != (size_t) -1)
 418                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 419                 }
 420             }
 421         }
 422 #endif /* RE_ENABLE_I18N */
 423       else if (type == OP_PERIOD
 424 #ifdef RE_ENABLE_I18N
 425                || type == OP_UTF8_PERIOD
 426 #endif /* RE_ENABLE_I18N */
 427                || type == END_OF_RE)
 428         {
 429           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 430           if (type == END_OF_RE)
 431             bufp->can_be_null = 1;
 432           return;
 433         }
 434     }
 435 }
 436 \f
 437 /* Entry point for POSIX code.  */
 438 /* regcomp takes a regular expression as a string and compiles it.
 439
 440    PREG is a regex_t *.  We do not expect any fields to be initialized,
 441    since POSIX says we shouldn't.  Thus, we set
 442
 443      `buffer' to the compiled pattern;
 444      `used' to the length of the compiled pattern;
 445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 447        RE_SYNTAX_POSIX_BASIC;
 448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 449      `fastmap' to an allocated space for the fastmap;
 450      `fastmap_accurate' to zero;
 451      `re_nsub' to the number of subexpressions in PATTERN.
 452
 453    PATTERN is the address of the pattern string.
 454
 455    CFLAGS is a series of bits which affect compilation.
 456
 457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 458      use POSIX basic syntax.
 459
 460      If REG_NEWLINE is set, then . and [^...] don't match newline.
 461      Also, regexec will try a match beginning after every newline.
 462
 463      If REG_ICASE is set, then we considers upper- and lowercase
 464      versions of letters to be equivalent when matching.
 465
 466      If REG_NOSUB is set, then when PREG is passed to regexec, that
 467      routine will report only success or failure, and nothing about the
 468      registers.
 469
 470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 471    the return codes and their meanings.)  */
 472
 473 int
 474 regcomp (preg, pattern, cflags)
 475     regex_t *__restrict preg;
 476     const char *__restrict pattern;
 477     int cflags;
 478 {
 479   reg_errcode_t ret;
 480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 481                          : RE_SYNTAX_POSIX_BASIC);
 482
 483   preg->buffer = NULL;
 484   preg->allocated = 0;
 485   preg->used = 0;
 486
 487   /* Try to allocate space for the fastmap.  */
 488   preg->fastmap = re_malloc (char, SBC_MAX);
 489   if (BE (preg->fastmap == NULL, 0))
 490     return REG_ESPACE;
 491
 492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 493
 494   /* If REG_NEWLINE is set, newlines are treated differently.  */
 495   if (cflags & REG_NEWLINE)
 496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 497       syntax &= ~RE_DOT_NEWLINE;
 498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 499       /* It also changes the matching behavior.  */
 500       preg->newline_anchor = 1;
 501     }
 502   else
 503     preg->newline_anchor = 0;
 504   preg->no_sub = !!(cflags & REG_NOSUB);
 505   preg->translate = NULL;
 506
 507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 508
 509   /* POSIX doesn't distinguish between an unmatched open-group and an
 510      unmatched close-group: both are REG_EPAREN.  */
 511   if (ret == REG_ERPAREN)
 512     ret = REG_EPAREN;
 513
 514   /* We have already checked preg->fastmap != NULL.  */
 515   if (BE (ret == REG_NOERROR, 1))
 516     /* Compute the fastmap now, since regexec cannot modify the pattern
 517        buffer.  This function never fails in this implementation.  */
 518     (void) re_compile_fastmap (preg);
 519   else
 520     {
 521       /* Some error occurred while compiling the expression.  */
 522       re_free (preg->fastmap);
 523       preg->fastmap = NULL;
 524     }
 525
 526   return (int) ret;
 527 }
 528 #ifdef _LIBC
 529 weak_alias (__regcomp, regcomp)
 530 #endif
 531
 532 /* Returns a message corresponding to an error code, ERRCODE, returned
 533    from either regcomp or regexec.   We don't use PREG here.  */
 534
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *preg;
 539     char *errbuf;
 540     size_t errbuf_size;
 541 {
 542   const char *msg;
 543   size_t msg_size;
 544
 545   if (BE (errcode < 0
 546           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 547                                / sizeof (__re_error_msgid_idx[0])), 0))
 548     /* Only error codes returned by the rest of the code should be passed
 549        to this routine.  If we are given anything else, or if other regex
 550        code generates an invalid error code, then the program has a bug.
 551        Dump core so we can fix it.  */
 552     abort ();
 553
 554   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 555
 556   msg_size = strlen (msg) + 1; /* Includes the null.  */
 557
 558   if (BE (errbuf_size != 0, 1))
 559     {
 560       if (BE (msg_size > errbuf_size, 0))
 561         {
 562 #if defined HAVE_MEMPCPY || defined _LIBC
 563           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 564 #else
 565           memcpy (errbuf, msg, errbuf_size - 1);
 566           errbuf[errbuf_size - 1] = 0;
 567 #endif
 568         }
 569       else
 570         memcpy (errbuf, msg, msg_size);
 571     }
 572
 573   return msg_size;
 574 }
 575 #ifdef _LIBC
 576 weak_alias (__regerror, regerror)
 577 #endif
 578
 579
 580 #ifdef RE_ENABLE_I18N
 581 /* This static array is used for the map to single-byte characters when
 582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 583    it the same all the time.  UTF-8 is the preferred encoding so this is
 584    a worthwhile optimization.  */
 585 static const bitset utf8_sb_map =
 586 {
 587   /* Set the first 128 bits.  */
 588 # if UINT_MAX == 0xffffffff
 589   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 590 # else
 591 #  error "Add case for new unsigned int size"
 592 # endif
 593 };
 594 #endif
 595
 596
 597 static void
 598 free_dfa_content (re_dfa_t *dfa)
 599 {
 600   int i, j;
 601
 602   if (dfa->nodes)
 603     for (i = 0; i < dfa->nodes_len; ++i)
 604       free_token (dfa->nodes + i);
 605   re_free (dfa->nexts);
 606   for (i = 0; i < dfa->nodes_len; ++i)
 607     {
 608       if (dfa->eclosures != NULL)
 609         re_node_set_free (dfa->eclosures + i);
 610       if (dfa->inveclosures != NULL)
 611         re_node_set_free (dfa->inveclosures + i);
 612       if (dfa->edests != NULL)
 613         re_node_set_free (dfa->edests + i);
 614     }
 615   re_free (dfa->edests);
 616   re_free (dfa->eclosures);
 617   re_free (dfa->inveclosures);
 618   re_free (dfa->nodes);
 619
 620   if (dfa->state_table)
 621     for (i = 0; i <= dfa->state_hash_mask; ++i)
 622       {
 623         struct re_state_table_entry *entry = dfa->state_table + i;
 624         for (j = 0; j < entry->num; ++j)
 625           {
 626             re_dfastate_t *state = entry->array[j];
 627             free_state (state);
 628           }
 629         re_free (entry->array);
 630       }
 631   re_free (dfa->state_table);
 632 #ifdef RE_ENABLE_I18N
 633   if (dfa->sb_char != utf8_sb_map)
 634     re_free (dfa->sb_char);
 635 #endif
 636   re_free (dfa->subexp_map);
 637 #ifdef DEBUG
 638   re_free (dfa->re_str);
 639 #endif
 640
 641   re_free (dfa);
 642 }
 643
 644
 645 /* Free dynamically allocated space used by PREG.  */
 646
 647 void
 648 regfree (preg)
 649     regex_t *preg;
 650 {
 651   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 652   if (BE (dfa != NULL, 1))
 653     free_dfa_content (dfa);
 654   preg->buffer = NULL;
 655   preg->allocated = 0;
 656
 657   re_free (preg->fastmap);
 658   preg->fastmap = NULL;
 659
 660   re_free (preg->translate);
 661   preg->translate = NULL;
 662 }
 663 #ifdef _LIBC
 664 weak_alias (__regfree, regfree)
 665 #endif
 666 \f
 667 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 668    them unless specifically requested.  */
 669
 670 #if defined _REGEX_RE_COMP || defined _LIBC
 671
 672 /* BSD has one and only one pattern buffer.  */
 673 static struct re_pattern_buffer re_comp_buf;
 674
 675 char *
 676 # ifdef _LIBC
 677 /* Make these definitions weak in libc, so POSIX programs can redefine
 678    these names if they don't use our functions, and still use
 679    regcomp/regexec above without link errors.  */
 680 weak_function
 681 # endif
 682 re_comp (s)
 683      const char *s;
 684 {
 685   reg_errcode_t ret;
 686   char *fastmap;
 687
 688   if (!s)
 689     {
 690       if (!re_comp_buf.buffer)
 691         return gettext ("No previous regular expression");
 692       return 0;
 693     }
 694
 695   if (re_comp_buf.buffer)
 696     {
 697       fastmap = re_comp_buf.fastmap;
 698       re_comp_buf.fastmap = NULL;
 699       __regfree (&re_comp_buf);
 700       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 701       re_comp_buf.fastmap = fastmap;
 702     }
 703
 704   if (re_comp_buf.fastmap == NULL)
 705     {
 706       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 707       if (re_comp_buf.fastmap == NULL)
 708         return (char *) gettext (__re_error_msgid
 709                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 710     }
 711
 712   /* Since `re_exec' always passes NULL for the `regs' argument, we
 713      don't need to initialize the pattern buffer fields which affect it.  */
 714
 715   /* Match anchors at newlines.  */
 716   re_comp_buf.newline_anchor = 1;
 717
 718   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 719
 720   if (!ret)
 721     return NULL;
 722
 723   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 724   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 725 }
 726
 727 #ifdef _LIBC
 728 libc_freeres_fn (free_mem)
 729 {
 730   __regfree (&re_comp_buf);
 731 }
 732 #endif
 733
 734 #endif /* _REGEX_RE_COMP */
 735 \f
 736 /* Internal entry point.
 737    Compile the regular expression PATTERN, whose length is LENGTH.
 738    SYNTAX indicate regular expression's syntax.  */
 739
 740 static reg_errcode_t
 741 re_compile_internal (preg, pattern, length, syntax)
 742      regex_t *preg;
 743      const char * pattern;
 744      int length;
 745      reg_syntax_t syntax;
 746 {
 747   reg_errcode_t err = REG_NOERROR;
 748   re_dfa_t *dfa;
 749   re_string_t regexp;
 750
 751   /* Initialize the pattern buffer.  */
 752   preg->fastmap_accurate = 0;
 753   preg->syntax = syntax;
 754   preg->not_bol = preg->not_eol = 0;
 755   preg->used = 0;
 756   preg->re_nsub = 0;
 757   preg->can_be_null = 0;
 758   preg->regs_allocated = REGS_UNALLOCATED;
 759
 760   /* Initialize the dfa.  */
 761   dfa = (re_dfa_t *) preg->buffer;
 762   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 763     {
 764       /* If zero allocated, but buffer is non-null, try to realloc
 765          enough space.  This loses if buffer's address is bogus, but
 766          that is the user's responsibility.  If ->buffer is NULL this
 767          is a simple allocation.  */
 768       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 769       if (dfa == NULL)
 770         return REG_ESPACE;
 771       preg->allocated = sizeof (re_dfa_t);
 772       preg->buffer = (unsigned char *) dfa;
 773     }
 774   preg->used = sizeof (re_dfa_t);
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = init_dfa (dfa, length);
 779   if (BE (err != REG_NOERROR, 0))
 780     {
 781       free_dfa_content (dfa);
 782       preg->buffer = NULL;
 783       preg->allocated = 0;
 784       return err;
 785     }
 786 #ifdef DEBUG
 787   dfa->re_str = re_malloc (char, length + 1);
 788   strncpy (dfa->re_str, pattern, length + 1);
 789 #endif
 790
 791   err = re_string_construct (&regexp, pattern, length, preg->translate,
 792                              syntax & RE_ICASE, dfa);
 793   if (BE (err != REG_NOERROR, 0))
 794     {
 795     re_compile_internal_free_return:
 796       free_workarea_compile (preg);
 797       re_string_destruct (&regexp);
 798       free_dfa_content (dfa);
 799       preg->buffer = NULL;
 800       preg->allocated = 0;
 801       return err;
 802     }
 803
 804   /* Parse the regular expression, and build a structure tree.  */
 805   preg->re_nsub = 0;
 806   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 807   if (BE (dfa->str_tree == NULL, 0))
 808     goto re_compile_internal_free_return;
 809
 810   /* Analyze the tree and create the nfa.  */
 811   err = analyze (preg);
 812   if (BE (err != REG_NOERROR, 0))
 813     goto re_compile_internal_free_return;
 814
 815 #ifdef RE_ENABLE_I18N
 816   /* If possible, do searching in single byte encoding to speed things up.  */
 817   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 818     optimize_utf8 (dfa);
 819 #endif
 820
 821   /* Then create the initial state of the dfa.  */
 822   err = create_initial_state (dfa);
 823
 824   /* Release work areas.  */
 825   free_workarea_compile (preg);
 826   re_string_destruct (&regexp);
 827
 828   if (BE (err != REG_NOERROR, 0))
 829     {
 830       free_dfa_content (dfa);
 831       preg->buffer = NULL;
 832       preg->allocated = 0;
 833     }
 834
 835   return err;
 836 }
 837
 838 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 839    as the initial length of some arrays.  */
 840
 841 static reg_errcode_t
 842 init_dfa (dfa, pat_len)
 843      re_dfa_t *dfa;
 844      int pat_len;
 845 {
 846   int table_size;
 847 #ifndef _LIBC
 848   char *codeset_name;
 849 #endif
 850
 851   memset (dfa, '\0', sizeof (re_dfa_t));
 852
 853   /* Force allocation of str_tree_storage the first time.  */
 854   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 855
 856   dfa->nodes_alloc = pat_len + 1;
 857   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 858
 859   dfa->states_alloc = pat_len + 1;
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; table_size > 0; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877 # ifdef HAVE_LANGINFO_CODESET
 878   codeset_name = nl_langinfo (CODESET);
 879 # else
 880   codeset_name = getenv ("LC_ALL");
 881   if (codeset_name == NULL || codeset_name[0] == '\0')
 882     codeset_name = getenv ("LC_CTYPE");
 883   if (codeset_name == NULL || codeset_name[0] == '\0')
 884     codeset_name = getenv ("LANG");
 885   if (codeset_name == NULL)
 886     codeset_name = "";
 887   else if (strchr (codeset_name, '.') !=  NULL)
 888     codeset_name = strchr (codeset_name, '.') + 1;
 889 # endif
 890
 891   if (strcasecmp (codeset_name, "UTF-8") == 0
 892       || strcasecmp (codeset_name, "UTF8") == 0)
 893     dfa->is_utf8 = 1;
 894
 895   /* We check exhaustively in the loop below if this charset is a
 896      superset of ASCII.  */
 897   dfa->map_notascii = 0;
 898 #endif
 899
 900 #ifdef RE_ENABLE_I18N
 901   if (dfa->mb_cur_max > 1)
 902     {
 903       if (dfa->is_utf8)
 904         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 905       else
 906         {
 907           int i, j, ch;
 908
 909           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 910           if (BE (dfa->sb_char == NULL, 0))
 911             return REG_ESPACE;
 912
 913           /* Clear all bits by, then set those corresponding to single
 914              byte chars.  */
 915           bitset_empty (dfa->sb_char);
 916
 917           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 918             for (j = 0; j < UINT_BITS; ++j, ++ch)
 919               {
 920                 wint_t wch = __btowc (ch);
 921                 if (wch != WEOF)
 922                   dfa->sb_char[i] |= 1 << j;
 923 # ifndef _LIBC
 924                 if (isascii (ch) && wch != ch)
 925                   dfa->map_notascii = 1;
 926 # endif
 927               }
 928         }
 929     }
 930 #endif
 931
 932   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 933     return REG_ESPACE;
 934   return REG_NOERROR;
 935 }
 936
 937 /* Initialize WORD_CHAR table, which indicate which character is
 938    "word".  In this case "word" means that it is the word construction
 939    character used by some operators like "\<", "\>", etc.  */
 940
 941 static void
 942 init_word_char (dfa)
 943      re_dfa_t *dfa;
 944 {
 945   int i, j, ch;
 946   dfa->word_ops_used = 1;
 947   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 948     for (j = 0; j < UINT_BITS; ++j, ++ch)
 949       if (isalnum (ch) || ch == '_')
 950         dfa->word_char[i] |= 1 << j;
 951 }
 952
 953 /* Free the work area which are only used while compiling.  */
 954
 955 static void
 956 free_workarea_compile (preg)
 957      regex_t *preg;
 958 {
 959   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 960   bin_tree_storage_t *storage, *next;
 961   for (storage = dfa->str_tree_storage; storage; storage = next)
 962     {
 963       next = storage->next;
 964       re_free (storage);
 965     }
 966   dfa->str_tree_storage = NULL;
 967   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 968   dfa->str_tree = NULL;
 969   re_free (dfa->org_indices);
 970   dfa->org_indices = NULL;
 971 }
 972
 973 /* Create initial states for all contexts.  */
 974
 975 static reg_errcode_t
 976 create_initial_state (dfa)
 977      re_dfa_t *dfa;
 978 {
 979   int first, i;
 980   reg_errcode_t err;
 981   re_node_set init_nodes;
 982
 983   /* Initial states have the epsilon closure of the node which is
 984      the first node of the regular expression.  */
 985   first = dfa->str_tree->first->node_idx;
 986   dfa->init_node = first;
 987   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 988   if (BE (err != REG_NOERROR, 0))
 989     return err;
 990
 991   /* The back-references which are in initial states can epsilon transit,
 992      since in this case all of the subexpressions can be null.
 993      Then we add epsilon closures of the nodes which are the next nodes of
 994      the back-references.  */
 995   if (dfa->nbackref > 0)
 996     for (i = 0; i < init_nodes.nelem; ++i)
 997       {
 998         int node_idx = init_nodes.elems[i];
 999         re_token_type_t type = dfa->nodes[node_idx].type;
1000
1001         int clexp_idx;
1002         if (type != OP_BACK_REF)
1003           continue;
1004         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1005           {
1006             re_token_t *clexp_node;
1007             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1008             if (clexp_node->type == OP_CLOSE_SUBEXP
1009                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1010               break;
1011           }
1012         if (clexp_idx == init_nodes.nelem)
1013           continue;
1014
1015         if (type == OP_BACK_REF)
1016           {
1017             int dest_idx = dfa->edests[node_idx].elems[0];
1018             if (!re_node_set_contains (&init_nodes, dest_idx))
1019               {
1020                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1021                 i = 0;
1022               }
1023           }
1024       }
1025
1026   /* It must be the first time to invoke acquire_state.  */
1027   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1028   /* We don't check ERR here, since the initial state must not be NULL.  */
1029   if (BE (dfa->init_state == NULL, 0))
1030     return err;
1031   if (dfa->init_state->has_constraint)
1032     {
1033       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1034                                                        CONTEXT_WORD);
1035       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1036                                                      CONTEXT_NEWLINE);
1037       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1038                                                          &init_nodes,
1039                                                          CONTEXT_NEWLINE
1040                                                          | CONTEXT_BEGBUF);
1041       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1042               || dfa->init_state_begbuf == NULL, 0))
1043         return err;
1044     }
1045   else
1046     dfa->init_state_word = dfa->init_state_nl
1047       = dfa->init_state_begbuf = dfa->init_state;
1048
1049   re_node_set_free (&init_nodes);
1050   return REG_NOERROR;
1051 }
1052 \f
1053 #ifdef RE_ENABLE_I18N
1054 /* If it is possible to do searching in single byte encoding instead of UTF-8
1055    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1056    DFA nodes where needed.  */
1057
1058 static void
1059 optimize_utf8 (dfa)
1060      re_dfa_t *dfa;
1061 {
1062   int node, i, mb_chars = 0, has_period = 0;
1063
1064   for (node = 0; node < dfa->nodes_len; ++node)
1065     switch (dfa->nodes[node].type)
1066       {
1067       case CHARACTER:
1068         if (dfa->nodes[node].opr.c >= 0x80)
1069           mb_chars = 1;
1070         break;
1071       case ANCHOR:
1072         switch (dfa->nodes[node].opr.idx)
1073           {
1074           case LINE_FIRST:
1075           case LINE_LAST:
1076           case BUF_FIRST:
1077           case BUF_LAST:
1078             break;
1079           default:
1080             /* Word anchors etc. cannot be handled.  */
1081             return;
1082           }
1083         break;
1084       case OP_PERIOD:
1085         has_period = 1;
1086         break;
1087       case OP_BACK_REF:
1088       case OP_ALT:
1089       case END_OF_RE:
1090       case OP_DUP_ASTERISK:
1091       case OP_OPEN_SUBEXP:
1092       case OP_CLOSE_SUBEXP:
1093         break;
1094       case COMPLEX_BRACKET:
1095         return;
1096       case SIMPLE_BRACKET:
1097         /* Just double check.  */
1098         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1099           if (dfa->nodes[node].opr.sbcset[i])
1100             return;
1101         break;
1102       default:
1103         abort ();
1104       }
1105
1106   if (mb_chars || has_period)
1107     for (node = 0; node < dfa->nodes_len; ++node)
1108       {
1109         if (dfa->nodes[node].type == CHARACTER
1110             && dfa->nodes[node].opr.c >= 0x80)
1111           dfa->nodes[node].mb_partial = 0;
1112         else if (dfa->nodes[node].type == OP_PERIOD)
1113           dfa->nodes[node].type = OP_UTF8_PERIOD;
1114       }
1115
1116   /* The search can be in single byte locale.  */
1117   dfa->mb_cur_max = 1;
1118   dfa->is_utf8 = 0;
1119   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1120 }
1121 #endif
1122 \f
1123 /* Analyze the structure tree, and calculate "first", "next", "edest",
1124    "eclosure", and "inveclosure".  */
1125
1126 static reg_errcode_t
1127 analyze (preg)
1128      regex_t *preg;
1129 {
1130   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1131   reg_errcode_t ret;
1132
1133   /* Allocate arrays.  */
1134   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1135   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1136   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1137   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1138   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1139           || dfa->eclosures == NULL, 0))
1140     return REG_ESPACE;
1141
1142   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1143   if (dfa->subexp_map != NULL)
1144     {
1145       int i;
1146       for (i = 0; i < preg->re_nsub; i++)
1147         dfa->subexp_map[i] = i;
1148       preorder (dfa->str_tree, optimize_subexps, dfa);
1149       for (i = 0; i < preg->re_nsub; i++)
1150         if (dfa->subexp_map[i] != i)
1151           break;
1152       if (i == preg->re_nsub)
1153         {
1154           free (dfa->subexp_map);
1155           dfa->subexp_map = NULL;
1156         }
1157     }
1158
1159   ret = postorder (dfa->str_tree, lower_subexps, preg);
1160   if (BE (ret != REG_NOERROR, 0))
1161     return ret;
1162   ret = postorder (dfa->str_tree, calc_first, dfa);
1163   if (BE (ret != REG_NOERROR, 0))
1164     return ret;
1165   preorder (dfa->str_tree, calc_next, dfa);
1166   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1167   if (BE (ret != REG_NOERROR, 0))
1168     return ret;
1169   ret = calc_eclosure (dfa);
1170   if (BE (ret != REG_NOERROR, 0))
1171     return ret;
1172
1173   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1174      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1175   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1176       || dfa->nbackref)
1177     {
1178       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1179       if (BE (dfa->inveclosures == NULL, 0))
1180         return REG_ESPACE;
1181       ret = calc_inveclosure (dfa);
1182     }
1183
1184   return ret;
1185 }
1186
1187 /* Our parse trees are very unbalanced, so we cannot use a stack to
1188    implement parse tree visits.  Instead, we use parent pointers and
1189    some hairy code in these two functions.  */
1190 static reg_errcode_t
1191 postorder (root, fn, extra)
1192      bin_tree_t *root;
1193      reg_errcode_t (fn (void *, bin_tree_t *));
1194      void *extra;
1195 {
1196   bin_tree_t *node, *prev;
1197
1198   for (node = root; ; )
1199     {
1200       /* Descend down the tree, preferably to the left (or to the right
1201          if that's the only child).  */
1202       while (node->left || node->right)
1203         if (node->left)
1204           node = node->left;
1205         else
1206           node = node->right;
1207
1208       do
1209         {
1210           reg_errcode_t err = fn (extra, node);
1211           if (BE (err != REG_NOERROR, 0))
1212             return err;
1213           if (node->parent == NULL)
1214             return REG_NOERROR;
1215           prev = node;
1216           node = node->parent;
1217         }
1218       /* Go up while we have a node that is reached from the right.  */
1219       while (node->right == prev || node->right == NULL);
1220       node = node->right;
1221     }
1222 }
1223
1224 static reg_errcode_t
1225 preorder (root, fn, extra)
1226      bin_tree_t *root;
1227      reg_errcode_t (fn (void *, bin_tree_t *));
1228      void *extra;
1229 {
1230   bin_tree_t *node;
1231
1232   for (node = root; ; )
1233     {
1234       reg_errcode_t err = fn (extra, node);
1235       if (BE (err != REG_NOERROR, 0))
1236         return err;
1237
1238       /* Go to the left node, or up and to the right.  */
1239       if (node->left)
1240         node = node->left;
1241       else
1242         {
1243           bin_tree_t *prev = NULL;
1244           while (node->right == prev || node->right == NULL)
1245             {
1246               prev = node;
1247               node = node->parent;
1248               if (!node)
1249                 return REG_NOERROR;
1250             }
1251           node = node->right;
1252         }
1253     }
1254 }
1255
1256 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1257    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1258    backreferences as well.  Requires a preorder visit.  */
1259 static reg_errcode_t
1260 optimize_subexps (extra, node)
1261      void *extra;
1262      bin_tree_t *node;
1263 {
1264   re_dfa_t *dfa = (re_dfa_t *) extra;
1265
1266   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1267     {
1268       int idx = node->token.opr.idx;
1269       node->token.opr.idx = dfa->subexp_map[idx];
1270       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1271     }
1272
1273   else if (node->token.type == SUBEXP
1274            && node->left && node->left->token.type == SUBEXP)
1275     {
1276       int other_idx = node->left->token.opr.idx;
1277
1278       node->left = node->left->left;
1279       if (node->left)
1280         node->left->parent = node;
1281
1282       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1283       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1284         dfa->used_bkref_map &= ~(1 << other_idx);
1285     }
1286
1287   return REG_NOERROR;
1288 }
1289
1290 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1291    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1292 static reg_errcode_t
1293 lower_subexps (extra, node)
1294      void *extra;
1295      bin_tree_t *node;
1296 {
1297   regex_t *preg = (regex_t *) extra;
1298   reg_errcode_t err = REG_NOERROR;
1299
1300   if (node->left && node->left->token.type == SUBEXP)
1301     {
1302       node->left = lower_subexp (&err, preg, node->left);
1303       if (node->left)
1304         node->left->parent = node;
1305     }
1306   if (node->right && node->right->token.type == SUBEXP)
1307     {
1308       node->right = lower_subexp (&err, preg, node->right);
1309       if (node->right)
1310         node->right->parent = node;
1311     }
1312
1313   return err;
1314 }
1315
1316 static bin_tree_t *
1317 lower_subexp (err, preg, node)
1318      reg_errcode_t *err;
1319      regex_t *preg;
1320      bin_tree_t *node;
1321 {
1322   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323   bin_tree_t *body = node->left;
1324   bin_tree_t *op, *cls, *tree1, *tree;
1325
1326   if (preg->no_sub
1327       /* We do not optimize empty subexpressions, because otherwise we may
1328          have bad CONCAT nodes with NULL children.  This is obviously not
1329          very common, so we do not lose much.  An example that triggers
1330          this case is the sed "script" /\(\)/x.  */
1331       && node->left != NULL
1332       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1333           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1334     return node->left;
1335
1336   /* Convert the SUBEXP node to the concatenation of an
1337      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1338   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1339   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1340   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1341   tree = create_tree (dfa, op, tree1, CONCAT);
1342   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1343     {
1344       *err = REG_ESPACE;
1345       return NULL;
1346     }
1347
1348   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1350   return tree;
1351 }
1352
1353 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354    nodes.  Requires a postorder visit.  */
1355 static reg_errcode_t
1356 calc_first (extra, node)
1357      void *extra;
1358      bin_tree_t *node;
1359 {
1360   re_dfa_t *dfa = (re_dfa_t *) extra;
1361   if (node->token.type == CONCAT)
1362     {
1363       node->first = node->left->first;
1364       node->node_idx = node->left->node_idx;
1365     }
1366   else
1367     {
1368       node->first = node;
1369       node->node_idx = re_dfa_add_node (dfa, node->token);
1370       if (BE (node->node_idx == -1, 0))
1371         return REG_ESPACE;
1372     }
1373   return REG_NOERROR;
1374 }
1375
1376 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1377 static reg_errcode_t
1378 calc_next (extra, node)
1379      void *extra;
1380      bin_tree_t *node;
1381 {
1382   switch (node->token.type)
1383     {
1384     case OP_DUP_ASTERISK:
1385       node->left->next = node;
1386       break;
1387     case CONCAT:
1388       node->left->next = node->right->first;
1389       node->right->next = node->next;
1390       break;
1391     default:
1392       if (node->left)
1393         node->left->next = node->next;
1394       if (node->right)
1395         node->right->next = node->next;
1396       break;
1397     }
1398   return REG_NOERROR;
1399 }
1400
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1402 static reg_errcode_t
1403 link_nfa_nodes (extra, node)
1404      void *extra;
1405      bin_tree_t *node;
1406 {
1407   re_dfa_t *dfa = (re_dfa_t *) extra;
1408   int idx = node->node_idx;
1409   reg_errcode_t err = REG_NOERROR;
1410
1411   switch (node->token.type)
1412     {
1413     case CONCAT:
1414       break;
1415
1416     case END_OF_RE:
1417       assert (node->next == NULL);
1418       break;
1419
1420     case OP_DUP_ASTERISK:
1421     case OP_ALT:
1422       {
1423         int left, right;
1424         dfa->has_plural_match = 1;
1425         if (node->left != NULL)
1426           left = node->left->first->node_idx;
1427         else
1428           left = node->next->node_idx;
1429         if (node->right != NULL)
1430           right = node->right->first->node_idx;
1431         else
1432           right = node->next->node_idx;
1433         assert (left > -1);
1434         assert (right > -1);
1435         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436       }
1437       break;
1438
1439     case ANCHOR:
1440     case OP_OPEN_SUBEXP:
1441     case OP_CLOSE_SUBEXP:
1442       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443       break;
1444
1445     case OP_BACK_REF:
1446       dfa->nexts[idx] = node->next->node_idx;
1447       if (node->token.type == OP_BACK_REF)
1448         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1449       break;
1450
1451     default:
1452       assert (!IS_EPSILON_NODE (node->token.type));
1453       dfa->nexts[idx] = node->next->node_idx;
1454       break;
1455     }
1456
1457   return err;
1458 }
1459
1460 /* Duplicate the epsilon closure of the node ROOT_NODE.
1461    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462    to their own constraint.  */
1463
1464 static reg_errcode_t
1465 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1466                         init_constraint)
1467      re_dfa_t *dfa;
1468      int top_org_node, top_clone_node, root_node;
1469      unsigned int init_constraint;
1470 {
1471   reg_errcode_t err;
1472   int org_node, clone_node, ret;
1473   unsigned int constraint = init_constraint;
1474   for (org_node = top_org_node, clone_node = top_clone_node;;)
1475     {
1476       int org_dest, clone_dest;
1477       if (dfa->nodes[org_node].type == OP_BACK_REF)
1478         {
1479           /* If the back reference epsilon-transit, its destination must
1480              also have the constraint.  Then duplicate the epsilon closure
1481              of the destination of the back reference, and store it in
1482              edests of the back reference.  */
1483           org_dest = dfa->nexts[org_node];
1484           re_node_set_empty (dfa->edests + clone_node);
1485           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1486           if (BE (err != REG_NOERROR, 0))
1487             return err;
1488           dfa->nexts[clone_node] = dfa->nexts[org_node];
1489           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1490           if (BE (ret < 0, 0))
1491             return REG_ESPACE;
1492         }
1493       else if (dfa->edests[org_node].nelem == 0)
1494         {
1495           /* In case of the node can't epsilon-transit, don't duplicate the
1496              destination and store the original destination as the
1497              destination of the node.  */
1498           dfa->nexts[clone_node] = dfa->nexts[org_node];
1499           break;
1500         }
1501       else if (dfa->edests[org_node].nelem == 1)
1502         {
1503           /* In case of the node can epsilon-transit, and it has only one
1504              destination.  */
1505           org_dest = dfa->edests[org_node].elems[0];
1506           re_node_set_empty (dfa->edests + clone_node);
1507           if (dfa->nodes[org_node].type == ANCHOR)
1508             {
1509               /* In case of the node has another constraint, append it.  */
1510               if (org_node == root_node && clone_node != org_node)
1511                 {
1512                   /* ...but if the node is root_node itself, it means the
1513                      epsilon closure have a loop, then tie it to the
1514                      destination of the root_node.  */
1515                   ret = re_node_set_insert (dfa->edests + clone_node,
1516                                             org_dest);
1517                   if (BE (ret < 0, 0))
1518                     return REG_ESPACE;
1519                   break;
1520                 }
1521               constraint |= dfa->nodes[org_node].opr.ctx_type;
1522             }
1523           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1524           if (BE (err != REG_NOERROR, 0))
1525             return err;
1526           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527           if (BE (ret < 0, 0))
1528             return REG_ESPACE;
1529         }
1530       else /* dfa->edests[org_node].nelem == 2 */
1531         {
1532           /* In case of the node can epsilon-transit, and it has two
1533              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1534           org_dest = dfa->edests[org_node].elems[0];
1535           re_node_set_empty (dfa->edests + clone_node);
1536           /* Search for a duplicated node which satisfies the constraint.  */
1537           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1538           if (clone_dest == -1)
1539             {
1540               /* There are no such a duplicated node, create a new one.  */
1541               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1542               if (BE (err != REG_NOERROR, 0))
1543                 return err;
1544               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545               if (BE (ret < 0, 0))
1546                 return REG_ESPACE;
1547               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548                                             root_node, constraint);
1549               if (BE (err != REG_NOERROR, 0))
1550                 return err;
1551             }
1552           else
1553             {
1554               /* There are a duplicated node which satisfy the constraint,
1555                  use it to avoid infinite loop.  */
1556               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557               if (BE (ret < 0, 0))
1558                 return REG_ESPACE;
1559             }
1560
1561           org_dest = dfa->edests[org_node].elems[1];
1562           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1563           if (BE (err != REG_NOERROR, 0))
1564             return err;
1565           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566           if (BE (ret < 0, 0))
1567             return REG_ESPACE;
1568         }
1569       org_node = org_dest;
1570       clone_node = clone_dest;
1571     }
1572   return REG_NOERROR;
1573 }
1574
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576    satisfies the constraint CONSTRAINT.  */
1577
1578 static int
1579 search_duplicated_node (dfa, org_node, constraint)
1580      re_dfa_t *dfa;
1581      int org_node;
1582      unsigned int constraint;
1583 {
1584   int idx;
1585   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1586     {
1587       if (org_node == dfa->org_indices[idx]
1588           && constraint == dfa->nodes[idx].constraint)
1589         return idx; /* Found.  */
1590     }
1591   return -1; /* Not found.  */
1592 }
1593
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1596    otherwise return the error code.  */
1597
1598 static reg_errcode_t
1599 duplicate_node (new_idx, dfa, org_idx, constraint)
1600      re_dfa_t *dfa;
1601      int *new_idx, org_idx;
1602      unsigned int constraint;
1603 {
1604   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1605   if (BE (dup_idx == -1, 0))
1606     return REG_ESPACE;
1607   dfa->nodes[dup_idx].constraint = constraint;
1608   if (dfa->nodes[org_idx].type == ANCHOR)
1609     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1610   dfa->nodes[dup_idx].duplicated = 1;
1611
1612   /* Store the index of the original node.  */
1613   dfa->org_indices[dup_idx] = org_idx;
1614   *new_idx = dup_idx;
1615   return REG_NOERROR;
1616 }
1617
1618 static reg_errcode_t
1619 calc_inveclosure (dfa)
1620      re_dfa_t *dfa;
1621 {
1622   int src, idx, ret;
1623   for (idx = 0; idx < dfa->nodes_len; ++idx)
1624     re_node_set_init_empty (dfa->inveclosures + idx);
1625
1626   for (src = 0; src < dfa->nodes_len; ++src)
1627     {
1628       int *elems = dfa->eclosures[src].elems;
1629       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630         {
1631           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632           if (BE (ret == -1, 0))
1633             return REG_ESPACE;
1634         }
1635     }
1636
1637   return REG_NOERROR;
1638 }
1639
1640 /* Calculate "eclosure" for all the node in DFA.  */
1641
1642 static reg_errcode_t
1643 calc_eclosure (dfa)
1644      re_dfa_t *dfa;
1645 {
1646   int node_idx, incomplete;
1647 #ifdef DEBUG
1648   assert (dfa->nodes_len > 0);
1649 #endif
1650   incomplete = 0;
1651   /* For each nodes, calculate epsilon closure.  */
1652   for (node_idx = 0; ; ++node_idx)
1653     {
1654       reg_errcode_t err;
1655       re_node_set eclosure_elem;
1656       if (node_idx == dfa->nodes_len)
1657         {
1658           if (!incomplete)
1659             break;
1660           incomplete = 0;
1661           node_idx = 0;
1662         }
1663
1664 #ifdef DEBUG
1665       assert (dfa->eclosures[node_idx].nelem != -1);
1666 #endif
1667
1668       /* If we have already calculated, skip it.  */
1669       if (dfa->eclosures[node_idx].nelem != 0)
1670         continue;
1671       /* Calculate epsilon closure of `node_idx'.  */
1672       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1673       if (BE (err != REG_NOERROR, 0))
1674         return err;
1675
1676       if (dfa->eclosures[node_idx].nelem == 0)
1677         {
1678           incomplete = 1;
1679           re_node_set_free (&eclosure_elem);
1680         }
1681     }
1682   return REG_NOERROR;
1683 }
1684
1685 /* Calculate epsilon closure of NODE.  */
1686
1687 static reg_errcode_t
1688 calc_eclosure_iter (new_set, dfa, node, root)
1689      re_node_set *new_set;
1690      re_dfa_t *dfa;
1691      int node, root;
1692 {
1693   reg_errcode_t err;
1694   unsigned int constraint;
1695   int i, incomplete;
1696   re_node_set eclosure;
1697   incomplete = 0;
1698   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1699   if (BE (err != REG_NOERROR, 0))
1700     return err;
1701
1702   /* This indicates that we are calculating this node now.
1703      We reference this value to avoid infinite loop.  */
1704   dfa->eclosures[node].nelem = -1;
1705
1706   constraint = ((dfa->nodes[node].type == ANCHOR)
1707                 ? dfa->nodes[node].opr.ctx_type : 0);
1708   /* If the current node has constraints, duplicate all nodes.
1709      Since they must inherit the constraints.  */
1710   if (constraint
1711       && dfa->edests[node].nelem
1712       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1713     {
1714       int org_node, cur_node;
1715       org_node = cur_node = node;
1716       err = duplicate_node_closure (dfa, node, node, node, constraint);
1717       if (BE (err != REG_NOERROR, 0))
1718         return err;
1719     }
1720
1721   /* Expand each epsilon destination nodes.  */
1722   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1723     for (i = 0; i < dfa->edests[node].nelem; ++i)
1724       {
1725         re_node_set eclosure_elem;
1726         int edest = dfa->edests[node].elems[i];
1727         /* If calculating the epsilon closure of `edest' is in progress,
1728            return intermediate result.  */
1729         if (dfa->eclosures[edest].nelem == -1)
1730           {
1731             incomplete = 1;
1732             continue;
1733           }
1734         /* If we haven't calculated the epsilon closure of `edest' yet,
1735            calculate now. Otherwise use calculated epsilon closure.  */
1736         if (dfa->eclosures[edest].nelem == 0)
1737           {
1738             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1739             if (BE (err != REG_NOERROR, 0))
1740               return err;
1741           }
1742         else
1743           eclosure_elem = dfa->eclosures[edest];
1744         /* Merge the epsilon closure of `edest'.  */
1745         re_node_set_merge (&eclosure, &eclosure_elem);
1746         /* If the epsilon closure of `edest' is incomplete,
1747            the epsilon closure of this node is also incomplete.  */
1748         if (dfa->eclosures[edest].nelem == 0)
1749           {
1750             incomplete = 1;
1751             re_node_set_free (&eclosure_elem);
1752           }
1753       }
1754
1755   /* Epsilon closures include itself.  */
1756   re_node_set_insert (&eclosure, node);
1757   if (incomplete && !root)
1758     dfa->eclosures[node].nelem = 0;
1759   else
1760     dfa->eclosures[node] = eclosure;
1761   *new_set = eclosure;
1762   return REG_NOERROR;
1763 }
1764 \f
1765 /* Functions for token which are used in the parser.  */
1766
1767 /* Fetch a token from INPUT.
1768    We must not use this function inside bracket expressions.  */
1769
1770 static void
1771 fetch_token (result, input, syntax)
1772      re_token_t *result;
1773      re_string_t *input;
1774      reg_syntax_t syntax;
1775 {
1776   re_string_skip_bytes (input, peek_token (result, input, syntax));
1777 }
1778
1779 /* Peek a token from INPUT, and return the length of the token.
1780    We must not use this function inside bracket expressions.  */
1781
1782 static int
1783 peek_token (token, input, syntax)
1784      re_token_t *token;
1785      re_string_t *input;
1786      reg_syntax_t syntax;
1787 {
1788   unsigned char c;
1789
1790   if (re_string_eoi (input))
1791     {
1792       token->type = END_OF_RE;
1793       return 0;
1794     }
1795
1796   c = re_string_peek_byte (input, 0);
1797   token->opr.c = c;
1798
1799   token->word_char = 0;
1800 #ifdef RE_ENABLE_I18N
1801   token->mb_partial = 0;
1802   if (input->mb_cur_max > 1 &&
1803       !re_string_first_byte (input, re_string_cur_idx (input)))
1804     {
1805       token->type = CHARACTER;
1806       token->mb_partial = 1;
1807       return 1;
1808     }
1809 #endif
1810   if (c == '\\')
1811     {
1812       unsigned char c2;
1813       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1814         {
1815           token->type = BACK_SLASH;
1816           return 1;
1817         }
1818
1819       c2 = re_string_peek_byte_case (input, 1);
1820       token->opr.c = c2;
1821       token->type = CHARACTER;
1822 #ifdef RE_ENABLE_I18N
1823       if (input->mb_cur_max > 1)
1824         {
1825           wint_t wc = re_string_wchar_at (input,
1826                                           re_string_cur_idx (input) + 1);
1827           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1828         }
1829       else
1830 #endif
1831         token->word_char = IS_WORD_CHAR (c2) != 0;
1832
1833       switch (c2)
1834         {
1835         case '|':
1836           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1837             token->type = OP_ALT;
1838           break;
1839         case '1': case '2': case '3': case '4': case '5':
1840         case '6': case '7': case '8': case '9':
1841           if (!(syntax & RE_NO_BK_REFS))
1842             {
1843               token->type = OP_BACK_REF;
1844               token->opr.idx = c2 - '1';
1845             }
1846           break;
1847         case '<':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = WORD_FIRST;
1852             }
1853           break;
1854         case '>':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             {
1857               token->type = ANCHOR;
1858               token->opr.ctx_type = WORD_LAST;
1859             }
1860           break;
1861         case 'b':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             {
1864               token->type = ANCHOR;
1865               token->opr.ctx_type = WORD_DELIM;
1866             }
1867           break;
1868         case 'B':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             {
1871               token->type = ANCHOR;
1872               token->opr.ctx_type = NOT_WORD_DELIM;
1873             }
1874           break;
1875         case 'w':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_WORD;
1878           break;
1879         case 'W':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_NOTWORD;
1882           break;
1883         case 's':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             token->type = OP_SPACE;
1886           break;
1887         case 'S':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             token->type = OP_NOTSPACE;
1890           break;
1891         case '`':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = BUF_FIRST;
1896             }
1897           break;
1898         case '\'':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             {
1901               token->type = ANCHOR;
1902               token->opr.ctx_type = BUF_LAST;
1903             }
1904           break;
1905         case '(':
1906           if (!(syntax & RE_NO_BK_PARENS))
1907             token->type = OP_OPEN_SUBEXP;
1908           break;
1909         case ')':
1910           if (!(syntax & RE_NO_BK_PARENS))
1911             token->type = OP_CLOSE_SUBEXP;
1912           break;
1913         case '+':
1914           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915             token->type = OP_DUP_PLUS;
1916           break;
1917         case '?':
1918           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1919             token->type = OP_DUP_QUESTION;
1920           break;
1921         case '{':
1922           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923             token->type = OP_OPEN_DUP_NUM;
1924           break;
1925         case '}':
1926           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1927             token->type = OP_CLOSE_DUP_NUM;
1928           break;
1929         default:
1930           break;
1931         }
1932       return 2;
1933     }
1934
1935   token->type = CHARACTER;
1936 #ifdef RE_ENABLE_I18N
1937   if (input->mb_cur_max > 1)
1938     {
1939       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1940       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1941     }
1942   else
1943 #endif
1944     token->word_char = IS_WORD_CHAR (token->opr.c);
1945
1946   switch (c)
1947     {
1948     case '\n':
1949       if (syntax & RE_NEWLINE_ALT)
1950         token->type = OP_ALT;
1951       break;
1952     case '|':
1953       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1954         token->type = OP_ALT;
1955       break;
1956     case '*':
1957       token->type = OP_DUP_ASTERISK;
1958       break;
1959     case '+':
1960       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961         token->type = OP_DUP_PLUS;
1962       break;
1963     case '?':
1964       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1965         token->type = OP_DUP_QUESTION;
1966       break;
1967     case '{':
1968       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969         token->type = OP_OPEN_DUP_NUM;
1970       break;
1971     case '}':
1972       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1973         token->type = OP_CLOSE_DUP_NUM;
1974       break;
1975     case '(':
1976       if (syntax & RE_NO_BK_PARENS)
1977         token->type = OP_OPEN_SUBEXP;
1978       break;
1979     case ')':
1980       if (syntax & RE_NO_BK_PARENS)
1981         token->type = OP_CLOSE_SUBEXP;
1982       break;
1983     case '[':
1984       token->type = OP_OPEN_BRACKET;
1985       break;
1986     case '.':
1987       token->type = OP_PERIOD;
1988       break;
1989     case '^':
1990       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1991           re_string_cur_idx (input) != 0)
1992         {
1993           char prev = re_string_peek_byte (input, -1);
1994           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1995             break;
1996         }
1997       token->type = ANCHOR;
1998       token->opr.ctx_type = LINE_FIRST;
1999       break;
2000     case '$':
2001       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2002           re_string_cur_idx (input) + 1 != re_string_length (input))
2003         {
2004           re_token_t next;
2005           re_string_skip_bytes (input, 1);
2006           peek_token (&next, input, syntax);
2007           re_string_skip_bytes (input, -1);
2008           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2009             break;
2010         }
2011       token->type = ANCHOR;
2012       token->opr.ctx_type = LINE_LAST;
2013       break;
2014     default:
2015       break;
2016     }
2017   return 1;
2018 }
2019
2020 /* Peek a token from INPUT, and return the length of the token.
2021    We must not use this function out of bracket expressions.  */
2022
2023 static int
2024 peek_token_bracket (token, input, syntax)
2025      re_token_t *token;
2026      re_string_t *input;
2027      reg_syntax_t syntax;
2028 {
2029   unsigned char c;
2030   if (re_string_eoi (input))
2031     {
2032       token->type = END_OF_RE;
2033       return 0;
2034     }
2035   c = re_string_peek_byte (input, 0);
2036   token->opr.c = c;
2037
2038 #ifdef RE_ENABLE_I18N
2039   if (input->mb_cur_max > 1 &&
2040       !re_string_first_byte (input, re_string_cur_idx (input)))
2041     {
2042       token->type = CHARACTER;
2043       return 1;
2044     }
2045 #endif /* RE_ENABLE_I18N */
2046
2047   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2048       && re_string_cur_idx (input) + 1 < re_string_length (input))
2049     {
2050       /* In this case, '\' escape a character.  */
2051       unsigned char c2;
2052       re_string_skip_bytes (input, 1);
2053       c2 = re_string_peek_byte (input, 0);
2054       token->opr.c = c2;
2055       token->type = CHARACTER;
2056       return 1;
2057     }
2058   if (c == '[') /* '[' is a special char in a bracket exps.  */
2059     {
2060       unsigned char c2;
2061       int token_len;
2062       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2063         c2 = re_string_peek_byte (input, 1);
2064       else
2065         c2 = 0;
2066       token->opr.c = c2;
2067       token_len = 2;
2068       switch (c2)
2069         {
2070         case '.':
2071           token->type = OP_OPEN_COLL_ELEM;
2072           break;
2073         case '=':
2074           token->type = OP_OPEN_EQUIV_CLASS;
2075           break;
2076         case ':':
2077           if (syntax & RE_CHAR_CLASSES)
2078             {
2079               token->type = OP_OPEN_CHAR_CLASS;
2080               break;
2081             }
2082           /* else fall through.  */
2083         default:
2084           token->type = CHARACTER;
2085           token->opr.c = c;
2086           token_len = 1;
2087           break;
2088         }
2089       return token_len;
2090     }
2091   switch (c)
2092     {
2093     case '-':
2094       token->type = OP_CHARSET_RANGE;
2095       break;
2096     case ']':
2097       token->type = OP_CLOSE_BRACKET;
2098       break;
2099     case '^':
2100       token->type = OP_NON_MATCH_LIST;
2101       break;
2102     default:
2103       token->type = CHARACTER;
2104     }
2105   return 1;
2106 }
2107 \f
2108 /* Functions for parser.  */
2109
2110 /* Entry point of the parser.
2111    Parse the regular expression REGEXP and return the structure tree.
2112    If an error is occured, ERR is set by error code, and return NULL.
2113    This function build the following tree, from regular expression <reg_exp>:
2114            CAT
2115            / \
2116           /   \
2117    <reg_exp>  EOR
2118
2119    CAT means concatenation.
2120    EOR means end of regular expression.  */
2121
2122 static bin_tree_t *
2123 parse (regexp, preg, syntax, err)
2124      re_string_t *regexp;
2125      regex_t *preg;
2126      reg_syntax_t syntax;
2127      reg_errcode_t *err;
2128 {
2129   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130   bin_tree_t *tree, *eor, *root;
2131   re_token_t current_token;
2132   dfa->syntax = syntax;
2133   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2134   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2135   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2136     return NULL;
2137   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2138   if (tree != NULL)
2139     root = create_tree (dfa, tree, eor, CONCAT);
2140   else
2141     root = eor;
2142   if (BE (eor == NULL || root == NULL, 0))
2143     {
2144       *err = REG_ESPACE;
2145       return NULL;
2146     }
2147   return root;
2148 }
2149
2150 /* This function build the following tree, from regular expression
2151    <branch1>|<branch2>:
2152            ALT
2153            / \
2154           /   \
2155    <branch1> <branch2>
2156
2157    ALT means alternative, which represents the operator `|'.  */
2158
2159 static bin_tree_t *
2160 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2161      re_string_t *regexp;
2162      regex_t *preg;
2163      re_token_t *token;
2164      reg_syntax_t syntax;
2165      int nest;
2166      reg_errcode_t *err;
2167 {
2168   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2169   bin_tree_t *tree, *branch = NULL;
2170   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2171   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2172     return NULL;
2173
2174   while (token->type == OP_ALT)
2175     {
2176       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2177       if (token->type != OP_ALT && token->type != END_OF_RE
2178           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2179         {
2180           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2181           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2182             return NULL;
2183         }
2184       else
2185         branch = NULL;
2186       tree = create_tree (dfa, tree, branch, OP_ALT);
2187       if (BE (tree == NULL, 0))
2188         {
2189           *err = REG_ESPACE;
2190           return NULL;
2191         }
2192     }
2193   return tree;
2194 }
2195
2196 /* This function build the following tree, from regular expression
2197    <exp1><exp2>:
2198         CAT
2199         / \
2200        /   \
2201    <exp1> <exp2>
2202
2203    CAT means concatenation.  */
2204
2205 static bin_tree_t *
2206 parse_branch (regexp, preg, token, syntax, nest, err)
2207      re_string_t *regexp;
2208      regex_t *preg;
2209      re_token_t *token;
2210      reg_syntax_t syntax;
2211      int nest;
2212      reg_errcode_t *err;
2213 {
2214   bin_tree_t *tree, *exp;
2215   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2216   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2217   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2218     return NULL;
2219
2220   while (token->type != OP_ALT && token->type != END_OF_RE
2221          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2222     {
2223       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2224       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2225         {
2226           return NULL;
2227         }
2228       if (tree != NULL && exp != NULL)
2229         {
2230           tree = create_tree (dfa, tree, exp, CONCAT);
2231           if (tree == NULL)
2232             {
2233               *err = REG_ESPACE;
2234               return NULL;
2235             }
2236         }
2237       else if (tree == NULL)
2238         tree = exp;
2239       /* Otherwise exp == NULL, we don't need to create new tree.  */
2240     }
2241   return tree;
2242 }
2243
2244 /* This function build the following tree, from regular expression a*:
2245          *
2246          |
2247          a
2248 */
2249
2250 static bin_tree_t *
2251 parse_expression (regexp, preg, token, syntax, nest, err)
2252      re_string_t *regexp;
2253      regex_t *preg;
2254      re_token_t *token;
2255      reg_syntax_t syntax;
2256      int nest;
2257      reg_errcode_t *err;
2258 {
2259   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2260   bin_tree_t *tree;
2261   switch (token->type)
2262     {
2263     case CHARACTER:
2264       tree = create_token_tree (dfa, NULL, NULL, token);
2265       if (BE (tree == NULL, 0))
2266         {
2267           *err = REG_ESPACE;
2268           return NULL;
2269         }
2270 #ifdef RE_ENABLE_I18N
2271       if (dfa->mb_cur_max > 1)
2272         {
2273           while (!re_string_eoi (regexp)
2274                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2275             {
2276               bin_tree_t *mbc_remain;
2277               fetch_token (token, regexp, syntax);
2278               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2279               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2280               if (BE (mbc_remain == NULL || tree == NULL, 0))
2281                 {
2282                   *err = REG_ESPACE;
2283                   return NULL;
2284                 }
2285             }
2286         }
2287 #endif
2288       break;
2289     case OP_OPEN_SUBEXP:
2290       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2291       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2292         return NULL;
2293       break;
2294     case OP_OPEN_BRACKET:
2295       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2296       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2297         return NULL;
2298       break;
2299     case OP_BACK_REF:
2300       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2301         {
2302           *err = REG_ESUBREG;
2303           return NULL;
2304         }
2305       dfa->used_bkref_map |= 1 << token->opr.idx;
2306       tree = create_token_tree (dfa, NULL, NULL, token);
2307       if (BE (tree == NULL, 0))
2308         {
2309           *err = REG_ESPACE;
2310           return NULL;
2311         }
2312       ++dfa->nbackref;
2313       dfa->has_mb_node = 1;
2314       break;
2315     case OP_OPEN_DUP_NUM:
2316       if (syntax & RE_CONTEXT_INVALID_DUP)
2317         {
2318           *err = REG_BADRPT;
2319           return NULL;
2320         }
2321       /* FALLTHROUGH */
2322     case OP_DUP_ASTERISK:
2323     case OP_DUP_PLUS:
2324     case OP_DUP_QUESTION:
2325       if (syntax & RE_CONTEXT_INVALID_OPS)
2326         {
2327           *err = REG_BADRPT;
2328           return NULL;
2329         }
2330       else if (syntax & RE_CONTEXT_INDEP_OPS)
2331         {
2332           fetch_token (token, regexp, syntax);
2333           return parse_expression (regexp, preg, token, syntax, nest, err);
2334         }
2335       /* else fall through  */
2336     case OP_CLOSE_SUBEXP:
2337       if ((token->type == OP_CLOSE_SUBEXP) &&
2338           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2339         {
2340           *err = REG_ERPAREN;
2341           return NULL;
2342         }
2343       /* else fall through  */
2344     case OP_CLOSE_DUP_NUM:
2345       /* We treat it as a normal character.  */
2346
2347       /* Then we can these characters as normal characters.  */
2348       token->type = CHARACTER;
2349       /* mb_partial and word_char bits should be initialized already
2350          by peek_token.  */
2351       tree = create_token_tree (dfa, NULL, NULL, token);
2352       if (BE (tree == NULL, 0))
2353         {
2354           *err = REG_ESPACE;
2355           return NULL;
2356         }
2357       break;
2358     case ANCHOR:
2359       if ((token->opr.ctx_type
2360            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2361           && dfa->word_ops_used == 0)
2362         init_word_char (dfa);
2363       if (token->opr.ctx_type == WORD_DELIM
2364           || token->opr.ctx_type == NOT_WORD_DELIM)
2365         {
2366           bin_tree_t *tree_first, *tree_last;
2367           if (token->opr.ctx_type == WORD_DELIM)
2368             {
2369               token->opr.ctx_type = WORD_FIRST;
2370               tree_first = create_token_tree (dfa, NULL, NULL, token);
2371               token->opr.ctx_type = WORD_LAST;
2372             }
2373           else
2374             {
2375               token->opr.ctx_type = INSIDE_WORD;
2376               tree_first = create_token_tree (dfa, NULL, NULL, token);
2377               token->opr.ctx_type = INSIDE_NOTWORD;
2378             }
2379           tree_last = create_token_tree (dfa, NULL, NULL, token);
2380           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2381           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2382             {
2383               *err = REG_ESPACE;
2384               return NULL;
2385             }
2386         }
2387       else
2388         {
2389           tree = create_token_tree (dfa, NULL, NULL, token);
2390           if (BE (tree == NULL, 0))
2391             {
2392               *err = REG_ESPACE;
2393               return NULL;
2394             }
2395         }
2396       /* We must return here, since ANCHORs can't be followed
2397          by repetition operators.
2398          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2399              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2400       fetch_token (token, regexp, syntax);
2401       return tree;
2402     case OP_PERIOD:
2403       tree = create_token_tree (dfa, NULL, NULL, token);
2404       if (BE (tree == NULL, 0))
2405         {
2406           *err = REG_ESPACE;
2407           return NULL;
2408         }
2409       if (dfa->mb_cur_max > 1)
2410         dfa->has_mb_node = 1;
2411       break;
2412     case OP_WORD:
2413     case OP_NOTWORD:
2414       tree = build_charclass_op (dfa, regexp->trans,
2415                                  (const unsigned char *) "alnum",
2416                                  (const unsigned char *) "_",
2417                                  token->type == OP_NOTWORD, err);
2418       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2419         return NULL;
2420       break;
2421     case OP_SPACE:
2422     case OP_NOTSPACE:
2423       tree = build_charclass_op (dfa, regexp->trans,
2424                                  (const unsigned char *) "space",
2425                                  (const unsigned char *) "",
2426                                  token->type == OP_NOTSPACE, err);
2427       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2428         return NULL;
2429       break;
2430     case OP_ALT:
2431     case END_OF_RE:
2432       return NULL;
2433     case BACK_SLASH:
2434       *err = REG_EESCAPE;
2435       return NULL;
2436     default:
2437       /* Must not happen?  */
2438 #ifdef DEBUG
2439       assert (0);
2440 #endif
2441       return NULL;
2442     }
2443   fetch_token (token, regexp, syntax);
2444
2445   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2446          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2447     {
2448       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2449       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2450         return NULL;
2451       /* In BRE consecutive duplications are not allowed.  */
2452       if ((syntax & RE_CONTEXT_INVALID_DUP)
2453           && (token->type == OP_DUP_ASTERISK
2454               || token->type == OP_OPEN_DUP_NUM))
2455         {
2456           *err = REG_BADRPT;
2457           return NULL;
2458         }
2459     }
2460
2461   return tree;
2462 }
2463
2464 /* This function build the following tree, from regular expression
2465    (<reg_exp>):
2466          SUBEXP
2467             |
2468         <reg_exp>
2469 */
2470
2471 static bin_tree_t *
2472 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2473      re_string_t *regexp;
2474      regex_t *preg;
2475      re_token_t *token;
2476      reg_syntax_t syntax;
2477      int nest;
2478      reg_errcode_t *err;
2479 {
2480   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2481   bin_tree_t *tree;
2482   size_t cur_nsub;
2483   cur_nsub = preg->re_nsub++;
2484
2485   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2486
2487   /* The subexpression may be a null string.  */
2488   if (token->type == OP_CLOSE_SUBEXP)
2489     tree = NULL;
2490   else
2491     {
2492       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2493       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2494         *err = REG_EPAREN;
2495       if (BE (*err != REG_NOERROR, 0))
2496         return NULL;
2497     }
2498   dfa->completed_bkref_map |= 1 << cur_nsub;
2499
2500   tree = create_tree (dfa, tree, NULL, SUBEXP);
2501   if (BE (tree == NULL, 0))
2502     {
2503       *err = REG_ESPACE;
2504       return NULL;
2505     }
2506   tree->token.opr.idx = cur_nsub;
2507   return tree;
2508 }
2509
2510 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2511
2512 static bin_tree_t *
2513 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2514      bin_tree_t *elem;
2515      re_string_t *regexp;
2516      re_dfa_t *dfa;
2517      re_token_t *token;
2518      reg_syntax_t syntax;
2519      reg_errcode_t *err;
2520 {
2521   bin_tree_t *tree = NULL, *old_tree = NULL;
2522   int i, start, end, start_idx = re_string_cur_idx (regexp);
2523   re_token_t start_token = *token;
2524
2525   if (token->type == OP_OPEN_DUP_NUM)
2526     {
2527       end = 0;
2528       start = fetch_number (regexp, token, syntax);
2529       if (start == -1)
2530         {
2531           if (token->type == CHARACTER && token->opr.c == ',')
2532             start = 0; /* We treat "{,m}" as "{0,m}".  */
2533           else
2534             {
2535               *err = REG_BADBR; /* <re>{} is invalid.  */
2536               return NULL;
2537             }
2538         }
2539       if (BE (start != -2, 1))
2540         {
2541           /* We treat "{n}" as "{n,n}".  */
2542           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2543                  : ((token->type == CHARACTER && token->opr.c == ',')
2544                     ? fetch_number (regexp, token, syntax) : -2));
2545         }
2546       if (BE (start == -2 || end == -2, 0))
2547         {
2548           /* Invalid sequence.  */
2549           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2550             {
2551               if (token->type == END_OF_RE)
2552                 *err = REG_EBRACE;
2553               else
2554                 *err = REG_BADBR;
2555
2556               return NULL;
2557             }
2558
2559           /* If the syntax bit is set, rollback.  */
2560           re_string_set_index (regexp, start_idx);
2561           *token = start_token;
2562           token->type = CHARACTER;
2563           /* mb_partial and word_char bits should be already initialized by
2564              peek_token.  */
2565           return elem;
2566         }
2567
2568       if (BE (end != -1 && start > end, 0))
2569         {
2570           /* First number greater than second.  */
2571           *err = REG_BADBR;
2572           return NULL;
2573         }
2574     }
2575   else
2576     {
2577       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2578       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2579     }
2580
2581   fetch_token (token, regexp, syntax);
2582
2583   if (BE (elem == NULL, 0))
2584     return NULL;
2585   if (BE (start == 0 && end == 0, 0))
2586     {
2587       postorder (elem, free_tree, NULL);
2588       return NULL;
2589     }
2590
2591   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2592   if (BE (start > 0, 0))
2593     {
2594       tree = elem;
2595       for (i = 2; i <= start; ++i)
2596         {
2597           elem = duplicate_tree (elem, dfa);
2598           tree = create_tree (dfa, tree, elem, CONCAT);
2599           if (BE (elem == NULL || tree == NULL, 0))
2600             goto parse_dup_op_espace;
2601         }
2602
2603       if (start == end)
2604         return tree;
2605
2606       /* Duplicate ELEM before it is marked optional.  */
2607       elem = duplicate_tree (elem, dfa);
2608       old_tree = tree;
2609     }
2610   else
2611     old_tree = NULL;
2612
2613   if (elem->token.type == SUBEXP)
2614     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2615
2616   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2617   if (BE (tree == NULL, 0))
2618     goto parse_dup_op_espace;
2619
2620   /* This loop is actually executed only when end != -1,
2621      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2622      already created the start+1-th copy.  */
2623   for (i = start + 2; i <= end; ++i)
2624     {
2625       elem = duplicate_tree (elem, dfa);
2626       tree = create_tree (dfa, tree, elem, CONCAT);
2627       if (BE (elem == NULL || tree == NULL, 0))
2628         goto parse_dup_op_espace;
2629
2630       tree = create_tree (dfa, tree, NULL, OP_ALT);
2631       if (BE (tree == NULL, 0))
2632         goto parse_dup_op_espace;
2633     }
2634
2635   if (old_tree)
2636     tree = create_tree (dfa, old_tree, tree, CONCAT);
2637
2638   return tree;
2639
2640  parse_dup_op_espace:
2641   *err = REG_ESPACE;
2642   return NULL;
2643 }
2644
2645 /* Size of the names for collating symbol/equivalence_class/character_class.
2646    I'm not sure, but maybe enough.  */
2647 #define BRACKET_NAME_BUF_SIZE 32
2648
2649 #ifndef _LIBC
2650   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2651      Build the range expression which starts from START_ELEM, and ends
2652      at END_ELEM.  The result are written to MBCSET and SBCSET.
2653      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2654      mbcset->range_ends, is a pointer argument sinse we may
2655      update it.  */
2656
2657 static reg_errcode_t
2658 # ifdef RE_ENABLE_I18N
2659 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2660      re_charset_t *mbcset;
2661      int *range_alloc;
2662 # else /* not RE_ENABLE_I18N */
2663 build_range_exp (sbcset, start_elem, end_elem)
2664 # endif /* not RE_ENABLE_I18N */
2665      re_bitset_ptr_t sbcset;
2666      bracket_elem_t *start_elem, *end_elem;
2667 {
2668   unsigned int start_ch, end_ch;
2669   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2670   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2671           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2672           0))
2673     return REG_ERANGE;
2674
2675   /* We can handle no multi character collating elements without libc
2676      support.  */
2677   if (BE ((start_elem->type == COLL_SYM
2678            && strlen ((char *) start_elem->opr.name) > 1)
2679           || (end_elem->type == COLL_SYM
2680               && strlen ((char *) end_elem->opr.name) > 1), 0))
2681     return REG_ECOLLATE;
2682
2683 # ifdef RE_ENABLE_I18N
2684   {
2685     wchar_t wc;
2686     wint_t start_wc, end_wc;
2687     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2688
2689     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2690                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2691                    : 0));
2692     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2693               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2694                  : 0));
2695     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2696                 ? __btowc (start_ch) : start_elem->opr.wch);
2697     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2698               ? __btowc (end_ch) : end_elem->opr.wch);
2699     if (start_wc == WEOF || end_wc == WEOF)
2700       return REG_ECOLLATE;
2701     cmp_buf[0] = start_wc;
2702     cmp_buf[4] = end_wc;
2703     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2704       return REG_ERANGE;
2705
2706     /* Got valid collation sequence values, add them as a new entry.
2707        However, for !_LIBC we have no collation elements: if the
2708        character set is single byte, the single byte character set
2709        that we build below suffices.  parse_bracket_exp passes
2710        no MBCSET if dfa->mb_cur_max == 1.  */
2711     if (mbcset)
2712       {
2713         /* Check the space of the arrays.  */
2714         if (BE (*range_alloc == mbcset->nranges, 0))
2715           {
2716             /* There is not enough space, need realloc.  */
2717             wchar_t *new_array_start, *new_array_end;
2718             int new_nranges;
2719
2720             /* +1 in case of mbcset->nranges is 0.  */
2721             new_nranges = 2 * mbcset->nranges + 1;
2722             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2723                are NULL if *range_alloc == 0.  */
2724             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2725                                           new_nranges);
2726             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2727                                         new_nranges);
2728
2729             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2730               return REG_ESPACE;
2731
2732             mbcset->range_starts = new_array_start;
2733             mbcset->range_ends = new_array_end;
2734             *range_alloc = new_nranges;
2735           }
2736
2737         mbcset->range_starts[mbcset->nranges] = start_wc;
2738         mbcset->range_ends[mbcset->nranges++] = end_wc;
2739       }
2740
2741     /* Build the table for single byte characters.  */
2742     for (wc = 0; wc < SBC_MAX; ++wc)
2743       {
2744         cmp_buf[2] = wc;
2745         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2746             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2747           bitset_set (sbcset, wc);
2748       }
2749   }
2750 # else /* not RE_ENABLE_I18N */
2751   {
2752     unsigned int ch;
2753     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2754                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2755                    : 0));
2756     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2757               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2758                  : 0));
2759     if (start_ch > end_ch)
2760       return REG_ERANGE;
2761     /* Build the table for single byte characters.  */
2762     for (ch = 0; ch < SBC_MAX; ++ch)
2763       if (start_ch <= ch  && ch <= end_ch)
2764         bitset_set (sbcset, ch);
2765   }
2766 # endif /* not RE_ENABLE_I18N */
2767   return REG_NOERROR;
2768 }
2769 #endif /* not _LIBC */
2770
2771 #ifndef _LIBC
2772 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2773    Build the collating element which is represented by NAME.
2774    The result are written to MBCSET and SBCSET.
2775    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2776    pointer argument since we may update it.  */
2777
2778 static reg_errcode_t
2779 # ifdef RE_ENABLE_I18N
2780 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2781      re_charset_t *mbcset;
2782      int *coll_sym_alloc;
2783 # else /* not RE_ENABLE_I18N */
2784 build_collating_symbol (sbcset, name)
2785 # endif /* not RE_ENABLE_I18N */
2786      re_bitset_ptr_t sbcset;
2787      const unsigned char *name;
2788 {
2789   size_t name_len = strlen ((const char *) name);
2790   if (BE (name_len != 1, 0))
2791     return REG_ECOLLATE;
2792   else
2793     {
2794       bitset_set (sbcset, name[0]);
2795       return REG_NOERROR;
2796     }
2797 }
2798 #endif /* not _LIBC */
2799
2800 /* This function parse bracket expression like "[abc]", "[a-c]",
2801    "[[.a-a.]]" etc.  */
2802
2803 static bin_tree_t *
2804 parse_bracket_exp (regexp, dfa, token, syntax, err)
2805      re_string_t *regexp;
2806      re_dfa_t *dfa;
2807      re_token_t *token;
2808      reg_syntax_t syntax;
2809      reg_errcode_t *err;
2810 {
2811 #ifdef _LIBC
2812   const unsigned char *collseqmb;
2813   const char *collseqwc;
2814   uint32_t nrules;
2815   int32_t table_size;
2816   const int32_t *symb_table;
2817   const unsigned char *extra;
2818
2819   /* Local function for parse_bracket_exp used in _LIBC environement.
2820      Seek the collating symbol entry correspondings to NAME.
2821      Return the index of the symbol in the SYMB_TABLE.  */
2822
2823   auto inline int32_t
2824   __attribute ((always_inline))
2825   seek_collating_symbol_entry (name, name_len)
2826          const unsigned char *name;
2827          size_t name_len;
2828     {
2829       int32_t hash = elem_hash ((const char *) name, name_len);
2830       int32_t elem = hash % table_size;
2831       int32_t second = hash % (table_size - 2);
2832       while (symb_table[2 * elem] != 0)
2833         {
2834           /* First compare the hashing value.  */
2835           if (symb_table[2 * elem] == hash
2836               /* Compare the length of the name.  */
2837               && name_len == extra[symb_table[2 * elem + 1]]
2838               /* Compare the name.  */
2839               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2840                          name_len) == 0)
2841             {
2842               /* Yep, this is the entry.  */
2843               break;
2844             }
2845
2846           /* Next entry.  */
2847           elem += second;
2848         }
2849       return elem;
2850     }
2851
2852   /* Local function for parse_bracket_exp used in _LIBC environement.
2853      Look up the collation sequence value of BR_ELEM.
2854      Return the value if succeeded, UINT_MAX otherwise.  */
2855
2856   auto inline unsigned int
2857   __attribute ((always_inline))
2858   lookup_collation_sequence_value (br_elem)
2859          bracket_elem_t *br_elem;
2860     {
2861       if (br_elem->type == SB_CHAR)
2862         {
2863           /*
2864           if (MB_CUR_MAX == 1)
2865           */
2866           if (nrules == 0)
2867             return collseqmb[br_elem->opr.ch];
2868           else
2869             {
2870               wint_t wc = __btowc (br_elem->opr.ch);
2871               return __collseq_table_lookup (collseqwc, wc);
2872             }
2873         }
2874       else if (br_elem->type == MB_CHAR)
2875         {
2876           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2877         }
2878       else if (br_elem->type == COLL_SYM)
2879         {
2880           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2881           if (nrules != 0)
2882             {
2883               int32_t elem, idx;
2884               elem = seek_collating_symbol_entry (br_elem->opr.name,
2885                                                   sym_name_len);
2886               if (symb_table[2 * elem] != 0)
2887                 {
2888                   /* We found the entry.  */
2889                   idx = symb_table[2 * elem + 1];
2890                   /* Skip the name of collating element name.  */
2891                   idx += 1 + extra[idx];
2892                   /* Skip the byte sequence of the collating element.  */
2893                   idx += 1 + extra[idx];
2894                   /* Adjust for the alignment.  */
2895                   idx = (idx + 3) & ~3;
2896                   /* Skip the multibyte collation sequence value.  */
2897                   idx += sizeof (unsigned int);
2898                   /* Skip the wide char sequence of the collating element.  */
2899                   idx += sizeof (unsigned int) *
2900                     (1 + *(unsigned int *) (extra + idx));
2901                   /* Return the collation sequence value.  */
2902                   return *(unsigned int *) (extra + idx);
2903                 }
2904               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2905                 {
2906                   /* No valid character.  Match it as a single byte
2907                      character.  */
2908                   return collseqmb[br_elem->opr.name[0]];
2909                 }
2910             }
2911           else if (sym_name_len == 1)
2912             return collseqmb[br_elem->opr.name[0]];
2913         }
2914       return UINT_MAX;
2915     }
2916
2917   /* Local function for parse_bracket_exp used in _LIBC environement.
2918      Build the range expression which starts from START_ELEM, and ends
2919      at END_ELEM.  The result are written to MBCSET and SBCSET.
2920      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2921      mbcset->range_ends, is a pointer argument sinse we may
2922      update it.  */
2923
2924   auto inline reg_errcode_t
2925   __attribute ((always_inline))
2926   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2927          re_charset_t *mbcset;
2928          int *range_alloc;
2929          re_bitset_ptr_t sbcset;
2930          bracket_elem_t *start_elem, *end_elem;
2931     {
2932       unsigned int ch;
2933       uint32_t start_collseq;
2934       uint32_t end_collseq;
2935
2936       /* Equivalence Classes and Character Classes can't be a range
2937          start/end.  */
2938       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2939               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2940               0))
2941         return REG_ERANGE;
2942
2943       start_collseq = lookup_collation_sequence_value (start_elem);
2944       end_collseq = lookup_collation_sequence_value (end_elem);
2945       /* Check start/end collation sequence values.  */
2946       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2947         return REG_ECOLLATE;
2948       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2949         return REG_ERANGE;
2950
2951       /* Got valid collation sequence values, add them as a new entry.
2952          However, if we have no collation elements, and the character set
2953          is single byte, the single byte character set that we
2954          build below suffices. */
2955       if (nrules > 0 || dfa->mb_cur_max > 1)
2956         {
2957           /* Check the space of the arrays.  */
2958           if (BE (*range_alloc == mbcset->nranges, 0))
2959             {
2960               /* There is not enough space, need realloc.  */
2961               uint32_t *new_array_start;
2962               uint32_t *new_array_end;
2963               int new_nranges;
2964
2965               /* +1 in case of mbcset->nranges is 0.  */
2966               new_nranges = 2 * mbcset->nranges + 1;
2967               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2968                                             new_nranges);
2969               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2970                                           new_nranges);
2971
2972               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2973                 return REG_ESPACE;
2974
2975               mbcset->range_starts = new_array_start;
2976               mbcset->range_ends = new_array_end;
2977               *range_alloc = new_nranges;
2978             }
2979
2980           mbcset->range_starts[mbcset->nranges] = start_collseq;
2981           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2982         }
2983
2984       /* Build the table for single byte characters.  */
2985       for (ch = 0; ch < SBC_MAX; ch++)
2986         {
2987           uint32_t ch_collseq;
2988           /*
2989           if (MB_CUR_MAX == 1)
2990           */
2991           if (nrules == 0)
2992             ch_collseq = collseqmb[ch];
2993           else
2994             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2995           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2996             bitset_set (sbcset, ch);
2997         }
2998       return REG_NOERROR;
2999     }
3000
3001   /* Local function for parse_bracket_exp used in _LIBC environement.
3002      Build the collating element which is represented by NAME.
3003      The result are written to MBCSET and SBCSET.
3004      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3005      pointer argument sinse we may update it.  */
3006
3007   auto inline reg_errcode_t
3008   __attribute ((always_inline))
3009   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3010          re_charset_t *mbcset;
3011          int *coll_sym_alloc;
3012          re_bitset_ptr_t sbcset;
3013          const unsigned char *name;
3014     {
3015       int32_t elem, idx;
3016       size_t name_len = strlen ((const char *) name);
3017       if (nrules != 0)
3018         {
3019           elem = seek_collating_symbol_entry (name, name_len);
3020           if (symb_table[2 * elem] != 0)
3021             {
3022               /* We found the entry.  */
3023               idx = symb_table[2 * elem + 1];
3024               /* Skip the name of collating element name.  */
3025               idx += 1 + extra[idx];
3026             }
3027           else if (symb_table[2 * elem] == 0 && name_len == 1)
3028             {
3029               /* No valid character, treat it as a normal
3030                  character.  */
3031               bitset_set (sbcset, name[0]);
3032               return REG_NOERROR;
3033             }
3034           else
3035             return REG_ECOLLATE;
3036
3037           /* Got valid collation sequence, add it as a new entry.  */
3038           /* Check the space of the arrays.  */
3039           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3040             {
3041               /* Not enough, realloc it.  */
3042               /* +1 in case of mbcset->ncoll_syms is 0.  */
3043               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3044               /* Use realloc since mbcset->coll_syms is NULL
3045                  if *alloc == 0.  */
3046               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3047                                                    new_coll_sym_alloc);
3048               if (BE (new_coll_syms == NULL, 0))
3049                 return REG_ESPACE;
3050               mbcset->coll_syms = new_coll_syms;
3051               *coll_sym_alloc = new_coll_sym_alloc;
3052             }
3053           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3054           return REG_NOERROR;
3055         }
3056       else
3057         {
3058           if (BE (name_len != 1, 0))
3059             return REG_ECOLLATE;
3060           else
3061             {
3062               bitset_set (sbcset, name[0]);
3063               return REG_NOERROR;
3064             }
3065         }
3066     }
3067 #endif
3068
3069   re_token_t br_token;
3070   re_bitset_ptr_t sbcset;
3071 #ifdef RE_ENABLE_I18N
3072   re_charset_t *mbcset;
3073   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3074   int equiv_class_alloc = 0, char_class_alloc = 0;
3075 #endif /* not RE_ENABLE_I18N */
3076   int non_match = 0;
3077   bin_tree_t *work_tree;
3078   int token_len;
3079   int first_round = 1;
3080 #ifdef _LIBC
3081   collseqmb = (const unsigned char *)
3082     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3083   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3084   if (nrules)
3085     {
3086       /*
3087       if (MB_CUR_MAX > 1)
3088       */
3089         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3090       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3091       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3092                                                   _NL_COLLATE_SYMB_TABLEMB);
3093       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3094                                                    _NL_COLLATE_SYMB_EXTRAMB);
3095     }
3096 #endif
3097   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3098 #ifdef RE_ENABLE_I18N
3099   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3100 #endif /* RE_ENABLE_I18N */
3101 #ifdef RE_ENABLE_I18N
3102   if (BE (sbcset == NULL || mbcset == NULL, 0))
3103 #else
3104   if (BE (sbcset == NULL, 0))
3105 #endif /* RE_ENABLE_I18N */
3106     {
3107       *err = REG_ESPACE;
3108       return NULL;
3109     }
3110
3111   token_len = peek_token_bracket (token, regexp, syntax);
3112   if (BE (token->type == END_OF_RE, 0))
3113     {
3114       *err = REG_BADPAT;
3115       goto parse_bracket_exp_free_return;
3116     }
3117   if (token->type == OP_NON_MATCH_LIST)
3118     {
3119 #ifdef RE_ENABLE_I18N
3120       mbcset->non_match = 1;
3121 #endif /* not RE_ENABLE_I18N */
3122       non_match = 1;
3123       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3124         bitset_set (sbcset, '\0');
3125       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3126       token_len = peek_token_bracket (token, regexp, syntax);
3127       if (BE (token->type == END_OF_RE, 0))
3128         {
3129           *err = REG_BADPAT;
3130           goto parse_bracket_exp_free_return;
3131         }
3132     }
3133
3134   /* We treat the first ']' as a normal character.  */
3135   if (token->type == OP_CLOSE_BRACKET)
3136     token->type = CHARACTER;
3137
3138   while (1)
3139     {
3140       bracket_elem_t start_elem, end_elem;
3141       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3142       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3143       reg_errcode_t ret;
3144       int token_len2 = 0, is_range_exp = 0;
3145       re_token_t token2;
3146
3147       start_elem.opr.name = start_name_buf;
3148       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3149                                    syntax, first_round);
3150       if (BE (ret != REG_NOERROR, 0))
3151         {
3152           *err = ret;
3153           goto parse_bracket_exp_free_return;
3154         }
3155       first_round = 0;
3156
3157       /* Get information about the next token.  We need it in any case.  */
3158       token_len = peek_token_bracket (token, regexp, syntax);
3159
3160       /* Do not check for ranges if we know they are not allowed.  */
3161       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3162         {
3163           if (BE (token->type == END_OF_RE, 0))
3164             {
3165               *err = REG_EBRACK;
3166               goto parse_bracket_exp_free_return;
3167             }
3168           if (token->type == OP_CHARSET_RANGE)
3169             {
3170               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3171               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3172               if (BE (token2.type == END_OF_RE, 0))
3173                 {
3174                   *err = REG_EBRACK;
3175                   goto parse_bracket_exp_free_return;
3176                 }
3177               if (token2.type == OP_CLOSE_BRACKET)
3178                 {
3179                   /* We treat the last '-' as a normal character.  */
3180                   re_string_skip_bytes (regexp, -token_len);
3181                   token->type = CHARACTER;
3182                 }
3183               else
3184                 is_range_exp = 1;
3185             }
3186         }
3187
3188       if (is_range_exp == 1)
3189         {
3190           end_elem.opr.name = end_name_buf;
3191           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3192                                        dfa, syntax, 1);
3193           if (BE (ret != REG_NOERROR, 0))
3194             {
3195               *err = ret;
3196               goto parse_bracket_exp_free_return;
3197             }
3198
3199           token_len = peek_token_bracket (token, regexp, syntax);
3200
3201 #ifdef _LIBC
3202           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3203                                   &start_elem, &end_elem);
3204 #else
3205 # ifdef RE_ENABLE_I18N
3206           *err = build_range_exp (sbcset,
3207                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3208                                   &range_alloc, &start_elem, &end_elem);
3209 # else
3210           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3211 # endif
3212 #endif /* RE_ENABLE_I18N */
3213           if (BE (*err != REG_NOERROR, 0))
3214             goto parse_bracket_exp_free_return;
3215         }
3216       else
3217         {
3218           switch (start_elem.type)
3219             {
3220             case SB_CHAR:
3221               bitset_set (sbcset, start_elem.opr.ch);
3222               break;
3223 #ifdef RE_ENABLE_I18N
3224             case MB_CHAR:
3225               /* Check whether the array has enough space.  */
3226               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3227                 {
3228                   wchar_t *new_mbchars;
3229                   /* Not enough, realloc it.  */
3230                   /* +1 in case of mbcset->nmbchars is 0.  */
3231                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3232                   /* Use realloc since array is NULL if *alloc == 0.  */
3233                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3234                                             mbchar_alloc);
3235                   if (BE (new_mbchars == NULL, 0))
3236                     goto parse_bracket_exp_espace;
3237                   mbcset->mbchars = new_mbchars;
3238                 }
3239               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3240               break;
3241 #endif /* RE_ENABLE_I18N */
3242             case EQUIV_CLASS:
3243               *err = build_equiv_class (sbcset,
3244 #ifdef RE_ENABLE_I18N
3245                                         mbcset, &equiv_class_alloc,
3246 #endif /* RE_ENABLE_I18N */
3247                                         start_elem.opr.name);
3248               if (BE (*err != REG_NOERROR, 0))
3249                 goto parse_bracket_exp_free_return;
3250               break;
3251             case COLL_SYM:
3252               *err = build_collating_symbol (sbcset,
3253 #ifdef RE_ENABLE_I18N
3254                                              mbcset, &coll_sym_alloc,
3255 #endif /* RE_ENABLE_I18N */
3256                                              start_elem.opr.name);
3257               if (BE (*err != REG_NOERROR, 0))
3258                 goto parse_bracket_exp_free_return;
3259               break;
3260             case CHAR_CLASS:
3261               *err = build_charclass (regexp->trans, sbcset,
3262 #ifdef RE_ENABLE_I18N
3263                                       mbcset, &char_class_alloc,
3264 #endif /* RE_ENABLE_I18N */
3265                                       start_elem.opr.name, syntax);
3266               if (BE (*err != REG_NOERROR, 0))
3267                goto parse_bracket_exp_free_return;
3268               break;
3269             default:
3270               assert (0);
3271               break;
3272             }
3273         }
3274       if (BE (token->type == END_OF_RE, 0))
3275         {
3276           *err = REG_EBRACK;
3277           goto parse_bracket_exp_free_return;
3278         }
3279       if (token->type == OP_CLOSE_BRACKET)
3280         break;
3281     }
3282
3283   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3284
3285   /* If it is non-matching list.  */
3286   if (non_match)
3287     bitset_not (sbcset);
3288
3289 #ifdef RE_ENABLE_I18N
3290   /* Ensure only single byte characters are set.  */
3291   if (dfa->mb_cur_max > 1)
3292     bitset_mask (sbcset, dfa->sb_char);
3293
3294   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3295       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3296                                                      || mbcset->non_match)))
3297     {
3298       bin_tree_t *mbc_tree;
3299       int sbc_idx;
3300       /* Build a tree for complex bracket.  */
3301       dfa->has_mb_node = 1;
3302       br_token.type = COMPLEX_BRACKET;
3303       br_token.opr.mbcset = mbcset;
3304       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3305       if (BE (mbc_tree == NULL, 0))
3306         goto parse_bracket_exp_espace;
3307       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3308         if (sbcset[sbc_idx])
3309           break;
3310       /* If there are no bits set in sbcset, there is no point
3311          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3312       if (sbc_idx < BITSET_UINTS)
3313         {
3314           /* Build a tree for simple bracket.  */
3315           br_token.type = SIMPLE_BRACKET;
3316           br_token.opr.sbcset = sbcset;
3317           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3318           if (BE (work_tree == NULL, 0))
3319             goto parse_bracket_exp_espace;
3320
3321           /* Then join them by ALT node.  */
3322           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3323           if (BE (work_tree == NULL, 0))
3324             goto parse_bracket_exp_espace;
3325         }
3326       else
3327         {
3328           re_free (sbcset);
3329           work_tree = mbc_tree;
3330         }
3331     }
3332   else
3333 #endif /* not RE_ENABLE_I18N */
3334     {
3335 #ifdef RE_ENABLE_I18N
3336       free_charset (mbcset);
3337 #endif
3338       /* Build a tree for simple bracket.  */
3339       br_token.type = SIMPLE_BRACKET;
3340       br_token.opr.sbcset = sbcset;
3341       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3342       if (BE (work_tree == NULL, 0))
3343         goto parse_bracket_exp_espace;
3344     }
3345   return work_tree;
3346
3347  parse_bracket_exp_espace:
3348   *err = REG_ESPACE;
3349  parse_bracket_exp_free_return:
3350   re_free (sbcset);
3351 #ifdef RE_ENABLE_I18N
3352   free_charset (mbcset);
3353 #endif /* RE_ENABLE_I18N */
3354   return NULL;
3355 }
3356
3357 /* Parse an element in the bracket expression.  */
3358
3359 static reg_errcode_t
3360 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3361                        accept_hyphen)
3362      bracket_elem_t *elem;
3363      re_string_t *regexp;
3364      re_token_t *token;
3365      int token_len;
3366      re_dfa_t *dfa;
3367      reg_syntax_t syntax;
3368      int accept_hyphen;
3369 {
3370 #ifdef RE_ENABLE_I18N
3371   int cur_char_size;
3372   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3373   if (cur_char_size > 1)
3374     {
3375       elem->type = MB_CHAR;
3376       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3377       re_string_skip_bytes (regexp, cur_char_size);
3378       return REG_NOERROR;
3379     }
3380 #endif /* RE_ENABLE_I18N */
3381   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3382   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3383       || token->type == OP_OPEN_EQUIV_CLASS)
3384     return parse_bracket_symbol (elem, regexp, token);
3385   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3386     {
3387       /* A '-' must only appear as anything but a range indicator before
3388          the closing bracket.  Everything else is an error.  */
3389       re_token_t token2;
3390       (void) peek_token_bracket (&token2, regexp, syntax);
3391       if (token2.type != OP_CLOSE_BRACKET)
3392         /* The actual error value is not standardized since this whole
3393            case is undefined.  But ERANGE makes good sense.  */
3394         return REG_ERANGE;
3395     }
3396   elem->type = SB_CHAR;
3397   elem->opr.ch = token->opr.c;
3398   return REG_NOERROR;
3399 }
3400
3401 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3402    such as [:<character_class>:], [.<collating_element>.], and
3403    [=<equivalent_class>=].  */
3404
3405 static reg_errcode_t
3406 parse_bracket_symbol (elem, regexp, token)
3407      bracket_elem_t *elem;
3408      re_string_t *regexp;
3409      re_token_t *token;
3410 {
3411   unsigned char ch, delim = token->opr.c;
3412   int i = 0;
3413   if (re_string_eoi(regexp))
3414     return REG_EBRACK;
3415   for (;; ++i)
3416     {
3417       if (i >= BRACKET_NAME_BUF_SIZE)
3418         return REG_EBRACK;
3419       if (token->type == OP_OPEN_CHAR_CLASS)
3420         ch = re_string_fetch_byte_case (regexp);
3421       else
3422         ch = re_string_fetch_byte (regexp);
3423       if (re_string_eoi(regexp))
3424         return REG_EBRACK;
3425       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3426         break;
3427       elem->opr.name[i] = ch;
3428     }
3429   re_string_skip_bytes (regexp, 1);
3430   elem->opr.name[i] = '\0';
3431   switch (token->type)
3432     {
3433     case OP_OPEN_COLL_ELEM:
3434       elem->type = COLL_SYM;
3435       break;
3436     case OP_OPEN_EQUIV_CLASS:
3437       elem->type = EQUIV_CLASS;
3438       break;
3439     case OP_OPEN_CHAR_CLASS:
3440       elem->type = CHAR_CLASS;
3441       break;
3442     default:
3443       break;
3444     }
3445   return REG_NOERROR;
3446 }
3447
3448   /* Helper function for parse_bracket_exp.
3449      Build the equivalence class which is represented by NAME.
3450      The result are written to MBCSET and SBCSET.
3451      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3452      is a pointer argument sinse we may update it.  */
3453
3454 static reg_errcode_t
3455 #ifdef RE_ENABLE_I18N
3456 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3457      re_charset_t *mbcset;
3458      int *equiv_class_alloc;
3459 #else /* not RE_ENABLE_I18N */
3460 build_equiv_class (sbcset, name)
3461 #endif /* not RE_ENABLE_I18N */
3462      re_bitset_ptr_t sbcset;
3463      const unsigned char *name;
3464 {
3465 #if defined _LIBC
3466   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3467   if (nrules != 0)
3468     {
3469       const int32_t *table, *indirect;
3470       const unsigned char *weights, *extra, *cp;
3471       unsigned char char_buf[2];
3472       int32_t idx1, idx2;
3473       unsigned int ch;
3474       size_t len;
3475       /* This #include defines a local function!  */
3476 # include <locale/weight.h>
3477       /* Calculate the index for equivalence class.  */
3478       cp = name;
3479       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3480       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3481                                                _NL_COLLATE_WEIGHTMB);
3482       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3483                                                    _NL_COLLATE_EXTRAMB);
3484       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3485                                                 _NL_COLLATE_INDIRECTMB);
3486       idx1 = findidx (&cp);
3487       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3488         /* This isn't a valid character.  */
3489         return REG_ECOLLATE;
3490
3491       /* Build single byte matcing table for this equivalence class.  */
3492       char_buf[1] = (unsigned char) '\0';
3493       len = weights[idx1];
3494       for (ch = 0; ch < SBC_MAX; ++ch)
3495         {
3496           char_buf[0] = ch;
3497           cp = char_buf;
3498           idx2 = findidx (&cp);
3499 /*
3500           idx2 = table[ch];
3501 */
3502           if (idx2 == 0)
3503             /* This isn't a valid character.  */
3504             continue;
3505           if (len == weights[idx2])
3506             {
3507               int cnt = 0;
3508               while (cnt <= len &&
3509                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3510                 ++cnt;
3511
3512               if (cnt > len)
3513                 bitset_set (sbcset, ch);
3514             }
3515         }
3516       /* Check whether the array has enough space.  */
3517       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3518         {
3519           /* Not enough, realloc it.  */
3520           /* +1 in case of mbcset->nequiv_classes is 0.  */
3521           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3522           /* Use realloc since the array is NULL if *alloc == 0.  */
3523           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3524                                                    int32_t,
3525                                                    new_equiv_class_alloc);
3526           if (BE (new_equiv_classes == NULL, 0))
3527             return REG_ESPACE;
3528           mbcset->equiv_classes = new_equiv_classes;
3529           *equiv_class_alloc = new_equiv_class_alloc;
3530         }
3531       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3532     }
3533   else
3534 #endif /* _LIBC */
3535     {
3536       if (BE (strlen ((const char *) name) != 1, 0))
3537         return REG_ECOLLATE;
3538       bitset_set (sbcset, *name);
3539     }
3540   return REG_NOERROR;
3541 }
3542
3543   /* Helper function for parse_bracket_exp.
3544      Build the character class which is represented by NAME.
3545      The result are written to MBCSET and SBCSET.
3546      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3547      is a pointer argument sinse we may update it.  */
3548
3549 static reg_errcode_t
3550 #ifdef RE_ENABLE_I18N
3551 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3552      re_charset_t *mbcset;
3553      int *char_class_alloc;
3554 #else /* not RE_ENABLE_I18N */
3555 build_charclass (trans, sbcset, class_name, syntax)
3556 #endif /* not RE_ENABLE_I18N */
3557      unsigned RE_TRANSLATE_TYPE trans;
3558      re_bitset_ptr_t sbcset;
3559      const unsigned char *class_name;
3560      reg_syntax_t syntax;
3561 {
3562   int i;
3563   const char *name = (const char *) class_name;
3564
3565   /* In case of REG_ICASE "upper" and "lower" match the both of
3566      upper and lower cases.  */
3567   if ((syntax & RE_ICASE)
3568       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3569     name = "alpha";
3570
3571 #ifdef RE_ENABLE_I18N
3572   /* Check the space of the arrays.  */
3573   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3574     {
3575       /* Not enough, realloc it.  */
3576       /* +1 in case of mbcset->nchar_classes is 0.  */
3577       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3578       /* Use realloc since array is NULL if *alloc == 0.  */
3579       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3580                                                new_char_class_alloc);
3581       if (BE (new_char_classes == NULL, 0))
3582         return REG_ESPACE;
3583       mbcset->char_classes = new_char_classes;
3584       *char_class_alloc = new_char_class_alloc;
3585     }
3586   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3587 #endif /* RE_ENABLE_I18N */
3588
3589 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3590     for (i = 0; i < SBC_MAX; ++i)               \
3591       {                                         \
3592         if (ctype_func (i))                     \
3593           {                                     \
3594             int ch = trans ? trans[i] : i;      \
3595             bitset_set (sbcset, ch);            \
3596           }                                     \
3597       }
3598
3599   if (strcmp (name, "alnum") == 0)
3600     BUILD_CHARCLASS_LOOP (isalnum)
3601   else if (strcmp (name, "cntrl") == 0)
3602     BUILD_CHARCLASS_LOOP (iscntrl)
3603   else if (strcmp (name, "lower") == 0)
3604     BUILD_CHARCLASS_LOOP (islower)
3605   else if (strcmp (name, "space") == 0)
3606     BUILD_CHARCLASS_LOOP (isspace)
3607   else if (strcmp (name, "alpha") == 0)
3608     BUILD_CHARCLASS_LOOP (isalpha)
3609   else if (strcmp (name, "digit") == 0)
3610     BUILD_CHARCLASS_LOOP (isdigit)
3611   else if (strcmp (name, "print") == 0)
3612     BUILD_CHARCLASS_LOOP (isprint)
3613   else if (strcmp (name, "upper") == 0)
3614     BUILD_CHARCLASS_LOOP (isupper)
3615   else if (strcmp (name, "blank") == 0)
3616     BUILD_CHARCLASS_LOOP (isblank)
3617   else if (strcmp (name, "graph") == 0)
3618     BUILD_CHARCLASS_LOOP (isgraph)
3619   else if (strcmp (name, "punct") == 0)
3620     BUILD_CHARCLASS_LOOP (ispunct)
3621   else if (strcmp (name, "xdigit") == 0)
3622     BUILD_CHARCLASS_LOOP (isxdigit)
3623   else
3624     return REG_ECTYPE;
3625
3626   return REG_NOERROR;
3627 }
3628
3629 static bin_tree_t *
3630 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3631      re_dfa_t *dfa;
3632      unsigned RE_TRANSLATE_TYPE trans;
3633      const unsigned char *class_name;
3634      const unsigned char *extra;
3635      int non_match;
3636      reg_errcode_t *err;
3637 {
3638   re_bitset_ptr_t sbcset;
3639 #ifdef RE_ENABLE_I18N
3640   re_charset_t *mbcset;
3641   int alloc = 0;
3642 #endif /* not RE_ENABLE_I18N */
3643   reg_errcode_t ret;
3644   re_token_t br_token;
3645   bin_tree_t *tree;
3646
3647   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3648 #ifdef RE_ENABLE_I18N
3649   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3650 #endif /* RE_ENABLE_I18N */
3651
3652 #ifdef RE_ENABLE_I18N
3653   if (BE (sbcset == NULL || mbcset == NULL, 0))
3654 #else /* not RE_ENABLE_I18N */
3655   if (BE (sbcset == NULL, 0))
3656 #endif /* not RE_ENABLE_I18N */
3657     {
3658       *err = REG_ESPACE;
3659       return NULL;
3660     }
3661
3662   if (non_match)
3663     {
3664 #ifdef RE_ENABLE_I18N
3665       /*
3666       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3667         bitset_set(cset->sbcset, '\0');
3668       */
3669       mbcset->non_match = 1;
3670 #endif /* not RE_ENABLE_I18N */
3671     }
3672
3673   /* We don't care the syntax in this case.  */
3674   ret = build_charclass (trans, sbcset,
3675 #ifdef RE_ENABLE_I18N
3676                          mbcset, &alloc,
3677 #endif /* RE_ENABLE_I18N */
3678                          class_name, 0);
3679
3680   if (BE (ret != REG_NOERROR, 0))
3681     {
3682       re_free (sbcset);
3683 #ifdef RE_ENABLE_I18N
3684       free_charset (mbcset);
3685 #endif /* RE_ENABLE_I18N */
3686       *err = ret;
3687       return NULL;
3688     }
3689   /* \w match '_' also.  */
3690   for (; *extra; extra++)
3691     bitset_set (sbcset, *extra);
3692
3693   /* If it is non-matching list.  */
3694   if (non_match)
3695     bitset_not (sbcset);
3696
3697 #ifdef RE_ENABLE_I18N
3698   /* Ensure only single byte characters are set.  */
3699   if (dfa->mb_cur_max > 1)
3700     bitset_mask (sbcset, dfa->sb_char);
3701 #endif
3702
3703   /* Build a tree for simple bracket.  */
3704   br_token.type = SIMPLE_BRACKET;
3705   br_token.opr.sbcset = sbcset;
3706   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3707   if (BE (tree == NULL, 0))
3708     goto build_word_op_espace;
3709
3710 #ifdef RE_ENABLE_I18N
3711   if (dfa->mb_cur_max > 1)
3712     {
3713       bin_tree_t *mbc_tree;
3714       /* Build a tree for complex bracket.  */
3715       br_token.type = COMPLEX_BRACKET;
3716       br_token.opr.mbcset = mbcset;
3717       dfa->has_mb_node = 1;
3718       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3719       if (BE (mbc_tree == NULL, 0))
3720         goto build_word_op_espace;
3721       /* Then join them by ALT node.  */
3722       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3723       if (BE (mbc_tree != NULL, 1))
3724         return tree;
3725     }
3726   else
3727     {
3728       free_charset (mbcset);
3729       return tree;
3730     }
3731 #else /* not RE_ENABLE_I18N */
3732   return tree;
3733 #endif /* not RE_ENABLE_I18N */
3734
3735  build_word_op_espace:
3736   re_free (sbcset);
3737 #ifdef RE_ENABLE_I18N
3738   free_charset (mbcset);
3739 #endif /* RE_ENABLE_I18N */
3740   *err = REG_ESPACE;
3741   return NULL;
3742 }
3743
3744 /* This is intended for the expressions like "a{1,3}".
3745    Fetch a number from `input', and return the number.
3746    Return -1, if the number field is empty like "{,1}".
3747    Return -2, If an error is occured.  */
3748
3749 static int
3750 fetch_number (input, token, syntax)
3751      re_string_t *input;
3752      re_token_t *token;
3753      reg_syntax_t syntax;
3754 {
3755   int num = -1;
3756   unsigned char c;
3757   while (1)
3758     {
3759       fetch_token (token, input, syntax);
3760       c = token->opr.c;
3761       if (BE (token->type == END_OF_RE, 0))
3762         return -2;
3763       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3764         break;
3765       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3766              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3767       num = (num > RE_DUP_MAX) ? -2 : num;
3768     }
3769   return num;
3770 }
3771 \f
3772 #ifdef RE_ENABLE_I18N
3773 static void
3774 free_charset (re_charset_t *cset)
3775 {
3776   re_free (cset->mbchars);
3777 # ifdef _LIBC
3778   re_free (cset->coll_syms);
3779   re_free (cset->equiv_classes);
3780   re_free (cset->range_starts);
3781   re_free (cset->range_ends);
3782 # endif
3783   re_free (cset->char_classes);
3784   re_free (cset);
3785 }
3786 #endif /* RE_ENABLE_I18N */
3787 \f
3788 /* Functions for binary tree operation.  */
3789
3790 /* Create a tree node.  */
3791
3792 static bin_tree_t *
3793 create_tree (dfa, left, right, type)
3794      re_dfa_t *dfa;
3795      bin_tree_t *left;
3796      bin_tree_t *right;
3797      re_token_type_t type;
3798 {
3799   re_token_t t;
3800   t.type = type;
3801   return create_token_tree (dfa, left, right, &t);
3802 }
3803
3804 static bin_tree_t *
3805 create_token_tree (dfa, left, right, token)
3806      re_dfa_t *dfa;
3807      bin_tree_t *left;
3808      bin_tree_t *right;
3809      const re_token_t *token;
3810 {
3811   bin_tree_t *tree;
3812   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813     {
3814       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816       if (storage == NULL)
3817         return NULL;
3818       storage->next = dfa->str_tree_storage;
3819       dfa->str_tree_storage = storage;
3820       dfa->str_tree_storage_idx = 0;
3821     }
3822   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824   tree->parent = NULL;
3825   tree->left = left;
3826   tree->right = right;
3827   tree->token = *token;
3828   tree->token.duplicated = 0;
3829   tree->token.opt_subexp = 0;
3830   tree->first = NULL;
3831   tree->next = NULL;
3832   tree->node_idx = -1;
3833
3834   if (left != NULL)
3835     left->parent = tree;
3836   if (right != NULL)
3837     right->parent = tree;
3838   return tree;
3839 }
3840
3841 /* Mark the tree SRC as an optional subexpression.
3842    To be called from preorder or postorder.  */
3843
3844 static reg_errcode_t
3845 mark_opt_subexp (extra, node)
3846      void *extra;
3847      bin_tree_t *node;
3848 {
3849   int idx = (int) (long) extra;
3850   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3851     node->token.opt_subexp = 1;
3852
3853   return REG_NOERROR;
3854 }
3855
3856 /* Free the allocated memory inside NODE. */
3857
3858 static void
3859 free_token (re_token_t *node)
3860 {
3861 #ifdef RE_ENABLE_I18N
3862   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3863     free_charset (node->opr.mbcset);
3864   else
3865 #endif /* RE_ENABLE_I18N */
3866     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3867       re_free (node->opr.sbcset);
3868 }
3869
3870 /* Worker function for tree walking.  Free the allocated memory inside NODE
3871    and its children. */
3872
3873 static reg_errcode_t
3874 free_tree (void *extra, bin_tree_t *node)
3875 {
3876   free_token (&node->token);
3877   return REG_NOERROR;
3878 }
3879
3880
3881 /* Duplicate the node SRC, and return new node.  This is a preorder
3882    visit similar to the one implemented by the generic visitor, but
3883    we need more infrastructure to maintain two parallel trees --- so,
3884    it's easier to duplicate.  */
3885
3886 static bin_tree_t *
3887 duplicate_tree (root, dfa)
3888      const bin_tree_t *root;
3889      re_dfa_t *dfa;
3890 {
3891   const bin_tree_t *node;
3892   bin_tree_t *dup_root;
3893   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3894
3895   for (node = root; ; )
3896     {
3897       /* Create a new tree and link it back to the current parent.  */
3898       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3899       if (*p_new == NULL)
3900         return NULL;
3901       (*p_new)->parent = dup_node;
3902       (*p_new)->token.duplicated = 1;
3903       dup_node = *p_new;
3904
3905       /* Go to the left node, or up and to the right.  */
3906       if (node->left)
3907         {
3908           node = node->left;
3909           p_new = &dup_node->left;
3910         }
3911       else
3912         {
3913           const bin_tree_t *prev = NULL;
3914           while (node->right == prev || node->right == NULL)
3915             {
3916               prev = node;
3917               node = node->parent;
3918               dup_node = dup_node->parent;
3919               if (!node)
3920                 return dup_root;
3921             }
3922           node = node->right;
3923           p_new = &dup_node->right;
3924         }
3925     }
3926 }