lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License along
  18    with this program; if not, write to the Free Software Foundation,
  19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          Idx node, bool root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static Idx fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   Idx nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             bool accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         Idx *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       Idx *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        bool non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 static const char __re_error_msgid[] =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 static const size_t __re_error_msgid_idx[] =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 #ifdef _LIBC
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 #else /* size_t might promote */
 220 const char *
 221 re_compile_pattern (const char *pattern, size_t length,
 222                     struct re_pattern_buffer *bufp)
 223 #endif
 224 {
 225   reg_errcode_t ret;
 226
 227   /* And GNU code determines whether or not to get register information
 228      by passing null for the REGS argument to re_match, etc., not by
 229      setting no_sub, unless RE_NO_SUB is set.  */
 230   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 231
 232   /* Match anchors at newline.  */
 233   bufp->newline_anchor = 1;
 234
 235   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 236
 237   if (!ret)
 238     return NULL;
 239   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 240 }
 241 #ifdef _LIBC
 242 weak_alias (__re_compile_pattern, re_compile_pattern)
 243 #endif
 244
 245 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 246    also be assigned to arbitrarily: each pattern buffer stores its own
 247    syntax, so it can be changed between regex compilations.  */
 248 /* This has no initializer because initialized variables in Emacs
 249    become read-only after dumping.  */
 250 reg_syntax_t re_syntax_options;
 251
 252
 253 /* Specify the precise syntax of regexps for compilation.  This provides
 254    for compatibility for various utilities which historically have
 255    different, incompatible syntaxes.
 256
 257    The argument SYNTAX is a bit mask comprised of the various bits
 258    defined in regex.h.  We return the old syntax.  */
 259
 260 reg_syntax_t
 261 re_set_syntax (syntax)
 262     reg_syntax_t syntax;
 263 {
 264   reg_syntax_t ret = re_syntax_options;
 265
 266   re_syntax_options = syntax;
 267   return ret;
 268 }
 269 #ifdef _LIBC
 270 weak_alias (__re_set_syntax, re_set_syntax)
 271 #endif
 272
 273 int
 274 re_compile_fastmap (bufp)
 275     struct re_pattern_buffer *bufp;
 276 {
 277   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 278   char *fastmap = bufp->fastmap;
 279
 280   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 281   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 282   if (dfa->init_state != dfa->init_state_word)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 284   if (dfa->init_state != dfa->init_state_nl)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 286   if (dfa->init_state != dfa->init_state_begbuf)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 288   bufp->fastmap_accurate = 1;
 289   return 0;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 293 #endif
 294
 295 static inline void
 296 __attribute ((always_inline))
 297 re_set_fastmap (char *fastmap, bool icase, int ch)
 298 {
 299   fastmap[ch] = 1;
 300   if (icase)
 301     fastmap[tolower (ch)] = 1;
 302 }
 303
 304 /* Helper function for re_compile_fastmap.
 305    Compile fastmap for the initial_state INIT_STATE.  */
 306
 307 static void
 308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 309                          char *fastmap)
 310 {
 311   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 312   Idx node_cnt;
 313   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 314   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 315     {
 316       Idx node = init_state->nodes.elems[node_cnt];
 317       re_token_type_t type = dfa->nodes[node].type;
 318
 319       if (type == CHARACTER)
 320         {
 321           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 322 #ifdef RE_ENABLE_I18N
 323           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 324             {
 325               unsigned char buf[MB_LEN_MAX];
 326               unsigned char *p;
 327               wchar_t wc;
 328               mbstate_t state;
 329
 330               p = buf;
 331               *p++ = dfa->nodes[node].opr.c;
 332               while (++node < dfa->nodes_len
 333                      && dfa->nodes[node].type == CHARACTER
 334                      && dfa->nodes[node].mb_partial)
 335                 *p++ = dfa->nodes[node].opr.c;
 336               memset (&state, '\0', sizeof (state));
 337               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 338                              &state) == p - buf
 339                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 340                       != (size_t) -1))
 341                 re_set_fastmap (fastmap, false, buf[0]);
 342             }
 343 #endif
 344         }
 345       else if (type == SIMPLE_BRACKET)
 346         {
 347           int i, ch;
 348           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 349             {
 350               int j;
 351               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 352               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 353                 if (w & ((bitset_word_t) 1 << j))
 354                   re_set_fastmap (fastmap, icase, ch);
 355             }
 356         }
 357 #ifdef RE_ENABLE_I18N
 358       else if (type == COMPLEX_BRACKET)
 359         {
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           Idx i;
 362
 363 # ifdef _LIBC
 364           /* See if we have to try all bytes which start multiple collation
 365              elements.
 366              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 367                   collation element, and don't catch 'b' since 'b' is
 368                   the only collation element which starts from 'b' (and
 369                   it is caught by SIMPLE_BRACKET).  */
 370               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 371                   && (cset->ncoll_syms || cset->nranges))
 372                 {
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # endif /* _LIBC */
 380
 381           /* See if we have to start the match at all multibyte characters,
 382              i.e. where we would not find an invalid sequence.  This only
 383              applies to multibyte character sets; for single byte character
 384              sets, the SIMPLE_BRACKET again suffices.  */
 385           if (dfa->mb_cur_max > 1
 386               && (cset->nchar_classes || cset->non_match || cset->nranges
 387 # ifdef _LIBC
 388                   || cset->nequiv_classes
 389 # endif /* _LIBC */
 390                  ))
 391             {
 392               unsigned char c = 0;
 393               do
 394                 {
 395                   mbstate_t mbs;
 396                   memset (&mbs, 0, sizeof (mbs));
 397                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 398                     re_set_fastmap (fastmap, false, (int) c);
 399                 }
 400               while (++c != 0);
 401             }
 402
 403           else
 404             {
 405               /* ... Else catch all bytes which can start the mbchars.  */
 406               for (i = 0; i < cset->nmbchars; ++i)
 407                 {
 408                   char buf[256];
 409                   mbstate_t state;
 410                   memset (&state, '\0', sizeof (state));
 411                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 412                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 413                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 414                     {
 415                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 416                           != (size_t) -1)
 417                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 418                     }
 419                 }
 420             }
 421         }
 422 #endif /* RE_ENABLE_I18N */
 423       else if (type == OP_PERIOD
 424 #ifdef RE_ENABLE_I18N
 425                || type == OP_UTF8_PERIOD
 426 #endif /* RE_ENABLE_I18N */
 427                || type == END_OF_RE)
 428         {
 429           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 430           if (type == END_OF_RE)
 431             bufp->can_be_null = 1;
 432           return;
 433         }
 434     }
 435 }
 436 \f
 437 /* Entry point for POSIX code.  */
 438 /* regcomp takes a regular expression as a string and compiles it.
 439
 440    PREG is a regex_t *.  We do not expect any fields to be initialized,
 441    since POSIX says we shouldn't.  Thus, we set
 442
 443      `buffer' to the compiled pattern;
 444      `used' to the length of the compiled pattern;
 445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 447        RE_SYNTAX_POSIX_BASIC;
 448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 449      `fastmap' to an allocated space for the fastmap;
 450      `fastmap_accurate' to zero;
 451      `re_nsub' to the number of subexpressions in PATTERN.
 452
 453    PATTERN is the address of the pattern string.
 454
 455    CFLAGS is a series of bits which affect compilation.
 456
 457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 458      use POSIX basic syntax.
 459
 460      If REG_NEWLINE is set, then . and [^...] don't match newline.
 461      Also, regexec will try a match beginning after every newline.
 462
 463      If REG_ICASE is set, then we considers upper- and lowercase
 464      versions of letters to be equivalent when matching.
 465
 466      If REG_NOSUB is set, then when PREG is passed to regexec, that
 467      routine will report only success or failure, and nothing about the
 468      registers.
 469
 470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 471    the return codes and their meanings.)  */
 472
 473 int
 474 regcomp (preg, pattern, cflags)
 475     regex_t *_Restrict_ preg;
 476     const char *_Restrict_ pattern;
 477     int cflags;
 478 {
 479   reg_errcode_t ret;
 480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 481                          : RE_SYNTAX_POSIX_BASIC);
 482
 483   preg->buffer = NULL;
 484   preg->allocated = 0;
 485   preg->used = 0;
 486
 487   /* Try to allocate space for the fastmap.  */
 488   preg->fastmap = re_malloc (char, SBC_MAX);
 489   if (BE (preg->fastmap == NULL, 0))
 490     return REG_ESPACE;
 491
 492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 493
 494   /* If REG_NEWLINE is set, newlines are treated differently.  */
 495   if (cflags & REG_NEWLINE)
 496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 497       syntax &= ~RE_DOT_NEWLINE;
 498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 499       /* It also changes the matching behavior.  */
 500       preg->newline_anchor = 1;
 501     }
 502   else
 503     preg->newline_anchor = 0;
 504   preg->no_sub = !!(cflags & REG_NOSUB);
 505   preg->translate = NULL;
 506
 507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 508
 509   /* POSIX doesn't distinguish between an unmatched open-group and an
 510      unmatched close-group: both are REG_EPAREN.  */
 511   if (ret == REG_ERPAREN)
 512     ret = REG_EPAREN;
 513
 514   /* We have already checked preg->fastmap != NULL.  */
 515   if (BE (ret == REG_NOERROR, 1))
 516     /* Compute the fastmap now, since regexec cannot modify the pattern
 517        buffer.  This function never fails in this implementation.  */
 518     (void) re_compile_fastmap (preg);
 519   else
 520     {
 521       /* Some error occurred while compiling the expression.  */
 522       re_free (preg->fastmap);
 523       preg->fastmap = NULL;
 524     }
 525
 526   return (int) ret;
 527 }
 528 #ifdef _LIBC
 529 weak_alias (__regcomp, regcomp)
 530 #endif
 531
 532 /* Returns a message corresponding to an error code, ERRCODE, returned
 533    from either regcomp or regexec.   We don't use PREG here.  */
 534
 535 #ifdef _LIBC
 536 size_t
 537 regerror (errcode, preg, errbuf, errbuf_size)
 538     int errcode;
 539     const regex_t *_Restrict_ preg;
 540     char *_Restrict_ errbuf;
 541     size_t errbuf_size;
 542 #else /* size_t might promote */
 543 size_t
 544 regerror (int errcode, const regex_t *_Restrict_ preg,
 545           char *_Restrict_ errbuf, size_t errbuf_size)
 546 #endif
 547 {
 548   const char *msg;
 549   size_t msg_size;
 550
 551   if (BE (errcode < 0
 552           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 553                                / sizeof (__re_error_msgid_idx[0])), 0))
 554     /* Only error codes returned by the rest of the code should be passed
 555        to this routine.  If we are given anything else, or if other regex
 556        code generates an invalid error code, then the program has a bug.
 557        Dump core so we can fix it.  */
 558     abort ();
 559
 560   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 561
 562   msg_size = strlen (msg) + 1; /* Includes the null.  */
 563
 564   if (BE (errbuf_size != 0, 1))
 565     {
 566       size_t cpy_size = msg_size;
 567       if (BE (msg_size > errbuf_size, 0))
 568         {
 569           cpy_size = errbuf_size - 1;
 570           errbuf[cpy_size] = '\0';
 571         }
 572       memcpy (errbuf, msg, cpy_size);
 573     }
 574
 575   return msg_size;
 576 }
 577 #ifdef _LIBC
 578 weak_alias (__regerror, regerror)
 579 #endif
 580
 581
 582 #ifdef RE_ENABLE_I18N
 583 /* This static array is used for the map to single-byte characters when
 584    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 585    it the same all the time.  UTF-8 is the preferred encoding so this is
 586    a worthwhile optimization.  */
 587 static const bitset_t utf8_sb_map =
 588 {
 589   /* Set the first 128 bits.  */
 590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 591 #  error "bitset_word_t is narrower than 32 bits"
 592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 593   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX,
 598 # endif
 599   (BITSET_WORD_MAX
 600    >> (SBC_MAX % BITSET_WORD_BITS == 0
 601        ? 0
 602        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 603 };
 604 #endif
 605
 606
 607 static void
 608 free_dfa_content (re_dfa_t *dfa)
 609 {
 610   Idx i, j;
 611
 612   if (dfa->nodes)
 613     for (i = 0; i < dfa->nodes_len; ++i)
 614       free_token (dfa->nodes + i);
 615   re_free (dfa->nexts);
 616   for (i = 0; i < dfa->nodes_len; ++i)
 617     {
 618       if (dfa->eclosures != NULL)
 619         re_node_set_free (dfa->eclosures + i);
 620       if (dfa->inveclosures != NULL)
 621         re_node_set_free (dfa->inveclosures + i);
 622       if (dfa->edests != NULL)
 623         re_node_set_free (dfa->edests + i);
 624     }
 625   re_free (dfa->edests);
 626   re_free (dfa->eclosures);
 627   re_free (dfa->inveclosures);
 628   re_free (dfa->nodes);
 629
 630   if (dfa->state_table)
 631     for (i = 0; i <= dfa->state_hash_mask; ++i)
 632       {
 633         struct re_state_table_entry *entry = dfa->state_table + i;
 634         for (j = 0; j < entry->num; ++j)
 635           {
 636             re_dfastate_t *state = entry->array[j];
 637             free_state (state);
 638           }
 639         re_free (entry->array);
 640       }
 641   re_free (dfa->state_table);
 642 #ifdef RE_ENABLE_I18N
 643   if (dfa->sb_char != utf8_sb_map)
 644     re_free (dfa->sb_char);
 645 #endif
 646   re_free (dfa->subexp_map);
 647 #ifdef DEBUG
 648   re_free (dfa->re_str);
 649 #endif
 650
 651   re_free (dfa);
 652 }
 653
 654
 655 /* Free dynamically allocated space used by PREG.  */
 656
 657 void
 658 regfree (preg)
 659     regex_t *preg;
 660 {
 661   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 662   if (BE (dfa != NULL, 1))
 663     free_dfa_content (dfa);
 664   preg->buffer = NULL;
 665   preg->allocated = 0;
 666
 667   re_free (preg->fastmap);
 668   preg->fastmap = NULL;
 669
 670   re_free (preg->translate);
 671   preg->translate = NULL;
 672 }
 673 #ifdef _LIBC
 674 weak_alias (__regfree, regfree)
 675 #endif
 676 \f
 677 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 678    them unless specifically requested.  */
 679
 680 #if defined _REGEX_RE_COMP || defined _LIBC
 681
 682 /* BSD has one and only one pattern buffer.  */
 683 static struct re_pattern_buffer re_comp_buf;
 684
 685 char *
 686 # ifdef _LIBC
 687 /* Make these definitions weak in libc, so POSIX programs can redefine
 688    these names if they don't use our functions, and still use
 689    regcomp/regexec above without link errors.  */
 690 weak_function
 691 # endif
 692 re_comp (s)
 693      const char *s;
 694 {
 695   reg_errcode_t ret;
 696   char *fastmap;
 697
 698   if (!s)
 699     {
 700       if (!re_comp_buf.buffer)
 701         return gettext ("No previous regular expression");
 702       return 0;
 703     }
 704
 705   if (re_comp_buf.buffer)
 706     {
 707       fastmap = re_comp_buf.fastmap;
 708       re_comp_buf.fastmap = NULL;
 709       __regfree (&re_comp_buf);
 710       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 711       re_comp_buf.fastmap = fastmap;
 712     }
 713
 714   if (re_comp_buf.fastmap == NULL)
 715     {
 716       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 717       if (re_comp_buf.fastmap == NULL)
 718         return (char *) gettext (__re_error_msgid
 719                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 720     }
 721
 722   /* Since `re_exec' always passes NULL for the `regs' argument, we
 723      don't need to initialize the pattern buffer fields which affect it.  */
 724
 725   /* Match anchors at newlines.  */
 726   re_comp_buf.newline_anchor = 1;
 727
 728   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 729
 730   if (!ret)
 731     return NULL;
 732
 733   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 734   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 735 }
 736
 737 #ifdef _LIBC
 738 libc_freeres_fn (free_mem)
 739 {
 740   __regfree (&re_comp_buf);
 741 }
 742 #endif
 743
 744 #endif /* _REGEX_RE_COMP */
 745 \f
 746 /* Internal entry point.
 747    Compile the regular expression PATTERN, whose length is LENGTH.
 748    SYNTAX indicate regular expression's syntax.  */
 749
 750 static reg_errcode_t
 751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 752                      reg_syntax_t syntax)
 753 {
 754   reg_errcode_t err = REG_NOERROR;
 755   re_dfa_t *dfa;
 756   re_string_t regexp;
 757
 758   /* Initialize the pattern buffer.  */
 759   preg->fastmap_accurate = 0;
 760   preg->syntax = syntax;
 761   preg->not_bol = preg->not_eol = 0;
 762   preg->used = 0;
 763   preg->re_nsub = 0;
 764   preg->can_be_null = 0;
 765   preg->regs_allocated = REGS_UNALLOCATED;
 766
 767   /* Initialize the dfa.  */
 768   dfa = (re_dfa_t *) preg->buffer;
 769   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 770     {
 771       /* If zero allocated, but buffer is non-null, try to realloc
 772          enough space.  This loses if buffer's address is bogus, but
 773          that is the user's responsibility.  If ->buffer is NULL this
 774          is a simple allocation.  */
 775       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 776       if (dfa == NULL)
 777         return REG_ESPACE;
 778       preg->allocated = sizeof (re_dfa_t);
 779       preg->buffer = (unsigned char *) dfa;
 780     }
 781   preg->used = sizeof (re_dfa_t);
 782
 783   err = init_dfa (dfa, length);
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786       free_dfa_content (dfa);
 787       preg->buffer = NULL;
 788       preg->allocated = 0;
 789       return err;
 790     }
 791 #ifdef DEBUG
 792   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 793   dfa->re_str = re_malloc (char, length + 1);
 794   strncpy (dfa->re_str, pattern, length + 1);
 795 #endif
 796
 797   __libc_lock_init (dfa->lock);
 798
 799   err = re_string_construct (&regexp, pattern, length, preg->translate,
 800                              (syntax & RE_ICASE) != 0, dfa);
 801   if (BE (err != REG_NOERROR, 0))
 802     {
 803     re_compile_internal_free_return:
 804       free_workarea_compile (preg);
 805       re_string_destruct (&regexp);
 806       free_dfa_content (dfa);
 807       preg->buffer = NULL;
 808       preg->allocated = 0;
 809       return err;
 810     }
 811
 812   /* Parse the regular expression, and build a structure tree.  */
 813   preg->re_nsub = 0;
 814   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 815   if (BE (dfa->str_tree == NULL, 0))
 816     goto re_compile_internal_free_return;
 817
 818   /* Analyze the tree and create the nfa.  */
 819   err = analyze (preg);
 820   if (BE (err != REG_NOERROR, 0))
 821     goto re_compile_internal_free_return;
 822
 823 #ifdef RE_ENABLE_I18N
 824   /* If possible, do searching in single byte encoding to speed things up.  */
 825   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 826     optimize_utf8 (dfa);
 827 #endif
 828
 829   /* Then create the initial state of the dfa.  */
 830   err = create_initial_state (dfa);
 831
 832   /* Release work areas.  */
 833   free_workarea_compile (preg);
 834   re_string_destruct (&regexp);
 835
 836   if (BE (err != REG_NOERROR, 0))
 837     {
 838       free_dfa_content (dfa);
 839       preg->buffer = NULL;
 840       preg->allocated = 0;
 841     }
 842
 843   return err;
 844 }
 845
 846 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 847    as the initial length of some arrays.  */
 848
 849 static reg_errcode_t
 850 init_dfa (re_dfa_t *dfa, size_t pat_len)
 851 {
 852   __re_size_t table_size;
 853 #ifndef _LIBC
 854   char *codeset_name;
 855 #endif
 856 #ifdef RE_ENABLE_I18N
 857   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 858 #else
 859   size_t max_i18n_object_size = 0;
 860 #endif
 861   size_t max_object_size =
 862     MAX (sizeof (struct re_state_table_entry),
 863          MAX (sizeof (re_token_t),
 864               MAX (sizeof (re_node_set),
 865                    MAX (sizeof (regmatch_t),
 866                         max_i18n_object_size))));
 867
 868   memset (dfa, '\0', sizeof (re_dfa_t));
 869
 870   /* Force allocation of str_tree_storage the first time.  */
 871   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 872
 873   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 874      calculation below, and for similar doubling calculations
 875      elsewhere.  And it's <= rather than <, because some of the
 876      doubling calculations add 1 afterwards.  */
 877   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 878     return REG_ESPACE;
 879
 880   dfa->nodes_alloc = pat_len + 1;
 881   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 882
 883   /*  table_size = 2 ^ ceil(log pat_len) */
 884   for (table_size = 1; ; table_size <<= 1)
 885     if (table_size > pat_len)
 886       break;
 887
 888   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 889   dfa->state_hash_mask = table_size - 1;
 890
 891   dfa->mb_cur_max = MB_CUR_MAX;
 892 #ifdef _LIBC
 893   if (dfa->mb_cur_max == 6
 894       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 895     dfa->is_utf8 = 1;
 896   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 897                        != 0);
 898 #else
 899   codeset_name = nl_langinfo (CODESET);
 900   if (strcasecmp (codeset_name, "UTF-8") == 0
 901       || strcasecmp (codeset_name, "UTF8") == 0)
 902     dfa->is_utf8 = 1;
 903
 904   /* We check exhaustively in the loop below if this charset is a
 905      superset of ASCII.  */
 906   dfa->map_notascii = 0;
 907 #endif
 908
 909 #ifdef RE_ENABLE_I18N
 910   if (dfa->mb_cur_max > 1)
 911     {
 912       if (dfa->is_utf8)
 913         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 914       else
 915         {
 916           int i, j, ch;
 917
 918           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 919           if (BE (dfa->sb_char == NULL, 0))
 920             return REG_ESPACE;
 921
 922           /* Set the bits corresponding to single byte chars.  */
 923           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 924             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 925               {
 926                 wint_t wch = __btowc (ch);
 927                 if (wch != WEOF)
 928                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 929 # ifndef _LIBC
 930                 if (isascii (ch) && wch != ch)
 931                   dfa->map_notascii = 1;
 932 # endif
 933               }
 934         }
 935     }
 936 #endif
 937
 938   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 939     return REG_ESPACE;
 940   return REG_NOERROR;
 941 }
 942
 943 /* Initialize WORD_CHAR table, which indicate which character is
 944    "word".  In this case "word" means that it is the word construction
 945    character used by some operators like "\<", "\>", etc.  */
 946
 947 static void
 948 internal_function
 949 init_word_char (re_dfa_t *dfa)
 950 {
 951   int i, j, ch;
 952   dfa->word_ops_used = 1;
 953   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 954     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 955       if (isalnum (ch) || ch == '_')
 956         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 957 }
 958
 959 /* Free the work area which are only used while compiling.  */
 960
 961 static void
 962 free_workarea_compile (regex_t *preg)
 963 {
 964   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 965   bin_tree_storage_t *storage, *next;
 966   for (storage = dfa->str_tree_storage; storage; storage = next)
 967     {
 968       next = storage->next;
 969       re_free (storage);
 970     }
 971   dfa->str_tree_storage = NULL;
 972   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 973   dfa->str_tree = NULL;
 974   re_free (dfa->org_indices);
 975   dfa->org_indices = NULL;
 976 }
 977
 978 /* Create initial states for all contexts.  */
 979
 980 static reg_errcode_t
 981 create_initial_state (re_dfa_t *dfa)
 982 {
 983   Idx first, i;
 984   reg_errcode_t err;
 985   re_node_set init_nodes;
 986
 987   /* Initial states have the epsilon closure of the node which is
 988      the first node of the regular expression.  */
 989   first = dfa->str_tree->first->node_idx;
 990   dfa->init_node = first;
 991   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 992   if (BE (err != REG_NOERROR, 0))
 993     return err;
 994
 995   /* The back-references which are in initial states can epsilon transit,
 996      since in this case all of the subexpressions can be null.
 997      Then we add epsilon closures of the nodes which are the next nodes of
 998      the back-references.  */
 999   if (dfa->nbackref > 0)
1000     for (i = 0; i < init_nodes.nelem; ++i)
1001       {
1002         Idx node_idx = init_nodes.elems[i];
1003         re_token_type_t type = dfa->nodes[node_idx].type;
1004
1005         Idx clexp_idx;
1006         if (type != OP_BACK_REF)
1007           continue;
1008         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1009           {
1010             re_token_t *clexp_node;
1011             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012             if (clexp_node->type == OP_CLOSE_SUBEXP
1013                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1014               break;
1015           }
1016         if (clexp_idx == init_nodes.nelem)
1017           continue;
1018
1019         if (type == OP_BACK_REF)
1020           {
1021             Idx dest_idx = dfa->edests[node_idx].elems[0];
1022             if (!re_node_set_contains (&init_nodes, dest_idx))
1023               {
1024                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1025                                                        dfa->eclosures
1026                                                        + dest_idx);
1027                 if (err != REG_NOERROR)
1028                   return err;
1029                 i = 0;
1030               }
1031           }
1032       }
1033
1034   /* It must be the first time to invoke acquire_state.  */
1035   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1036   /* We don't check ERR here, since the initial state must not be NULL.  */
1037   if (BE (dfa->init_state == NULL, 0))
1038     return err;
1039   if (dfa->init_state->has_constraint)
1040     {
1041       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1042                                                        CONTEXT_WORD);
1043       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1044                                                      CONTEXT_NEWLINE);
1045       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1046                                                          &init_nodes,
1047                                                          CONTEXT_NEWLINE
1048                                                          | CONTEXT_BEGBUF);
1049       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1050               || dfa->init_state_begbuf == NULL, 0))
1051         return err;
1052     }
1053   else
1054     dfa->init_state_word = dfa->init_state_nl
1055       = dfa->init_state_begbuf = dfa->init_state;
1056
1057   re_node_set_free (&init_nodes);
1058   return REG_NOERROR;
1059 }
1060 \f
1061 #ifdef RE_ENABLE_I18N
1062 /* If it is possible to do searching in single byte encoding instead of UTF-8
1063    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1064    DFA nodes where needed.  */
1065
1066 static void
1067 optimize_utf8 (re_dfa_t *dfa)
1068 {
1069   Idx node;
1070   int i;
1071   bool mb_chars = false;
1072   bool has_period = false;
1073
1074   for (node = 0; node < dfa->nodes_len; ++node)
1075     switch (dfa->nodes[node].type)
1076       {
1077       case CHARACTER:
1078         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1079           mb_chars = true;
1080         break;
1081       case ANCHOR:
1082         switch (dfa->nodes[node].opr.ctx_type)
1083           {
1084           case LINE_FIRST:
1085           case LINE_LAST:
1086           case BUF_FIRST:
1087           case BUF_LAST:
1088             break;
1089           default:
1090             /* Word anchors etc. cannot be handled.  It's okay to test
1091                opr.ctx_type since constraints (for all DFA nodes) are
1092                created by ORing one or more opr.ctx_type values.  */
1093             return;
1094           }
1095         break;
1096       case OP_PERIOD:
1097         has_period = true;
1098         break;
1099       case OP_BACK_REF:
1100       case OP_ALT:
1101       case END_OF_RE:
1102       case OP_DUP_ASTERISK:
1103       case OP_OPEN_SUBEXP:
1104       case OP_CLOSE_SUBEXP:
1105         break;
1106       case COMPLEX_BRACKET:
1107         return;
1108       case SIMPLE_BRACKET:
1109         /* Just double check.  */
1110         {
1111           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112                         ? 0
1113                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1114           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115             {
1116               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1117                 return;
1118               rshift = 0;
1119             }
1120         }
1121         break;
1122       default:
1123         abort ();
1124       }
1125
1126   if (mb_chars || has_period)
1127     for (node = 0; node < dfa->nodes_len; ++node)
1128       {
1129         if (dfa->nodes[node].type == CHARACTER
1130             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1131           dfa->nodes[node].mb_partial = 0;
1132         else if (dfa->nodes[node].type == OP_PERIOD)
1133           dfa->nodes[node].type = OP_UTF8_PERIOD;
1134       }
1135
1136   /* The search can be in single byte locale.  */
1137   dfa->mb_cur_max = 1;
1138   dfa->is_utf8 = 0;
1139   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1140 }
1141 #endif
1142 \f
1143 /* Analyze the structure tree, and calculate "first", "next", "edest",
1144    "eclosure", and "inveclosure".  */
1145
1146 static reg_errcode_t
1147 analyze (regex_t *preg)
1148 {
1149   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1150   reg_errcode_t ret;
1151
1152   /* Allocate arrays.  */
1153   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1154   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1156   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1157   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1158           || dfa->eclosures == NULL, 0))
1159     return REG_ESPACE;
1160
1161   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1162   if (dfa->subexp_map != NULL)
1163     {
1164       Idx i;
1165       for (i = 0; i < preg->re_nsub; i++)
1166         dfa->subexp_map[i] = i;
1167       preorder (dfa->str_tree, optimize_subexps, dfa);
1168       for (i = 0; i < preg->re_nsub; i++)
1169         if (dfa->subexp_map[i] != i)
1170           break;
1171       if (i == preg->re_nsub)
1172         {
1173           free (dfa->subexp_map);
1174           dfa->subexp_map = NULL;
1175         }
1176     }
1177
1178   ret = postorder (dfa->str_tree, lower_subexps, preg);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181   ret = postorder (dfa->str_tree, calc_first, dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184   preorder (dfa->str_tree, calc_next, dfa);
1185   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1186   if (BE (ret != REG_NOERROR, 0))
1187     return ret;
1188   ret = calc_eclosure (dfa);
1189   if (BE (ret != REG_NOERROR, 0))
1190     return ret;
1191
1192   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1193      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1194   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1195       || dfa->nbackref)
1196     {
1197       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1198       if (BE (dfa->inveclosures == NULL, 0))
1199         return REG_ESPACE;
1200       ret = calc_inveclosure (dfa);
1201     }
1202
1203   return ret;
1204 }
1205
1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
1207    implement parse tree visits.  Instead, we use parent pointers and
1208    some hairy code in these two functions.  */
1209 static reg_errcode_t
1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211            void *extra)
1212 {
1213   bin_tree_t *node, *prev;
1214
1215   for (node = root; ; )
1216     {
1217       /* Descend down the tree, preferably to the left (or to the right
1218          if that's the only child).  */
1219       while (node->left || node->right)
1220         if (node->left)
1221           node = node->left;
1222         else
1223           node = node->right;
1224
1225       do
1226         {
1227           reg_errcode_t err = fn (extra, node);
1228           if (BE (err != REG_NOERROR, 0))
1229             return err;
1230           if (node->parent == NULL)
1231             return REG_NOERROR;
1232           prev = node;
1233           node = node->parent;
1234         }
1235       /* Go up while we have a node that is reached from the right.  */
1236       while (node->right == prev || node->right == NULL);
1237       node = node->right;
1238     }
1239 }
1240
1241 static reg_errcode_t
1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1243           void *extra)
1244 {
1245   bin_tree_t *node;
1246
1247   for (node = root; ; )
1248     {
1249       reg_errcode_t err = fn (extra, node);
1250       if (BE (err != REG_NOERROR, 0))
1251         return err;
1252
1253       /* Go to the left node, or up and to the right.  */
1254       if (node->left)
1255         node = node->left;
1256       else
1257         {
1258           bin_tree_t *prev = NULL;
1259           while (node->right == prev || node->right == NULL)
1260             {
1261               prev = node;
1262               node = node->parent;
1263               if (!node)
1264                 return REG_NOERROR;
1265             }
1266           node = node->right;
1267         }
1268     }
1269 }
1270
1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1272    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1273    backreferences as well.  Requires a preorder visit.  */
1274 static reg_errcode_t
1275 optimize_subexps (void *extra, bin_tree_t *node)
1276 {
1277   re_dfa_t *dfa = (re_dfa_t *) extra;
1278
1279   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280     {
1281       int idx = node->token.opr.idx;
1282       node->token.opr.idx = dfa->subexp_map[idx];
1283       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1284     }
1285
1286   else if (node->token.type == SUBEXP
1287            && node->left && node->left->token.type == SUBEXP)
1288     {
1289       Idx other_idx = node->left->token.opr.idx;
1290
1291       node->left = node->left->left;
1292       if (node->left)
1293         node->left->parent = node;
1294
1295       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1296       if (other_idx < BITSET_WORD_BITS)
1297         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1298     }
1299
1300   return REG_NOERROR;
1301 }
1302
1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1304    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1305 static reg_errcode_t
1306 lower_subexps (void *extra, bin_tree_t *node)
1307 {
1308   regex_t *preg = (regex_t *) extra;
1309   reg_errcode_t err = REG_NOERROR;
1310
1311   if (node->left && node->left->token.type == SUBEXP)
1312     {
1313       node->left = lower_subexp (&err, preg, node->left);
1314       if (node->left)
1315         node->left->parent = node;
1316     }
1317   if (node->right && node->right->token.type == SUBEXP)
1318     {
1319       node->right = lower_subexp (&err, preg, node->right);
1320       if (node->right)
1321         node->right->parent = node;
1322     }
1323
1324   return err;
1325 }
1326
1327 static bin_tree_t *
1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 {
1330   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331   bin_tree_t *body = node->left;
1332   bin_tree_t *op, *cls, *tree1, *tree;
1333
1334   if (preg->no_sub
1335       /* We do not optimize empty subexpressions, because otherwise we may
1336          have bad CONCAT nodes with NULL children.  This is obviously not
1337          very common, so we do not lose much.  An example that triggers
1338          this case is the sed "script" /\(\)/x.  */
1339       && node->left != NULL
1340       && (node->token.opr.idx >= BITSET_WORD_BITS
1341           || !(dfa->used_bkref_map
1342                & ((bitset_word_t) 1 << node->token.opr.idx))))
1343     return node->left;
1344
1345   /* Convert the SUBEXP node to the concatenation of an
1346      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1347   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1348   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1349   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1350   tree = create_tree (dfa, op, tree1, CONCAT);
1351   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1352     {
1353       *err = REG_ESPACE;
1354       return NULL;
1355     }
1356
1357   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1358   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1359   return tree;
1360 }
1361
1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1363    nodes.  Requires a postorder visit.  */
1364 static reg_errcode_t
1365 calc_first (void *extra, bin_tree_t *node)
1366 {
1367   re_dfa_t *dfa = (re_dfa_t *) extra;
1368   if (node->token.type == CONCAT)
1369     {
1370       node->first = node->left->first;
1371       node->node_idx = node->left->node_idx;
1372     }
1373   else
1374     {
1375       node->first = node;
1376       node->node_idx = re_dfa_add_node (dfa, node->token);
1377       if (BE (node->node_idx == REG_MISSING, 0))
1378         return REG_ESPACE;
1379       if (node->token.type == ANCHOR)
1380         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381     }
1382   return REG_NOERROR;
1383 }
1384
1385 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1386 static reg_errcode_t
1387 calc_next (void *extra, bin_tree_t *node)
1388 {
1389   switch (node->token.type)
1390     {
1391     case OP_DUP_ASTERISK:
1392       node->left->next = node;
1393       break;
1394     case CONCAT:
1395       node->left->next = node->right->first;
1396       node->right->next = node->next;
1397       break;
1398     default:
1399       if (node->left)
1400         node->left->next = node->next;
1401       if (node->right)
1402         node->right->next = node->next;
1403       break;
1404     }
1405   return REG_NOERROR;
1406 }
1407
1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1409 static reg_errcode_t
1410 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 {
1412   re_dfa_t *dfa = (re_dfa_t *) extra;
1413   Idx idx = node->node_idx;
1414   reg_errcode_t err = REG_NOERROR;
1415
1416   switch (node->token.type)
1417     {
1418     case CONCAT:
1419       break;
1420
1421     case END_OF_RE:
1422       assert (node->next == NULL);
1423       break;
1424
1425     case OP_DUP_ASTERISK:
1426     case OP_ALT:
1427       {
1428         Idx left, right;
1429         dfa->has_plural_match = 1;
1430         if (node->left != NULL)
1431           left = node->left->first->node_idx;
1432         else
1433           left = node->next->node_idx;
1434         if (node->right != NULL)
1435           right = node->right->first->node_idx;
1436         else
1437           right = node->next->node_idx;
1438         assert (REG_VALID_INDEX (left));
1439         assert (REG_VALID_INDEX (right));
1440         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441       }
1442       break;
1443
1444     case ANCHOR:
1445     case OP_OPEN_SUBEXP:
1446     case OP_CLOSE_SUBEXP:
1447       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1448       break;
1449
1450     case OP_BACK_REF:
1451       dfa->nexts[idx] = node->next->node_idx;
1452       if (node->token.type == OP_BACK_REF)
1453         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1454       break;
1455
1456     default:
1457       assert (!IS_EPSILON_NODE (node->token.type));
1458       dfa->nexts[idx] = node->next->node_idx;
1459       break;
1460     }
1461
1462   return err;
1463 }
1464
1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
1466    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1467    to their own constraint.  */
1468
1469 static reg_errcode_t
1470 internal_function
1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1472                         Idx root_node, unsigned int init_constraint)
1473 {
1474   Idx org_node, clone_node;
1475   bool ok;
1476   unsigned int constraint = init_constraint;
1477   for (org_node = top_org_node, clone_node = top_clone_node;;)
1478     {
1479       Idx org_dest, clone_dest;
1480       if (dfa->nodes[org_node].type == OP_BACK_REF)
1481         {
1482           /* If the back reference epsilon-transit, its destination must
1483              also have the constraint.  Then duplicate the epsilon closure
1484              of the destination of the back reference, and store it in
1485              edests of the back reference.  */
1486           org_dest = dfa->nexts[org_node];
1487           re_node_set_empty (dfa->edests + clone_node);
1488           clone_dest = duplicate_node (dfa, org_dest, constraint);
1489           if (BE (clone_dest == REG_MISSING, 0))
1490             return REG_ESPACE;
1491           dfa->nexts[clone_node] = dfa->nexts[org_node];
1492           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493           if (BE (! ok, 0))
1494             return REG_ESPACE;
1495         }
1496       else if (dfa->edests[org_node].nelem == 0)
1497         {
1498           /* In case of the node can't epsilon-transit, don't duplicate the
1499              destination and store the original destination as the
1500              destination of the node.  */
1501           dfa->nexts[clone_node] = dfa->nexts[org_node];
1502           break;
1503         }
1504       else if (dfa->edests[org_node].nelem == 1)
1505         {
1506           /* In case of the node can epsilon-transit, and it has only one
1507              destination.  */
1508           org_dest = dfa->edests[org_node].elems[0];
1509           re_node_set_empty (dfa->edests + clone_node);
1510           /* If the node is root_node itself, it means the epsilon closure
1511              has a loop.  Then tie it to the destination of the root_node.  */
1512           if (org_node == root_node && clone_node != org_node)
1513             {
1514               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515               if (BE (! ok, 0))
1516                 return REG_ESPACE;
1517               break;
1518             }
1519           /* In case the node has another constraint, append it.  */
1520           constraint |= dfa->nodes[org_node].constraint;
1521           clone_dest = duplicate_node (dfa, org_dest, constraint);
1522           if (BE (clone_dest == REG_MISSING, 0))
1523             return REG_ESPACE;
1524           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525           if (BE (! ok, 0))
1526             return REG_ESPACE;
1527         }
1528       else /* dfa->edests[org_node].nelem == 2 */
1529         {
1530           /* In case of the node can epsilon-transit, and it has two
1531              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1532           org_dest = dfa->edests[org_node].elems[0];
1533           re_node_set_empty (dfa->edests + clone_node);
1534           /* Search for a duplicated node which satisfies the constraint.  */
1535           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536           if (clone_dest == REG_MISSING)
1537             {
1538               /* There is no such duplicated node, create a new one.  */
1539               reg_errcode_t err;
1540               clone_dest = duplicate_node (dfa, org_dest, constraint);
1541               if (BE (clone_dest == REG_MISSING, 0))
1542                 return REG_ESPACE;
1543               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (! ok, 0))
1545                 return REG_ESPACE;
1546               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547                                             root_node, constraint);
1548               if (BE (err != REG_NOERROR, 0))
1549                 return err;
1550             }
1551           else
1552             {
1553               /* There is a duplicated node which satisfies the constraint,
1554                  use it to avoid infinite loop.  */
1555               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556               if (BE (! ok, 0))
1557                 return REG_ESPACE;
1558             }
1559
1560           org_dest = dfa->edests[org_node].elems[1];
1561           clone_dest = duplicate_node (dfa, org_dest, constraint);
1562           if (BE (clone_dest == REG_MISSING, 0))
1563             return REG_ESPACE;
1564           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565           if (BE (! ok, 0))
1566             return REG_ESPACE;
1567         }
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576
1577 static Idx
1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1579                         unsigned int constraint)
1580 {
1581   Idx idx;
1582   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583     {
1584       if (org_node == dfa->org_indices[idx]
1585           && constraint == dfa->nodes[idx].constraint)
1586         return idx; /* Found.  */
1587     }
1588   return REG_MISSING; /* Not found.  */
1589 }
1590
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592    Return the index of the new node, or REG_MISSING if insufficient storage is
1593    available.  */
1594
1595 static Idx
1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597 {
1598   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599   if (BE (dup_idx != REG_MISSING, 1))
1600     {
1601       dfa->nodes[dup_idx].constraint = constraint;
1602       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603       dfa->nodes[dup_idx].duplicated = 1;
1604
1605       /* Store the index of the original node.  */
1606       dfa->org_indices[dup_idx] = org_idx;
1607     }
1608   return dup_idx;
1609 }
1610
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1613 {
1614   Idx src, idx;
1615   bool ok;
1616   for (idx = 0; idx < dfa->nodes_len; ++idx)
1617     re_node_set_init_empty (dfa->inveclosures + idx);
1618
1619   for (src = 0; src < dfa->nodes_len; ++src)
1620     {
1621       Idx *elems = dfa->eclosures[src].elems;
1622       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623         {
1624           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625           if (BE (! ok, 0))
1626             return REG_ESPACE;
1627         }
1628     }
1629
1630   return REG_NOERROR;
1631 }
1632
1633 /* Calculate "eclosure" for all the node in DFA.  */
1634
1635 static reg_errcode_t
1636 calc_eclosure (re_dfa_t *dfa)
1637 {
1638   Idx node_idx;
1639   bool incomplete;
1640 #ifdef DEBUG
1641   assert (dfa->nodes_len > 0);
1642 #endif
1643   incomplete = false;
1644   /* For each nodes, calculate epsilon closure.  */
1645   for (node_idx = 0; ; ++node_idx)
1646     {
1647       reg_errcode_t err;
1648       re_node_set eclosure_elem;
1649       if (node_idx == dfa->nodes_len)
1650         {
1651           if (!incomplete)
1652             break;
1653           incomplete = false;
1654           node_idx = 0;
1655         }
1656
1657 #ifdef DEBUG
1658       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659 #endif
1660
1661       /* If we have already calculated, skip it.  */
1662       if (dfa->eclosures[node_idx].nelem != 0)
1663         continue;
1664       /* Calculate epsilon closure of `node_idx'.  */
1665       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666       if (BE (err != REG_NOERROR, 0))
1667         return err;
1668
1669       if (dfa->eclosures[node_idx].nelem == 0)
1670         {
1671           incomplete = true;
1672           re_node_set_free (&eclosure_elem);
1673         }
1674     }
1675   return REG_NOERROR;
1676 }
1677
1678 /* Calculate epsilon closure of NODE.  */
1679
1680 static reg_errcode_t
1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1682 {
1683   reg_errcode_t err;
1684   Idx i;
1685   re_node_set eclosure;
1686   bool ok;
1687   bool incomplete = false;
1688   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689   if (BE (err != REG_NOERROR, 0))
1690     return err;
1691
1692   /* This indicates that we are calculating this node now.
1693      We reference this value to avoid infinite loop.  */
1694   dfa->eclosures[node].nelem = REG_MISSING;
1695
1696   /* If the current node has constraints, duplicate all nodes
1697      since they must inherit the constraints.  */
1698   if (dfa->nodes[node].constraint
1699       && dfa->edests[node].nelem
1700       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701     {
1702       err = duplicate_node_closure (dfa, node, node, node,
1703                                     dfa->nodes[node].constraint);
1704       if (BE (err != REG_NOERROR, 0))
1705         return err;
1706     }
1707
1708   /* Expand each epsilon destination nodes.  */
1709   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710     for (i = 0; i < dfa->edests[node].nelem; ++i)
1711       {
1712         re_node_set eclosure_elem;
1713         Idx edest = dfa->edests[node].elems[i];
1714         /* If calculating the epsilon closure of `edest' is in progress,
1715            return intermediate result.  */
1716         if (dfa->eclosures[edest].nelem == REG_MISSING)
1717           {
1718             incomplete = true;
1719             continue;
1720           }
1721         /* If we haven't calculated the epsilon closure of `edest' yet,
1722            calculate now. Otherwise use calculated epsilon closure.  */
1723         if (dfa->eclosures[edest].nelem == 0)
1724           {
1725             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1726             if (BE (err != REG_NOERROR, 0))
1727               return err;
1728           }
1729         else
1730           eclosure_elem = dfa->eclosures[edest];
1731         /* Merge the epsilon closure of `edest'.  */
1732         err = re_node_set_merge (&eclosure, &eclosure_elem);
1733         if (BE (err != REG_NOERROR, 0))
1734           return err;
1735         /* If the epsilon closure of `edest' is incomplete,
1736            the epsilon closure of this node is also incomplete.  */
1737         if (dfa->eclosures[edest].nelem == 0)
1738           {
1739             incomplete = true;
1740             re_node_set_free (&eclosure_elem);
1741           }
1742       }
1743
1744   /* An epsilon closure includes itself.  */
1745   ok = re_node_set_insert (&eclosure, node);
1746   if (BE (! ok, 0))
1747     return REG_ESPACE;
1748   if (incomplete && !root)
1749     dfa->eclosures[node].nelem = 0;
1750   else
1751     dfa->eclosures[node] = eclosure;
1752   *new_set = eclosure;
1753   return REG_NOERROR;
1754 }
1755 \f
1756 /* Functions for token which are used in the parser.  */
1757
1758 /* Fetch a token from INPUT.
1759    We must not use this function inside bracket expressions.  */
1760
1761 static void
1762 internal_function
1763 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764 {
1765   re_string_skip_bytes (input, peek_token (result, input, syntax));
1766 }
1767
1768 /* Peek a token from INPUT, and return the length of the token.
1769    We must not use this function inside bracket expressions.  */
1770
1771 static int
1772 internal_function
1773 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774 {
1775   unsigned char c;
1776
1777   if (re_string_eoi (input))
1778     {
1779       token->type = END_OF_RE;
1780       return 0;
1781     }
1782
1783   c = re_string_peek_byte (input, 0);
1784   token->opr.c = c;
1785
1786   token->word_char = 0;
1787 #ifdef RE_ENABLE_I18N
1788   token->mb_partial = 0;
1789   if (input->mb_cur_max > 1 &&
1790       !re_string_first_byte (input, re_string_cur_idx (input)))
1791     {
1792       token->type = CHARACTER;
1793       token->mb_partial = 1;
1794       return 1;
1795     }
1796 #endif
1797   if (c == '\\')
1798     {
1799       unsigned char c2;
1800       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801         {
1802           token->type = BACK_SLASH;
1803           return 1;
1804         }
1805
1806       c2 = re_string_peek_byte_case (input, 1);
1807       token->opr.c = c2;
1808       token->type = CHARACTER;
1809 #ifdef RE_ENABLE_I18N
1810       if (input->mb_cur_max > 1)
1811         {
1812           wint_t wc = re_string_wchar_at (input,
1813                                           re_string_cur_idx (input) + 1);
1814           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815         }
1816       else
1817 #endif
1818         token->word_char = IS_WORD_CHAR (c2) != 0;
1819
1820       switch (c2)
1821         {
1822         case '|':
1823           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824             token->type = OP_ALT;
1825           break;
1826         case '1': case '2': case '3': case '4': case '5':
1827         case '6': case '7': case '8': case '9':
1828           if (!(syntax & RE_NO_BK_REFS))
1829             {
1830               token->type = OP_BACK_REF;
1831               token->opr.idx = c2 - '1';
1832             }
1833           break;
1834         case '<':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             {
1837               token->type = ANCHOR;
1838               token->opr.ctx_type = WORD_FIRST;
1839             }
1840           break;
1841         case '>':
1842           if (!(syntax & RE_NO_GNU_OPS))
1843             {
1844               token->type = ANCHOR;
1845               token->opr.ctx_type = WORD_LAST;
1846             }
1847           break;
1848         case 'b':
1849           if (!(syntax & RE_NO_GNU_OPS))
1850             {
1851               token->type = ANCHOR;
1852               token->opr.ctx_type = WORD_DELIM;
1853             }
1854           break;
1855         case 'B':
1856           if (!(syntax & RE_NO_GNU_OPS))
1857             {
1858               token->type = ANCHOR;
1859               token->opr.ctx_type = NOT_WORD_DELIM;
1860             }
1861           break;
1862         case 'w':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_WORD;
1865           break;
1866         case 'W':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_NOTWORD;
1869           break;
1870         case 's':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             token->type = OP_SPACE;
1873           break;
1874         case 'S':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_NOTSPACE;
1877           break;
1878         case '`':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = BUF_FIRST;
1883             }
1884           break;
1885         case '\'':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             {
1888               token->type = ANCHOR;
1889               token->opr.ctx_type = BUF_LAST;
1890             }
1891           break;
1892         case '(':
1893           if (!(syntax & RE_NO_BK_PARENS))
1894             token->type = OP_OPEN_SUBEXP;
1895           break;
1896         case ')':
1897           if (!(syntax & RE_NO_BK_PARENS))
1898             token->type = OP_CLOSE_SUBEXP;
1899           break;
1900         case '+':
1901           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902             token->type = OP_DUP_PLUS;
1903           break;
1904         case '?':
1905           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906             token->type = OP_DUP_QUESTION;
1907           break;
1908         case '{':
1909           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910             token->type = OP_OPEN_DUP_NUM;
1911           break;
1912         case '}':
1913           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914             token->type = OP_CLOSE_DUP_NUM;
1915           break;
1916         default:
1917           break;
1918         }
1919       return 2;
1920     }
1921
1922   token->type = CHARACTER;
1923 #ifdef RE_ENABLE_I18N
1924   if (input->mb_cur_max > 1)
1925     {
1926       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928     }
1929   else
1930 #endif
1931     token->word_char = IS_WORD_CHAR (token->opr.c);
1932
1933   switch (c)
1934     {
1935     case '\n':
1936       if (syntax & RE_NEWLINE_ALT)
1937         token->type = OP_ALT;
1938       break;
1939     case '|':
1940       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941         token->type = OP_ALT;
1942       break;
1943     case '*':
1944       token->type = OP_DUP_ASTERISK;
1945       break;
1946     case '+':
1947       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948         token->type = OP_DUP_PLUS;
1949       break;
1950     case '?':
1951       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952         token->type = OP_DUP_QUESTION;
1953       break;
1954     case '{':
1955       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956         token->type = OP_OPEN_DUP_NUM;
1957       break;
1958     case '}':
1959       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960         token->type = OP_CLOSE_DUP_NUM;
1961       break;
1962     case '(':
1963       if (syntax & RE_NO_BK_PARENS)
1964         token->type = OP_OPEN_SUBEXP;
1965       break;
1966     case ')':
1967       if (syntax & RE_NO_BK_PARENS)
1968         token->type = OP_CLOSE_SUBEXP;
1969       break;
1970     case '[':
1971       token->type = OP_OPEN_BRACKET;
1972       break;
1973     case '.':
1974       token->type = OP_PERIOD;
1975       break;
1976     case '^':
1977       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978           re_string_cur_idx (input) != 0)
1979         {
1980           char prev = re_string_peek_byte (input, -1);
1981           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982             break;
1983         }
1984       token->type = ANCHOR;
1985       token->opr.ctx_type = LINE_FIRST;
1986       break;
1987     case '$':
1988       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989           re_string_cur_idx (input) + 1 != re_string_length (input))
1990         {
1991           re_token_t next;
1992           re_string_skip_bytes (input, 1);
1993           peek_token (&next, input, syntax);
1994           re_string_skip_bytes (input, -1);
1995           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996             break;
1997         }
1998       token->type = ANCHOR;
1999       token->opr.ctx_type = LINE_LAST;
2000       break;
2001     default:
2002       break;
2003     }
2004   return 1;
2005 }
2006
2007 /* Peek a token from INPUT, and return the length of the token.
2008    We must not use this function out of bracket expressions.  */
2009
2010 static int
2011 internal_function
2012 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013 {
2014   unsigned char c;
2015   if (re_string_eoi (input))
2016     {
2017       token->type = END_OF_RE;
2018       return 0;
2019     }
2020   c = re_string_peek_byte (input, 0);
2021   token->opr.c = c;
2022
2023 #ifdef RE_ENABLE_I18N
2024   if (input->mb_cur_max > 1 &&
2025       !re_string_first_byte (input, re_string_cur_idx (input)))
2026     {
2027       token->type = CHARACTER;
2028       return 1;
2029     }
2030 #endif /* RE_ENABLE_I18N */
2031
2032   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033       && re_string_cur_idx (input) + 1 < re_string_length (input))
2034     {
2035       /* In this case, '\' escape a character.  */
2036       unsigned char c2;
2037       re_string_skip_bytes (input, 1);
2038       c2 = re_string_peek_byte (input, 0);
2039       token->opr.c = c2;
2040       token->type = CHARACTER;
2041       return 1;
2042     }
2043   if (c == '[') /* '[' is a special char in a bracket exps.  */
2044     {
2045       unsigned char c2;
2046       int token_len;
2047       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048         c2 = re_string_peek_byte (input, 1);
2049       else
2050         c2 = 0;
2051       token->opr.c = c2;
2052       token_len = 2;
2053       switch (c2)
2054         {
2055         case '.':
2056           token->type = OP_OPEN_COLL_ELEM;
2057           break;
2058         case '=':
2059           token->type = OP_OPEN_EQUIV_CLASS;
2060           break;
2061         case ':':
2062           if (syntax & RE_CHAR_CLASSES)
2063             {
2064               token->type = OP_OPEN_CHAR_CLASS;
2065               break;
2066             }
2067           /* else fall through.  */
2068         default:
2069           token->type = CHARACTER;
2070           token->opr.c = c;
2071           token_len = 1;
2072           break;
2073         }
2074       return token_len;
2075     }
2076   switch (c)
2077     {
2078     case '-':
2079       token->type = OP_CHARSET_RANGE;
2080       break;
2081     case ']':
2082       token->type = OP_CLOSE_BRACKET;
2083       break;
2084     case '^':
2085       token->type = OP_NON_MATCH_LIST;
2086       break;
2087     default:
2088       token->type = CHARACTER;
2089     }
2090   return 1;
2091 }
2092 \f
2093 /* Functions for parser.  */
2094
2095 /* Entry point of the parser.
2096    Parse the regular expression REGEXP and return the structure tree.
2097    If an error is occured, ERR is set by error code, and return NULL.
2098    This function build the following tree, from regular expression <reg_exp>:
2099            CAT
2100            / \
2101           /   \
2102    <reg_exp>  EOR
2103
2104    CAT means concatenation.
2105    EOR means end of regular expression.  */
2106
2107 static bin_tree_t *
2108 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109        reg_errcode_t *err)
2110 {
2111   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112   bin_tree_t *tree, *eor, *root;
2113   re_token_t current_token;
2114   dfa->syntax = syntax;
2115   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2117   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118     return NULL;
2119   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120   if (tree != NULL)
2121     root = create_tree (dfa, tree, eor, CONCAT);
2122   else
2123     root = eor;
2124   if (BE (eor == NULL || root == NULL, 0))
2125     {
2126       *err = REG_ESPACE;
2127       return NULL;
2128     }
2129   return root;
2130 }
2131
2132 /* This function build the following tree, from regular expression
2133    <branch1>|<branch2>:
2134            ALT
2135            / \
2136           /   \
2137    <branch1> <branch2>
2138
2139    ALT means alternative, which represents the operator `|'.  */
2140
2141 static bin_tree_t *
2142 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144 {
2145   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146   bin_tree_t *tree, *branch = NULL;
2147   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150
2151   while (token->type == OP_ALT)
2152     {
2153       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154       if (token->type != OP_ALT && token->type != END_OF_RE
2155           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156         {
2157           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159             return NULL;
2160         }
2161       else
2162         branch = NULL;
2163       tree = create_tree (dfa, tree, branch, OP_ALT);
2164       if (BE (tree == NULL, 0))
2165         {
2166           *err = REG_ESPACE;
2167           return NULL;
2168         }
2169     }
2170   return tree;
2171 }
2172
2173 /* This function build the following tree, from regular expression
2174    <exp1><exp2>:
2175         CAT
2176         / \
2177        /   \
2178    <exp1> <exp2>
2179
2180    CAT means concatenation.  */
2181
2182 static bin_tree_t *
2183 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185 {
2186   bin_tree_t *tree, *expr;
2187   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190     return NULL;
2191
2192   while (token->type != OP_ALT && token->type != END_OF_RE
2193          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194     {
2195       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2196       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2197         {
2198           return NULL;
2199         }
2200       if (tree != NULL && expr != NULL)
2201         {
2202           tree = create_tree (dfa, tree, expr, CONCAT);
2203           if (tree == NULL)
2204             {
2205               *err = REG_ESPACE;
2206               return NULL;
2207             }
2208         }
2209       else if (tree == NULL)
2210         tree = expr;
2211       /* Otherwise expr == NULL, we don't need to create new tree.  */
2212     }
2213   return tree;
2214 }
2215
2216 /* This function build the following tree, from regular expression a*:
2217          *
2218          |
2219          a
2220 */
2221
2222 static bin_tree_t *
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225 {
2226   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227   bin_tree_t *tree;
2228   switch (token->type)
2229     {
2230     case CHARACTER:
2231       tree = create_token_tree (dfa, NULL, NULL, token);
2232       if (BE (tree == NULL, 0))
2233         {
2234           *err = REG_ESPACE;
2235           return NULL;
2236         }
2237 #ifdef RE_ENABLE_I18N
2238       if (dfa->mb_cur_max > 1)
2239         {
2240           while (!re_string_eoi (regexp)
2241                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242             {
2243               bin_tree_t *mbc_remain;
2244               fetch_token (token, regexp, syntax);
2245               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247               if (BE (mbc_remain == NULL || tree == NULL, 0))
2248                 {
2249                   *err = REG_ESPACE;
2250                   return NULL;
2251                 }
2252             }
2253         }
2254 #endif
2255       break;
2256     case OP_OPEN_SUBEXP:
2257       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259         return NULL;
2260       break;
2261     case OP_OPEN_BRACKET:
2262       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264         return NULL;
2265       break;
2266     case OP_BACK_REF:
2267       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268         {
2269           *err = REG_ESUBREG;
2270           return NULL;
2271         }
2272       dfa->used_bkref_map |= 1 << token->opr.idx;
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275         {
2276           *err = REG_ESPACE;
2277           return NULL;
2278         }
2279       ++dfa->nbackref;
2280       dfa->has_mb_node = 1;
2281       break;
2282     case OP_OPEN_DUP_NUM:
2283       if (syntax & RE_CONTEXT_INVALID_DUP)
2284         {
2285           *err = REG_BADRPT;
2286           return NULL;
2287         }
2288       /* FALLTHROUGH */
2289     case OP_DUP_ASTERISK:
2290     case OP_DUP_PLUS:
2291     case OP_DUP_QUESTION:
2292       if (syntax & RE_CONTEXT_INVALID_OPS)
2293         {
2294           *err = REG_BADRPT;
2295           return NULL;
2296         }
2297       else if (syntax & RE_CONTEXT_INDEP_OPS)
2298         {
2299           fetch_token (token, regexp, syntax);
2300           return parse_expression (regexp, preg, token, syntax, nest, err);
2301         }
2302       /* else fall through  */
2303     case OP_CLOSE_SUBEXP:
2304       if ((token->type == OP_CLOSE_SUBEXP) &&
2305           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306         {
2307           *err = REG_ERPAREN;
2308           return NULL;
2309         }
2310       /* else fall through  */
2311     case OP_CLOSE_DUP_NUM:
2312       /* We treat it as a normal character.  */
2313
2314       /* Then we can these characters as normal characters.  */
2315       token->type = CHARACTER;
2316       /* mb_partial and word_char bits should be initialized already
2317          by peek_token.  */
2318       tree = create_token_tree (dfa, NULL, NULL, token);
2319       if (BE (tree == NULL, 0))
2320         {
2321           *err = REG_ESPACE;
2322           return NULL;
2323         }
2324       break;
2325     case ANCHOR:
2326       if ((token->opr.ctx_type
2327            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328           && dfa->word_ops_used == 0)
2329         init_word_char (dfa);
2330       if (token->opr.ctx_type == WORD_DELIM
2331           || token->opr.ctx_type == NOT_WORD_DELIM)
2332         {
2333           bin_tree_t *tree_first, *tree_last;
2334           if (token->opr.ctx_type == WORD_DELIM)
2335             {
2336               token->opr.ctx_type = WORD_FIRST;
2337               tree_first = create_token_tree (dfa, NULL, NULL, token);
2338               token->opr.ctx_type = WORD_LAST;
2339             }
2340           else
2341             {
2342               token->opr.ctx_type = INSIDE_WORD;
2343               tree_first = create_token_tree (dfa, NULL, NULL, token);
2344               token->opr.ctx_type = INSIDE_NOTWORD;
2345             }
2346           tree_last = create_token_tree (dfa, NULL, NULL, token);
2347           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349             {
2350               *err = REG_ESPACE;
2351               return NULL;
2352             }
2353         }
2354       else
2355         {
2356           tree = create_token_tree (dfa, NULL, NULL, token);
2357           if (BE (tree == NULL, 0))
2358             {
2359               *err = REG_ESPACE;
2360               return NULL;
2361             }
2362         }
2363       /* We must return here, since ANCHORs can't be followed
2364          by repetition operators.
2365          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367       fetch_token (token, regexp, syntax);
2368       return tree;
2369     case OP_PERIOD:
2370       tree = create_token_tree (dfa, NULL, NULL, token);
2371       if (BE (tree == NULL, 0))
2372         {
2373           *err = REG_ESPACE;
2374           return NULL;
2375         }
2376       if (dfa->mb_cur_max > 1)
2377         dfa->has_mb_node = 1;
2378       break;
2379     case OP_WORD:
2380     case OP_NOTWORD:
2381       tree = build_charclass_op (dfa, regexp->trans,
2382                                  (const unsigned char *) "alnum",
2383                                  (const unsigned char *) "_",
2384                                  token->type == OP_NOTWORD, err);
2385       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386         return NULL;
2387       break;
2388     case OP_SPACE:
2389     case OP_NOTSPACE:
2390       tree = build_charclass_op (dfa, regexp->trans,
2391                                  (const unsigned char *) "space",
2392                                  (const unsigned char *) "",
2393                                  token->type == OP_NOTSPACE, err);
2394       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395         return NULL;
2396       break;
2397     case OP_ALT:
2398     case END_OF_RE:
2399       return NULL;
2400     case BACK_SLASH:
2401       *err = REG_EESCAPE;
2402       return NULL;
2403     default:
2404       /* Must not happen?  */
2405 #ifdef DEBUG
2406       assert (0);
2407 #endif
2408       return NULL;
2409     }
2410   fetch_token (token, regexp, syntax);
2411
2412   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414     {
2415       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417         return NULL;
2418       /* In BRE consecutive duplications are not allowed.  */
2419       if ((syntax & RE_CONTEXT_INVALID_DUP)
2420           && (token->type == OP_DUP_ASTERISK
2421               || token->type == OP_OPEN_DUP_NUM))
2422         {
2423           *err = REG_BADRPT;
2424           return NULL;
2425         }
2426     }
2427
2428   return tree;
2429 }
2430
2431 /* This function build the following tree, from regular expression
2432    (<reg_exp>):
2433          SUBEXP
2434             |
2435         <reg_exp>
2436 */
2437
2438 static bin_tree_t *
2439 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441 {
2442   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443   bin_tree_t *tree;
2444   size_t cur_nsub;
2445   cur_nsub = preg->re_nsub++;
2446
2447   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
2449   /* The subexpression may be a null string.  */
2450   if (token->type == OP_CLOSE_SUBEXP)
2451     tree = NULL;
2452   else
2453     {
2454       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456         *err = REG_EPAREN;
2457       if (BE (*err != REG_NOERROR, 0))
2458         return NULL;
2459     }
2460
2461   if (cur_nsub <= '9' - '1')
2462     dfa->completed_bkref_map |= 1 << cur_nsub;
2463
2464   tree = create_tree (dfa, tree, NULL, SUBEXP);
2465   if (BE (tree == NULL, 0))
2466     {
2467       *err = REG_ESPACE;
2468       return NULL;
2469     }
2470   tree->token.opr.idx = cur_nsub;
2471   return tree;
2472 }
2473
2474 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2475
2476 static bin_tree_t *
2477 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479 {
2480   bin_tree_t *tree = NULL, *old_tree = NULL;
2481   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2482   re_token_t start_token = *token;
2483
2484   if (token->type == OP_OPEN_DUP_NUM)
2485     {
2486       end = 0;
2487       start = fetch_number (regexp, token, syntax);
2488       if (start == REG_MISSING)
2489         {
2490           if (token->type == CHARACTER && token->opr.c == ',')
2491             start = 0; /* We treat "{,m}" as "{0,m}".  */
2492           else
2493             {
2494               *err = REG_BADBR; /* <re>{} is invalid.  */
2495               return NULL;
2496             }
2497         }
2498       if (BE (start != REG_ERROR, 1))
2499         {
2500           /* We treat "{n}" as "{n,n}".  */
2501           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2502                  : ((token->type == CHARACTER && token->opr.c == ',')
2503                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504         }
2505       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506         {
2507           /* Invalid sequence.  */
2508           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509             {
2510               if (token->type == END_OF_RE)
2511                 *err = REG_EBRACE;
2512               else
2513                 *err = REG_BADBR;
2514
2515               return NULL;
2516             }
2517
2518           /* If the syntax bit is set, rollback.  */
2519           re_string_set_index (regexp, start_idx);
2520           *token = start_token;
2521           token->type = CHARACTER;
2522           /* mb_partial and word_char bits should be already initialized by
2523              peek_token.  */
2524           return elem;
2525         }
2526
2527       if (BE ((end != REG_MISSING && start > end)
2528               || token->type != OP_CLOSE_DUP_NUM, 0))
2529         {
2530           /* First number greater than second.  */
2531           *err = REG_BADBR;
2532           return NULL;
2533         }
2534     }
2535   else
2536     {
2537       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2539     }
2540
2541   fetch_token (token, regexp, syntax);
2542
2543   if (BE (elem == NULL, 0))
2544     return NULL;
2545   if (BE (start == 0 && end == 0, 0))
2546     {
2547       postorder (elem, free_tree, NULL);
2548       return NULL;
2549     }
2550
2551   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2552   if (BE (start > 0, 0))
2553     {
2554       tree = elem;
2555       for (i = 2; i <= start; ++i)
2556         {
2557           elem = duplicate_tree (elem, dfa);
2558           tree = create_tree (dfa, tree, elem, CONCAT);
2559           if (BE (elem == NULL || tree == NULL, 0))
2560             goto parse_dup_op_espace;
2561         }
2562
2563       if (start == end)
2564         return tree;
2565
2566       /* Duplicate ELEM before it is marked optional.  */
2567       elem = duplicate_tree (elem, dfa);
2568       old_tree = tree;
2569     }
2570   else
2571     old_tree = NULL;
2572
2573   if (elem->token.type == SUBEXP)
2574     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
2576   tree = create_tree (dfa, elem, NULL,
2577                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2578   if (BE (tree == NULL, 0))
2579     goto parse_dup_op_espace;
2580
2581   /* This loop is actually executed only when end != REG_MISSING,
2582      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2583      already created the start+1-th copy.  */
2584   if ((Idx) -1 < 0 || end != REG_MISSING)
2585     for (i = start + 2; i <= end; ++i)
2586       {
2587         elem = duplicate_tree (elem, dfa);
2588         tree = create_tree (dfa, tree, elem, CONCAT);
2589         if (BE (elem == NULL || tree == NULL, 0))
2590           goto parse_dup_op_espace;
2591
2592         tree = create_tree (dfa, tree, NULL, OP_ALT);
2593         if (BE (tree == NULL, 0))
2594           goto parse_dup_op_espace;
2595       }
2596
2597   if (old_tree)
2598     tree = create_tree (dfa, old_tree, tree, CONCAT);
2599
2600   return tree;
2601
2602  parse_dup_op_espace:
2603   *err = REG_ESPACE;
2604   return NULL;
2605 }
2606
2607 /* Size of the names for collating symbol/equivalence_class/character_class.
2608    I'm not sure, but maybe enough.  */
2609 #define BRACKET_NAME_BUF_SIZE 32
2610
2611 #ifndef _LIBC
2612   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2613      Build the range expression which starts from START_ELEM, and ends
2614      at END_ELEM.  The result are written to MBCSET and SBCSET.
2615      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2616      mbcset->range_ends, is a pointer argument sinse we may
2617      update it.  */
2618
2619 static reg_errcode_t
2620 internal_function
2621 # ifdef RE_ENABLE_I18N
2622 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2623                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2624 # else /* not RE_ENABLE_I18N */
2625 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2626                  bracket_elem_t *end_elem)
2627 # endif /* not RE_ENABLE_I18N */
2628 {
2629   unsigned int start_ch, end_ch;
2630   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2631   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2632           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2633           0))
2634     return REG_ERANGE;
2635
2636   /* We can handle no multi character collating elements without libc
2637      support.  */
2638   if (BE ((start_elem->type == COLL_SYM
2639            && strlen ((char *) start_elem->opr.name) > 1)
2640           || (end_elem->type == COLL_SYM
2641               && strlen ((char *) end_elem->opr.name) > 1), 0))
2642     return REG_ECOLLATE;
2643
2644 # ifdef RE_ENABLE_I18N
2645   {
2646     wchar_t wc;
2647     wint_t start_wc;
2648     wint_t end_wc;
2649     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2650
2651     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2652                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2653                    : 0));
2654     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2655               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2656                  : 0));
2657     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2658                 ? __btowc (start_ch) : start_elem->opr.wch);
2659     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2660               ? __btowc (end_ch) : end_elem->opr.wch);
2661     if (start_wc == WEOF || end_wc == WEOF)
2662       return REG_ECOLLATE;
2663     cmp_buf[0] = start_wc;
2664     cmp_buf[4] = end_wc;
2665     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2666       return REG_ERANGE;
2667
2668     /* Got valid collation sequence values, add them as a new entry.
2669        However, for !_LIBC we have no collation elements: if the
2670        character set is single byte, the single byte character set
2671        that we build below suffices.  parse_bracket_exp passes
2672        no MBCSET if dfa->mb_cur_max == 1.  */
2673     if (mbcset)
2674       {
2675         /* Check the space of the arrays.  */
2676         if (BE (*range_alloc == mbcset->nranges, 0))
2677           {
2678             /* There is not enough space, need realloc.  */
2679             wchar_t *new_array_start, *new_array_end;
2680             Idx new_nranges;
2681
2682             /* +1 in case of mbcset->nranges is 0.  */
2683             new_nranges = 2 * mbcset->nranges + 1;
2684             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2685                are NULL if *range_alloc == 0.  */
2686             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2687                                           new_nranges);
2688             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2689                                         new_nranges);
2690
2691             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2692               return REG_ESPACE;
2693
2694             mbcset->range_starts = new_array_start;
2695             mbcset->range_ends = new_array_end;
2696             *range_alloc = new_nranges;
2697           }
2698
2699         mbcset->range_starts[mbcset->nranges] = start_wc;
2700         mbcset->range_ends[mbcset->nranges++] = end_wc;
2701       }
2702
2703     /* Build the table for single byte characters.  */
2704     for (wc = 0; wc < SBC_MAX; ++wc)
2705       {
2706         cmp_buf[2] = wc;
2707         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2708             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2709           bitset_set (sbcset, wc);
2710       }
2711   }
2712 # else /* not RE_ENABLE_I18N */
2713   {
2714     unsigned int ch;
2715     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2716                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2717                    : 0));
2718     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2719               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2720                  : 0));
2721     if (start_ch > end_ch)
2722       return REG_ERANGE;
2723     /* Build the table for single byte characters.  */
2724     for (ch = 0; ch < SBC_MAX; ++ch)
2725       if (start_ch <= ch  && ch <= end_ch)
2726         bitset_set (sbcset, ch);
2727   }
2728 # endif /* not RE_ENABLE_I18N */
2729   return REG_NOERROR;
2730 }
2731 #endif /* not _LIBC */
2732
2733 #ifndef _LIBC
2734 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2735    Build the collating element which is represented by NAME.
2736    The result are written to MBCSET and SBCSET.
2737    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2738    pointer argument since we may update it.  */
2739
2740 static reg_errcode_t
2741 internal_function
2742 build_collating_symbol (bitset_t sbcset,
2743 # ifdef RE_ENABLE_I18N
2744                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2745 # endif
2746                         const unsigned char *name)
2747 {
2748   size_t name_len = strlen ((const char *) name);
2749   if (BE (name_len != 1, 0))
2750     return REG_ECOLLATE;
2751   else
2752     {
2753       bitset_set (sbcset, name[0]);
2754       return REG_NOERROR;
2755     }
2756 }
2757 #endif /* not _LIBC */
2758
2759 /* This function parse bracket expression like "[abc]", "[a-c]",
2760    "[[.a-a.]]" etc.  */
2761
2762 static bin_tree_t *
2763 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2764                    reg_syntax_t syntax, reg_errcode_t *err)
2765 {
2766 #ifdef _LIBC
2767   const unsigned char *collseqmb;
2768   const char *collseqwc;
2769   uint32_t nrules;
2770   int32_t table_size;
2771   const int32_t *symb_table;
2772   const unsigned char *extra;
2773
2774   /* Local function for parse_bracket_exp used in _LIBC environement.
2775      Seek the collating symbol entry correspondings to NAME.
2776      Return the index of the symbol in the SYMB_TABLE.  */
2777
2778   auto inline int32_t
2779   __attribute ((always_inline))
2780   seek_collating_symbol_entry (name, name_len)
2781          const unsigned char *name;
2782          size_t name_len;
2783     {
2784       int32_t hash = elem_hash ((const char *) name, name_len);
2785       int32_t elem = hash % table_size;
2786       if (symb_table[2 * elem] != 0)
2787         {
2788           int32_t second = hash % (table_size - 2) + 1;
2789
2790           do
2791             {
2792               /* First compare the hashing value.  */
2793               if (symb_table[2 * elem] == hash
2794                   /* Compare the length of the name.  */
2795                   && name_len == extra[symb_table[2 * elem + 1]]
2796                   /* Compare the name.  */
2797                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2798                              name_len) == 0)
2799                 {
2800                   /* Yep, this is the entry.  */
2801                   break;
2802                 }
2803
2804               /* Next entry.  */
2805               elem += second;
2806             }
2807           while (symb_table[2 * elem] != 0);
2808         }
2809       return elem;
2810     }
2811
2812   /* Local function for parse_bracket_exp used in _LIBC environment.
2813      Look up the collation sequence value of BR_ELEM.
2814      Return the value if succeeded, UINT_MAX otherwise.  */
2815
2816   auto inline unsigned int
2817   __attribute ((always_inline))
2818   lookup_collation_sequence_value (br_elem)
2819          bracket_elem_t *br_elem;
2820     {
2821       if (br_elem->type == SB_CHAR)
2822         {
2823           /*
2824           if (MB_CUR_MAX == 1)
2825           */
2826           if (nrules == 0)
2827             return collseqmb[br_elem->opr.ch];
2828           else
2829             {
2830               wint_t wc = __btowc (br_elem->opr.ch);
2831               return __collseq_table_lookup (collseqwc, wc);
2832             }
2833         }
2834       else if (br_elem->type == MB_CHAR)
2835         {
2836           if (nrules != 0)
2837             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2838         }
2839       else if (br_elem->type == COLL_SYM)
2840         {
2841           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2842           if (nrules != 0)
2843             {
2844               int32_t elem, idx;
2845               elem = seek_collating_symbol_entry (br_elem->opr.name,
2846                                                   sym_name_len);
2847               if (symb_table[2 * elem] != 0)
2848                 {
2849                   /* We found the entry.  */
2850                   idx = symb_table[2 * elem + 1];
2851                   /* Skip the name of collating element name.  */
2852                   idx += 1 + extra[idx];
2853                   /* Skip the byte sequence of the collating element.  */
2854                   idx += 1 + extra[idx];
2855                   /* Adjust for the alignment.  */
2856                   idx = (idx + 3) & ~3;
2857                   /* Skip the multibyte collation sequence value.  */
2858                   idx += sizeof (unsigned int);
2859                   /* Skip the wide char sequence of the collating element.  */
2860                   idx += sizeof (unsigned int) *
2861                     (1 + *(unsigned int *) (extra + idx));
2862                   /* Return the collation sequence value.  */
2863                   return *(unsigned int *) (extra + idx);
2864                 }
2865               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2866                 {
2867                   /* No valid character.  Match it as a single byte
2868                      character.  */
2869                   return collseqmb[br_elem->opr.name[0]];
2870                 }
2871             }
2872           else if (sym_name_len == 1)
2873             return collseqmb[br_elem->opr.name[0]];
2874         }
2875       return UINT_MAX;
2876     }
2877
2878   /* Local function for parse_bracket_exp used in _LIBC environement.
2879      Build the range expression which starts from START_ELEM, and ends
2880      at END_ELEM.  The result are written to MBCSET and SBCSET.
2881      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2882      mbcset->range_ends, is a pointer argument sinse we may
2883      update it.  */
2884
2885   auto inline reg_errcode_t
2886   __attribute ((always_inline))
2887   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2888          re_charset_t *mbcset;
2889          Idx *range_alloc;
2890          bitset_t sbcset;
2891          bracket_elem_t *start_elem, *end_elem;
2892     {
2893       unsigned int ch;
2894       uint32_t start_collseq;
2895       uint32_t end_collseq;
2896
2897       /* Equivalence Classes and Character Classes can't be a range
2898          start/end.  */
2899       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2900               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2901               0))
2902         return REG_ERANGE;
2903
2904       start_collseq = lookup_collation_sequence_value (start_elem);
2905       end_collseq = lookup_collation_sequence_value (end_elem);
2906       /* Check start/end collation sequence values.  */
2907       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2908         return REG_ECOLLATE;
2909       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2910         return REG_ERANGE;
2911
2912       /* Got valid collation sequence values, add them as a new entry.
2913          However, if we have no collation elements, and the character set
2914          is single byte, the single byte character set that we
2915          build below suffices. */
2916       if (nrules > 0 || dfa->mb_cur_max > 1)
2917         {
2918           /* Check the space of the arrays.  */
2919           if (BE (*range_alloc == mbcset->nranges, 0))
2920             {
2921               /* There is not enough space, need realloc.  */
2922               uint32_t *new_array_start;
2923               uint32_t *new_array_end;
2924               Idx new_nranges;
2925
2926               /* +1 in case of mbcset->nranges is 0.  */
2927               new_nranges = 2 * mbcset->nranges + 1;
2928               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2929                                             new_nranges);
2930               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2931                                           new_nranges);
2932
2933               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2934                 return REG_ESPACE;
2935
2936               mbcset->range_starts = new_array_start;
2937               mbcset->range_ends = new_array_end;
2938               *range_alloc = new_nranges;
2939             }
2940
2941           mbcset->range_starts[mbcset->nranges] = start_collseq;
2942           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2943         }
2944
2945       /* Build the table for single byte characters.  */
2946       for (ch = 0; ch < SBC_MAX; ch++)
2947         {
2948           uint32_t ch_collseq;
2949           /*
2950           if (MB_CUR_MAX == 1)
2951           */
2952           if (nrules == 0)
2953             ch_collseq = collseqmb[ch];
2954           else
2955             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2956           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2957             bitset_set (sbcset, ch);
2958         }
2959       return REG_NOERROR;
2960     }
2961
2962   /* Local function for parse_bracket_exp used in _LIBC environement.
2963      Build the collating element which is represented by NAME.
2964      The result are written to MBCSET and SBCSET.
2965      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2966      pointer argument sinse we may update it.  */
2967
2968   auto inline reg_errcode_t
2969   __attribute ((always_inline))
2970   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2971          re_charset_t *mbcset;
2972          Idx *coll_sym_alloc;
2973          bitset_t sbcset;
2974          const unsigned char *name;
2975     {
2976       int32_t elem, idx;
2977       size_t name_len = strlen ((const char *) name);
2978       if (nrules != 0)
2979         {
2980           elem = seek_collating_symbol_entry (name, name_len);
2981           if (symb_table[2 * elem] != 0)
2982             {
2983               /* We found the entry.  */
2984               idx = symb_table[2 * elem + 1];
2985               /* Skip the name of collating element name.  */
2986               idx += 1 + extra[idx];
2987             }
2988           else if (symb_table[2 * elem] == 0 && name_len == 1)
2989             {
2990               /* No valid character, treat it as a normal
2991                  character.  */
2992               bitset_set (sbcset, name[0]);
2993               return REG_NOERROR;
2994             }
2995           else
2996             return REG_ECOLLATE;
2997
2998           /* Got valid collation sequence, add it as a new entry.  */
2999           /* Check the space of the arrays.  */
3000           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3001             {
3002               /* Not enough, realloc it.  */
3003               /* +1 in case of mbcset->ncoll_syms is 0.  */
3004               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3005               /* Use realloc since mbcset->coll_syms is NULL
3006                  if *alloc == 0.  */
3007               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3008                                                    new_coll_sym_alloc);
3009               if (BE (new_coll_syms == NULL, 0))
3010                 return REG_ESPACE;
3011               mbcset->coll_syms = new_coll_syms;
3012               *coll_sym_alloc = new_coll_sym_alloc;
3013             }
3014           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3015           return REG_NOERROR;
3016         }
3017       else
3018         {
3019           if (BE (name_len != 1, 0))
3020             return REG_ECOLLATE;
3021           else
3022             {
3023               bitset_set (sbcset, name[0]);
3024               return REG_NOERROR;
3025             }
3026         }
3027     }
3028 #endif
3029
3030   re_token_t br_token;
3031   re_bitset_ptr_t sbcset;
3032 #ifdef RE_ENABLE_I18N
3033   re_charset_t *mbcset;
3034   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3035   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3036 #endif /* not RE_ENABLE_I18N */
3037   bool non_match = false;
3038   bin_tree_t *work_tree;
3039   int token_len;
3040   bool first_round = true;
3041 #ifdef _LIBC
3042   collseqmb = (const unsigned char *)
3043     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3044   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3045   if (nrules)
3046     {
3047       /*
3048       if (MB_CUR_MAX > 1)
3049       */
3050       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3051       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3052       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3053                                                   _NL_COLLATE_SYMB_TABLEMB);
3054       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055                                                    _NL_COLLATE_SYMB_EXTRAMB);
3056     }
3057 #endif
3058   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059 #ifdef RE_ENABLE_I18N
3060   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3061 #endif /* RE_ENABLE_I18N */
3062 #ifdef RE_ENABLE_I18N
3063   if (BE (sbcset == NULL || mbcset == NULL, 0))
3064 #else
3065   if (BE (sbcset == NULL, 0))
3066 #endif /* RE_ENABLE_I18N */
3067     {
3068       *err = REG_ESPACE;
3069       return NULL;
3070     }
3071
3072   token_len = peek_token_bracket (token, regexp, syntax);
3073   if (BE (token->type == END_OF_RE, 0))
3074     {
3075       *err = REG_BADPAT;
3076       goto parse_bracket_exp_free_return;
3077     }
3078   if (token->type == OP_NON_MATCH_LIST)
3079     {
3080 #ifdef RE_ENABLE_I18N
3081       mbcset->non_match = 1;
3082 #endif /* not RE_ENABLE_I18N */
3083       non_match = true;
3084       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3085         bitset_set (sbcset, '\n');
3086       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3087       token_len = peek_token_bracket (token, regexp, syntax);
3088       if (BE (token->type == END_OF_RE, 0))
3089         {
3090           *err = REG_BADPAT;
3091           goto parse_bracket_exp_free_return;
3092         }
3093     }
3094
3095   /* We treat the first ']' as a normal character.  */
3096   if (token->type == OP_CLOSE_BRACKET)
3097     token->type = CHARACTER;
3098
3099   while (1)
3100     {
3101       bracket_elem_t start_elem, end_elem;
3102       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3103       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3104       reg_errcode_t ret;
3105       int token_len2 = 0;
3106       bool is_range_exp = false;
3107       re_token_t token2;
3108
3109       start_elem.opr.name = start_name_buf;
3110       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3111                                    syntax, first_round);
3112       if (BE (ret != REG_NOERROR, 0))
3113         {
3114           *err = ret;
3115           goto parse_bracket_exp_free_return;
3116         }
3117       first_round = false;
3118
3119       /* Get information about the next token.  We need it in any case.  */
3120       token_len = peek_token_bracket (token, regexp, syntax);
3121
3122       /* Do not check for ranges if we know they are not allowed.  */
3123       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3124         {
3125           if (BE (token->type == END_OF_RE, 0))
3126             {
3127               *err = REG_EBRACK;
3128               goto parse_bracket_exp_free_return;
3129             }
3130           if (token->type == OP_CHARSET_RANGE)
3131             {
3132               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3133               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3134               if (BE (token2.type == END_OF_RE, 0))
3135                 {
3136                   *err = REG_EBRACK;
3137                   goto parse_bracket_exp_free_return;
3138                 }
3139               if (token2.type == OP_CLOSE_BRACKET)
3140                 {
3141                   /* We treat the last '-' as a normal character.  */
3142                   re_string_skip_bytes (regexp, -token_len);
3143                   token->type = CHARACTER;
3144                 }
3145               else
3146                 is_range_exp = true;
3147             }
3148         }
3149
3150       if (is_range_exp == true)
3151         {
3152           end_elem.opr.name = end_name_buf;
3153           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3154                                        dfa, syntax, true);
3155           if (BE (ret != REG_NOERROR, 0))
3156             {
3157               *err = ret;
3158               goto parse_bracket_exp_free_return;
3159             }
3160
3161           token_len = peek_token_bracket (token, regexp, syntax);
3162
3163 #ifdef _LIBC
3164           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3165                                   &start_elem, &end_elem);
3166 #else
3167 # ifdef RE_ENABLE_I18N
3168           *err = build_range_exp (sbcset,
3169                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3170                                   &range_alloc, &start_elem, &end_elem);
3171 # else
3172           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3173 # endif
3174 #endif /* RE_ENABLE_I18N */
3175           if (BE (*err != REG_NOERROR, 0))
3176             goto parse_bracket_exp_free_return;
3177         }
3178       else
3179         {
3180           switch (start_elem.type)
3181             {
3182             case SB_CHAR:
3183               bitset_set (sbcset, start_elem.opr.ch);
3184               break;
3185 #ifdef RE_ENABLE_I18N
3186             case MB_CHAR:
3187               /* Check whether the array has enough space.  */
3188               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3189                 {
3190                   wchar_t *new_mbchars;
3191                   /* Not enough, realloc it.  */
3192                   /* +1 in case of mbcset->nmbchars is 0.  */
3193                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3194                   /* Use realloc since array is NULL if *alloc == 0.  */
3195                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3196                                             mbchar_alloc);
3197                   if (BE (new_mbchars == NULL, 0))
3198                     goto parse_bracket_exp_espace;
3199                   mbcset->mbchars = new_mbchars;
3200                 }
3201               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3202               break;
3203 #endif /* RE_ENABLE_I18N */
3204             case EQUIV_CLASS:
3205               *err = build_equiv_class (sbcset,
3206 #ifdef RE_ENABLE_I18N
3207                                         mbcset, &equiv_class_alloc,
3208 #endif /* RE_ENABLE_I18N */
3209                                         start_elem.opr.name);
3210               if (BE (*err != REG_NOERROR, 0))
3211                 goto parse_bracket_exp_free_return;
3212               break;
3213             case COLL_SYM:
3214               *err = build_collating_symbol (sbcset,
3215 #ifdef RE_ENABLE_I18N
3216                                              mbcset, &coll_sym_alloc,
3217 #endif /* RE_ENABLE_I18N */
3218                                              start_elem.opr.name);
3219               if (BE (*err != REG_NOERROR, 0))
3220                 goto parse_bracket_exp_free_return;
3221               break;
3222             case CHAR_CLASS:
3223               *err = build_charclass (regexp->trans, sbcset,
3224 #ifdef RE_ENABLE_I18N
3225                                       mbcset, &char_class_alloc,
3226 #endif /* RE_ENABLE_I18N */
3227                                       start_elem.opr.name, syntax);
3228               if (BE (*err != REG_NOERROR, 0))
3229                goto parse_bracket_exp_free_return;
3230               break;
3231             default:
3232               assert (0);
3233               break;
3234             }
3235         }
3236       if (BE (token->type == END_OF_RE, 0))
3237         {
3238           *err = REG_EBRACK;
3239           goto parse_bracket_exp_free_return;
3240         }
3241       if (token->type == OP_CLOSE_BRACKET)
3242         break;
3243     }
3244
3245   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3246
3247   /* If it is non-matching list.  */
3248   if (non_match)
3249     bitset_not (sbcset);
3250
3251 #ifdef RE_ENABLE_I18N
3252   /* Ensure only single byte characters are set.  */
3253   if (dfa->mb_cur_max > 1)
3254     bitset_mask (sbcset, dfa->sb_char);
3255
3256   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3257       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3258                                                      || mbcset->non_match)))
3259     {
3260       bin_tree_t *mbc_tree;
3261       int sbc_idx;
3262       /* Build a tree for complex bracket.  */
3263       dfa->has_mb_node = 1;
3264       br_token.type = COMPLEX_BRACKET;
3265       br_token.opr.mbcset = mbcset;
3266       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3267       if (BE (mbc_tree == NULL, 0))
3268         goto parse_bracket_exp_espace;
3269       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3270         if (sbcset[sbc_idx])
3271           break;
3272       /* If there are no bits set in sbcset, there is no point
3273          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3274       if (sbc_idx < BITSET_WORDS)
3275         {
3276           /* Build a tree for simple bracket.  */
3277           br_token.type = SIMPLE_BRACKET;
3278           br_token.opr.sbcset = sbcset;
3279           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3280           if (BE (work_tree == NULL, 0))
3281             goto parse_bracket_exp_espace;
3282
3283           /* Then join them by ALT node.  */
3284           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3285           if (BE (work_tree == NULL, 0))
3286             goto parse_bracket_exp_espace;
3287         }
3288       else
3289         {
3290           re_free (sbcset);
3291           work_tree = mbc_tree;
3292         }
3293     }
3294   else
3295 #endif /* not RE_ENABLE_I18N */
3296     {
3297 #ifdef RE_ENABLE_I18N
3298       free_charset (mbcset);
3299 #endif
3300       /* Build a tree for simple bracket.  */
3301       br_token.type = SIMPLE_BRACKET;
3302       br_token.opr.sbcset = sbcset;
3303       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304       if (BE (work_tree == NULL, 0))
3305         goto parse_bracket_exp_espace;
3306     }
3307   return work_tree;
3308
3309  parse_bracket_exp_espace:
3310   *err = REG_ESPACE;
3311  parse_bracket_exp_free_return:
3312   re_free (sbcset);
3313 #ifdef RE_ENABLE_I18N
3314   free_charset (mbcset);
3315 #endif /* RE_ENABLE_I18N */
3316   return NULL;
3317 }
3318
3319 /* Parse an element in the bracket expression.  */
3320
3321 static reg_errcode_t
3322 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3323                        re_token_t *token, int token_len, re_dfa_t *dfa,
3324                        reg_syntax_t syntax, bool accept_hyphen)
3325 {
3326 #ifdef RE_ENABLE_I18N
3327   int cur_char_size;
3328   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3329   if (cur_char_size > 1)
3330     {
3331       elem->type = MB_CHAR;
3332       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3333       re_string_skip_bytes (regexp, cur_char_size);
3334       return REG_NOERROR;
3335     }
3336 #endif /* RE_ENABLE_I18N */
3337   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3338   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3339       || token->type == OP_OPEN_EQUIV_CLASS)
3340     return parse_bracket_symbol (elem, regexp, token);
3341   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3342     {
3343       /* A '-' must only appear as anything but a range indicator before
3344          the closing bracket.  Everything else is an error.  */
3345       re_token_t token2;
3346       (void) peek_token_bracket (&token2, regexp, syntax);
3347       if (token2.type != OP_CLOSE_BRACKET)
3348         /* The actual error value is not standardized since this whole
3349            case is undefined.  But ERANGE makes good sense.  */
3350         return REG_ERANGE;
3351     }
3352   elem->type = SB_CHAR;
3353   elem->opr.ch = token->opr.c;
3354   return REG_NOERROR;
3355 }
3356
3357 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3358    such as [:<character_class>:], [.<collating_element>.], and
3359    [=<equivalent_class>=].  */
3360
3361 static reg_errcode_t
3362 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3363                       re_token_t *token)
3364 {
3365   unsigned char ch, delim = token->opr.c;
3366   int i = 0;
3367   if (re_string_eoi(regexp))
3368     return REG_EBRACK;
3369   for (;; ++i)
3370     {
3371       if (i >= BRACKET_NAME_BUF_SIZE)
3372         return REG_EBRACK;
3373       if (token->type == OP_OPEN_CHAR_CLASS)
3374         ch = re_string_fetch_byte_case (regexp);
3375       else
3376         ch = re_string_fetch_byte (regexp);
3377       if (re_string_eoi(regexp))
3378         return REG_EBRACK;
3379       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3380         break;
3381       elem->opr.name[i] = ch;
3382     }
3383   re_string_skip_bytes (regexp, 1);
3384   elem->opr.name[i] = '\0';
3385   switch (token->type)
3386     {
3387     case OP_OPEN_COLL_ELEM:
3388       elem->type = COLL_SYM;
3389       break;
3390     case OP_OPEN_EQUIV_CLASS:
3391       elem->type = EQUIV_CLASS;
3392       break;
3393     case OP_OPEN_CHAR_CLASS:
3394       elem->type = CHAR_CLASS;
3395       break;
3396     default:
3397       break;
3398     }
3399   return REG_NOERROR;
3400 }
3401
3402   /* Helper function for parse_bracket_exp.
3403      Build the equivalence class which is represented by NAME.
3404      The result are written to MBCSET and SBCSET.
3405      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3406      is a pointer argument sinse we may update it.  */
3407
3408 static reg_errcode_t
3409 #ifdef RE_ENABLE_I18N
3410 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3411                    Idx *equiv_class_alloc, const unsigned char *name)
3412 #else /* not RE_ENABLE_I18N */
3413 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3414 #endif /* not RE_ENABLE_I18N */
3415 {
3416 #ifdef _LIBC
3417   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3418   if (nrules != 0)
3419     {
3420       const int32_t *table, *indirect;
3421       const unsigned char *weights, *extra, *cp;
3422       unsigned char char_buf[2];
3423       int32_t idx1, idx2;
3424       unsigned int ch;
3425       size_t len;
3426       /* This #include defines a local function!  */
3427 # include <locale/weight.h>
3428       /* Calculate the index for equivalence class.  */
3429       cp = name;
3430       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3431       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3432                                                _NL_COLLATE_WEIGHTMB);
3433       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3434                                                    _NL_COLLATE_EXTRAMB);
3435       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3436                                                 _NL_COLLATE_INDIRECTMB);
3437       idx1 = findidx (&cp);
3438       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3439         /* This isn't a valid character.  */
3440         return REG_ECOLLATE;
3441
3442       /* Build single byte matcing table for this equivalence class.  */
3443       char_buf[1] = (unsigned char) '\0';
3444       len = weights[idx1 & 0xffffff];
3445       for (ch = 0; ch < SBC_MAX; ++ch)
3446         {
3447           char_buf[0] = ch;
3448           cp = char_buf;
3449           idx2 = findidx (&cp);
3450 /*
3451           idx2 = table[ch];
3452 */
3453           if (idx2 == 0)
3454             /* This isn't a valid character.  */
3455             continue;
3456           /* Compare only if the length matches and the collation rule
3457              index is the same.  */
3458           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3459             {
3460               int cnt = 0;
3461
3462               while (cnt <= len &&
3463                      weights[(idx1 & 0xffffff) + 1 + cnt]
3464                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3465                 ++cnt;
3466
3467               if (cnt > len)
3468                 bitset_set (sbcset, ch);
3469             }
3470         }
3471       /* Check whether the array has enough space.  */
3472       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3473         {
3474           /* Not enough, realloc it.  */
3475           /* +1 in case of mbcset->nequiv_classes is 0.  */
3476           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3477           /* Use realloc since the array is NULL if *alloc == 0.  */
3478           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3479                                                    int32_t,
3480                                                    new_equiv_class_alloc);
3481           if (BE (new_equiv_classes == NULL, 0))
3482             return REG_ESPACE;
3483           mbcset->equiv_classes = new_equiv_classes;
3484           *equiv_class_alloc = new_equiv_class_alloc;
3485         }
3486       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3487     }
3488   else
3489 #endif /* _LIBC */
3490     {
3491       if (BE (strlen ((const char *) name) != 1, 0))
3492         return REG_ECOLLATE;
3493       bitset_set (sbcset, *name);
3494     }
3495   return REG_NOERROR;
3496 }
3497
3498   /* Helper function for parse_bracket_exp.
3499      Build the character class which is represented by NAME.
3500      The result are written to MBCSET and SBCSET.
3501      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3502      is a pointer argument sinse we may update it.  */
3503
3504 static reg_errcode_t
3505 #ifdef RE_ENABLE_I18N
3506 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3507                  re_charset_t *mbcset, Idx *char_class_alloc,
3508                  const unsigned char *class_name, reg_syntax_t syntax)
3509 #else /* not RE_ENABLE_I18N */
3510 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3511                  const unsigned char *class_name, reg_syntax_t syntax)
3512 #endif /* not RE_ENABLE_I18N */
3513 {
3514   int i;
3515   const char *name = (const char *) class_name;
3516
3517   /* In case of REG_ICASE "upper" and "lower" match the both of
3518      upper and lower cases.  */
3519   if ((syntax & RE_ICASE)
3520       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3521     name = "alpha";
3522
3523 #ifdef RE_ENABLE_I18N
3524   /* Check the space of the arrays.  */
3525   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3526     {
3527       /* Not enough, realloc it.  */
3528       /* +1 in case of mbcset->nchar_classes is 0.  */
3529       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3530       /* Use realloc since array is NULL if *alloc == 0.  */
3531       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3532                                                new_char_class_alloc);
3533       if (BE (new_char_classes == NULL, 0))
3534         return REG_ESPACE;
3535       mbcset->char_classes = new_char_classes;
3536       *char_class_alloc = new_char_class_alloc;
3537     }
3538   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3539 #endif /* RE_ENABLE_I18N */
3540
3541 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3542   do {                                          \
3543     if (BE (trans != NULL, 0))                  \
3544       {                                         \
3545         for (i = 0; i < SBC_MAX; ++i)           \
3546           if (ctype_func (i))                   \
3547             bitset_set (sbcset, trans[i]);      \
3548       }                                         \
3549     else                                        \
3550       {                                         \
3551         for (i = 0; i < SBC_MAX; ++i)           \
3552           if (ctype_func (i))                   \
3553             bitset_set (sbcset, i);             \
3554       }                                         \
3555   } while (0)
3556
3557   if (strcmp (name, "alnum") == 0)
3558     BUILD_CHARCLASS_LOOP (isalnum);
3559   else if (strcmp (name, "cntrl") == 0)
3560     BUILD_CHARCLASS_LOOP (iscntrl);
3561   else if (strcmp (name, "lower") == 0)
3562     BUILD_CHARCLASS_LOOP (islower);
3563   else if (strcmp (name, "space") == 0)
3564     BUILD_CHARCLASS_LOOP (isspace);
3565   else if (strcmp (name, "alpha") == 0)
3566     BUILD_CHARCLASS_LOOP (isalpha);
3567   else if (strcmp (name, "digit") == 0)
3568     BUILD_CHARCLASS_LOOP (isdigit);
3569   else if (strcmp (name, "print") == 0)
3570     BUILD_CHARCLASS_LOOP (isprint);
3571   else if (strcmp (name, "upper") == 0)
3572     BUILD_CHARCLASS_LOOP (isupper);
3573   else if (strcmp (name, "blank") == 0)
3574     BUILD_CHARCLASS_LOOP (isblank);
3575   else if (strcmp (name, "graph") == 0)
3576     BUILD_CHARCLASS_LOOP (isgraph);
3577   else if (strcmp (name, "punct") == 0)
3578     BUILD_CHARCLASS_LOOP (ispunct);
3579   else if (strcmp (name, "xdigit") == 0)
3580     BUILD_CHARCLASS_LOOP (isxdigit);
3581   else
3582     return REG_ECTYPE;
3583
3584   return REG_NOERROR;
3585 }
3586
3587 static bin_tree_t *
3588 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3589                     const unsigned char *class_name,
3590                     const unsigned char *extra, bool non_match,
3591                     reg_errcode_t *err)
3592 {
3593   re_bitset_ptr_t sbcset;
3594 #ifdef RE_ENABLE_I18N
3595   re_charset_t *mbcset;
3596   Idx alloc = 0;
3597 #endif /* not RE_ENABLE_I18N */
3598   reg_errcode_t ret;
3599   re_token_t br_token;
3600   bin_tree_t *tree;
3601
3602   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3603 #ifdef RE_ENABLE_I18N
3604   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3605 #endif /* RE_ENABLE_I18N */
3606
3607 #ifdef RE_ENABLE_I18N
3608   if (BE (sbcset == NULL || mbcset == NULL, 0))
3609 #else /* not RE_ENABLE_I18N */
3610   if (BE (sbcset == NULL, 0))
3611 #endif /* not RE_ENABLE_I18N */
3612     {
3613       *err = REG_ESPACE;
3614       return NULL;
3615     }
3616
3617   if (non_match)
3618     {
3619 #ifdef RE_ENABLE_I18N
3620       mbcset->non_match = 1;
3621 #endif /* not RE_ENABLE_I18N */
3622     }
3623
3624   /* We don't care the syntax in this case.  */
3625   ret = build_charclass (trans, sbcset,
3626 #ifdef RE_ENABLE_I18N
3627                          mbcset, &alloc,
3628 #endif /* RE_ENABLE_I18N */
3629                          class_name, 0);
3630
3631   if (BE (ret != REG_NOERROR, 0))
3632     {
3633       re_free (sbcset);
3634 #ifdef RE_ENABLE_I18N
3635       free_charset (mbcset);
3636 #endif /* RE_ENABLE_I18N */
3637       *err = ret;
3638       return NULL;
3639     }
3640   /* \w match '_' also.  */
3641   for (; *extra; extra++)
3642     bitset_set (sbcset, *extra);
3643
3644   /* If it is non-matching list.  */
3645   if (non_match)
3646     bitset_not (sbcset);
3647
3648 #ifdef RE_ENABLE_I18N
3649   /* Ensure only single byte characters are set.  */
3650   if (dfa->mb_cur_max > 1)
3651     bitset_mask (sbcset, dfa->sb_char);
3652 #endif
3653
3654   /* Build a tree for simple bracket.  */
3655   br_token.type = SIMPLE_BRACKET;
3656   br_token.opr.sbcset = sbcset;
3657   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3658   if (BE (tree == NULL, 0))
3659     goto build_word_op_espace;
3660
3661 #ifdef RE_ENABLE_I18N
3662   if (dfa->mb_cur_max > 1)
3663     {
3664       bin_tree_t *mbc_tree;
3665       /* Build a tree for complex bracket.  */
3666       br_token.type = COMPLEX_BRACKET;
3667       br_token.opr.mbcset = mbcset;
3668       dfa->has_mb_node = 1;
3669       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3670       if (BE (mbc_tree == NULL, 0))
3671         goto build_word_op_espace;
3672       /* Then join them by ALT node.  */
3673       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3674       if (BE (mbc_tree != NULL, 1))
3675         return tree;
3676     }
3677   else
3678     {
3679       free_charset (mbcset);
3680       return tree;
3681     }
3682 #else /* not RE_ENABLE_I18N */
3683   return tree;
3684 #endif /* not RE_ENABLE_I18N */
3685
3686  build_word_op_espace:
3687   re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689   free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691   *err = REG_ESPACE;
3692   return NULL;
3693 }
3694
3695 /* This is intended for the expressions like "a{1,3}".
3696    Fetch a number from `input', and return the number.
3697    Return REG_MISSING if the number field is empty like "{,1}".
3698    Return REG_ERROR if an error occurred.  */
3699
3700 static Idx
3701 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3702 {
3703   Idx num = REG_MISSING;
3704   unsigned char c;
3705   while (1)
3706     {
3707       fetch_token (token, input, syntax);
3708       c = token->opr.c;
3709       if (BE (token->type == END_OF_RE, 0))
3710         return REG_ERROR;
3711       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3712         break;
3713       num = ((token->type != CHARACTER || c < '0' || '9' < c
3714               || num == REG_ERROR)
3715              ? REG_ERROR
3716              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3717       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3718     }
3719   return num;
3720 }
3721 \f
3722 #ifdef RE_ENABLE_I18N
3723 static void
3724 free_charset (re_charset_t *cset)
3725 {
3726   re_free (cset->mbchars);
3727 # ifdef _LIBC
3728   re_free (cset->coll_syms);
3729   re_free (cset->equiv_classes);
3730   re_free (cset->range_starts);
3731   re_free (cset->range_ends);
3732 # endif
3733   re_free (cset->char_classes);
3734   re_free (cset);
3735 }
3736 #endif /* RE_ENABLE_I18N */
3737 \f
3738 /* Functions for binary tree operation.  */
3739
3740 /* Create a tree node.  */
3741
3742 static bin_tree_t *
3743 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3744              re_token_type_t type)
3745 {
3746   re_token_t t;
3747   t.type = type;
3748   return create_token_tree (dfa, left, right, &t);
3749 }
3750
3751 static bin_tree_t *
3752 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3753                    const re_token_t *token)
3754 {
3755   bin_tree_t *tree;
3756   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3757     {
3758       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3759
3760       if (storage == NULL)
3761         return NULL;
3762       storage->next = dfa->str_tree_storage;
3763       dfa->str_tree_storage = storage;
3764       dfa->str_tree_storage_idx = 0;
3765     }
3766   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3767
3768   tree->parent = NULL;
3769   tree->left = left;
3770   tree->right = right;
3771   tree->token = *token;
3772   tree->token.duplicated = 0;
3773   tree->token.opt_subexp = 0;
3774   tree->first = NULL;
3775   tree->next = NULL;
3776   tree->node_idx = REG_MISSING;
3777
3778   if (left != NULL)
3779     left->parent = tree;
3780   if (right != NULL)
3781     right->parent = tree;
3782   return tree;
3783 }
3784
3785 /* Mark the tree SRC as an optional subexpression.
3786    To be called from preorder or postorder.  */
3787
3788 static reg_errcode_t
3789 mark_opt_subexp (void *extra, bin_tree_t *node)
3790 {
3791   Idx idx = (Idx) (long) extra;
3792   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3793     node->token.opt_subexp = 1;
3794
3795   return REG_NOERROR;
3796 }
3797
3798 /* Free the allocated memory inside NODE. */
3799
3800 static void
3801 free_token (re_token_t *node)
3802 {
3803 #ifdef RE_ENABLE_I18N
3804   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3805     free_charset (node->opr.mbcset);
3806   else
3807 #endif /* RE_ENABLE_I18N */
3808     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3809       re_free (node->opr.sbcset);
3810 }
3811
3812 /* Worker function for tree walking.  Free the allocated memory inside NODE
3813    and its children. */
3814
3815 static reg_errcode_t
3816 free_tree (void *extra, bin_tree_t *node)
3817 {
3818   free_token (&node->token);
3819   return REG_NOERROR;
3820 }
3821
3822
3823 /* Duplicate the node SRC, and return new node.  This is a preorder
3824    visit similar to the one implemented by the generic visitor, but
3825    we need more infrastructure to maintain two parallel trees --- so,
3826    it's easier to duplicate.  */
3827
3828 static bin_tree_t *
3829 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3830 {
3831   const bin_tree_t *node;
3832   bin_tree_t *dup_root;
3833   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3834
3835   for (node = root; ; )
3836     {
3837       /* Create a new tree and link it back to the current parent.  */
3838       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3839       if (*p_new == NULL)
3840         return NULL;
3841       (*p_new)->parent = dup_node;
3842       (*p_new)->token.duplicated = 1;
3843       dup_node = *p_new;
3844
3845       /* Go to the left node, or up and to the right.  */
3846       if (node->left)
3847         {
3848           node = node->left;
3849           p_new = &dup_node->left;
3850         }
3851       else
3852         {
3853           const bin_tree_t *prev = NULL;
3854           while (node->right == prev || node->right == NULL)
3855             {
3856               prev = node;
3857               node = node->parent;
3858               dup_node = dup_node->parent;
3859               if (!node)
3860                 return dup_root;
3861             }
3862           node = node->right;
3863           p_new = &dup_node->right;
3864         }
3865     }
3866 }