lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License along
  18    with this program; if not, write to the Free Software Foundation,
  19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          Idx node, bool root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static Idx fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   Idx nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             bool accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         Idx *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       Idx *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        bool non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 static const char __re_error_msgid[] =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 static const size_t __re_error_msgid_idx[] =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 #ifdef _LIBC
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 #else /* size_t might promote */
 220 const char *
 221 re_compile_pattern (const char *pattern, size_t length,
 222                     struct re_pattern_buffer *bufp)
 223 #endif
 224 {
 225   reg_errcode_t ret;
 226
 227   /* And GNU code determines whether or not to get register information
 228      by passing null for the REGS argument to re_match, etc., not by
 229      setting no_sub, unless RE_NO_SUB is set.  */
 230   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 231
 232   /* Match anchors at newline.  */
 233   bufp->newline_anchor = 1;
 234
 235   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 236
 237   if (!ret)
 238     return NULL;
 239   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 240 }
 241 #ifdef _LIBC
 242 weak_alias (__re_compile_pattern, re_compile_pattern)
 243 #endif
 244
 245 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 246    also be assigned to arbitrarily: each pattern buffer stores its own
 247    syntax, so it can be changed between regex compilations.  */
 248 /* This has no initializer because initialized variables in Emacs
 249    become read-only after dumping.  */
 250 reg_syntax_t re_syntax_options;
 251
 252
 253 /* Specify the precise syntax of regexps for compilation.  This provides
 254    for compatibility for various utilities which historically have
 255    different, incompatible syntaxes.
 256
 257    The argument SYNTAX is a bit mask comprised of the various bits
 258    defined in regex.h.  We return the old syntax.  */
 259
 260 reg_syntax_t
 261 re_set_syntax (syntax)
 262     reg_syntax_t syntax;
 263 {
 264   reg_syntax_t ret = re_syntax_options;
 265
 266   re_syntax_options = syntax;
 267   return ret;
 268 }
 269 #ifdef _LIBC
 270 weak_alias (__re_set_syntax, re_set_syntax)
 271 #endif
 272
 273 int
 274 re_compile_fastmap (bufp)
 275     struct re_pattern_buffer *bufp;
 276 {
 277   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 278   char *fastmap = bufp->fastmap;
 279
 280   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 281   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 282   if (dfa->init_state != dfa->init_state_word)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 284   if (dfa->init_state != dfa->init_state_nl)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 286   if (dfa->init_state != dfa->init_state_begbuf)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 288   bufp->fastmap_accurate = 1;
 289   return 0;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 293 #endif
 294
 295 static inline void
 296 __attribute ((always_inline))
 297 re_set_fastmap (char *fastmap, bool icase, int ch)
 298 {
 299   fastmap[ch] = 1;
 300   if (icase)
 301     fastmap[tolower (ch)] = 1;
 302 }
 303
 304 /* Helper function for re_compile_fastmap.
 305    Compile fastmap for the initial_state INIT_STATE.  */
 306
 307 static void
 308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 309                          char *fastmap)
 310 {
 311   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 312   Idx node_cnt;
 313   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 314   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 315     {
 316       Idx node = init_state->nodes.elems[node_cnt];
 317       re_token_type_t type = dfa->nodes[node].type;
 318
 319       if (type == CHARACTER)
 320         {
 321           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 322 #ifdef RE_ENABLE_I18N
 323           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 324             {
 325               unsigned char buf[MB_LEN_MAX];
 326               unsigned char *p;
 327               wchar_t wc;
 328               mbstate_t state;
 329
 330               p = buf;
 331               *p++ = dfa->nodes[node].opr.c;
 332               while (++node < dfa->nodes_len
 333                      && dfa->nodes[node].type == CHARACTER
 334                      && dfa->nodes[node].mb_partial)
 335                 *p++ = dfa->nodes[node].opr.c;
 336               memset (&state, '\0', sizeof (state));
 337               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 338                              &state) == p - buf
 339                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 340                       != (size_t) -1))
 341                 re_set_fastmap (fastmap, false, buf[0]);
 342             }
 343 #endif
 344         }
 345       else if (type == SIMPLE_BRACKET)
 346         {
 347           int i, ch;
 348           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 349             {
 350               int j;
 351               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 352               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 353                 if (w & ((bitset_word_t) 1 << j))
 354                   re_set_fastmap (fastmap, icase, ch);
 355             }
 356         }
 357 #ifdef RE_ENABLE_I18N
 358       else if (type == COMPLEX_BRACKET)
 359         {
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           Idx i;
 362
 363 # ifdef _LIBC
 364           /* See if we have to try all bytes which start multiple collation
 365              elements.
 366              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 367                   collation element, and don't catch 'b' since 'b' is
 368                   the only collation element which starts from 'b' (and
 369                   it is caught by SIMPLE_BRACKET).  */
 370               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 371                   && (cset->ncoll_syms || cset->nranges))
 372                 {
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # endif /* _LIBC */
 380
 381           /* See if we have to start the match at all multibyte characters,
 382              i.e. where we would not find an invalid sequence.  This only
 383              applies to multibyte character sets; for single byte character
 384              sets, the SIMPLE_BRACKET again suffices.  */
 385           if (dfa->mb_cur_max > 1
 386               && (cset->nchar_classes || cset->non_match || cset->nranges
 387 # ifdef _LIBC
 388                   || cset->nequiv_classes
 389 # endif /* _LIBC */
 390                  ))
 391             {
 392               unsigned char c = 0;
 393               do
 394                 {
 395                   mbstate_t mbs;
 396                   memset (&mbs, 0, sizeof (mbs));
 397                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 398                     re_set_fastmap (fastmap, false, (int) c);
 399                 }
 400               while (++c != 0);
 401             }
 402
 403           else
 404             {
 405               /* ... Else catch all bytes which can start the mbchars.  */
 406               for (i = 0; i < cset->nmbchars; ++i)
 407                 {
 408                   char buf[256];
 409                   mbstate_t state;
 410                   memset (&state, '\0', sizeof (state));
 411                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 412                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 413                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 414                     {
 415                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 416                           != (size_t) -1)
 417                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 418                     }
 419                 }
 420             }
 421         }
 422 #endif /* RE_ENABLE_I18N */
 423       else if (type == OP_PERIOD
 424 #ifdef RE_ENABLE_I18N
 425                || type == OP_UTF8_PERIOD
 426 #endif /* RE_ENABLE_I18N */
 427                || type == END_OF_RE)
 428         {
 429           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 430           if (type == END_OF_RE)
 431             bufp->can_be_null = 1;
 432           return;
 433         }
 434     }
 435 }
 436 \f
 437 /* Entry point for POSIX code.  */
 438 /* regcomp takes a regular expression as a string and compiles it.
 439
 440    PREG is a regex_t *.  We do not expect any fields to be initialized,
 441    since POSIX says we shouldn't.  Thus, we set
 442
 443      `buffer' to the compiled pattern;
 444      `used' to the length of the compiled pattern;
 445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 447        RE_SYNTAX_POSIX_BASIC;
 448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 449      `fastmap' to an allocated space for the fastmap;
 450      `fastmap_accurate' to zero;
 451      `re_nsub' to the number of subexpressions in PATTERN.
 452
 453    PATTERN is the address of the pattern string.
 454
 455    CFLAGS is a series of bits which affect compilation.
 456
 457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 458      use POSIX basic syntax.
 459
 460      If REG_NEWLINE is set, then . and [^...] don't match newline.
 461      Also, regexec will try a match beginning after every newline.
 462
 463      If REG_ICASE is set, then we considers upper- and lowercase
 464      versions of letters to be equivalent when matching.
 465
 466      If REG_NOSUB is set, then when PREG is passed to regexec, that
 467      routine will report only success or failure, and nothing about the
 468      registers.
 469
 470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 471    the return codes and their meanings.)  */
 472
 473 int
 474 regcomp (preg, pattern, cflags)
 475     regex_t *_Restrict_ preg;
 476     const char *_Restrict_ pattern;
 477     int cflags;
 478 {
 479   reg_errcode_t ret;
 480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 481                          : RE_SYNTAX_POSIX_BASIC);
 482
 483   preg->buffer = NULL;
 484   preg->allocated = 0;
 485   preg->used = 0;
 486
 487   /* Try to allocate space for the fastmap.  */
 488   preg->fastmap = re_malloc (char, SBC_MAX);
 489   if (BE (preg->fastmap == NULL, 0))
 490     return REG_ESPACE;
 491
 492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 493
 494   /* If REG_NEWLINE is set, newlines are treated differently.  */
 495   if (cflags & REG_NEWLINE)
 496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 497       syntax &= ~RE_DOT_NEWLINE;
 498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 499       /* It also changes the matching behavior.  */
 500       preg->newline_anchor = 1;
 501     }
 502   else
 503     preg->newline_anchor = 0;
 504   preg->no_sub = !!(cflags & REG_NOSUB);
 505   preg->translate = NULL;
 506
 507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 508
 509   /* POSIX doesn't distinguish between an unmatched open-group and an
 510      unmatched close-group: both are REG_EPAREN.  */
 511   if (ret == REG_ERPAREN)
 512     ret = REG_EPAREN;
 513
 514   /* We have already checked preg->fastmap != NULL.  */
 515   if (BE (ret == REG_NOERROR, 1))
 516     /* Compute the fastmap now, since regexec cannot modify the pattern
 517        buffer.  This function never fails in this implementation.  */
 518     (void) re_compile_fastmap (preg);
 519   else
 520     {
 521       /* Some error occurred while compiling the expression.  */
 522       re_free (preg->fastmap);
 523       preg->fastmap = NULL;
 524     }
 525
 526   return (int) ret;
 527 }
 528 #ifdef _LIBC
 529 weak_alias (__regcomp, regcomp)
 530 #endif
 531
 532 /* Returns a message corresponding to an error code, ERRCODE, returned
 533    from either regcomp or regexec.   We don't use PREG here.  */
 534
 535 #ifdef _LIBC
 536 size_t
 537 regerror (errcode, preg, errbuf, errbuf_size)
 538     int errcode;
 539     const regex_t *_Restrict_ preg;
 540     char *_Restrict_ errbuf;
 541     size_t errbuf_size;
 542 #else /* size_t might promote */
 543 size_t
 544 regerror (int errcode, const regex_t *_Restrict_ preg,
 545           char *_Restrict_ errbuf, size_t errbuf_size)
 546 #endif
 547 {
 548   const char *msg;
 549   size_t msg_size;
 550
 551   if (BE (errcode < 0
 552           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 553                                / sizeof (__re_error_msgid_idx[0])), 0))
 554     /* Only error codes returned by the rest of the code should be passed
 555        to this routine.  If we are given anything else, or if other regex
 556        code generates an invalid error code, then the program has a bug.
 557        Dump core so we can fix it.  */
 558     abort ();
 559
 560   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 561
 562   msg_size = strlen (msg) + 1; /* Includes the null.  */
 563
 564   if (BE (errbuf_size != 0, 1))
 565     {
 566       size_t cpy_size = msg_size;
 567       if (BE (msg_size > errbuf_size, 0))
 568         {
 569           cpy_size = errbuf_size - 1;
 570           errbuf[cpy_size] = '\0';
 571         }
 572       memcpy (errbuf, msg, cpy_size);
 573     }
 574
 575   return msg_size;
 576 }
 577 #ifdef _LIBC
 578 weak_alias (__regerror, regerror)
 579 #endif
 580
 581
 582 #ifdef RE_ENABLE_I18N
 583 /* This static array is used for the map to single-byte characters when
 584    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 585    it the same all the time.  UTF-8 is the preferred encoding so this is
 586    a worthwhile optimization.  */
 587 static const bitset_t utf8_sb_map =
 588 {
 589   /* Set the first 128 bits.  */
 590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 591 #  error "bitset_word_t is narrower than 32 bits"
 592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 593   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX,
 598 # endif
 599   (BITSET_WORD_MAX
 600    >> (SBC_MAX % BITSET_WORD_BITS == 0
 601        ? 0
 602        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 603 };
 604 #endif
 605
 606
 607 static void
 608 free_dfa_content (re_dfa_t *dfa)
 609 {
 610   Idx i, j;
 611
 612   if (dfa->nodes)
 613     for (i = 0; i < dfa->nodes_len; ++i)
 614       free_token (dfa->nodes + i);
 615   re_free (dfa->nexts);
 616   for (i = 0; i < dfa->nodes_len; ++i)
 617     {
 618       if (dfa->eclosures != NULL)
 619         re_node_set_free (dfa->eclosures + i);
 620       if (dfa->inveclosures != NULL)
 621         re_node_set_free (dfa->inveclosures + i);
 622       if (dfa->edests != NULL)
 623         re_node_set_free (dfa->edests + i);
 624     }
 625   re_free (dfa->edests);
 626   re_free (dfa->eclosures);
 627   re_free (dfa->inveclosures);
 628   re_free (dfa->nodes);
 629
 630   if (dfa->state_table)
 631     for (i = 0; i <= dfa->state_hash_mask; ++i)
 632       {
 633         struct re_state_table_entry *entry = dfa->state_table + i;
 634         for (j = 0; j < entry->num; ++j)
 635           {
 636             re_dfastate_t *state = entry->array[j];
 637             free_state (state);
 638           }
 639         re_free (entry->array);
 640       }
 641   re_free (dfa->state_table);
 642 #ifdef RE_ENABLE_I18N
 643   if (dfa->sb_char != utf8_sb_map)
 644     re_free (dfa->sb_char);
 645 #endif
 646   re_free (dfa->subexp_map);
 647 #ifdef DEBUG
 648   re_free (dfa->re_str);
 649 #endif
 650
 651   re_free (dfa);
 652 }
 653
 654
 655 /* Free dynamically allocated space used by PREG.  */
 656
 657 void
 658 regfree (preg)
 659     regex_t *preg;
 660 {
 661   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 662   if (BE (dfa != NULL, 1))
 663     free_dfa_content (dfa);
 664   preg->buffer = NULL;
 665   preg->allocated = 0;
 666
 667   re_free (preg->fastmap);
 668   preg->fastmap = NULL;
 669
 670   re_free (preg->translate);
 671   preg->translate = NULL;
 672 }
 673 #ifdef _LIBC
 674 weak_alias (__regfree, regfree)
 675 #endif
 676 \f
 677 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 678    them unless specifically requested.  */
 679
 680 #if defined _REGEX_RE_COMP || defined _LIBC
 681
 682 /* BSD has one and only one pattern buffer.  */
 683 static struct re_pattern_buffer re_comp_buf;
 684
 685 char *
 686 # ifdef _LIBC
 687 /* Make these definitions weak in libc, so POSIX programs can redefine
 688    these names if they don't use our functions, and still use
 689    regcomp/regexec above without link errors.  */
 690 weak_function
 691 # endif
 692 re_comp (s)
 693      const char *s;
 694 {
 695   reg_errcode_t ret;
 696   char *fastmap;
 697
 698   if (!s)
 699     {
 700       if (!re_comp_buf.buffer)
 701         return gettext ("No previous regular expression");
 702       return 0;
 703     }
 704
 705   if (re_comp_buf.buffer)
 706     {
 707       fastmap = re_comp_buf.fastmap;
 708       re_comp_buf.fastmap = NULL;
 709       __regfree (&re_comp_buf);
 710       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 711       re_comp_buf.fastmap = fastmap;
 712     }
 713
 714   if (re_comp_buf.fastmap == NULL)
 715     {
 716       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 717       if (re_comp_buf.fastmap == NULL)
 718         return (char *) gettext (__re_error_msgid
 719                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 720     }
 721
 722   /* Since `re_exec' always passes NULL for the `regs' argument, we
 723      don't need to initialize the pattern buffer fields which affect it.  */
 724
 725   /* Match anchors at newlines.  */
 726   re_comp_buf.newline_anchor = 1;
 727
 728   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 729
 730   if (!ret)
 731     return NULL;
 732
 733   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 734   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 735 }
 736
 737 #ifdef _LIBC
 738 libc_freeres_fn (free_mem)
 739 {
 740   __regfree (&re_comp_buf);
 741 }
 742 #endif
 743
 744 #endif /* _REGEX_RE_COMP */
 745 \f
 746 /* Internal entry point.
 747    Compile the regular expression PATTERN, whose length is LENGTH.
 748    SYNTAX indicate regular expression's syntax.  */
 749
 750 static reg_errcode_t
 751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 752                      reg_syntax_t syntax)
 753 {
 754   reg_errcode_t err = REG_NOERROR;
 755   re_dfa_t *dfa;
 756   re_string_t regexp;
 757
 758   /* Initialize the pattern buffer.  */
 759   preg->fastmap_accurate = 0;
 760   preg->syntax = syntax;
 761   preg->not_bol = preg->not_eol = 0;
 762   preg->used = 0;
 763   preg->re_nsub = 0;
 764   preg->can_be_null = 0;
 765   preg->regs_allocated = REGS_UNALLOCATED;
 766
 767   /* Initialize the dfa.  */
 768   dfa = (re_dfa_t *) preg->buffer;
 769   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 770     {
 771       /* If zero allocated, but buffer is non-null, try to realloc
 772          enough space.  This loses if buffer's address is bogus, but
 773          that is the user's responsibility.  If ->buffer is NULL this
 774          is a simple allocation.  */
 775       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 776       if (dfa == NULL)
 777         return REG_ESPACE;
 778       preg->allocated = sizeof (re_dfa_t);
 779       preg->buffer = (unsigned char *) dfa;
 780     }
 781   preg->used = sizeof (re_dfa_t);
 782
 783   err = init_dfa (dfa, length);
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786       free_dfa_content (dfa);
 787       preg->buffer = NULL;
 788       preg->allocated = 0;
 789       return err;
 790     }
 791 #ifdef DEBUG
 792   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 793   dfa->re_str = re_malloc (char, length + 1);
 794   strncpy (dfa->re_str, pattern, length + 1);
 795 #endif
 796
 797   __libc_lock_init (dfa->lock);
 798
 799   err = re_string_construct (&regexp, pattern, length, preg->translate,
 800                              (syntax & RE_ICASE) != 0, dfa);
 801   if (BE (err != REG_NOERROR, 0))
 802     {
 803     re_compile_internal_free_return:
 804       free_workarea_compile (preg);
 805       re_string_destruct (&regexp);
 806       free_dfa_content (dfa);
 807       preg->buffer = NULL;
 808       preg->allocated = 0;
 809       return err;
 810     }
 811
 812   /* Parse the regular expression, and build a structure tree.  */
 813   preg->re_nsub = 0;
 814   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 815   if (BE (dfa->str_tree == NULL, 0))
 816     goto re_compile_internal_free_return;
 817
 818   /* Analyze the tree and create the nfa.  */
 819   err = analyze (preg);
 820   if (BE (err != REG_NOERROR, 0))
 821     goto re_compile_internal_free_return;
 822
 823 #ifdef RE_ENABLE_I18N
 824   /* If possible, do searching in single byte encoding to speed things up.  */
 825   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 826     optimize_utf8 (dfa);
 827 #endif
 828
 829   /* Then create the initial state of the dfa.  */
 830   err = create_initial_state (dfa);
 831
 832   /* Release work areas.  */
 833   free_workarea_compile (preg);
 834   re_string_destruct (&regexp);
 835
 836   if (BE (err != REG_NOERROR, 0))
 837     {
 838       free_dfa_content (dfa);
 839       preg->buffer = NULL;
 840       preg->allocated = 0;
 841     }
 842
 843   return err;
 844 }
 845
 846 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 847    as the initial length of some arrays.  */
 848
 849 static reg_errcode_t
 850 init_dfa (re_dfa_t *dfa, size_t pat_len)
 851 {
 852   __re_size_t table_size;
 853 #ifndef _LIBC
 854   char *codeset_name;
 855 #endif
 856 #ifdef RE_ENABLE_I18N
 857   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 858 #else
 859   size_t max_i18n_object_size = 0;
 860 #endif
 861   size_t max_object_size =
 862     MAX (sizeof (struct re_state_table_entry),
 863          MAX (sizeof (re_token_t),
 864               MAX (sizeof (re_node_set),
 865                    MAX (sizeof (regmatch_t),
 866                         max_i18n_object_size))));
 867
 868   memset (dfa, '\0', sizeof (re_dfa_t));
 869
 870   /* Force allocation of str_tree_storage the first time.  */
 871   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 872
 873   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 874      calculation below, and for similar doubling calculations
 875      elsewhere.  And it's <= rather than <, because some of the
 876      doubling calculations add 1 afterwards.  */
 877   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 878     return REG_ESPACE;
 879
 880   dfa->nodes_alloc = pat_len + 1;
 881   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 882
 883   /*  table_size = 2 ^ ceil(log pat_len) */
 884   for (table_size = 1; ; table_size <<= 1)
 885     if (table_size > pat_len)
 886       break;
 887
 888   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 889   dfa->state_hash_mask = table_size - 1;
 890
 891   dfa->mb_cur_max = MB_CUR_MAX;
 892 #ifdef _LIBC
 893   if (dfa->mb_cur_max == 6
 894       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 895     dfa->is_utf8 = 1;
 896   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 897                        != 0);
 898 #else
 899   codeset_name = nl_langinfo (CODESET);
 900   if (strcasecmp (codeset_name, "UTF-8") == 0
 901       || strcasecmp (codeset_name, "UTF8") == 0)
 902     dfa->is_utf8 = 1;
 903
 904   /* We check exhaustively in the loop below if this charset is a
 905      superset of ASCII.  */
 906   dfa->map_notascii = 0;
 907 #endif
 908
 909 #ifdef RE_ENABLE_I18N
 910   if (dfa->mb_cur_max > 1)
 911     {
 912       if (dfa->is_utf8)
 913         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 914       else
 915         {
 916           int i, j, ch;
 917
 918           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 919           if (BE (dfa->sb_char == NULL, 0))
 920             return REG_ESPACE;
 921
 922           /* Set the bits corresponding to single byte chars.  */
 923           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 924             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 925               {
 926                 wint_t wch = __btowc (ch);
 927                 if (wch != WEOF)
 928                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 929 # ifndef _LIBC
 930                 if (isascii (ch) && wch != ch)
 931                   dfa->map_notascii = 1;
 932 # endif
 933               }
 934         }
 935     }
 936 #endif
 937
 938   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 939     return REG_ESPACE;
 940   return REG_NOERROR;
 941 }
 942
 943 /* Initialize WORD_CHAR table, which indicate which character is
 944    "word".  In this case "word" means that it is the word construction
 945    character used by some operators like "\<", "\>", etc.  */
 946
 947 static void
 948 internal_function
 949 init_word_char (re_dfa_t *dfa)
 950 {
 951   int i, j, ch;
 952   dfa->word_ops_used = 1;
 953   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 954     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 955       if (isalnum (ch) || ch == '_')
 956         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 957 }
 958
 959 /* Free the work area which are only used while compiling.  */
 960
 961 static void
 962 free_workarea_compile (regex_t *preg)
 963 {
 964   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 965   bin_tree_storage_t *storage, *next;
 966   for (storage = dfa->str_tree_storage; storage; storage = next)
 967     {
 968       next = storage->next;
 969       re_free (storage);
 970     }
 971   dfa->str_tree_storage = NULL;
 972   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 973   dfa->str_tree = NULL;
 974   re_free (dfa->org_indices);
 975   dfa->org_indices = NULL;
 976 }
 977
 978 /* Create initial states for all contexts.  */
 979
 980 static reg_errcode_t
 981 create_initial_state (re_dfa_t *dfa)
 982 {
 983   Idx first, i;
 984   reg_errcode_t err;
 985   re_node_set init_nodes;
 986
 987   /* Initial states have the epsilon closure of the node which is
 988      the first node of the regular expression.  */
 989   first = dfa->str_tree->first->node_idx;
 990   dfa->init_node = first;
 991   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 992   if (BE (err != REG_NOERROR, 0))
 993     return err;
 994
 995   /* The back-references which are in initial states can epsilon transit,
 996      since in this case all of the subexpressions can be null.
 997      Then we add epsilon closures of the nodes which are the next nodes of
 998      the back-references.  */
 999   if (dfa->nbackref > 0)
1000     for (i = 0; i < init_nodes.nelem; ++i)
1001       {
1002         Idx node_idx = init_nodes.elems[i];
1003         re_token_type_t type = dfa->nodes[node_idx].type;
1004
1005         Idx clexp_idx;
1006         if (type != OP_BACK_REF)
1007           continue;
1008         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1009           {
1010             re_token_t *clexp_node;
1011             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012             if (clexp_node->type == OP_CLOSE_SUBEXP
1013                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1014               break;
1015           }
1016         if (clexp_idx == init_nodes.nelem)
1017           continue;
1018
1019         if (type == OP_BACK_REF)
1020           {
1021             Idx dest_idx = dfa->edests[node_idx].elems[0];
1022             if (!re_node_set_contains (&init_nodes, dest_idx))
1023               {
1024                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1025                 i = 0;
1026               }
1027           }
1028       }
1029
1030   /* It must be the first time to invoke acquire_state.  */
1031   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1032   /* We don't check ERR here, since the initial state must not be NULL.  */
1033   if (BE (dfa->init_state == NULL, 0))
1034     return err;
1035   if (dfa->init_state->has_constraint)
1036     {
1037       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1038                                                        CONTEXT_WORD);
1039       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1040                                                      CONTEXT_NEWLINE);
1041       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1042                                                          &init_nodes,
1043                                                          CONTEXT_NEWLINE
1044                                                          | CONTEXT_BEGBUF);
1045       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1046               || dfa->init_state_begbuf == NULL, 0))
1047         return err;
1048     }
1049   else
1050     dfa->init_state_word = dfa->init_state_nl
1051       = dfa->init_state_begbuf = dfa->init_state;
1052
1053   re_node_set_free (&init_nodes);
1054   return REG_NOERROR;
1055 }
1056 \f
1057 #ifdef RE_ENABLE_I18N
1058 /* If it is possible to do searching in single byte encoding instead of UTF-8
1059    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1060    DFA nodes where needed.  */
1061
1062 static void
1063 optimize_utf8 (re_dfa_t *dfa)
1064 {
1065   Idx node;
1066   int i;
1067   bool mb_chars = false;
1068   bool has_period = false;
1069
1070   for (node = 0; node < dfa->nodes_len; ++node)
1071     switch (dfa->nodes[node].type)
1072       {
1073       case CHARACTER:
1074         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1075           mb_chars = true;
1076         break;
1077       case ANCHOR:
1078         switch (dfa->nodes[node].opr.ctx_type)
1079           {
1080           case LINE_FIRST:
1081           case LINE_LAST:
1082           case BUF_FIRST:
1083           case BUF_LAST:
1084             break;
1085           default:
1086             /* Word anchors etc. cannot be handled.  It's okay to test
1087                opr.ctx_type since constraints (for all DFA nodes) are
1088                created by ORing one or more opr.ctx_type values.  */
1089             return;
1090           }
1091         break;
1092       case OP_PERIOD:
1093         has_period = true;
1094         break;
1095       case OP_BACK_REF:
1096       case OP_ALT:
1097       case END_OF_RE:
1098       case OP_DUP_ASTERISK:
1099       case OP_OPEN_SUBEXP:
1100       case OP_CLOSE_SUBEXP:
1101         break;
1102       case COMPLEX_BRACKET:
1103         return;
1104       case SIMPLE_BRACKET:
1105         /* Just double check.  */
1106         {
1107           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1108                         ? 0
1109                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1110           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1111             {
1112               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1113                 return;
1114               rshift = 0;
1115             }
1116         }
1117         break;
1118       default:
1119         abort ();
1120       }
1121
1122   if (mb_chars || has_period)
1123     for (node = 0; node < dfa->nodes_len; ++node)
1124       {
1125         if (dfa->nodes[node].type == CHARACTER
1126             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1127           dfa->nodes[node].mb_partial = 0;
1128         else if (dfa->nodes[node].type == OP_PERIOD)
1129           dfa->nodes[node].type = OP_UTF8_PERIOD;
1130       }
1131
1132   /* The search can be in single byte locale.  */
1133   dfa->mb_cur_max = 1;
1134   dfa->is_utf8 = 0;
1135   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1136 }
1137 #endif
1138 \f
1139 /* Analyze the structure tree, and calculate "first", "next", "edest",
1140    "eclosure", and "inveclosure".  */
1141
1142 static reg_errcode_t
1143 analyze (regex_t *preg)
1144 {
1145   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1146   reg_errcode_t ret;
1147
1148   /* Allocate arrays.  */
1149   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1150   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1151   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1152   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1153   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1154           || dfa->eclosures == NULL, 0))
1155     return REG_ESPACE;
1156
1157   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1158   if (dfa->subexp_map != NULL)
1159     {
1160       Idx i;
1161       for (i = 0; i < preg->re_nsub; i++)
1162         dfa->subexp_map[i] = i;
1163       preorder (dfa->str_tree, optimize_subexps, dfa);
1164       for (i = 0; i < preg->re_nsub; i++)
1165         if (dfa->subexp_map[i] != i)
1166           break;
1167       if (i == preg->re_nsub)
1168         {
1169           free (dfa->subexp_map);
1170           dfa->subexp_map = NULL;
1171         }
1172     }
1173
1174   ret = postorder (dfa->str_tree, lower_subexps, preg);
1175   if (BE (ret != REG_NOERROR, 0))
1176     return ret;
1177   ret = postorder (dfa->str_tree, calc_first, dfa);
1178   if (BE (ret != REG_NOERROR, 0))
1179     return ret;
1180   preorder (dfa->str_tree, calc_next, dfa);
1181   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184   ret = calc_eclosure (dfa);
1185   if (BE (ret != REG_NOERROR, 0))
1186     return ret;
1187
1188   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1189      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1190   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1191       || dfa->nbackref)
1192     {
1193       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1194       if (BE (dfa->inveclosures == NULL, 0))
1195         return REG_ESPACE;
1196       ret = calc_inveclosure (dfa);
1197     }
1198
1199   return ret;
1200 }
1201
1202 /* Our parse trees are very unbalanced, so we cannot use a stack to
1203    implement parse tree visits.  Instead, we use parent pointers and
1204    some hairy code in these two functions.  */
1205 static reg_errcode_t
1206 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1207            void *extra)
1208 {
1209   bin_tree_t *node, *prev;
1210
1211   for (node = root; ; )
1212     {
1213       /* Descend down the tree, preferably to the left (or to the right
1214          if that's the only child).  */
1215       while (node->left || node->right)
1216         if (node->left)
1217           node = node->left;
1218         else
1219           node = node->right;
1220
1221       do
1222         {
1223           reg_errcode_t err = fn (extra, node);
1224           if (BE (err != REG_NOERROR, 0))
1225             return err;
1226           if (node->parent == NULL)
1227             return REG_NOERROR;
1228           prev = node;
1229           node = node->parent;
1230         }
1231       /* Go up while we have a node that is reached from the right.  */
1232       while (node->right == prev || node->right == NULL);
1233       node = node->right;
1234     }
1235 }
1236
1237 static reg_errcode_t
1238 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1239           void *extra)
1240 {
1241   bin_tree_t *node;
1242
1243   for (node = root; ; )
1244     {
1245       reg_errcode_t err = fn (extra, node);
1246       if (BE (err != REG_NOERROR, 0))
1247         return err;
1248
1249       /* Go to the left node, or up and to the right.  */
1250       if (node->left)
1251         node = node->left;
1252       else
1253         {
1254           bin_tree_t *prev = NULL;
1255           while (node->right == prev || node->right == NULL)
1256             {
1257               prev = node;
1258               node = node->parent;
1259               if (!node)
1260                 return REG_NOERROR;
1261             }
1262           node = node->right;
1263         }
1264     }
1265 }
1266
1267 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1268    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1269    backreferences as well.  Requires a preorder visit.  */
1270 static reg_errcode_t
1271 optimize_subexps (void *extra, bin_tree_t *node)
1272 {
1273   re_dfa_t *dfa = (re_dfa_t *) extra;
1274
1275   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1276     {
1277       int idx = node->token.opr.idx;
1278       node->token.opr.idx = dfa->subexp_map[idx];
1279       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1280     }
1281
1282   else if (node->token.type == SUBEXP
1283            && node->left && node->left->token.type == SUBEXP)
1284     {
1285       Idx other_idx = node->left->token.opr.idx;
1286
1287       node->left = node->left->left;
1288       if (node->left)
1289         node->left->parent = node;
1290
1291       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1292       if (other_idx < BITSET_WORD_BITS)
1293         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1294     }
1295
1296   return REG_NOERROR;
1297 }
1298
1299 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1300    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1301 static reg_errcode_t
1302 lower_subexps (void *extra, bin_tree_t *node)
1303 {
1304   regex_t *preg = (regex_t *) extra;
1305   reg_errcode_t err = REG_NOERROR;
1306
1307   if (node->left && node->left->token.type == SUBEXP)
1308     {
1309       node->left = lower_subexp (&err, preg, node->left);
1310       if (node->left)
1311         node->left->parent = node;
1312     }
1313   if (node->right && node->right->token.type == SUBEXP)
1314     {
1315       node->right = lower_subexp (&err, preg, node->right);
1316       if (node->right)
1317         node->right->parent = node;
1318     }
1319
1320   return err;
1321 }
1322
1323 static bin_tree_t *
1324 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1325 {
1326   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1327   bin_tree_t *body = node->left;
1328   bin_tree_t *op, *cls, *tree1, *tree;
1329
1330   if (preg->no_sub
1331       /* We do not optimize empty subexpressions, because otherwise we may
1332          have bad CONCAT nodes with NULL children.  This is obviously not
1333          very common, so we do not lose much.  An example that triggers
1334          this case is the sed "script" /\(\)/x.  */
1335       && node->left != NULL
1336       && (node->token.opr.idx >= BITSET_WORD_BITS
1337           || !(dfa->used_bkref_map
1338                & ((bitset_word_t) 1 << node->token.opr.idx))))
1339     return node->left;
1340
1341   /* Convert the SUBEXP node to the concatenation of an
1342      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1343   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1344   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1345   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1346   tree = create_tree (dfa, op, tree1, CONCAT);
1347   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1348     {
1349       *err = REG_ESPACE;
1350       return NULL;
1351     }
1352
1353   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1354   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1355   return tree;
1356 }
1357
1358 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1359    nodes.  Requires a postorder visit.  */
1360 static reg_errcode_t
1361 calc_first (void *extra, bin_tree_t *node)
1362 {
1363   re_dfa_t *dfa = (re_dfa_t *) extra;
1364   if (node->token.type == CONCAT)
1365     {
1366       node->first = node->left->first;
1367       node->node_idx = node->left->node_idx;
1368     }
1369   else
1370     {
1371       node->first = node;
1372       node->node_idx = re_dfa_add_node (dfa, node->token);
1373       if (BE (node->node_idx == REG_MISSING, 0))
1374         return REG_ESPACE;
1375       if (node->token.type == ANCHOR)
1376         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1377     }
1378   return REG_NOERROR;
1379 }
1380
1381 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1382 static reg_errcode_t
1383 calc_next (void *extra, bin_tree_t *node)
1384 {
1385   switch (node->token.type)
1386     {
1387     case OP_DUP_ASTERISK:
1388       node->left->next = node;
1389       break;
1390     case CONCAT:
1391       node->left->next = node->right->first;
1392       node->right->next = node->next;
1393       break;
1394     default:
1395       if (node->left)
1396         node->left->next = node->next;
1397       if (node->right)
1398         node->right->next = node->next;
1399       break;
1400     }
1401   return REG_NOERROR;
1402 }
1403
1404 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1405 static reg_errcode_t
1406 link_nfa_nodes (void *extra, bin_tree_t *node)
1407 {
1408   re_dfa_t *dfa = (re_dfa_t *) extra;
1409   Idx idx = node->node_idx;
1410   reg_errcode_t err = REG_NOERROR;
1411
1412   switch (node->token.type)
1413     {
1414     case CONCAT:
1415       break;
1416
1417     case END_OF_RE:
1418       assert (node->next == NULL);
1419       break;
1420
1421     case OP_DUP_ASTERISK:
1422     case OP_ALT:
1423       {
1424         Idx left, right;
1425         dfa->has_plural_match = 1;
1426         if (node->left != NULL)
1427           left = node->left->first->node_idx;
1428         else
1429           left = node->next->node_idx;
1430         if (node->right != NULL)
1431           right = node->right->first->node_idx;
1432         else
1433           right = node->next->node_idx;
1434         assert (REG_VALID_INDEX (left));
1435         assert (REG_VALID_INDEX (right));
1436         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1437       }
1438       break;
1439
1440     case ANCHOR:
1441     case OP_OPEN_SUBEXP:
1442     case OP_CLOSE_SUBEXP:
1443       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1444       break;
1445
1446     case OP_BACK_REF:
1447       dfa->nexts[idx] = node->next->node_idx;
1448       if (node->token.type == OP_BACK_REF)
1449         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1450       break;
1451
1452     default:
1453       assert (!IS_EPSILON_NODE (node->token.type));
1454       dfa->nexts[idx] = node->next->node_idx;
1455       break;
1456     }
1457
1458   return err;
1459 }
1460
1461 /* Duplicate the epsilon closure of the node ROOT_NODE.
1462    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1463    to their own constraint.  */
1464
1465 static reg_errcode_t
1466 internal_function
1467 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1468                         Idx root_node, unsigned int init_constraint)
1469 {
1470   Idx org_node, clone_node;
1471   bool ok;
1472   unsigned int constraint = init_constraint;
1473   for (org_node = top_org_node, clone_node = top_clone_node;;)
1474     {
1475       Idx org_dest, clone_dest;
1476       if (dfa->nodes[org_node].type == OP_BACK_REF)
1477         {
1478           /* If the back reference epsilon-transit, its destination must
1479              also have the constraint.  Then duplicate the epsilon closure
1480              of the destination of the back reference, and store it in
1481              edests of the back reference.  */
1482           org_dest = dfa->nexts[org_node];
1483           re_node_set_empty (dfa->edests + clone_node);
1484           clone_dest = duplicate_node (dfa, org_dest, constraint);
1485           if (BE (clone_dest == REG_MISSING, 0))
1486             return REG_ESPACE;
1487           dfa->nexts[clone_node] = dfa->nexts[org_node];
1488           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489           if (BE (! ok, 0))
1490             return REG_ESPACE;
1491         }
1492       else if (dfa->edests[org_node].nelem == 0)
1493         {
1494           /* In case of the node can't epsilon-transit, don't duplicate the
1495              destination and store the original destination as the
1496              destination of the node.  */
1497           dfa->nexts[clone_node] = dfa->nexts[org_node];
1498           break;
1499         }
1500       else if (dfa->edests[org_node].nelem == 1)
1501         {
1502           /* In case of the node can epsilon-transit, and it has only one
1503              destination.  */
1504           org_dest = dfa->edests[org_node].elems[0];
1505           re_node_set_empty (dfa->edests + clone_node);
1506           /* If the node is root_node itself, it means the epsilon closure
1507              has a loop.  Then tie it to the destination of the root_node.  */
1508           if (org_node == root_node && clone_node != org_node)
1509             {
1510               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1511               if (BE (! ok, 0))
1512                 return REG_ESPACE;
1513               break;
1514             }
1515           /* In case the node has another constraint, append it.  */
1516           constraint |= dfa->nodes[org_node].constraint;
1517           clone_dest = duplicate_node (dfa, org_dest, constraint);
1518           if (BE (clone_dest == REG_MISSING, 0))
1519             return REG_ESPACE;
1520           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521           if (BE (! ok, 0))
1522             return REG_ESPACE;
1523         }
1524       else /* dfa->edests[org_node].nelem == 2 */
1525         {
1526           /* In case of the node can epsilon-transit, and it has two
1527              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1528           org_dest = dfa->edests[org_node].elems[0];
1529           re_node_set_empty (dfa->edests + clone_node);
1530           /* Search for a duplicated node which satisfies the constraint.  */
1531           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1532           if (clone_dest == REG_MISSING)
1533             {
1534               /* There is no such duplicated node, create a new one.  */
1535               reg_errcode_t err;
1536               clone_dest = duplicate_node (dfa, org_dest, constraint);
1537               if (BE (clone_dest == REG_MISSING, 0))
1538                 return REG_ESPACE;
1539               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540               if (BE (! ok, 0))
1541                 return REG_ESPACE;
1542               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1543                                             root_node, constraint);
1544               if (BE (err != REG_NOERROR, 0))
1545                 return err;
1546             }
1547           else
1548             {
1549               /* There is a duplicated node which satisfies the constraint,
1550                  use it to avoid infinite loop.  */
1551               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552               if (BE (! ok, 0))
1553                 return REG_ESPACE;
1554             }
1555
1556           org_dest = dfa->edests[org_node].elems[1];
1557           clone_dest = duplicate_node (dfa, org_dest, constraint);
1558           if (BE (clone_dest == REG_MISSING, 0))
1559             return REG_ESPACE;
1560           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561           if (BE (! ok, 0))
1562             return REG_ESPACE;
1563         }
1564       org_node = org_dest;
1565       clone_node = clone_dest;
1566     }
1567   return REG_NOERROR;
1568 }
1569
1570 /* Search for a node which is duplicated from the node ORG_NODE, and
1571    satisfies the constraint CONSTRAINT.  */
1572
1573 static Idx
1574 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1575                         unsigned int constraint)
1576 {
1577   Idx idx;
1578   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1579     {
1580       if (org_node == dfa->org_indices[idx]
1581           && constraint == dfa->nodes[idx].constraint)
1582         return idx; /* Found.  */
1583     }
1584   return REG_MISSING; /* Not found.  */
1585 }
1586
1587 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1588    Return the index of the new node, or REG_MISSING if insufficient storage is
1589    available.  */
1590
1591 static Idx
1592 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1593 {
1594   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1595   if (BE (dup_idx != REG_MISSING, 1))
1596     {
1597       dfa->nodes[dup_idx].constraint = constraint;
1598       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1599       dfa->nodes[dup_idx].duplicated = 1;
1600
1601       /* Store the index of the original node.  */
1602       dfa->org_indices[dup_idx] = org_idx;
1603     }
1604   return dup_idx;
1605 }
1606
1607 static reg_errcode_t
1608 calc_inveclosure (re_dfa_t *dfa)
1609 {
1610   Idx src, idx;
1611   bool ok;
1612   for (idx = 0; idx < dfa->nodes_len; ++idx)
1613     re_node_set_init_empty (dfa->inveclosures + idx);
1614
1615   for (src = 0; src < dfa->nodes_len; ++src)
1616     {
1617       Idx *elems = dfa->eclosures[src].elems;
1618       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1619         {
1620           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1621           if (BE (! ok, 0))
1622             return REG_ESPACE;
1623         }
1624     }
1625
1626   return REG_NOERROR;
1627 }
1628
1629 /* Calculate "eclosure" for all the node in DFA.  */
1630
1631 static reg_errcode_t
1632 calc_eclosure (re_dfa_t *dfa)
1633 {
1634   Idx node_idx;
1635   bool incomplete;
1636 #ifdef DEBUG
1637   assert (dfa->nodes_len > 0);
1638 #endif
1639   incomplete = false;
1640   /* For each nodes, calculate epsilon closure.  */
1641   for (node_idx = 0; ; ++node_idx)
1642     {
1643       reg_errcode_t err;
1644       re_node_set eclosure_elem;
1645       if (node_idx == dfa->nodes_len)
1646         {
1647           if (!incomplete)
1648             break;
1649           incomplete = false;
1650           node_idx = 0;
1651         }
1652
1653 #ifdef DEBUG
1654       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1655 #endif
1656
1657       /* If we have already calculated, skip it.  */
1658       if (dfa->eclosures[node_idx].nelem != 0)
1659         continue;
1660       /* Calculate epsilon closure of `node_idx'.  */
1661       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1662       if (BE (err != REG_NOERROR, 0))
1663         return err;
1664
1665       if (dfa->eclosures[node_idx].nelem == 0)
1666         {
1667           incomplete = true;
1668           re_node_set_free (&eclosure_elem);
1669         }
1670     }
1671   return REG_NOERROR;
1672 }
1673
1674 /* Calculate epsilon closure of NODE.  */
1675
1676 static reg_errcode_t
1677 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1678 {
1679   reg_errcode_t err;
1680   Idx i;
1681   bool incomplete;
1682   bool ok;
1683   re_node_set eclosure;
1684   incomplete = false;
1685   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1686   if (BE (err != REG_NOERROR, 0))
1687     return err;
1688
1689   /* This indicates that we are calculating this node now.
1690      We reference this value to avoid infinite loop.  */
1691   dfa->eclosures[node].nelem = REG_MISSING;
1692
1693   /* If the current node has constraints, duplicate all nodes
1694      since they must inherit the constraints.  */
1695   if (dfa->nodes[node].constraint
1696       && dfa->edests[node].nelem
1697       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1698     {
1699       err = duplicate_node_closure (dfa, node, node, node,
1700                                     dfa->nodes[node].constraint);
1701       if (BE (err != REG_NOERROR, 0))
1702         return err;
1703     }
1704
1705   /* Expand each epsilon destination nodes.  */
1706   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1707     for (i = 0; i < dfa->edests[node].nelem; ++i)
1708       {
1709         re_node_set eclosure_elem;
1710         Idx edest = dfa->edests[node].elems[i];
1711         /* If calculating the epsilon closure of `edest' is in progress,
1712            return intermediate result.  */
1713         if (dfa->eclosures[edest].nelem == REG_MISSING)
1714           {
1715             incomplete = true;
1716             continue;
1717           }
1718         /* If we haven't calculated the epsilon closure of `edest' yet,
1719            calculate now. Otherwise use calculated epsilon closure.  */
1720         if (dfa->eclosures[edest].nelem == 0)
1721           {
1722             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1723             if (BE (err != REG_NOERROR, 0))
1724               return err;
1725           }
1726         else
1727           eclosure_elem = dfa->eclosures[edest];
1728         /* Merge the epsilon closure of `edest'.  */
1729         re_node_set_merge (&eclosure, &eclosure_elem);
1730         /* If the epsilon closure of `edest' is incomplete,
1731            the epsilon closure of this node is also incomplete.  */
1732         if (dfa->eclosures[edest].nelem == 0)
1733           {
1734             incomplete = true;
1735             re_node_set_free (&eclosure_elem);
1736           }
1737       }
1738
1739   /* Epsilon closures include itself.  */
1740   ok = re_node_set_insert (&eclosure, node);
1741   if (BE (! ok, 0))
1742     return REG_ESPACE;
1743   if (incomplete && !root)
1744     dfa->eclosures[node].nelem = 0;
1745   else
1746     dfa->eclosures[node] = eclosure;
1747   *new_set = eclosure;
1748   return REG_NOERROR;
1749 }
1750 \f
1751 /* Functions for token which are used in the parser.  */
1752
1753 /* Fetch a token from INPUT.
1754    We must not use this function inside bracket expressions.  */
1755
1756 static void
1757 internal_function
1758 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1759 {
1760   re_string_skip_bytes (input, peek_token (result, input, syntax));
1761 }
1762
1763 /* Peek a token from INPUT, and return the length of the token.
1764    We must not use this function inside bracket expressions.  */
1765
1766 static int
1767 internal_function
1768 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1769 {
1770   unsigned char c;
1771
1772   if (re_string_eoi (input))
1773     {
1774       token->type = END_OF_RE;
1775       return 0;
1776     }
1777
1778   c = re_string_peek_byte (input, 0);
1779   token->opr.c = c;
1780
1781   token->word_char = 0;
1782 #ifdef RE_ENABLE_I18N
1783   token->mb_partial = 0;
1784   if (input->mb_cur_max > 1 &&
1785       !re_string_first_byte (input, re_string_cur_idx (input)))
1786     {
1787       token->type = CHARACTER;
1788       token->mb_partial = 1;
1789       return 1;
1790     }
1791 #endif
1792   if (c == '\\')
1793     {
1794       unsigned char c2;
1795       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1796         {
1797           token->type = BACK_SLASH;
1798           return 1;
1799         }
1800
1801       c2 = re_string_peek_byte_case (input, 1);
1802       token->opr.c = c2;
1803       token->type = CHARACTER;
1804 #ifdef RE_ENABLE_I18N
1805       if (input->mb_cur_max > 1)
1806         {
1807           wint_t wc = re_string_wchar_at (input,
1808                                           re_string_cur_idx (input) + 1);
1809           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1810         }
1811       else
1812 #endif
1813         token->word_char = IS_WORD_CHAR (c2) != 0;
1814
1815       switch (c2)
1816         {
1817         case '|':
1818           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1819             token->type = OP_ALT;
1820           break;
1821         case '1': case '2': case '3': case '4': case '5':
1822         case '6': case '7': case '8': case '9':
1823           if (!(syntax & RE_NO_BK_REFS))
1824             {
1825               token->type = OP_BACK_REF;
1826               token->opr.idx = c2 - '1';
1827             }
1828           break;
1829         case '<':
1830           if (!(syntax & RE_NO_GNU_OPS))
1831             {
1832               token->type = ANCHOR;
1833               token->opr.ctx_type = WORD_FIRST;
1834             }
1835           break;
1836         case '>':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             {
1839               token->type = ANCHOR;
1840               token->opr.ctx_type = WORD_LAST;
1841             }
1842           break;
1843         case 'b':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = WORD_DELIM;
1848             }
1849           break;
1850         case 'B':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = NOT_WORD_DELIM;
1855             }
1856           break;
1857         case 'w':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             token->type = OP_WORD;
1860           break;
1861         case 'W':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             token->type = OP_NOTWORD;
1864           break;
1865         case 's':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             token->type = OP_SPACE;
1868           break;
1869         case 'S':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             token->type = OP_NOTSPACE;
1872           break;
1873         case '`':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             {
1876               token->type = ANCHOR;
1877               token->opr.ctx_type = BUF_FIRST;
1878             }
1879           break;
1880         case '\'':
1881           if (!(syntax & RE_NO_GNU_OPS))
1882             {
1883               token->type = ANCHOR;
1884               token->opr.ctx_type = BUF_LAST;
1885             }
1886           break;
1887         case '(':
1888           if (!(syntax & RE_NO_BK_PARENS))
1889             token->type = OP_OPEN_SUBEXP;
1890           break;
1891         case ')':
1892           if (!(syntax & RE_NO_BK_PARENS))
1893             token->type = OP_CLOSE_SUBEXP;
1894           break;
1895         case '+':
1896           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1897             token->type = OP_DUP_PLUS;
1898           break;
1899         case '?':
1900           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1901             token->type = OP_DUP_QUESTION;
1902           break;
1903         case '{':
1904           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1905             token->type = OP_OPEN_DUP_NUM;
1906           break;
1907         case '}':
1908           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1909             token->type = OP_CLOSE_DUP_NUM;
1910           break;
1911         default:
1912           break;
1913         }
1914       return 2;
1915     }
1916
1917   token->type = CHARACTER;
1918 #ifdef RE_ENABLE_I18N
1919   if (input->mb_cur_max > 1)
1920     {
1921       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1922       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1923     }
1924   else
1925 #endif
1926     token->word_char = IS_WORD_CHAR (token->opr.c);
1927
1928   switch (c)
1929     {
1930     case '\n':
1931       if (syntax & RE_NEWLINE_ALT)
1932         token->type = OP_ALT;
1933       break;
1934     case '|':
1935       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1936         token->type = OP_ALT;
1937       break;
1938     case '*':
1939       token->type = OP_DUP_ASTERISK;
1940       break;
1941     case '+':
1942       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1943         token->type = OP_DUP_PLUS;
1944       break;
1945     case '?':
1946       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1947         token->type = OP_DUP_QUESTION;
1948       break;
1949     case '{':
1950       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1951         token->type = OP_OPEN_DUP_NUM;
1952       break;
1953     case '}':
1954       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1955         token->type = OP_CLOSE_DUP_NUM;
1956       break;
1957     case '(':
1958       if (syntax & RE_NO_BK_PARENS)
1959         token->type = OP_OPEN_SUBEXP;
1960       break;
1961     case ')':
1962       if (syntax & RE_NO_BK_PARENS)
1963         token->type = OP_CLOSE_SUBEXP;
1964       break;
1965     case '[':
1966       token->type = OP_OPEN_BRACKET;
1967       break;
1968     case '.':
1969       token->type = OP_PERIOD;
1970       break;
1971     case '^':
1972       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1973           re_string_cur_idx (input) != 0)
1974         {
1975           char prev = re_string_peek_byte (input, -1);
1976           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1977             break;
1978         }
1979       token->type = ANCHOR;
1980       token->opr.ctx_type = LINE_FIRST;
1981       break;
1982     case '$':
1983       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1984           re_string_cur_idx (input) + 1 != re_string_length (input))
1985         {
1986           re_token_t next;
1987           re_string_skip_bytes (input, 1);
1988           peek_token (&next, input, syntax);
1989           re_string_skip_bytes (input, -1);
1990           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1991             break;
1992         }
1993       token->type = ANCHOR;
1994       token->opr.ctx_type = LINE_LAST;
1995       break;
1996     default:
1997       break;
1998     }
1999   return 1;
2000 }
2001
2002 /* Peek a token from INPUT, and return the length of the token.
2003    We must not use this function out of bracket expressions.  */
2004
2005 static int
2006 internal_function
2007 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2008 {
2009   unsigned char c;
2010   if (re_string_eoi (input))
2011     {
2012       token->type = END_OF_RE;
2013       return 0;
2014     }
2015   c = re_string_peek_byte (input, 0);
2016   token->opr.c = c;
2017
2018 #ifdef RE_ENABLE_I18N
2019   if (input->mb_cur_max > 1 &&
2020       !re_string_first_byte (input, re_string_cur_idx (input)))
2021     {
2022       token->type = CHARACTER;
2023       return 1;
2024     }
2025 #endif /* RE_ENABLE_I18N */
2026
2027   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2028       && re_string_cur_idx (input) + 1 < re_string_length (input))
2029     {
2030       /* In this case, '\' escape a character.  */
2031       unsigned char c2;
2032       re_string_skip_bytes (input, 1);
2033       c2 = re_string_peek_byte (input, 0);
2034       token->opr.c = c2;
2035       token->type = CHARACTER;
2036       return 1;
2037     }
2038   if (c == '[') /* '[' is a special char in a bracket exps.  */
2039     {
2040       unsigned char c2;
2041       int token_len;
2042       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2043         c2 = re_string_peek_byte (input, 1);
2044       else
2045         c2 = 0;
2046       token->opr.c = c2;
2047       token_len = 2;
2048       switch (c2)
2049         {
2050         case '.':
2051           token->type = OP_OPEN_COLL_ELEM;
2052           break;
2053         case '=':
2054           token->type = OP_OPEN_EQUIV_CLASS;
2055           break;
2056         case ':':
2057           if (syntax & RE_CHAR_CLASSES)
2058             {
2059               token->type = OP_OPEN_CHAR_CLASS;
2060               break;
2061             }
2062           /* else fall through.  */
2063         default:
2064           token->type = CHARACTER;
2065           token->opr.c = c;
2066           token_len = 1;
2067           break;
2068         }
2069       return token_len;
2070     }
2071   switch (c)
2072     {
2073     case '-':
2074       token->type = OP_CHARSET_RANGE;
2075       break;
2076     case ']':
2077       token->type = OP_CLOSE_BRACKET;
2078       break;
2079     case '^':
2080       token->type = OP_NON_MATCH_LIST;
2081       break;
2082     default:
2083       token->type = CHARACTER;
2084     }
2085   return 1;
2086 }
2087 \f
2088 /* Functions for parser.  */
2089
2090 /* Entry point of the parser.
2091    Parse the regular expression REGEXP and return the structure tree.
2092    If an error is occured, ERR is set by error code, and return NULL.
2093    This function build the following tree, from regular expression <reg_exp>:
2094            CAT
2095            / \
2096           /   \
2097    <reg_exp>  EOR
2098
2099    CAT means concatenation.
2100    EOR means end of regular expression.  */
2101
2102 static bin_tree_t *
2103 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2104        reg_errcode_t *err)
2105 {
2106   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2107   bin_tree_t *tree, *eor, *root;
2108   re_token_t current_token;
2109   dfa->syntax = syntax;
2110   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2112   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2113     return NULL;
2114   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2115   if (tree != NULL)
2116     root = create_tree (dfa, tree, eor, CONCAT);
2117   else
2118     root = eor;
2119   if (BE (eor == NULL || root == NULL, 0))
2120     {
2121       *err = REG_ESPACE;
2122       return NULL;
2123     }
2124   return root;
2125 }
2126
2127 /* This function build the following tree, from regular expression
2128    <branch1>|<branch2>:
2129            ALT
2130            / \
2131           /   \
2132    <branch1> <branch2>
2133
2134    ALT means alternative, which represents the operator `|'.  */
2135
2136 static bin_tree_t *
2137 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2138                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2139 {
2140   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2141   bin_tree_t *tree, *branch = NULL;
2142   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2143   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2144     return NULL;
2145
2146   while (token->type == OP_ALT)
2147     {
2148       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2149       if (token->type != OP_ALT && token->type != END_OF_RE
2150           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2151         {
2152           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2153           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2154             return NULL;
2155         }
2156       else
2157         branch = NULL;
2158       tree = create_tree (dfa, tree, branch, OP_ALT);
2159       if (BE (tree == NULL, 0))
2160         {
2161           *err = REG_ESPACE;
2162           return NULL;
2163         }
2164     }
2165   return tree;
2166 }
2167
2168 /* This function build the following tree, from regular expression
2169    <exp1><exp2>:
2170         CAT
2171         / \
2172        /   \
2173    <exp1> <exp2>
2174
2175    CAT means concatenation.  */
2176
2177 static bin_tree_t *
2178 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2179               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2180 {
2181   bin_tree_t *tree, *expr;
2182   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2183   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2184   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2185     return NULL;
2186
2187   while (token->type != OP_ALT && token->type != END_OF_RE
2188          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2189     {
2190       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2191       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2192         {
2193           return NULL;
2194         }
2195       if (tree != NULL && expr != NULL)
2196         {
2197           tree = create_tree (dfa, tree, expr, CONCAT);
2198           if (tree == NULL)
2199             {
2200               *err = REG_ESPACE;
2201               return NULL;
2202             }
2203         }
2204       else if (tree == NULL)
2205         tree = expr;
2206       /* Otherwise expr == NULL, we don't need to create new tree.  */
2207     }
2208   return tree;
2209 }
2210
2211 /* This function build the following tree, from regular expression a*:
2212          *
2213          |
2214          a
2215 */
2216
2217 static bin_tree_t *
2218 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2220 {
2221   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2222   bin_tree_t *tree;
2223   switch (token->type)
2224     {
2225     case CHARACTER:
2226       tree = create_token_tree (dfa, NULL, NULL, token);
2227       if (BE (tree == NULL, 0))
2228         {
2229           *err = REG_ESPACE;
2230           return NULL;
2231         }
2232 #ifdef RE_ENABLE_I18N
2233       if (dfa->mb_cur_max > 1)
2234         {
2235           while (!re_string_eoi (regexp)
2236                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2237             {
2238               bin_tree_t *mbc_remain;
2239               fetch_token (token, regexp, syntax);
2240               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2241               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2242               if (BE (mbc_remain == NULL || tree == NULL, 0))
2243                 {
2244                   *err = REG_ESPACE;
2245                   return NULL;
2246                 }
2247             }
2248         }
2249 #endif
2250       break;
2251     case OP_OPEN_SUBEXP:
2252       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2253       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2254         return NULL;
2255       break;
2256     case OP_OPEN_BRACKET:
2257       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2258       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259         return NULL;
2260       break;
2261     case OP_BACK_REF:
2262       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2263         {
2264           *err = REG_ESUBREG;
2265           return NULL;
2266         }
2267       dfa->used_bkref_map |= 1 << token->opr.idx;
2268       tree = create_token_tree (dfa, NULL, NULL, token);
2269       if (BE (tree == NULL, 0))
2270         {
2271           *err = REG_ESPACE;
2272           return NULL;
2273         }
2274       ++dfa->nbackref;
2275       dfa->has_mb_node = 1;
2276       break;
2277     case OP_OPEN_DUP_NUM:
2278       if (syntax & RE_CONTEXT_INVALID_DUP)
2279         {
2280           *err = REG_BADRPT;
2281           return NULL;
2282         }
2283       /* FALLTHROUGH */
2284     case OP_DUP_ASTERISK:
2285     case OP_DUP_PLUS:
2286     case OP_DUP_QUESTION:
2287       if (syntax & RE_CONTEXT_INVALID_OPS)
2288         {
2289           *err = REG_BADRPT;
2290           return NULL;
2291         }
2292       else if (syntax & RE_CONTEXT_INDEP_OPS)
2293         {
2294           fetch_token (token, regexp, syntax);
2295           return parse_expression (regexp, preg, token, syntax, nest, err);
2296         }
2297       /* else fall through  */
2298     case OP_CLOSE_SUBEXP:
2299       if ((token->type == OP_CLOSE_SUBEXP) &&
2300           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2301         {
2302           *err = REG_ERPAREN;
2303           return NULL;
2304         }
2305       /* else fall through  */
2306     case OP_CLOSE_DUP_NUM:
2307       /* We treat it as a normal character.  */
2308
2309       /* Then we can these characters as normal characters.  */
2310       token->type = CHARACTER;
2311       /* mb_partial and word_char bits should be initialized already
2312          by peek_token.  */
2313       tree = create_token_tree (dfa, NULL, NULL, token);
2314       if (BE (tree == NULL, 0))
2315         {
2316           *err = REG_ESPACE;
2317           return NULL;
2318         }
2319       break;
2320     case ANCHOR:
2321       if ((token->opr.ctx_type
2322            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2323           && dfa->word_ops_used == 0)
2324         init_word_char (dfa);
2325       if (token->opr.ctx_type == WORD_DELIM
2326           || token->opr.ctx_type == NOT_WORD_DELIM)
2327         {
2328           bin_tree_t *tree_first, *tree_last;
2329           if (token->opr.ctx_type == WORD_DELIM)
2330             {
2331               token->opr.ctx_type = WORD_FIRST;
2332               tree_first = create_token_tree (dfa, NULL, NULL, token);
2333               token->opr.ctx_type = WORD_LAST;
2334             }
2335           else
2336             {
2337               token->opr.ctx_type = INSIDE_WORD;
2338               tree_first = create_token_tree (dfa, NULL, NULL, token);
2339               token->opr.ctx_type = INSIDE_NOTWORD;
2340             }
2341           tree_last = create_token_tree (dfa, NULL, NULL, token);
2342           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2343           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2344             {
2345               *err = REG_ESPACE;
2346               return NULL;
2347             }
2348         }
2349       else
2350         {
2351           tree = create_token_tree (dfa, NULL, NULL, token);
2352           if (BE (tree == NULL, 0))
2353             {
2354               *err = REG_ESPACE;
2355               return NULL;
2356             }
2357         }
2358       /* We must return here, since ANCHORs can't be followed
2359          by repetition operators.
2360          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2361              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2362       fetch_token (token, regexp, syntax);
2363       return tree;
2364     case OP_PERIOD:
2365       tree = create_token_tree (dfa, NULL, NULL, token);
2366       if (BE (tree == NULL, 0))
2367         {
2368           *err = REG_ESPACE;
2369           return NULL;
2370         }
2371       if (dfa->mb_cur_max > 1)
2372         dfa->has_mb_node = 1;
2373       break;
2374     case OP_WORD:
2375     case OP_NOTWORD:
2376       tree = build_charclass_op (dfa, regexp->trans,
2377                                  (const unsigned char *) "alnum",
2378                                  (const unsigned char *) "_",
2379                                  token->type == OP_NOTWORD, err);
2380       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2381         return NULL;
2382       break;
2383     case OP_SPACE:
2384     case OP_NOTSPACE:
2385       tree = build_charclass_op (dfa, regexp->trans,
2386                                  (const unsigned char *) "space",
2387                                  (const unsigned char *) "",
2388                                  token->type == OP_NOTSPACE, err);
2389       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2390         return NULL;
2391       break;
2392     case OP_ALT:
2393     case END_OF_RE:
2394       return NULL;
2395     case BACK_SLASH:
2396       *err = REG_EESCAPE;
2397       return NULL;
2398     default:
2399       /* Must not happen?  */
2400 #ifdef DEBUG
2401       assert (0);
2402 #endif
2403       return NULL;
2404     }
2405   fetch_token (token, regexp, syntax);
2406
2407   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2408          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2409     {
2410       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2411       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2412         return NULL;
2413       /* In BRE consecutive duplications are not allowed.  */
2414       if ((syntax & RE_CONTEXT_INVALID_DUP)
2415           && (token->type == OP_DUP_ASTERISK
2416               || token->type == OP_OPEN_DUP_NUM))
2417         {
2418           *err = REG_BADRPT;
2419           return NULL;
2420         }
2421     }
2422
2423   return tree;
2424 }
2425
2426 /* This function build the following tree, from regular expression
2427    (<reg_exp>):
2428          SUBEXP
2429             |
2430         <reg_exp>
2431 */
2432
2433 static bin_tree_t *
2434 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2435                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2436 {
2437   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2438   bin_tree_t *tree;
2439   size_t cur_nsub;
2440   cur_nsub = preg->re_nsub++;
2441
2442   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2443
2444   /* The subexpression may be a null string.  */
2445   if (token->type == OP_CLOSE_SUBEXP)
2446     tree = NULL;
2447   else
2448     {
2449       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2450       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2451         *err = REG_EPAREN;
2452       if (BE (*err != REG_NOERROR, 0))
2453         return NULL;
2454     }
2455
2456   if (cur_nsub <= '9' - '1')
2457     dfa->completed_bkref_map |= 1 << cur_nsub;
2458
2459   tree = create_tree (dfa, tree, NULL, SUBEXP);
2460   if (BE (tree == NULL, 0))
2461     {
2462       *err = REG_ESPACE;
2463       return NULL;
2464     }
2465   tree->token.opr.idx = cur_nsub;
2466   return tree;
2467 }
2468
2469 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2470
2471 static bin_tree_t *
2472 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2473               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2474 {
2475   bin_tree_t *tree = NULL, *old_tree = NULL;
2476   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2477   re_token_t start_token = *token;
2478
2479   if (token->type == OP_OPEN_DUP_NUM)
2480     {
2481       end = 0;
2482       start = fetch_number (regexp, token, syntax);
2483       if (start == REG_MISSING)
2484         {
2485           if (token->type == CHARACTER && token->opr.c == ',')
2486             start = 0; /* We treat "{,m}" as "{0,m}".  */
2487           else
2488             {
2489               *err = REG_BADBR; /* <re>{} is invalid.  */
2490               return NULL;
2491             }
2492         }
2493       if (BE (start != REG_ERROR, 1))
2494         {
2495           /* We treat "{n}" as "{n,n}".  */
2496           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2497                  : ((token->type == CHARACTER && token->opr.c == ',')
2498                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2499         }
2500       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2501         {
2502           /* Invalid sequence.  */
2503           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2504             {
2505               if (token->type == END_OF_RE)
2506                 *err = REG_EBRACE;
2507               else
2508                 *err = REG_BADBR;
2509
2510               return NULL;
2511             }
2512
2513           /* If the syntax bit is set, rollback.  */
2514           re_string_set_index (regexp, start_idx);
2515           *token = start_token;
2516           token->type = CHARACTER;
2517           /* mb_partial and word_char bits should be already initialized by
2518              peek_token.  */
2519           return elem;
2520         }
2521
2522       if (BE ((end != REG_MISSING && start > end)
2523               || token->type != OP_CLOSE_DUP_NUM, 0))
2524         {
2525           /* First number greater than second.  */
2526           *err = REG_BADBR;
2527           return NULL;
2528         }
2529     }
2530   else
2531     {
2532       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2533       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2534     }
2535
2536   fetch_token (token, regexp, syntax);
2537
2538   if (BE (elem == NULL, 0))
2539     return NULL;
2540   if (BE (start == 0 && end == 0, 0))
2541     {
2542       postorder (elem, free_tree, NULL);
2543       return NULL;
2544     }
2545
2546   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2547   if (BE (start > 0, 0))
2548     {
2549       tree = elem;
2550       for (i = 2; i <= start; ++i)
2551         {
2552           elem = duplicate_tree (elem, dfa);
2553           tree = create_tree (dfa, tree, elem, CONCAT);
2554           if (BE (elem == NULL || tree == NULL, 0))
2555             goto parse_dup_op_espace;
2556         }
2557
2558       if (start == end)
2559         return tree;
2560
2561       /* Duplicate ELEM before it is marked optional.  */
2562       elem = duplicate_tree (elem, dfa);
2563       old_tree = tree;
2564     }
2565   else
2566     old_tree = NULL;
2567
2568   if (elem->token.type == SUBEXP)
2569     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2570
2571   tree = create_tree (dfa, elem, NULL,
2572                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2573   if (BE (tree == NULL, 0))
2574     goto parse_dup_op_espace;
2575
2576   /* This loop is actually executed only when end != REG_MISSING,
2577      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2578      already created the start+1-th copy.  */
2579   if ((Idx) -1 < 0 || end != REG_MISSING)
2580     for (i = start + 2; i <= end; ++i)
2581       {
2582         elem = duplicate_tree (elem, dfa);
2583         tree = create_tree (dfa, tree, elem, CONCAT);
2584         if (BE (elem == NULL || tree == NULL, 0))
2585           goto parse_dup_op_espace;
2586
2587         tree = create_tree (dfa, tree, NULL, OP_ALT);
2588         if (BE (tree == NULL, 0))
2589           goto parse_dup_op_espace;
2590       }
2591
2592   if (old_tree)
2593     tree = create_tree (dfa, old_tree, tree, CONCAT);
2594
2595   return tree;
2596
2597  parse_dup_op_espace:
2598   *err = REG_ESPACE;
2599   return NULL;
2600 }
2601
2602 /* Size of the names for collating symbol/equivalence_class/character_class.
2603    I'm not sure, but maybe enough.  */
2604 #define BRACKET_NAME_BUF_SIZE 32
2605
2606 #ifndef _LIBC
2607   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2608      Build the range expression which starts from START_ELEM, and ends
2609      at END_ELEM.  The result are written to MBCSET and SBCSET.
2610      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2611      mbcset->range_ends, is a pointer argument sinse we may
2612      update it.  */
2613
2614 static reg_errcode_t
2615 internal_function
2616 # ifdef RE_ENABLE_I18N
2617 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2618                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2619 # else /* not RE_ENABLE_I18N */
2620 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2621                  bracket_elem_t *end_elem)
2622 # endif /* not RE_ENABLE_I18N */
2623 {
2624   unsigned int start_ch, end_ch;
2625   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2626   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2627           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2628           0))
2629     return REG_ERANGE;
2630
2631   /* We can handle no multi character collating elements without libc
2632      support.  */
2633   if (BE ((start_elem->type == COLL_SYM
2634            && strlen ((char *) start_elem->opr.name) > 1)
2635           || (end_elem->type == COLL_SYM
2636               && strlen ((char *) end_elem->opr.name) > 1), 0))
2637     return REG_ECOLLATE;
2638
2639 # ifdef RE_ENABLE_I18N
2640   {
2641     wchar_t wc;
2642     wint_t start_wc;
2643     wint_t end_wc;
2644     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2645
2646     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2647                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2648                    : 0));
2649     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2650               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2651                  : 0));
2652     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2653                 ? __btowc (start_ch) : start_elem->opr.wch);
2654     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2655               ? __btowc (end_ch) : end_elem->opr.wch);
2656     if (start_wc == WEOF || end_wc == WEOF)
2657       return REG_ECOLLATE;
2658     cmp_buf[0] = start_wc;
2659     cmp_buf[4] = end_wc;
2660     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2661       return REG_ERANGE;
2662
2663     /* Got valid collation sequence values, add them as a new entry.
2664        However, for !_LIBC we have no collation elements: if the
2665        character set is single byte, the single byte character set
2666        that we build below suffices.  parse_bracket_exp passes
2667        no MBCSET if dfa->mb_cur_max == 1.  */
2668     if (mbcset)
2669       {
2670         /* Check the space of the arrays.  */
2671         if (BE (*range_alloc == mbcset->nranges, 0))
2672           {
2673             /* There is not enough space, need realloc.  */
2674             wchar_t *new_array_start, *new_array_end;
2675             Idx new_nranges;
2676
2677             /* +1 in case of mbcset->nranges is 0.  */
2678             new_nranges = 2 * mbcset->nranges + 1;
2679             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2680                are NULL if *range_alloc == 0.  */
2681             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2682                                           new_nranges);
2683             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2684                                         new_nranges);
2685
2686             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2687               return REG_ESPACE;
2688
2689             mbcset->range_starts = new_array_start;
2690             mbcset->range_ends = new_array_end;
2691             *range_alloc = new_nranges;
2692           }
2693
2694         mbcset->range_starts[mbcset->nranges] = start_wc;
2695         mbcset->range_ends[mbcset->nranges++] = end_wc;
2696       }
2697
2698     /* Build the table for single byte characters.  */
2699     for (wc = 0; wc < SBC_MAX; ++wc)
2700       {
2701         cmp_buf[2] = wc;
2702         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2703             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2704           bitset_set (sbcset, wc);
2705       }
2706   }
2707 # else /* not RE_ENABLE_I18N */
2708   {
2709     unsigned int ch;
2710     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2711                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2712                    : 0));
2713     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2714               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2715                  : 0));
2716     if (start_ch > end_ch)
2717       return REG_ERANGE;
2718     /* Build the table for single byte characters.  */
2719     for (ch = 0; ch < SBC_MAX; ++ch)
2720       if (start_ch <= ch  && ch <= end_ch)
2721         bitset_set (sbcset, ch);
2722   }
2723 # endif /* not RE_ENABLE_I18N */
2724   return REG_NOERROR;
2725 }
2726 #endif /* not _LIBC */
2727
2728 #ifndef _LIBC
2729 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2730    Build the collating element which is represented by NAME.
2731    The result are written to MBCSET and SBCSET.
2732    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2733    pointer argument since we may update it.  */
2734
2735 static reg_errcode_t
2736 internal_function
2737 build_collating_symbol (bitset_t sbcset,
2738 # ifdef RE_ENABLE_I18N
2739                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2740 # endif
2741                         const unsigned char *name)
2742 {
2743   size_t name_len = strlen ((const char *) name);
2744   if (BE (name_len != 1, 0))
2745     return REG_ECOLLATE;
2746   else
2747     {
2748       bitset_set (sbcset, name[0]);
2749       return REG_NOERROR;
2750     }
2751 }
2752 #endif /* not _LIBC */
2753
2754 /* This function parse bracket expression like "[abc]", "[a-c]",
2755    "[[.a-a.]]" etc.  */
2756
2757 static bin_tree_t *
2758 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2759                    reg_syntax_t syntax, reg_errcode_t *err)
2760 {
2761 #ifdef _LIBC
2762   const unsigned char *collseqmb;
2763   const char *collseqwc;
2764   uint32_t nrules;
2765   int32_t table_size;
2766   const int32_t *symb_table;
2767   const unsigned char *extra;
2768
2769   /* Local function for parse_bracket_exp used in _LIBC environement.
2770      Seek the collating symbol entry correspondings to NAME.
2771      Return the index of the symbol in the SYMB_TABLE.  */
2772
2773   auto inline int32_t
2774   __attribute ((always_inline))
2775   seek_collating_symbol_entry (name, name_len)
2776          const unsigned char *name;
2777          size_t name_len;
2778     {
2779       int32_t hash = elem_hash ((const char *) name, name_len);
2780       int32_t elem = hash % table_size;
2781       if (symb_table[2 * elem] != 0)
2782         {
2783           int32_t second = hash % (table_size - 2) + 1;
2784
2785           do
2786             {
2787               /* First compare the hashing value.  */
2788               if (symb_table[2 * elem] == hash
2789                   /* Compare the length of the name.  */
2790                   && name_len == extra[symb_table[2 * elem + 1]]
2791                   /* Compare the name.  */
2792                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2793                              name_len) == 0)
2794                 {
2795                   /* Yep, this is the entry.  */
2796                   break;
2797                 }
2798
2799               /* Next entry.  */
2800               elem += second;
2801             }
2802           while (symb_table[2 * elem] != 0);
2803         }
2804       return elem;
2805     }
2806
2807   /* Local function for parse_bracket_exp used in _LIBC environment.
2808      Look up the collation sequence value of BR_ELEM.
2809      Return the value if succeeded, UINT_MAX otherwise.  */
2810
2811   auto inline unsigned int
2812   __attribute ((always_inline))
2813   lookup_collation_sequence_value (br_elem)
2814          bracket_elem_t *br_elem;
2815     {
2816       if (br_elem->type == SB_CHAR)
2817         {
2818           /*
2819           if (MB_CUR_MAX == 1)
2820           */
2821           if (nrules == 0)
2822             return collseqmb[br_elem->opr.ch];
2823           else
2824             {
2825               wint_t wc = __btowc (br_elem->opr.ch);
2826               return __collseq_table_lookup (collseqwc, wc);
2827             }
2828         }
2829       else if (br_elem->type == MB_CHAR)
2830         {
2831           if (nrules != 0)
2832             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2833         }
2834       else if (br_elem->type == COLL_SYM)
2835         {
2836           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2837           if (nrules != 0)
2838             {
2839               int32_t elem, idx;
2840               elem = seek_collating_symbol_entry (br_elem->opr.name,
2841                                                   sym_name_len);
2842               if (symb_table[2 * elem] != 0)
2843                 {
2844                   /* We found the entry.  */
2845                   idx = symb_table[2 * elem + 1];
2846                   /* Skip the name of collating element name.  */
2847                   idx += 1 + extra[idx];
2848                   /* Skip the byte sequence of the collating element.  */
2849                   idx += 1 + extra[idx];
2850                   /* Adjust for the alignment.  */
2851                   idx = (idx + 3) & ~3;
2852                   /* Skip the multibyte collation sequence value.  */
2853                   idx += sizeof (unsigned int);
2854                   /* Skip the wide char sequence of the collating element.  */
2855                   idx += sizeof (unsigned int) *
2856                     (1 + *(unsigned int *) (extra + idx));
2857                   /* Return the collation sequence value.  */
2858                   return *(unsigned int *) (extra + idx);
2859                 }
2860               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2861                 {
2862                   /* No valid character.  Match it as a single byte
2863                      character.  */
2864                   return collseqmb[br_elem->opr.name[0]];
2865                 }
2866             }
2867           else if (sym_name_len == 1)
2868             return collseqmb[br_elem->opr.name[0]];
2869         }
2870       return UINT_MAX;
2871     }
2872
2873   /* Local function for parse_bracket_exp used in _LIBC environement.
2874      Build the range expression which starts from START_ELEM, and ends
2875      at END_ELEM.  The result are written to MBCSET and SBCSET.
2876      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2877      mbcset->range_ends, is a pointer argument sinse we may
2878      update it.  */
2879
2880   auto inline reg_errcode_t
2881   __attribute ((always_inline))
2882   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2883          re_charset_t *mbcset;
2884          Idx *range_alloc;
2885          bitset_t sbcset;
2886          bracket_elem_t *start_elem, *end_elem;
2887     {
2888       unsigned int ch;
2889       uint32_t start_collseq;
2890       uint32_t end_collseq;
2891
2892       /* Equivalence Classes and Character Classes can't be a range
2893          start/end.  */
2894       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2895               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2896               0))
2897         return REG_ERANGE;
2898
2899       start_collseq = lookup_collation_sequence_value (start_elem);
2900       end_collseq = lookup_collation_sequence_value (end_elem);
2901       /* Check start/end collation sequence values.  */
2902       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2903         return REG_ECOLLATE;
2904       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2905         return REG_ERANGE;
2906
2907       /* Got valid collation sequence values, add them as a new entry.
2908          However, if we have no collation elements, and the character set
2909          is single byte, the single byte character set that we
2910          build below suffices. */
2911       if (nrules > 0 || dfa->mb_cur_max > 1)
2912         {
2913           /* Check the space of the arrays.  */
2914           if (BE (*range_alloc == mbcset->nranges, 0))
2915             {
2916               /* There is not enough space, need realloc.  */
2917               uint32_t *new_array_start;
2918               uint32_t *new_array_end;
2919               Idx new_nranges;
2920
2921               /* +1 in case of mbcset->nranges is 0.  */
2922               new_nranges = 2 * mbcset->nranges + 1;
2923               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2924                                             new_nranges);
2925               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2926                                           new_nranges);
2927
2928               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2929                 return REG_ESPACE;
2930
2931               mbcset->range_starts = new_array_start;
2932               mbcset->range_ends = new_array_end;
2933               *range_alloc = new_nranges;
2934             }
2935
2936           mbcset->range_starts[mbcset->nranges] = start_collseq;
2937           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2938         }
2939
2940       /* Build the table for single byte characters.  */
2941       for (ch = 0; ch < SBC_MAX; ch++)
2942         {
2943           uint32_t ch_collseq;
2944           /*
2945           if (MB_CUR_MAX == 1)
2946           */
2947           if (nrules == 0)
2948             ch_collseq = collseqmb[ch];
2949           else
2950             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2951           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2952             bitset_set (sbcset, ch);
2953         }
2954       return REG_NOERROR;
2955     }
2956
2957   /* Local function for parse_bracket_exp used in _LIBC environement.
2958      Build the collating element which is represented by NAME.
2959      The result are written to MBCSET and SBCSET.
2960      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2961      pointer argument sinse we may update it.  */
2962
2963   auto inline reg_errcode_t
2964   __attribute ((always_inline))
2965   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2966          re_charset_t *mbcset;
2967          Idx *coll_sym_alloc;
2968          bitset_t sbcset;
2969          const unsigned char *name;
2970     {
2971       int32_t elem, idx;
2972       size_t name_len = strlen ((const char *) name);
2973       if (nrules != 0)
2974         {
2975           elem = seek_collating_symbol_entry (name, name_len);
2976           if (symb_table[2 * elem] != 0)
2977             {
2978               /* We found the entry.  */
2979               idx = symb_table[2 * elem + 1];
2980               /* Skip the name of collating element name.  */
2981               idx += 1 + extra[idx];
2982             }
2983           else if (symb_table[2 * elem] == 0 && name_len == 1)
2984             {
2985               /* No valid character, treat it as a normal
2986                  character.  */
2987               bitset_set (sbcset, name[0]);
2988               return REG_NOERROR;
2989             }
2990           else
2991             return REG_ECOLLATE;
2992
2993           /* Got valid collation sequence, add it as a new entry.  */
2994           /* Check the space of the arrays.  */
2995           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2996             {
2997               /* Not enough, realloc it.  */
2998               /* +1 in case of mbcset->ncoll_syms is 0.  */
2999               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3000               /* Use realloc since mbcset->coll_syms is NULL
3001                  if *alloc == 0.  */
3002               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3003                                                    new_coll_sym_alloc);
3004               if (BE (new_coll_syms == NULL, 0))
3005                 return REG_ESPACE;
3006               mbcset->coll_syms = new_coll_syms;
3007               *coll_sym_alloc = new_coll_sym_alloc;
3008             }
3009           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3010           return REG_NOERROR;
3011         }
3012       else
3013         {
3014           if (BE (name_len != 1, 0))
3015             return REG_ECOLLATE;
3016           else
3017             {
3018               bitset_set (sbcset, name[0]);
3019               return REG_NOERROR;
3020             }
3021         }
3022     }
3023 #endif
3024
3025   re_token_t br_token;
3026   re_bitset_ptr_t sbcset;
3027 #ifdef RE_ENABLE_I18N
3028   re_charset_t *mbcset;
3029   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3030   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3031 #endif /* not RE_ENABLE_I18N */
3032   bool non_match = false;
3033   bin_tree_t *work_tree;
3034   int token_len;
3035   bool first_round = true;
3036 #ifdef _LIBC
3037   collseqmb = (const unsigned char *)
3038     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3039   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3040   if (nrules)
3041     {
3042       /*
3043       if (MB_CUR_MAX > 1)
3044       */
3045       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3046       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3047       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3048                                                   _NL_COLLATE_SYMB_TABLEMB);
3049       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3050                                                    _NL_COLLATE_SYMB_EXTRAMB);
3051     }
3052 #endif
3053   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3054 #ifdef RE_ENABLE_I18N
3055   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3056 #endif /* RE_ENABLE_I18N */
3057 #ifdef RE_ENABLE_I18N
3058   if (BE (sbcset == NULL || mbcset == NULL, 0))
3059 #else
3060   if (BE (sbcset == NULL, 0))
3061 #endif /* RE_ENABLE_I18N */
3062     {
3063       *err = REG_ESPACE;
3064       return NULL;
3065     }
3066
3067   token_len = peek_token_bracket (token, regexp, syntax);
3068   if (BE (token->type == END_OF_RE, 0))
3069     {
3070       *err = REG_BADPAT;
3071       goto parse_bracket_exp_free_return;
3072     }
3073   if (token->type == OP_NON_MATCH_LIST)
3074     {
3075 #ifdef RE_ENABLE_I18N
3076       mbcset->non_match = 1;
3077 #endif /* not RE_ENABLE_I18N */
3078       non_match = true;
3079       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3080         bitset_set (sbcset, '\n');
3081       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3082       token_len = peek_token_bracket (token, regexp, syntax);
3083       if (BE (token->type == END_OF_RE, 0))
3084         {
3085           *err = REG_BADPAT;
3086           goto parse_bracket_exp_free_return;
3087         }
3088     }
3089
3090   /* We treat the first ']' as a normal character.  */
3091   if (token->type == OP_CLOSE_BRACKET)
3092     token->type = CHARACTER;
3093
3094   while (1)
3095     {
3096       bracket_elem_t start_elem, end_elem;
3097       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3098       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3099       reg_errcode_t ret;
3100       int token_len2 = 0;
3101       bool is_range_exp = false;
3102       re_token_t token2;
3103
3104       start_elem.opr.name = start_name_buf;
3105       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106                                    syntax, first_round);
3107       if (BE (ret != REG_NOERROR, 0))
3108         {
3109           *err = ret;
3110           goto parse_bracket_exp_free_return;
3111         }
3112       first_round = false;
3113
3114       /* Get information about the next token.  We need it in any case.  */
3115       token_len = peek_token_bracket (token, regexp, syntax);
3116
3117       /* Do not check for ranges if we know they are not allowed.  */
3118       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119         {
3120           if (BE (token->type == END_OF_RE, 0))
3121             {
3122               *err = REG_EBRACK;
3123               goto parse_bracket_exp_free_return;
3124             }
3125           if (token->type == OP_CHARSET_RANGE)
3126             {
3127               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3128               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129               if (BE (token2.type == END_OF_RE, 0))
3130                 {
3131                   *err = REG_EBRACK;
3132                   goto parse_bracket_exp_free_return;
3133                 }
3134               if (token2.type == OP_CLOSE_BRACKET)
3135                 {
3136                   /* We treat the last '-' as a normal character.  */
3137                   re_string_skip_bytes (regexp, -token_len);
3138                   token->type = CHARACTER;
3139                 }
3140               else
3141                 is_range_exp = true;
3142             }
3143         }
3144
3145       if (is_range_exp == true)
3146         {
3147           end_elem.opr.name = end_name_buf;
3148           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3149                                        dfa, syntax, true);
3150           if (BE (ret != REG_NOERROR, 0))
3151             {
3152               *err = ret;
3153               goto parse_bracket_exp_free_return;
3154             }
3155
3156           token_len = peek_token_bracket (token, regexp, syntax);
3157
3158 #ifdef _LIBC
3159           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3160                                   &start_elem, &end_elem);
3161 #else
3162 # ifdef RE_ENABLE_I18N
3163           *err = build_range_exp (sbcset,
3164                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3165                                   &range_alloc, &start_elem, &end_elem);
3166 # else
3167           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3168 # endif
3169 #endif /* RE_ENABLE_I18N */
3170           if (BE (*err != REG_NOERROR, 0))
3171             goto parse_bracket_exp_free_return;
3172         }
3173       else
3174         {
3175           switch (start_elem.type)
3176             {
3177             case SB_CHAR:
3178               bitset_set (sbcset, start_elem.opr.ch);
3179               break;
3180 #ifdef RE_ENABLE_I18N
3181             case MB_CHAR:
3182               /* Check whether the array has enough space.  */
3183               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3184                 {
3185                   wchar_t *new_mbchars;
3186                   /* Not enough, realloc it.  */
3187                   /* +1 in case of mbcset->nmbchars is 0.  */
3188                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3189                   /* Use realloc since array is NULL if *alloc == 0.  */
3190                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3191                                             mbchar_alloc);
3192                   if (BE (new_mbchars == NULL, 0))
3193                     goto parse_bracket_exp_espace;
3194                   mbcset->mbchars = new_mbchars;
3195                 }
3196               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3197               break;
3198 #endif /* RE_ENABLE_I18N */
3199             case EQUIV_CLASS:
3200               *err = build_equiv_class (sbcset,
3201 #ifdef RE_ENABLE_I18N
3202                                         mbcset, &equiv_class_alloc,
3203 #endif /* RE_ENABLE_I18N */
3204                                         start_elem.opr.name);
3205               if (BE (*err != REG_NOERROR, 0))
3206                 goto parse_bracket_exp_free_return;
3207               break;
3208             case COLL_SYM:
3209               *err = build_collating_symbol (sbcset,
3210 #ifdef RE_ENABLE_I18N
3211                                              mbcset, &coll_sym_alloc,
3212 #endif /* RE_ENABLE_I18N */
3213                                              start_elem.opr.name);
3214               if (BE (*err != REG_NOERROR, 0))
3215                 goto parse_bracket_exp_free_return;
3216               break;
3217             case CHAR_CLASS:
3218               *err = build_charclass (regexp->trans, sbcset,
3219 #ifdef RE_ENABLE_I18N
3220                                       mbcset, &char_class_alloc,
3221 #endif /* RE_ENABLE_I18N */
3222                                       start_elem.opr.name, syntax);
3223               if (BE (*err != REG_NOERROR, 0))
3224                goto parse_bracket_exp_free_return;
3225               break;
3226             default:
3227               assert (0);
3228               break;
3229             }
3230         }
3231       if (BE (token->type == END_OF_RE, 0))
3232         {
3233           *err = REG_EBRACK;
3234           goto parse_bracket_exp_free_return;
3235         }
3236       if (token->type == OP_CLOSE_BRACKET)
3237         break;
3238     }
3239
3240   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3241
3242   /* If it is non-matching list.  */
3243   if (non_match)
3244     bitset_not (sbcset);
3245
3246 #ifdef RE_ENABLE_I18N
3247   /* Ensure only single byte characters are set.  */
3248   if (dfa->mb_cur_max > 1)
3249     bitset_mask (sbcset, dfa->sb_char);
3250
3251   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3252       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3253                                                      || mbcset->non_match)))
3254     {
3255       bin_tree_t *mbc_tree;
3256       int sbc_idx;
3257       /* Build a tree for complex bracket.  */
3258       dfa->has_mb_node = 1;
3259       br_token.type = COMPLEX_BRACKET;
3260       br_token.opr.mbcset = mbcset;
3261       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3262       if (BE (mbc_tree == NULL, 0))
3263         goto parse_bracket_exp_espace;
3264       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3265         if (sbcset[sbc_idx])
3266           break;
3267       /* If there are no bits set in sbcset, there is no point
3268          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3269       if (sbc_idx < BITSET_WORDS)
3270         {
3271           /* Build a tree for simple bracket.  */
3272           br_token.type = SIMPLE_BRACKET;
3273           br_token.opr.sbcset = sbcset;
3274           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3275           if (BE (work_tree == NULL, 0))
3276             goto parse_bracket_exp_espace;
3277
3278           /* Then join them by ALT node.  */
3279           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3280           if (BE (work_tree == NULL, 0))
3281             goto parse_bracket_exp_espace;
3282         }
3283       else
3284         {
3285           re_free (sbcset);
3286           work_tree = mbc_tree;
3287         }
3288     }
3289   else
3290 #endif /* not RE_ENABLE_I18N */
3291     {
3292 #ifdef RE_ENABLE_I18N
3293       free_charset (mbcset);
3294 #endif
3295       /* Build a tree for simple bracket.  */
3296       br_token.type = SIMPLE_BRACKET;
3297       br_token.opr.sbcset = sbcset;
3298       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3299       if (BE (work_tree == NULL, 0))
3300         goto parse_bracket_exp_espace;
3301     }
3302   return work_tree;
3303
3304  parse_bracket_exp_espace:
3305   *err = REG_ESPACE;
3306  parse_bracket_exp_free_return:
3307   re_free (sbcset);
3308 #ifdef RE_ENABLE_I18N
3309   free_charset (mbcset);
3310 #endif /* RE_ENABLE_I18N */
3311   return NULL;
3312 }
3313
3314 /* Parse an element in the bracket expression.  */
3315
3316 static reg_errcode_t
3317 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3318                        re_token_t *token, int token_len, re_dfa_t *dfa,
3319                        reg_syntax_t syntax, bool accept_hyphen)
3320 {
3321 #ifdef RE_ENABLE_I18N
3322   int cur_char_size;
3323   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3324   if (cur_char_size > 1)
3325     {
3326       elem->type = MB_CHAR;
3327       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3328       re_string_skip_bytes (regexp, cur_char_size);
3329       return REG_NOERROR;
3330     }
3331 #endif /* RE_ENABLE_I18N */
3332   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3333   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3334       || token->type == OP_OPEN_EQUIV_CLASS)
3335     return parse_bracket_symbol (elem, regexp, token);
3336   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3337     {
3338       /* A '-' must only appear as anything but a range indicator before
3339          the closing bracket.  Everything else is an error.  */
3340       re_token_t token2;
3341       (void) peek_token_bracket (&token2, regexp, syntax);
3342       if (token2.type != OP_CLOSE_BRACKET)
3343         /* The actual error value is not standardized since this whole
3344            case is undefined.  But ERANGE makes good sense.  */
3345         return REG_ERANGE;
3346     }
3347   elem->type = SB_CHAR;
3348   elem->opr.ch = token->opr.c;
3349   return REG_NOERROR;
3350 }
3351
3352 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3353    such as [:<character_class>:], [.<collating_element>.], and
3354    [=<equivalent_class>=].  */
3355
3356 static reg_errcode_t
3357 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3358                       re_token_t *token)
3359 {
3360   unsigned char ch, delim = token->opr.c;
3361   int i = 0;
3362   if (re_string_eoi(regexp))
3363     return REG_EBRACK;
3364   for (;; ++i)
3365     {
3366       if (i >= BRACKET_NAME_BUF_SIZE)
3367         return REG_EBRACK;
3368       if (token->type == OP_OPEN_CHAR_CLASS)
3369         ch = re_string_fetch_byte_case (regexp);
3370       else
3371         ch = re_string_fetch_byte (regexp);
3372       if (re_string_eoi(regexp))
3373         return REG_EBRACK;
3374       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3375         break;
3376       elem->opr.name[i] = ch;
3377     }
3378   re_string_skip_bytes (regexp, 1);
3379   elem->opr.name[i] = '\0';
3380   switch (token->type)
3381     {
3382     case OP_OPEN_COLL_ELEM:
3383       elem->type = COLL_SYM;
3384       break;
3385     case OP_OPEN_EQUIV_CLASS:
3386       elem->type = EQUIV_CLASS;
3387       break;
3388     case OP_OPEN_CHAR_CLASS:
3389       elem->type = CHAR_CLASS;
3390       break;
3391     default:
3392       break;
3393     }
3394   return REG_NOERROR;
3395 }
3396
3397   /* Helper function for parse_bracket_exp.
3398      Build the equivalence class which is represented by NAME.
3399      The result are written to MBCSET and SBCSET.
3400      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3401      is a pointer argument sinse we may update it.  */
3402
3403 static reg_errcode_t
3404 #ifdef RE_ENABLE_I18N
3405 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3406                    Idx *equiv_class_alloc, const unsigned char *name)
3407 #else /* not RE_ENABLE_I18N */
3408 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3409 #endif /* not RE_ENABLE_I18N */
3410 {
3411 #ifdef _LIBC
3412   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3413   if (nrules != 0)
3414     {
3415       const int32_t *table, *indirect;
3416       const unsigned char *weights, *extra, *cp;
3417       unsigned char char_buf[2];
3418       int32_t idx1, idx2;
3419       unsigned int ch;
3420       size_t len;
3421       /* This #include defines a local function!  */
3422 # include <locale/weight.h>
3423       /* Calculate the index for equivalence class.  */
3424       cp = name;
3425       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3426       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3427                                                _NL_COLLATE_WEIGHTMB);
3428       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3429                                                    _NL_COLLATE_EXTRAMB);
3430       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3431                                                 _NL_COLLATE_INDIRECTMB);
3432       idx1 = findidx (&cp);
3433       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3434         /* This isn't a valid character.  */
3435         return REG_ECOLLATE;
3436
3437       /* Build single byte matcing table for this equivalence class.  */
3438       char_buf[1] = (unsigned char) '\0';
3439       len = weights[idx1 & 0xffffff];
3440       for (ch = 0; ch < SBC_MAX; ++ch)
3441         {
3442           char_buf[0] = ch;
3443           cp = char_buf;
3444           idx2 = findidx (&cp);
3445 /*
3446           idx2 = table[ch];
3447 */
3448           if (idx2 == 0)
3449             /* This isn't a valid character.  */
3450             continue;
3451           /* Compare only if the length matches and the collation rule
3452              index is the same.  */
3453           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3454             {
3455               int cnt = 0;
3456
3457               while (cnt <= len &&
3458                      weights[(idx1 & 0xffffff) + 1 + cnt]
3459                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3460                 ++cnt;
3461
3462               if (cnt > len)
3463                 bitset_set (sbcset, ch);
3464             }
3465         }
3466       /* Check whether the array has enough space.  */
3467       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3468         {
3469           /* Not enough, realloc it.  */
3470           /* +1 in case of mbcset->nequiv_classes is 0.  */
3471           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3472           /* Use realloc since the array is NULL if *alloc == 0.  */
3473           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3474                                                    int32_t,
3475                                                    new_equiv_class_alloc);
3476           if (BE (new_equiv_classes == NULL, 0))
3477             return REG_ESPACE;
3478           mbcset->equiv_classes = new_equiv_classes;
3479           *equiv_class_alloc = new_equiv_class_alloc;
3480         }
3481       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3482     }
3483   else
3484 #endif /* _LIBC */
3485     {
3486       if (BE (strlen ((const char *) name) != 1, 0))
3487         return REG_ECOLLATE;
3488       bitset_set (sbcset, *name);
3489     }
3490   return REG_NOERROR;
3491 }
3492
3493   /* Helper function for parse_bracket_exp.
3494      Build the character class which is represented by NAME.
3495      The result are written to MBCSET and SBCSET.
3496      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3497      is a pointer argument sinse we may update it.  */
3498
3499 static reg_errcode_t
3500 #ifdef RE_ENABLE_I18N
3501 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3502                  re_charset_t *mbcset, Idx *char_class_alloc,
3503                  const unsigned char *class_name, reg_syntax_t syntax)
3504 #else /* not RE_ENABLE_I18N */
3505 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3506                  const unsigned char *class_name, reg_syntax_t syntax)
3507 #endif /* not RE_ENABLE_I18N */
3508 {
3509   int i;
3510   const char *name = (const char *) class_name;
3511
3512   /* In case of REG_ICASE "upper" and "lower" match the both of
3513      upper and lower cases.  */
3514   if ((syntax & RE_ICASE)
3515       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3516     name = "alpha";
3517
3518 #ifdef RE_ENABLE_I18N
3519   /* Check the space of the arrays.  */
3520   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3521     {
3522       /* Not enough, realloc it.  */
3523       /* +1 in case of mbcset->nchar_classes is 0.  */
3524       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3525       /* Use realloc since array is NULL if *alloc == 0.  */
3526       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3527                                                new_char_class_alloc);
3528       if (BE (new_char_classes == NULL, 0))
3529         return REG_ESPACE;
3530       mbcset->char_classes = new_char_classes;
3531       *char_class_alloc = new_char_class_alloc;
3532     }
3533   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3534 #endif /* RE_ENABLE_I18N */
3535
3536 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3537   do {                                          \
3538     if (BE (trans != NULL, 0))                  \
3539       {                                         \
3540         for (i = 0; i < SBC_MAX; ++i)           \
3541           if (ctype_func (i))                   \
3542             bitset_set (sbcset, trans[i]);      \
3543       }                                         \
3544     else                                        \
3545       {                                         \
3546         for (i = 0; i < SBC_MAX; ++i)           \
3547           if (ctype_func (i))                   \
3548             bitset_set (sbcset, i);             \
3549       }                                         \
3550   } while (0)
3551
3552   if (strcmp (name, "alnum") == 0)
3553     BUILD_CHARCLASS_LOOP (isalnum);
3554   else if (strcmp (name, "cntrl") == 0)
3555     BUILD_CHARCLASS_LOOP (iscntrl);
3556   else if (strcmp (name, "lower") == 0)
3557     BUILD_CHARCLASS_LOOP (islower);
3558   else if (strcmp (name, "space") == 0)
3559     BUILD_CHARCLASS_LOOP (isspace);
3560   else if (strcmp (name, "alpha") == 0)
3561     BUILD_CHARCLASS_LOOP (isalpha);
3562   else if (strcmp (name, "digit") == 0)
3563     BUILD_CHARCLASS_LOOP (isdigit);
3564   else if (strcmp (name, "print") == 0)
3565     BUILD_CHARCLASS_LOOP (isprint);
3566   else if (strcmp (name, "upper") == 0)
3567     BUILD_CHARCLASS_LOOP (isupper);
3568   else if (strcmp (name, "blank") == 0)
3569     BUILD_CHARCLASS_LOOP (isblank);
3570   else if (strcmp (name, "graph") == 0)
3571     BUILD_CHARCLASS_LOOP (isgraph);
3572   else if (strcmp (name, "punct") == 0)
3573     BUILD_CHARCLASS_LOOP (ispunct);
3574   else if (strcmp (name, "xdigit") == 0)
3575     BUILD_CHARCLASS_LOOP (isxdigit);
3576   else
3577     return REG_ECTYPE;
3578
3579   return REG_NOERROR;
3580 }
3581
3582 static bin_tree_t *
3583 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3584                     const unsigned char *class_name,
3585                     const unsigned char *extra, bool non_match,
3586                     reg_errcode_t *err)
3587 {
3588   re_bitset_ptr_t sbcset;
3589 #ifdef RE_ENABLE_I18N
3590   re_charset_t *mbcset;
3591   Idx alloc = 0;
3592 #endif /* not RE_ENABLE_I18N */
3593   reg_errcode_t ret;
3594   re_token_t br_token;
3595   bin_tree_t *tree;
3596
3597   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3598 #ifdef RE_ENABLE_I18N
3599   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3600 #endif /* RE_ENABLE_I18N */
3601
3602 #ifdef RE_ENABLE_I18N
3603   if (BE (sbcset == NULL || mbcset == NULL, 0))
3604 #else /* not RE_ENABLE_I18N */
3605   if (BE (sbcset == NULL, 0))
3606 #endif /* not RE_ENABLE_I18N */
3607     {
3608       *err = REG_ESPACE;
3609       return NULL;
3610     }
3611
3612   if (non_match)
3613     {
3614 #ifdef RE_ENABLE_I18N
3615       mbcset->non_match = 1;
3616 #endif /* not RE_ENABLE_I18N */
3617     }
3618
3619   /* We don't care the syntax in this case.  */
3620   ret = build_charclass (trans, sbcset,
3621 #ifdef RE_ENABLE_I18N
3622                          mbcset, &alloc,
3623 #endif /* RE_ENABLE_I18N */
3624                          class_name, 0);
3625
3626   if (BE (ret != REG_NOERROR, 0))
3627     {
3628       re_free (sbcset);
3629 #ifdef RE_ENABLE_I18N
3630       free_charset (mbcset);
3631 #endif /* RE_ENABLE_I18N */
3632       *err = ret;
3633       return NULL;
3634     }
3635   /* \w match '_' also.  */
3636   for (; *extra; extra++)
3637     bitset_set (sbcset, *extra);
3638
3639   /* If it is non-matching list.  */
3640   if (non_match)
3641     bitset_not (sbcset);
3642
3643 #ifdef RE_ENABLE_I18N
3644   /* Ensure only single byte characters are set.  */
3645   if (dfa->mb_cur_max > 1)
3646     bitset_mask (sbcset, dfa->sb_char);
3647 #endif
3648
3649   /* Build a tree for simple bracket.  */
3650   br_token.type = SIMPLE_BRACKET;
3651   br_token.opr.sbcset = sbcset;
3652   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3653   if (BE (tree == NULL, 0))
3654     goto build_word_op_espace;
3655
3656 #ifdef RE_ENABLE_I18N
3657   if (dfa->mb_cur_max > 1)
3658     {
3659       bin_tree_t *mbc_tree;
3660       /* Build a tree for complex bracket.  */
3661       br_token.type = COMPLEX_BRACKET;
3662       br_token.opr.mbcset = mbcset;
3663       dfa->has_mb_node = 1;
3664       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3665       if (BE (mbc_tree == NULL, 0))
3666         goto build_word_op_espace;
3667       /* Then join them by ALT node.  */
3668       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3669       if (BE (mbc_tree != NULL, 1))
3670         return tree;
3671     }
3672   else
3673     {
3674       free_charset (mbcset);
3675       return tree;
3676     }
3677 #else /* not RE_ENABLE_I18N */
3678   return tree;
3679 #endif /* not RE_ENABLE_I18N */
3680
3681  build_word_op_espace:
3682   re_free (sbcset);
3683 #ifdef RE_ENABLE_I18N
3684   free_charset (mbcset);
3685 #endif /* RE_ENABLE_I18N */
3686   *err = REG_ESPACE;
3687   return NULL;
3688 }
3689
3690 /* This is intended for the expressions like "a{1,3}".
3691    Fetch a number from `input', and return the number.
3692    Return REG_MISSING if the number field is empty like "{,1}".
3693    Return REG_ERROR if an error occurred.  */
3694
3695 static Idx
3696 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3697 {
3698   Idx num = REG_MISSING;
3699   unsigned char c;
3700   while (1)
3701     {
3702       fetch_token (token, input, syntax);
3703       c = token->opr.c;
3704       if (BE (token->type == END_OF_RE, 0))
3705         return REG_ERROR;
3706       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3707         break;
3708       num = ((token->type != CHARACTER || c < '0' || '9' < c
3709               || num == REG_ERROR)
3710              ? REG_ERROR
3711              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3712       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3713     }
3714   return num;
3715 }
3716 \f
3717 #ifdef RE_ENABLE_I18N
3718 static void
3719 free_charset (re_charset_t *cset)
3720 {
3721   re_free (cset->mbchars);
3722 # ifdef _LIBC
3723   re_free (cset->coll_syms);
3724   re_free (cset->equiv_classes);
3725   re_free (cset->range_starts);
3726   re_free (cset->range_ends);
3727 # endif
3728   re_free (cset->char_classes);
3729   re_free (cset);
3730 }
3731 #endif /* RE_ENABLE_I18N */
3732 \f
3733 /* Functions for binary tree operation.  */
3734
3735 /* Create a tree node.  */
3736
3737 static bin_tree_t *
3738 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3739              re_token_type_t type)
3740 {
3741   re_token_t t;
3742   t.type = type;
3743   return create_token_tree (dfa, left, right, &t);
3744 }
3745
3746 static bin_tree_t *
3747 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3748                    const re_token_t *token)
3749 {
3750   bin_tree_t *tree;
3751   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3752     {
3753       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3754
3755       if (storage == NULL)
3756         return NULL;
3757       storage->next = dfa->str_tree_storage;
3758       dfa->str_tree_storage = storage;
3759       dfa->str_tree_storage_idx = 0;
3760     }
3761   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3762
3763   tree->parent = NULL;
3764   tree->left = left;
3765   tree->right = right;
3766   tree->token = *token;
3767   tree->token.duplicated = 0;
3768   tree->token.opt_subexp = 0;
3769   tree->first = NULL;
3770   tree->next = NULL;
3771   tree->node_idx = REG_MISSING;
3772
3773   if (left != NULL)
3774     left->parent = tree;
3775   if (right != NULL)
3776     right->parent = tree;
3777   return tree;
3778 }
3779
3780 /* Mark the tree SRC as an optional subexpression.
3781    To be called from preorder or postorder.  */
3782
3783 static reg_errcode_t
3784 mark_opt_subexp (void *extra, bin_tree_t *node)
3785 {
3786   Idx idx = (Idx) (long) extra;
3787   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3788     node->token.opt_subexp = 1;
3789
3790   return REG_NOERROR;
3791 }
3792
3793 /* Free the allocated memory inside NODE. */
3794
3795 static void
3796 free_token (re_token_t *node)
3797 {
3798 #ifdef RE_ENABLE_I18N
3799   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3800     free_charset (node->opr.mbcset);
3801   else
3802 #endif /* RE_ENABLE_I18N */
3803     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3804       re_free (node->opr.sbcset);
3805 }
3806
3807 /* Worker function for tree walking.  Free the allocated memory inside NODE
3808    and its children. */
3809
3810 static reg_errcode_t
3811 free_tree (void *extra, bin_tree_t *node)
3812 {
3813   free_token (&node->token);
3814   return REG_NOERROR;
3815 }
3816
3817
3818 /* Duplicate the node SRC, and return new node.  This is a preorder
3819    visit similar to the one implemented by the generic visitor, but
3820    we need more infrastructure to maintain two parallel trees --- so,
3821    it's easier to duplicate.  */
3822
3823 static bin_tree_t *
3824 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3825 {
3826   const bin_tree_t *node;
3827   bin_tree_t *dup_root;
3828   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3829
3830   for (node = root; ; )
3831     {
3832       /* Create a new tree and link it back to the current parent.  */
3833       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3834       if (*p_new == NULL)
3835         return NULL;
3836       (*p_new)->parent = dup_node;
3837       (*p_new)->token.duplicated = 1;
3838       dup_node = *p_new;
3839
3840       /* Go to the left node, or up and to the right.  */
3841       if (node->left)
3842         {
3843           node = node->left;
3844           p_new = &dup_node->left;
3845         }
3846       else
3847         {
3848           const bin_tree_t *prev = NULL;
3849           while (node->right == prev || node->right == NULL)
3850             {
3851               prev = node;
3852               node = node->parent;
3853               dup_node = dup_node->parent;
3854               if (!node)
3855                 return dup_root;
3856             }
3857           node = node->right;
3858           p_new = &dup_node->right;
3859         }
3860     }
3861 }