lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (mbrtowc (&wc, (const char *) buf, p - buf,
 337                            &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           Idx i;
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 362               || cset->nranges || cset->nchar_classes)
 363             {
 364 # ifdef _LIBC
 365               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 366                 {
 367                   /* In this case we want to catch the bytes which are
 368                      the first byte of any collation elements.
 369                      e.g. In da_DK, we want to catch 'a' since "aa"
 370                           is a valid collation element, and don't catch
 371                           'b' since 'b' is the only collation element
 372                           which starts from 'b'.  */
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # else
 380               if (dfa->mb_cur_max > 1)
 381                 for (i = 0; i < SBC_MAX; ++i)
 382                   if (__btowc (i) == WEOF)
 383                     re_set_fastmap (fastmap, icase, i);
 384 # endif /* not _LIBC */
 385             }
 386           for (i = 0; i < cset->nmbchars; ++i)
 387             {
 388               char buf[256];
 389               mbstate_t state;
 390               memset (&state, '\0', sizeof (state));
 391               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 392                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 393               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 394                 {
 395                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 396                       != (size_t) -1)
 397                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 398                 }
 399             }
 400         }
 401 #endif /* RE_ENABLE_I18N */
 402       else if (type == OP_PERIOD
 403 #ifdef RE_ENABLE_I18N
 404                || type == OP_UTF8_PERIOD
 405 #endif /* RE_ENABLE_I18N */
 406                || type == END_OF_RE)
 407         {
 408           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 409           if (type == END_OF_RE)
 410             bufp->can_be_null = 1;
 411           return;
 412         }
 413     }
 414 }
 415 \f
 416 /* Entry point for POSIX code.  */
 417 /* regcomp takes a regular expression as a string and compiles it.
 418
 419    PREG is a regex_t *.  We do not expect any fields to be initialized,
 420    since POSIX says we shouldn't.  Thus, we set
 421
 422      `buffer' to the compiled pattern;
 423      `used' to the length of the compiled pattern;
 424      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 425        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 426        RE_SYNTAX_POSIX_BASIC;
 427      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 428      `fastmap' to an allocated space for the fastmap;
 429      `fastmap_accurate' to zero;
 430      `re_nsub' to the number of subexpressions in PATTERN.
 431
 432    PATTERN is the address of the pattern string.
 433
 434    CFLAGS is a series of bits which affect compilation.
 435
 436      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 437      use POSIX basic syntax.
 438
 439      If REG_NEWLINE is set, then . and [^...] don't match newline.
 440      Also, regexec will try a match beginning after every newline.
 441
 442      If REG_ICASE is set, then we considers upper- and lowercase
 443      versions of letters to be equivalent when matching.
 444
 445      If REG_NOSUB is set, then when PREG is passed to regexec, that
 446      routine will report only success or failure, and nothing about the
 447      registers.
 448
 449    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 450    the return codes and their meanings.)  */
 451
 452 int
 453 regcomp (preg, pattern, cflags)
 454     regex_t *__restrict preg;
 455     const char *__restrict pattern;
 456     int cflags;
 457 {
 458   reg_errcode_t ret;
 459   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 460                          : RE_SYNTAX_POSIX_BASIC);
 461
 462   preg->buffer = NULL;
 463   preg->allocated = 0;
 464   preg->used = 0;
 465
 466   /* Try to allocate space for the fastmap.  */
 467   preg->fastmap = re_malloc (char, SBC_MAX);
 468   if (BE (preg->fastmap == NULL, 0))
 469     return REG_ESPACE;
 470
 471   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 472
 473   /* If REG_NEWLINE is set, newlines are treated differently.  */
 474   if (cflags & REG_NEWLINE)
 475     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 476       syntax &= ~RE_DOT_NEWLINE;
 477       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 478       /* It also changes the matching behavior.  */
 479       preg->newline_anchor = 1;
 480     }
 481   else
 482     preg->newline_anchor = 0;
 483   preg->no_sub = !!(cflags & REG_NOSUB);
 484   preg->translate = NULL;
 485
 486   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 487
 488   /* POSIX doesn't distinguish between an unmatched open-group and an
 489      unmatched close-group: both are REG_EPAREN.  */
 490   if (ret == REG_ERPAREN)
 491     ret = REG_EPAREN;
 492
 493   /* We have already checked preg->fastmap != NULL.  */
 494   if (BE (ret == REG_NOERROR, 1))
 495     /* Compute the fastmap now, since regexec cannot modify the pattern
 496        buffer.  This function never fails in this implementation.  */
 497     (void) re_compile_fastmap (preg);
 498   else
 499     {
 500       /* Some error occurred while compiling the expression.  */
 501       re_free (preg->fastmap);
 502       preg->fastmap = NULL;
 503     }
 504
 505   return (int) ret;
 506 }
 507 #ifdef _LIBC
 508 weak_alias (__regcomp, regcomp)
 509 #endif
 510
 511 /* Returns a message corresponding to an error code, ERRCODE, returned
 512    from either regcomp or regexec.   We don't use PREG here.  */
 513
 514 #ifdef _LIBC
 515 size_t
 516 regerror (errcode, preg, errbuf, errbuf_size)
 517     int errcode;
 518     const regex_t *__restrict preg;
 519     char *__restrict errbuf;
 520     size_t errbuf_size;
 521 #else /* size_t might promote */
 522 size_t
 523 regerror (int errcode, const regex_t *__restrict preg,
 524           char *__restrict errbuf, size_t errbuf_size)
 525 #endif
 526 {
 527   const char *msg;
 528   size_t msg_size;
 529
 530   if (BE (errcode < 0
 531           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 532                                / sizeof (__re_error_msgid_idx[0])), 0))
 533     /* Only error codes returned by the rest of the code should be passed
 534        to this routine.  If we are given anything else, or if other regex
 535        code generates an invalid error code, then the program has a bug.
 536        Dump core so we can fix it.  */
 537     abort ();
 538
 539   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 540
 541   msg_size = strlen (msg) + 1; /* Includes the null.  */
 542
 543   if (BE (errbuf_size != 0, 1))
 544     {
 545       if (BE (msg_size > errbuf_size, 0))
 546         {
 547 #if defined HAVE_MEMPCPY || defined _LIBC
 548           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 549 #else
 550           memcpy (errbuf, msg, errbuf_size - 1);
 551           errbuf[errbuf_size - 1] = 0;
 552 #endif
 553         }
 554       else
 555         memcpy (errbuf, msg, msg_size);
 556     }
 557
 558   return msg_size;
 559 }
 560 #ifdef _LIBC
 561 weak_alias (__regerror, regerror)
 562 #endif
 563
 564
 565 #ifdef RE_ENABLE_I18N
 566 /* This static array is used for the map to single-byte characters when
 567    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 568    it the same all the time.  UTF-8 is the preferred encoding so this is
 569    a worthwhile optimization.  */
 570 static const bitset_t utf8_sb_map =
 571 {
 572   /* Set the first 128 bits.  */
 573 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 574 #  error "bitset_word_t is narrower than 32 bits"
 575 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 576   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 577 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 578   BITSET_WORD_MAX, BITSET_WORD_MAX,
 579 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 580   BITSET_WORD_MAX,
 581 # endif
 582   (BITSET_WORD_MAX
 583    >> (SBC_MAX % BITSET_WORD_BITS == 0
 584        ? 0
 585        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 586 };
 587 #endif
 588
 589
 590 static void
 591 free_dfa_content (re_dfa_t *dfa)
 592 {
 593   Idx i, j;
 594
 595   if (dfa->nodes)
 596     for (i = 0; i < dfa->nodes_len; ++i)
 597       free_token (dfa->nodes + i);
 598   re_free (dfa->nexts);
 599   for (i = 0; i < dfa->nodes_len; ++i)
 600     {
 601       if (dfa->eclosures != NULL)
 602         re_node_set_free (dfa->eclosures + i);
 603       if (dfa->inveclosures != NULL)
 604         re_node_set_free (dfa->inveclosures + i);
 605       if (dfa->edests != NULL)
 606         re_node_set_free (dfa->edests + i);
 607     }
 608   re_free (dfa->edests);
 609   re_free (dfa->eclosures);
 610   re_free (dfa->inveclosures);
 611   re_free (dfa->nodes);
 612
 613   if (dfa->state_table)
 614     for (i = 0; i <= dfa->state_hash_mask; ++i)
 615       {
 616         struct re_state_table_entry *entry = dfa->state_table + i;
 617         for (j = 0; j < entry->num; ++j)
 618           {
 619             re_dfastate_t *state = entry->array[j];
 620             free_state (state);
 621           }
 622         re_free (entry->array);
 623       }
 624   re_free (dfa->state_table);
 625 #ifdef RE_ENABLE_I18N
 626   if (dfa->sb_char != utf8_sb_map)
 627     re_free (dfa->sb_char);
 628 #endif
 629   re_free (dfa->subexp_map);
 630 #ifdef DEBUG
 631   re_free (dfa->re_str);
 632 #endif
 633
 634   re_free (dfa);
 635 }
 636
 637
 638 /* Free dynamically allocated space used by PREG.  */
 639
 640 void
 641 regfree (preg)
 642     regex_t *preg;
 643 {
 644   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 645   if (BE (dfa != NULL, 1))
 646     free_dfa_content (dfa);
 647   preg->buffer = NULL;
 648   preg->allocated = 0;
 649
 650   re_free (preg->fastmap);
 651   preg->fastmap = NULL;
 652
 653   re_free (preg->translate);
 654   preg->translate = NULL;
 655 }
 656 #ifdef _LIBC
 657 weak_alias (__regfree, regfree)
 658 #endif
 659 \f
 660 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 661    them unless specifically requested.  */
 662
 663 #if defined _REGEX_RE_COMP || defined _LIBC
 664
 665 /* BSD has one and only one pattern buffer.  */
 666 static struct re_pattern_buffer re_comp_buf;
 667
 668 char *
 669 # ifdef _LIBC
 670 /* Make these definitions weak in libc, so POSIX programs can redefine
 671    these names if they don't use our functions, and still use
 672    regcomp/regexec above without link errors.  */
 673 weak_function
 674 # endif
 675 re_comp (s)
 676      const char *s;
 677 {
 678   reg_errcode_t ret;
 679   char *fastmap;
 680
 681   if (!s)
 682     {
 683       if (!re_comp_buf.buffer)
 684         return gettext ("No previous regular expression");
 685       return 0;
 686     }
 687
 688   if (re_comp_buf.buffer)
 689     {
 690       fastmap = re_comp_buf.fastmap;
 691       re_comp_buf.fastmap = NULL;
 692       __regfree (&re_comp_buf);
 693       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 694       re_comp_buf.fastmap = fastmap;
 695     }
 696
 697   if (re_comp_buf.fastmap == NULL)
 698     {
 699       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 700       if (re_comp_buf.fastmap == NULL)
 701         return (char *) gettext (__re_error_msgid
 702                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 703     }
 704
 705   /* Since `re_exec' always passes NULL for the `regs' argument, we
 706      don't need to initialize the pattern buffer fields which affect it.  */
 707
 708   /* Match anchors at newlines.  */
 709   re_comp_buf.newline_anchor = 1;
 710
 711   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 712
 713   if (!ret)
 714     return NULL;
 715
 716   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 717   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 718 }
 719
 720 #ifdef _LIBC
 721 libc_freeres_fn (free_mem)
 722 {
 723   __regfree (&re_comp_buf);
 724 }
 725 #endif
 726
 727 #endif /* _REGEX_RE_COMP */
 728 \f
 729 /* Internal entry point.
 730    Compile the regular expression PATTERN, whose length is LENGTH.
 731    SYNTAX indicate regular expression's syntax.  */
 732
 733 static reg_errcode_t
 734 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 735                      reg_syntax_t syntax)
 736 {
 737   reg_errcode_t err = REG_NOERROR;
 738   re_dfa_t *dfa;
 739   re_string_t regexp;
 740
 741   /* Initialize the pattern buffer.  */
 742   preg->fastmap_accurate = 0;
 743   preg->syntax = syntax;
 744   preg->not_bol = preg->not_eol = 0;
 745   preg->used = 0;
 746   preg->re_nsub = 0;
 747   preg->can_be_null = 0;
 748   preg->regs_allocated = REGS_UNALLOCATED;
 749
 750   /* Initialize the dfa.  */
 751   dfa = (re_dfa_t *) preg->buffer;
 752   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 753     {
 754       /* If zero allocated, but buffer is non-null, try to realloc
 755          enough space.  This loses if buffer's address is bogus, but
 756          that is the user's responsibility.  If ->buffer is NULL this
 757          is a simple allocation.  */
 758       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 759       if (dfa == NULL)
 760         return REG_ESPACE;
 761       preg->allocated = sizeof (re_dfa_t);
 762       preg->buffer = (unsigned char *) dfa;
 763     }
 764   preg->used = sizeof (re_dfa_t);
 765
 766   err = init_dfa (dfa, length);
 767   if (BE (err != REG_NOERROR, 0))
 768     {
 769       free_dfa_content (dfa);
 770       preg->buffer = NULL;
 771       preg->allocated = 0;
 772       return err;
 773     }
 774 #ifdef DEBUG
 775   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 776   dfa->re_str = re_malloc (char, length + 1);
 777   strncpy (dfa->re_str, pattern, length + 1);
 778 #endif
 779
 780   __libc_lock_init (dfa->lock);
 781
 782   err = re_string_construct (&regexp, pattern, length, preg->translate,
 783                              syntax & RE_ICASE, dfa);
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786     re_compile_internal_free_return:
 787       free_workarea_compile (preg);
 788       re_string_destruct (&regexp);
 789       free_dfa_content (dfa);
 790       preg->buffer = NULL;
 791       preg->allocated = 0;
 792       return err;
 793     }
 794
 795   /* Parse the regular expression, and build a structure tree.  */
 796   preg->re_nsub = 0;
 797   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 798   if (BE (dfa->str_tree == NULL, 0))
 799     goto re_compile_internal_free_return;
 800
 801   /* Analyze the tree and create the nfa.  */
 802   err = analyze (preg);
 803   if (BE (err != REG_NOERROR, 0))
 804     goto re_compile_internal_free_return;
 805
 806 #ifdef RE_ENABLE_I18N
 807   /* If possible, do searching in single byte encoding to speed things up.  */
 808   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 809     optimize_utf8 (dfa);
 810 #endif
 811
 812   /* Then create the initial state of the dfa.  */
 813   err = create_initial_state (dfa);
 814
 815   /* Release work areas.  */
 816   free_workarea_compile (preg);
 817   re_string_destruct (&regexp);
 818
 819   if (BE (err != REG_NOERROR, 0))
 820     {
 821       free_dfa_content (dfa);
 822       preg->buffer = NULL;
 823       preg->allocated = 0;
 824     }
 825
 826   return err;
 827 }
 828
 829 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 830    as the initial length of some arrays.  */
 831
 832 static reg_errcode_t
 833 init_dfa (re_dfa_t *dfa, size_t pat_len)
 834 {
 835   __re_size_t table_size;
 836 #ifndef _LIBC
 837   char *codeset_name;
 838 #endif
 839   size_t max_object_size =
 840     MAX (sizeof (struct re_state_table_entry),
 841          MAX (sizeof (re_token_t),
 842               MAX (sizeof (re_node_set),
 843                    MAX (sizeof (regmatch_t),
 844                         MAX (sizeof (regoff_t),
 845                              MAX (sizeof (wchar_t),
 846                                   MAX (sizeof (wctype_t),
 847                                        sizeof (Idx))))))));
 848
 849   memset (dfa, '\0', sizeof (re_dfa_t));
 850
 851   /* Force allocation of str_tree_storage the first time.  */
 852   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 853
 854   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 855      calculation below, and for similar doubling calculations
 856      elsewhere.  And it's <= rather than <, because some of the
 857      doubling calculations add 1 afterwards.  */
 858   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 859     return REG_ESPACE;
 860
 861   dfa->nodes_alloc = pat_len + 1;
 862   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 863
 864   /*  table_size = 2 ^ ceil(log pat_len) */
 865   for (table_size = 1; ; table_size <<= 1)
 866     if (table_size > pat_len)
 867       break;
 868
 869   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 870   dfa->state_hash_mask = table_size - 1;
 871
 872   dfa->mb_cur_max = MB_CUR_MAX;
 873 #ifdef _LIBC
 874   if (dfa->mb_cur_max == 6
 875       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 876     dfa->is_utf8 = 1;
 877   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 878                        != 0);
 879 #else
 880 # ifdef HAVE_LANGINFO_CODESET
 881   codeset_name = nl_langinfo (CODESET);
 882 # else
 883   codeset_name = getenv ("LC_ALL");
 884   if (codeset_name == NULL || codeset_name[0] == '\0')
 885     codeset_name = getenv ("LC_CTYPE");
 886   if (codeset_name == NULL || codeset_name[0] == '\0')
 887     codeset_name = getenv ("LANG");
 888   if (codeset_name == NULL)
 889     codeset_name = "";
 890   else if (strchr (codeset_name, '.') !=  NULL)
 891     codeset_name = strchr (codeset_name, '.') + 1;
 892 # endif
 893
 894   if (strcasecmp (codeset_name, "UTF-8") == 0
 895       || strcasecmp (codeset_name, "UTF8") == 0)
 896     dfa->is_utf8 = 1;
 897
 898   /* We check exhaustively in the loop below if this charset is a
 899      superset of ASCII.  */
 900   dfa->map_notascii = 0;
 901 #endif
 902
 903 #ifdef RE_ENABLE_I18N
 904   if (dfa->mb_cur_max > 1)
 905     {
 906       if (dfa->is_utf8)
 907         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 908       else
 909         {
 910           int i, j, ch;
 911
 912           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 913           if (BE (dfa->sb_char == NULL, 0))
 914             return REG_ESPACE;
 915
 916           /* Set the bits corresponding to single byte chars.  */
 917           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 918             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 919               {
 920                 wint_t wch = __btowc (ch);
 921                 if (wch != WEOF)
 922                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 923 # ifndef _LIBC
 924                 if (isascii (ch) && wch != ch)
 925                   dfa->map_notascii = 1;
 926 # endif
 927               }
 928         }
 929     }
 930 #endif
 931
 932   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 933     return REG_ESPACE;
 934   return REG_NOERROR;
 935 }
 936
 937 /* Initialize WORD_CHAR table, which indicate which character is
 938    "word".  In this case "word" means that it is the word construction
 939    character used by some operators like "\<", "\>", etc.  */
 940
 941 static void
 942 internal_function
 943 init_word_char (re_dfa_t *dfa)
 944 {
 945   int i, j, ch;
 946   dfa->word_ops_used = 1;
 947   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 948     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 949       if (isalnum (ch) || ch == '_')
 950         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 951 }
 952
 953 /* Free the work area which are only used while compiling.  */
 954
 955 static void
 956 free_workarea_compile (regex_t *preg)
 957 {
 958   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 959   bin_tree_storage_t *storage, *next;
 960   for (storage = dfa->str_tree_storage; storage; storage = next)
 961     {
 962       next = storage->next;
 963       re_free (storage);
 964     }
 965   dfa->str_tree_storage = NULL;
 966   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 967   dfa->str_tree = NULL;
 968   re_free (dfa->org_indices);
 969   dfa->org_indices = NULL;
 970 }
 971
 972 /* Create initial states for all contexts.  */
 973
 974 static reg_errcode_t
 975 create_initial_state (re_dfa_t *dfa)
 976 {
 977   Idx first, i;
 978   reg_errcode_t err;
 979   re_node_set init_nodes;
 980
 981   /* Initial states have the epsilon closure of the node which is
 982      the first node of the regular expression.  */
 983   first = dfa->str_tree->first->node_idx;
 984   dfa->init_node = first;
 985   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 986   if (BE (err != REG_NOERROR, 0))
 987     return err;
 988
 989   /* The back-references which are in initial states can epsilon transit,
 990      since in this case all of the subexpressions can be null.
 991      Then we add epsilon closures of the nodes which are the next nodes of
 992      the back-references.  */
 993   if (dfa->nbackref > 0)
 994     for (i = 0; i < init_nodes.nelem; ++i)
 995       {
 996         Idx node_idx = init_nodes.elems[i];
 997         re_token_type_t type = dfa->nodes[node_idx].type;
 998
 999         Idx clexp_idx;
1000         if (type != OP_BACK_REF)
1001           continue;
1002         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1003           {
1004             re_token_t *clexp_node;
1005             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1006             if (clexp_node->type == OP_CLOSE_SUBEXP
1007                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1008               break;
1009           }
1010         if (clexp_idx == init_nodes.nelem)
1011           continue;
1012
1013         if (type == OP_BACK_REF)
1014           {
1015             Idx dest_idx = dfa->edests[node_idx].elems[0];
1016             if (!re_node_set_contains (&init_nodes, dest_idx))
1017               {
1018                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1019                 i = 0;
1020               }
1021           }
1022       }
1023
1024   /* It must be the first time to invoke acquire_state.  */
1025   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1026   /* We don't check ERR here, since the initial state must not be NULL.  */
1027   if (BE (dfa->init_state == NULL, 0))
1028     return err;
1029   if (dfa->init_state->has_constraint)
1030     {
1031       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1032                                                        CONTEXT_WORD);
1033       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1034                                                      CONTEXT_NEWLINE);
1035       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1036                                                          &init_nodes,
1037                                                          CONTEXT_NEWLINE
1038                                                          | CONTEXT_BEGBUF);
1039       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1040               || dfa->init_state_begbuf == NULL, 0))
1041         return err;
1042     }
1043   else
1044     dfa->init_state_word = dfa->init_state_nl
1045       = dfa->init_state_begbuf = dfa->init_state;
1046
1047   re_node_set_free (&init_nodes);
1048   return REG_NOERROR;
1049 }
1050 \f
1051 #ifdef RE_ENABLE_I18N
1052 /* If it is possible to do searching in single byte encoding instead of UTF-8
1053    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1054    DFA nodes where needed.  */
1055
1056 static void
1057 optimize_utf8 (re_dfa_t *dfa)
1058 {
1059   Idx node;
1060   int i;
1061   bool mb_chars = false;
1062   bool has_period = false;
1063
1064   for (node = 0; node < dfa->nodes_len; ++node)
1065     switch (dfa->nodes[node].type)
1066       {
1067       case CHARACTER:
1068         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1069           mb_chars = true;
1070         break;
1071       case ANCHOR:
1072         switch (dfa->nodes[node].opr.idx)
1073           {
1074           case LINE_FIRST:
1075           case LINE_LAST:
1076           case BUF_FIRST:
1077           case BUF_LAST:
1078             break;
1079           default:
1080             /* Word anchors etc. cannot be handled.  */
1081             return;
1082           }
1083         break;
1084       case OP_PERIOD:
1085         has_period = true;
1086         break;
1087       case OP_BACK_REF:
1088       case OP_ALT:
1089       case END_OF_RE:
1090       case OP_DUP_ASTERISK:
1091       case OP_OPEN_SUBEXP:
1092       case OP_CLOSE_SUBEXP:
1093         break;
1094       case COMPLEX_BRACKET:
1095         return;
1096       case SIMPLE_BRACKET:
1097         /* Just double check.  */
1098         {
1099           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1100                         ? 0
1101                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1102           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1103             {
1104               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1105                 return;
1106               rshift = 0;
1107             }
1108         }
1109         break;
1110       default:
1111         abort ();
1112       }
1113
1114   if (mb_chars || has_period)
1115     for (node = 0; node < dfa->nodes_len; ++node)
1116       {
1117         if (dfa->nodes[node].type == CHARACTER
1118             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1119           dfa->nodes[node].mb_partial = 0;
1120         else if (dfa->nodes[node].type == OP_PERIOD)
1121           dfa->nodes[node].type = OP_UTF8_PERIOD;
1122       }
1123
1124   /* The search can be in single byte locale.  */
1125   dfa->mb_cur_max = 1;
1126   dfa->is_utf8 = 0;
1127   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1128 }
1129 #endif
1130 \f
1131 /* Analyze the structure tree, and calculate "first", "next", "edest",
1132    "eclosure", and "inveclosure".  */
1133
1134 static reg_errcode_t
1135 analyze (regex_t *preg)
1136 {
1137   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1138   reg_errcode_t ret;
1139
1140   /* Allocate arrays.  */
1141   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1142   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1143   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1144   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1145   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1146           || dfa->eclosures == NULL, 0))
1147     return REG_ESPACE;
1148
1149   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1150   if (dfa->subexp_map != NULL)
1151     {
1152       Idx i;
1153       for (i = 0; i < preg->re_nsub; i++)
1154         dfa->subexp_map[i] = i;
1155       preorder (dfa->str_tree, optimize_subexps, dfa);
1156       for (i = 0; i < preg->re_nsub; i++)
1157         if (dfa->subexp_map[i] != i)
1158           break;
1159       if (i == preg->re_nsub)
1160         {
1161           free (dfa->subexp_map);
1162           dfa->subexp_map = NULL;
1163         }
1164     }
1165
1166   ret = postorder (dfa->str_tree, lower_subexps, preg);
1167   if (BE (ret != REG_NOERROR, 0))
1168     return ret;
1169   ret = postorder (dfa->str_tree, calc_first, dfa);
1170   if (BE (ret != REG_NOERROR, 0))
1171     return ret;
1172   preorder (dfa->str_tree, calc_next, dfa);
1173   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1174   if (BE (ret != REG_NOERROR, 0))
1175     return ret;
1176   ret = calc_eclosure (dfa);
1177   if (BE (ret != REG_NOERROR, 0))
1178     return ret;
1179
1180   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1181      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1182   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1183       || dfa->nbackref)
1184     {
1185       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1186       if (BE (dfa->inveclosures == NULL, 0))
1187         return REG_ESPACE;
1188       ret = calc_inveclosure (dfa);
1189     }
1190
1191   return ret;
1192 }
1193
1194 /* Our parse trees are very unbalanced, so we cannot use a stack to
1195    implement parse tree visits.  Instead, we use parent pointers and
1196    some hairy code in these two functions.  */
1197 static reg_errcode_t
1198 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1199            void *extra)
1200 {
1201   bin_tree_t *node, *prev;
1202
1203   for (node = root; ; )
1204     {
1205       /* Descend down the tree, preferably to the left (or to the right
1206          if that's the only child).  */
1207       while (node->left || node->right)
1208         if (node->left)
1209           node = node->left;
1210         else
1211           node = node->right;
1212
1213       do
1214         {
1215           reg_errcode_t err = fn (extra, node);
1216           if (BE (err != REG_NOERROR, 0))
1217             return err;
1218           if (node->parent == NULL)
1219             return REG_NOERROR;
1220           prev = node;
1221           node = node->parent;
1222         }
1223       /* Go up while we have a node that is reached from the right.  */
1224       while (node->right == prev || node->right == NULL);
1225       node = node->right;
1226     }
1227 }
1228
1229 static reg_errcode_t
1230 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1231           void *extra)
1232 {
1233   bin_tree_t *node;
1234
1235   for (node = root; ; )
1236     {
1237       reg_errcode_t err = fn (extra, node);
1238       if (BE (err != REG_NOERROR, 0))
1239         return err;
1240
1241       /* Go to the left node, or up and to the right.  */
1242       if (node->left)
1243         node = node->left;
1244       else
1245         {
1246           bin_tree_t *prev = NULL;
1247           while (node->right == prev || node->right == NULL)
1248             {
1249               prev = node;
1250               node = node->parent;
1251               if (!node)
1252                 return REG_NOERROR;
1253             }
1254           node = node->right;
1255         }
1256     }
1257 }
1258
1259 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1260    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1261    backreferences as well.  Requires a preorder visit.  */
1262 static reg_errcode_t
1263 optimize_subexps (void *extra, bin_tree_t *node)
1264 {
1265   re_dfa_t *dfa = (re_dfa_t *) extra;
1266
1267   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1268     {
1269       int idx = node->token.opr.idx;
1270       node->token.opr.idx = dfa->subexp_map[idx];
1271       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1272     }
1273
1274   else if (node->token.type == SUBEXP
1275            && node->left && node->left->token.type == SUBEXP)
1276     {
1277       Idx other_idx = node->left->token.opr.idx;
1278
1279       node->left = node->left->left;
1280       if (node->left)
1281         node->left->parent = node;
1282
1283       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1284       if (other_idx < BITSET_WORD_BITS)
1285         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1286     }
1287
1288   return REG_NOERROR;
1289 }
1290
1291 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1292    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1293 static reg_errcode_t
1294 lower_subexps (void *extra, bin_tree_t *node)
1295 {
1296   regex_t *preg = (regex_t *) extra;
1297   reg_errcode_t err = REG_NOERROR;
1298
1299   if (node->left && node->left->token.type == SUBEXP)
1300     {
1301       node->left = lower_subexp (&err, preg, node->left);
1302       if (node->left)
1303         node->left->parent = node;
1304     }
1305   if (node->right && node->right->token.type == SUBEXP)
1306     {
1307       node->right = lower_subexp (&err, preg, node->right);
1308       if (node->right)
1309         node->right->parent = node;
1310     }
1311
1312   return err;
1313 }
1314
1315 static bin_tree_t *
1316 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1317 {
1318   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1319   bin_tree_t *body = node->left;
1320   bin_tree_t *op, *cls, *tree1, *tree;
1321
1322   if (preg->no_sub
1323       /* We do not optimize empty subexpressions, because otherwise we may
1324          have bad CONCAT nodes with NULL children.  This is obviously not
1325          very common, so we do not lose much.  An example that triggers
1326          this case is the sed "script" /\(\)/x.  */
1327       && node->left != NULL
1328       && (node->token.opr.idx >= BITSET_WORD_BITS
1329           || !(dfa->used_bkref_map
1330                & ((bitset_word_t) 1 << node->token.opr.idx))))
1331     return node->left;
1332
1333   /* Convert the SUBEXP node to the concatenation of an
1334      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1335   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1336   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1337   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1338   tree = create_tree (dfa, op, tree1, CONCAT);
1339   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1340     {
1341       *err = REG_ESPACE;
1342       return NULL;
1343     }
1344
1345   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1346   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1347   return tree;
1348 }
1349
1350 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1351    nodes.  Requires a postorder visit.  */
1352 static reg_errcode_t
1353 calc_first (void *extra, bin_tree_t *node)
1354 {
1355   re_dfa_t *dfa = (re_dfa_t *) extra;
1356   if (node->token.type == CONCAT)
1357     {
1358       node->first = node->left->first;
1359       node->node_idx = node->left->node_idx;
1360     }
1361   else
1362     {
1363       node->first = node;
1364       node->node_idx = re_dfa_add_node (dfa, node->token);
1365       if (BE (node->node_idx == REG_MISSING, 0))
1366         return REG_ESPACE;
1367     }
1368   return REG_NOERROR;
1369 }
1370
1371 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1372 static reg_errcode_t
1373 calc_next (void *extra, bin_tree_t *node)
1374 {
1375   switch (node->token.type)
1376     {
1377     case OP_DUP_ASTERISK:
1378       node->left->next = node;
1379       break;
1380     case CONCAT:
1381       node->left->next = node->right->first;
1382       node->right->next = node->next;
1383       break;
1384     default:
1385       if (node->left)
1386         node->left->next = node->next;
1387       if (node->right)
1388         node->right->next = node->next;
1389       break;
1390     }
1391   return REG_NOERROR;
1392 }
1393
1394 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1395 static reg_errcode_t
1396 link_nfa_nodes (void *extra, bin_tree_t *node)
1397 {
1398   re_dfa_t *dfa = (re_dfa_t *) extra;
1399   Idx idx = node->node_idx;
1400   reg_errcode_t err = REG_NOERROR;
1401
1402   switch (node->token.type)
1403     {
1404     case CONCAT:
1405       break;
1406
1407     case END_OF_RE:
1408       assert (node->next == NULL);
1409       break;
1410
1411     case OP_DUP_ASTERISK:
1412     case OP_ALT:
1413       {
1414         Idx left, right;
1415         dfa->has_plural_match = 1;
1416         if (node->left != NULL)
1417           left = node->left->first->node_idx;
1418         else
1419           left = node->next->node_idx;
1420         if (node->right != NULL)
1421           right = node->right->first->node_idx;
1422         else
1423           right = node->next->node_idx;
1424         assert (REG_VALID_INDEX (left));
1425         assert (REG_VALID_INDEX (right));
1426         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1427       }
1428       break;
1429
1430     case ANCHOR:
1431     case OP_OPEN_SUBEXP:
1432     case OP_CLOSE_SUBEXP:
1433       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1434       break;
1435
1436     case OP_BACK_REF:
1437       dfa->nexts[idx] = node->next->node_idx;
1438       if (node->token.type == OP_BACK_REF)
1439         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1440       break;
1441
1442     default:
1443       assert (!IS_EPSILON_NODE (node->token.type));
1444       dfa->nexts[idx] = node->next->node_idx;
1445       break;
1446     }
1447
1448   return err;
1449 }
1450
1451 /* Duplicate the epsilon closure of the node ROOT_NODE.
1452    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1453    to their own constraint.  */
1454
1455 static reg_errcode_t
1456 internal_function
1457 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1458                         Idx root_node, unsigned int init_constraint)
1459 {
1460   Idx org_node, clone_node;
1461   bool ok;
1462   unsigned int constraint = init_constraint;
1463   for (org_node = top_org_node, clone_node = top_clone_node;;)
1464     {
1465       Idx org_dest, clone_dest;
1466       if (dfa->nodes[org_node].type == OP_BACK_REF)
1467         {
1468           /* If the back reference epsilon-transit, its destination must
1469              also have the constraint.  Then duplicate the epsilon closure
1470              of the destination of the back reference, and store it in
1471              edests of the back reference.  */
1472           org_dest = dfa->nexts[org_node];
1473           re_node_set_empty (dfa->edests + clone_node);
1474           clone_dest = duplicate_node (dfa, org_dest, constraint);
1475           if (BE (clone_dest == REG_MISSING, 0))
1476             return REG_ESPACE;
1477           dfa->nexts[clone_node] = dfa->nexts[org_node];
1478           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1479           if (BE (! ok, 0))
1480             return REG_ESPACE;
1481         }
1482       else if (dfa->edests[org_node].nelem == 0)
1483         {
1484           /* In case of the node can't epsilon-transit, don't duplicate the
1485              destination and store the original destination as the
1486              destination of the node.  */
1487           dfa->nexts[clone_node] = dfa->nexts[org_node];
1488           break;
1489         }
1490       else if (dfa->edests[org_node].nelem == 1)
1491         {
1492           /* In case of the node can epsilon-transit, and it has only one
1493              destination.  */
1494           org_dest = dfa->edests[org_node].elems[0];
1495           re_node_set_empty (dfa->edests + clone_node);
1496           if (dfa->nodes[org_node].type == ANCHOR)
1497             {
1498               /* In case of the node has another constraint, append it.  */
1499               if (org_node == root_node && clone_node != org_node)
1500                 {
1501                   /* ...but if the node is root_node itself, it means the
1502                      epsilon closure have a loop, then tie it to the
1503                      destination of the root_node.  */
1504                   ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1505                   if (BE (! ok, 0))
1506                     return REG_ESPACE;
1507                   break;
1508                 }
1509               constraint |= dfa->nodes[org_node].opr.ctx_type;
1510             }
1511           clone_dest = duplicate_node (dfa, org_dest, constraint);
1512           if (BE (clone_dest == REG_MISSING, 0))
1513             return REG_ESPACE;
1514           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1515           if (BE (! ok, 0))
1516             return REG_ESPACE;
1517         }
1518       else /* dfa->edests[org_node].nelem == 2 */
1519         {
1520           /* In case of the node can epsilon-transit, and it has two
1521              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1522           org_dest = dfa->edests[org_node].elems[0];
1523           re_node_set_empty (dfa->edests + clone_node);
1524           /* Search for a duplicated node which satisfies the constraint.  */
1525           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1526           if (clone_dest == REG_MISSING)
1527             {
1528               /* There are no such a duplicated node, create a new one.  */
1529               reg_errcode_t err;
1530               clone_dest = duplicate_node (dfa, org_dest, constraint);
1531               if (BE (clone_dest == REG_MISSING, 0))
1532                 return REG_ESPACE;
1533               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534               if (BE (! ok, 0))
1535                 return REG_ESPACE;
1536               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1537                                             root_node, constraint);
1538               if (BE (err != REG_NOERROR, 0))
1539                 return err;
1540             }
1541           else
1542             {
1543               /* There are a duplicated node which satisfy the constraint,
1544                  use it to avoid infinite loop.  */
1545               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546               if (BE (! ok, 0))
1547                 return REG_ESPACE;
1548             }
1549
1550           org_dest = dfa->edests[org_node].elems[1];
1551           clone_dest = duplicate_node (dfa, org_dest, constraint);
1552           if (BE (clone_dest == REG_MISSING, 0))
1553             return REG_ESPACE;
1554           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555           if (BE (! ok, 0))
1556             return REG_ESPACE;
1557         }
1558       org_node = org_dest;
1559       clone_node = clone_dest;
1560     }
1561   return REG_NOERROR;
1562 }
1563
1564 /* Search for a node which is duplicated from the node ORG_NODE, and
1565    satisfies the constraint CONSTRAINT.  */
1566
1567 static Idx
1568 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1569                         unsigned int constraint)
1570 {
1571   Idx idx;
1572   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1573     {
1574       if (org_node == dfa->org_indices[idx]
1575           && constraint == dfa->nodes[idx].constraint)
1576         return idx; /* Found.  */
1577     }
1578   return REG_MISSING; /* Not found.  */
1579 }
1580
1581 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1582    Return the index of the new node, or REG_MISSING if insufficient storage is
1583    available.  */
1584
1585 static Idx
1586 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1587 {
1588   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1589   if (BE (dup_idx != REG_MISSING, 1))
1590     {
1591       dfa->nodes[dup_idx].constraint = constraint;
1592       if (dfa->nodes[org_idx].type == ANCHOR)
1593         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1594       dfa->nodes[dup_idx].duplicated = 1;
1595
1596       /* Store the index of the original node.  */
1597       dfa->org_indices[dup_idx] = org_idx;
1598     }
1599   return dup_idx;
1600 }
1601
1602 static reg_errcode_t
1603 calc_inveclosure (re_dfa_t *dfa)
1604 {
1605   Idx src, idx;
1606   bool ok;
1607   for (idx = 0; idx < dfa->nodes_len; ++idx)
1608     re_node_set_init_empty (dfa->inveclosures + idx);
1609
1610   for (src = 0; src < dfa->nodes_len; ++src)
1611     {
1612       Idx *elems = dfa->eclosures[src].elems;
1613       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1614         {
1615           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1616           if (BE (! ok, 0))
1617             return REG_ESPACE;
1618         }
1619     }
1620
1621   return REG_NOERROR;
1622 }
1623
1624 /* Calculate "eclosure" for all the node in DFA.  */
1625
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t *dfa)
1628 {
1629   Idx node_idx;
1630   bool incomplete;
1631 #ifdef DEBUG
1632   assert (dfa->nodes_len > 0);
1633 #endif
1634   incomplete = false;
1635   /* For each nodes, calculate epsilon closure.  */
1636   for (node_idx = 0; ; ++node_idx)
1637     {
1638       reg_errcode_t err;
1639       re_node_set eclosure_elem;
1640       if (node_idx == dfa->nodes_len)
1641         {
1642           if (!incomplete)
1643             break;
1644           incomplete = false;
1645           node_idx = 0;
1646         }
1647
1648 #ifdef DEBUG
1649       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1650 #endif
1651
1652       /* If we have already calculated, skip it.  */
1653       if (dfa->eclosures[node_idx].nelem != 0)
1654         continue;
1655       /* Calculate epsilon closure of `node_idx'.  */
1656       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1657       if (BE (err != REG_NOERROR, 0))
1658         return err;
1659
1660       if (dfa->eclosures[node_idx].nelem == 0)
1661         {
1662           incomplete = true;
1663           re_node_set_free (&eclosure_elem);
1664         }
1665     }
1666   return REG_NOERROR;
1667 }
1668
1669 /* Calculate epsilon closure of NODE.  */
1670
1671 static reg_errcode_t
1672 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1673 {
1674   reg_errcode_t err;
1675   unsigned int constraint;
1676   Idx i;
1677   bool incomplete;
1678   bool ok;
1679   re_node_set eclosure;
1680   incomplete = false;
1681   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682   if (BE (err != REG_NOERROR, 0))
1683     return err;
1684
1685   /* This indicates that we are calculating this node now.
1686      We reference this value to avoid infinite loop.  */
1687   dfa->eclosures[node].nelem = REG_MISSING;
1688
1689   constraint = ((dfa->nodes[node].type == ANCHOR)
1690                 ? dfa->nodes[node].opr.ctx_type : 0);
1691   /* If the current node has constraints, duplicate all nodes.
1692      Since they must inherit the constraints.  */
1693   if (constraint
1694       && dfa->edests[node].nelem
1695       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1696     {
1697       err = duplicate_node_closure (dfa, node, node, node, constraint);
1698       if (BE (err != REG_NOERROR, 0))
1699         return err;
1700     }
1701
1702   /* Expand each epsilon destination nodes.  */
1703   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1704     for (i = 0; i < dfa->edests[node].nelem; ++i)
1705       {
1706         re_node_set eclosure_elem;
1707         Idx edest = dfa->edests[node].elems[i];
1708         /* If calculating the epsilon closure of `edest' is in progress,
1709            return intermediate result.  */
1710         if (dfa->eclosures[edest].nelem == REG_MISSING)
1711           {
1712             incomplete = true;
1713             continue;
1714           }
1715         /* If we haven't calculated the epsilon closure of `edest' yet,
1716            calculate now. Otherwise use calculated epsilon closure.  */
1717         if (dfa->eclosures[edest].nelem == 0)
1718           {
1719             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1720             if (BE (err != REG_NOERROR, 0))
1721               return err;
1722           }
1723         else
1724           eclosure_elem = dfa->eclosures[edest];
1725         /* Merge the epsilon closure of `edest'.  */
1726         re_node_set_merge (&eclosure, &eclosure_elem);
1727         /* If the epsilon closure of `edest' is incomplete,
1728            the epsilon closure of this node is also incomplete.  */
1729         if (dfa->eclosures[edest].nelem == 0)
1730           {
1731             incomplete = true;
1732             re_node_set_free (&eclosure_elem);
1733           }
1734       }
1735
1736   /* Epsilon closures include itself.  */
1737   ok = re_node_set_insert (&eclosure, node);
1738   if (BE (! ok, 0))
1739     return REG_ESPACE;
1740   if (incomplete && !root)
1741     dfa->eclosures[node].nelem = 0;
1742   else
1743     dfa->eclosures[node] = eclosure;
1744   *new_set = eclosure;
1745   return REG_NOERROR;
1746 }
1747 \f
1748 /* Functions for token which are used in the parser.  */
1749
1750 /* Fetch a token from INPUT.
1751    We must not use this function inside bracket expressions.  */
1752
1753 static void
1754 internal_function
1755 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1756 {
1757   re_string_skip_bytes (input, peek_token (result, input, syntax));
1758 }
1759
1760 /* Peek a token from INPUT, and return the length of the token.
1761    We must not use this function inside bracket expressions.  */
1762
1763 static int
1764 internal_function
1765 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1766 {
1767   unsigned char c;
1768
1769   if (re_string_eoi (input))
1770     {
1771       token->type = END_OF_RE;
1772       return 0;
1773     }
1774
1775   c = re_string_peek_byte (input, 0);
1776   token->opr.c = c;
1777
1778   token->word_char = 0;
1779 #ifdef RE_ENABLE_I18N
1780   token->mb_partial = 0;
1781   if (input->mb_cur_max > 1 &&
1782       !re_string_first_byte (input, re_string_cur_idx (input)))
1783     {
1784       token->type = CHARACTER;
1785       token->mb_partial = 1;
1786       return 1;
1787     }
1788 #endif
1789   if (c == '\\')
1790     {
1791       unsigned char c2;
1792       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1793         {
1794           token->type = BACK_SLASH;
1795           return 1;
1796         }
1797
1798       c2 = re_string_peek_byte_case (input, 1);
1799       token->opr.c = c2;
1800       token->type = CHARACTER;
1801 #ifdef RE_ENABLE_I18N
1802       if (input->mb_cur_max > 1)
1803         {
1804           wint_t wc = re_string_wchar_at (input,
1805                                           re_string_cur_idx (input) + 1);
1806           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1807         }
1808       else
1809 #endif
1810         token->word_char = IS_WORD_CHAR (c2) != 0;
1811
1812       switch (c2)
1813         {
1814         case '|':
1815           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816             token->type = OP_ALT;
1817           break;
1818         case '1': case '2': case '3': case '4': case '5':
1819         case '6': case '7': case '8': case '9':
1820           if (!(syntax & RE_NO_BK_REFS))
1821             {
1822               token->type = OP_BACK_REF;
1823               token->opr.idx = c2 - '1';
1824             }
1825           break;
1826         case '<':
1827           if (!(syntax & RE_NO_GNU_OPS))
1828             {
1829               token->type = ANCHOR;
1830               token->opr.ctx_type = WORD_FIRST;
1831             }
1832           break;
1833         case '>':
1834           if (!(syntax & RE_NO_GNU_OPS))
1835             {
1836               token->type = ANCHOR;
1837               token->opr.ctx_type = WORD_LAST;
1838             }
1839           break;
1840         case 'b':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             {
1843               token->type = ANCHOR;
1844               token->opr.ctx_type = WORD_DELIM;
1845             }
1846           break;
1847         case 'B':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = NOT_WORD_DELIM;
1852             }
1853           break;
1854         case 'w':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             token->type = OP_WORD;
1857           break;
1858         case 'W':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             token->type = OP_NOTWORD;
1861           break;
1862         case 's':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_SPACE;
1865           break;
1866         case 'S':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_NOTSPACE;
1869           break;
1870         case '`':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             {
1873               token->type = ANCHOR;
1874               token->opr.ctx_type = BUF_FIRST;
1875             }
1876           break;
1877         case '\'':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = BUF_LAST;
1882             }
1883           break;
1884         case '(':
1885           if (!(syntax & RE_NO_BK_PARENS))
1886             token->type = OP_OPEN_SUBEXP;
1887           break;
1888         case ')':
1889           if (!(syntax & RE_NO_BK_PARENS))
1890             token->type = OP_CLOSE_SUBEXP;
1891           break;
1892         case '+':
1893           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894             token->type = OP_DUP_PLUS;
1895           break;
1896         case '?':
1897           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898             token->type = OP_DUP_QUESTION;
1899           break;
1900         case '{':
1901           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902             token->type = OP_OPEN_DUP_NUM;
1903           break;
1904         case '}':
1905           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906             token->type = OP_CLOSE_DUP_NUM;
1907           break;
1908         default:
1909           break;
1910         }
1911       return 2;
1912     }
1913
1914   token->type = CHARACTER;
1915 #ifdef RE_ENABLE_I18N
1916   if (input->mb_cur_max > 1)
1917     {
1918       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1920     }
1921   else
1922 #endif
1923     token->word_char = IS_WORD_CHAR (token->opr.c);
1924
1925   switch (c)
1926     {
1927     case '\n':
1928       if (syntax & RE_NEWLINE_ALT)
1929         token->type = OP_ALT;
1930       break;
1931     case '|':
1932       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1933         token->type = OP_ALT;
1934       break;
1935     case '*':
1936       token->type = OP_DUP_ASTERISK;
1937       break;
1938     case '+':
1939       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940         token->type = OP_DUP_PLUS;
1941       break;
1942     case '?':
1943       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944         token->type = OP_DUP_QUESTION;
1945       break;
1946     case '{':
1947       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948         token->type = OP_OPEN_DUP_NUM;
1949       break;
1950     case '}':
1951       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952         token->type = OP_CLOSE_DUP_NUM;
1953       break;
1954     case '(':
1955       if (syntax & RE_NO_BK_PARENS)
1956         token->type = OP_OPEN_SUBEXP;
1957       break;
1958     case ')':
1959       if (syntax & RE_NO_BK_PARENS)
1960         token->type = OP_CLOSE_SUBEXP;
1961       break;
1962     case '[':
1963       token->type = OP_OPEN_BRACKET;
1964       break;
1965     case '.':
1966       token->type = OP_PERIOD;
1967       break;
1968     case '^':
1969       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1970           re_string_cur_idx (input) != 0)
1971         {
1972           char prev = re_string_peek_byte (input, -1);
1973           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1974             break;
1975         }
1976       token->type = ANCHOR;
1977       token->opr.ctx_type = LINE_FIRST;
1978       break;
1979     case '$':
1980       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1981           re_string_cur_idx (input) + 1 != re_string_length (input))
1982         {
1983           re_token_t next;
1984           re_string_skip_bytes (input, 1);
1985           peek_token (&next, input, syntax);
1986           re_string_skip_bytes (input, -1);
1987           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988             break;
1989         }
1990       token->type = ANCHOR;
1991       token->opr.ctx_type = LINE_LAST;
1992       break;
1993     default:
1994       break;
1995     }
1996   return 1;
1997 }
1998
1999 /* Peek a token from INPUT, and return the length of the token.
2000    We must not use this function out of bracket expressions.  */
2001
2002 static int
2003 internal_function
2004 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2005 {
2006   unsigned char c;
2007   if (re_string_eoi (input))
2008     {
2009       token->type = END_OF_RE;
2010       return 0;
2011     }
2012   c = re_string_peek_byte (input, 0);
2013   token->opr.c = c;
2014
2015 #ifdef RE_ENABLE_I18N
2016   if (input->mb_cur_max > 1 &&
2017       !re_string_first_byte (input, re_string_cur_idx (input)))
2018     {
2019       token->type = CHARACTER;
2020       return 1;
2021     }
2022 #endif /* RE_ENABLE_I18N */
2023
2024   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2025       && re_string_cur_idx (input) + 1 < re_string_length (input))
2026     {
2027       /* In this case, '\' escape a character.  */
2028       unsigned char c2;
2029       re_string_skip_bytes (input, 1);
2030       c2 = re_string_peek_byte (input, 0);
2031       token->opr.c = c2;
2032       token->type = CHARACTER;
2033       return 1;
2034     }
2035   if (c == '[') /* '[' is a special char in a bracket exps.  */
2036     {
2037       unsigned char c2;
2038       int token_len;
2039       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2040         c2 = re_string_peek_byte (input, 1);
2041       else
2042         c2 = 0;
2043       token->opr.c = c2;
2044       token_len = 2;
2045       switch (c2)
2046         {
2047         case '.':
2048           token->type = OP_OPEN_COLL_ELEM;
2049           break;
2050         case '=':
2051           token->type = OP_OPEN_EQUIV_CLASS;
2052           break;
2053         case ':':
2054           if (syntax & RE_CHAR_CLASSES)
2055             {
2056               token->type = OP_OPEN_CHAR_CLASS;
2057               break;
2058             }
2059           /* else fall through.  */
2060         default:
2061           token->type = CHARACTER;
2062           token->opr.c = c;
2063           token_len = 1;
2064           break;
2065         }
2066       return token_len;
2067     }
2068   switch (c)
2069     {
2070     case '-':
2071       token->type = OP_CHARSET_RANGE;
2072       break;
2073     case ']':
2074       token->type = OP_CLOSE_BRACKET;
2075       break;
2076     case '^':
2077       token->type = OP_NON_MATCH_LIST;
2078       break;
2079     default:
2080       token->type = CHARACTER;
2081     }
2082   return 1;
2083 }
2084 \f
2085 /* Functions for parser.  */
2086
2087 /* Entry point of the parser.
2088    Parse the regular expression REGEXP and return the structure tree.
2089    If an error is occured, ERR is set by error code, and return NULL.
2090    This function build the following tree, from regular expression <reg_exp>:
2091            CAT
2092            / \
2093           /   \
2094    <reg_exp>  EOR
2095
2096    CAT means concatenation.
2097    EOR means end of regular expression.  */
2098
2099 static bin_tree_t *
2100 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2101        reg_errcode_t *err)
2102 {
2103   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2104   bin_tree_t *tree, *eor, *root;
2105   re_token_t current_token;
2106   dfa->syntax = syntax;
2107   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2108   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2109   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2110     return NULL;
2111   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2112   if (tree != NULL)
2113     root = create_tree (dfa, tree, eor, CONCAT);
2114   else
2115     root = eor;
2116   if (BE (eor == NULL || root == NULL, 0))
2117     {
2118       *err = REG_ESPACE;
2119       return NULL;
2120     }
2121   return root;
2122 }
2123
2124 /* This function build the following tree, from regular expression
2125    <branch1>|<branch2>:
2126            ALT
2127            / \
2128           /   \
2129    <branch1> <branch2>
2130
2131    ALT means alternative, which represents the operator `|'.  */
2132
2133 static bin_tree_t *
2134 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2135                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2136 {
2137   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138   bin_tree_t *tree, *branch = NULL;
2139   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2140   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141     return NULL;
2142
2143   while (token->type == OP_ALT)
2144     {
2145       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2146       if (token->type != OP_ALT && token->type != END_OF_RE
2147           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2148         {
2149           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2150           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2151             return NULL;
2152         }
2153       else
2154         branch = NULL;
2155       tree = create_tree (dfa, tree, branch, OP_ALT);
2156       if (BE (tree == NULL, 0))
2157         {
2158           *err = REG_ESPACE;
2159           return NULL;
2160         }
2161     }
2162   return tree;
2163 }
2164
2165 /* This function build the following tree, from regular expression
2166    <exp1><exp2>:
2167         CAT
2168         / \
2169        /   \
2170    <exp1> <exp2>
2171
2172    CAT means concatenation.  */
2173
2174 static bin_tree_t *
2175 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2177 {
2178   bin_tree_t *tree, *exp;
2179   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2180   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2181   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182     return NULL;
2183
2184   while (token->type != OP_ALT && token->type != END_OF_RE
2185          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2186     {
2187       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2188       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2189         {
2190           return NULL;
2191         }
2192       if (tree != NULL && exp != NULL)
2193         {
2194           tree = create_tree (dfa, tree, exp, CONCAT);
2195           if (tree == NULL)
2196             {
2197               *err = REG_ESPACE;
2198               return NULL;
2199             }
2200         }
2201       else if (tree == NULL)
2202         tree = exp;
2203       /* Otherwise exp == NULL, we don't need to create new tree.  */
2204     }
2205   return tree;
2206 }
2207
2208 /* This function build the following tree, from regular expression a*:
2209          *
2210          |
2211          a
2212 */
2213
2214 static bin_tree_t *
2215 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2216                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217 {
2218   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2219   bin_tree_t *tree;
2220   switch (token->type)
2221     {
2222     case CHARACTER:
2223       tree = create_token_tree (dfa, NULL, NULL, token);
2224       if (BE (tree == NULL, 0))
2225         {
2226           *err = REG_ESPACE;
2227           return NULL;
2228         }
2229 #ifdef RE_ENABLE_I18N
2230       if (dfa->mb_cur_max > 1)
2231         {
2232           while (!re_string_eoi (regexp)
2233                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2234             {
2235               bin_tree_t *mbc_remain;
2236               fetch_token (token, regexp, syntax);
2237               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2238               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2239               if (BE (mbc_remain == NULL || tree == NULL, 0))
2240                 {
2241                   *err = REG_ESPACE;
2242                   return NULL;
2243                 }
2244             }
2245         }
2246 #endif
2247       break;
2248     case OP_OPEN_SUBEXP:
2249       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2250       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2251         return NULL;
2252       break;
2253     case OP_OPEN_BRACKET:
2254       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2255       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2256         return NULL;
2257       break;
2258     case OP_BACK_REF:
2259       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2260         {
2261           *err = REG_ESUBREG;
2262           return NULL;
2263         }
2264       dfa->used_bkref_map |= 1 << token->opr.idx;
2265       tree = create_token_tree (dfa, NULL, NULL, token);
2266       if (BE (tree == NULL, 0))
2267         {
2268           *err = REG_ESPACE;
2269           return NULL;
2270         }
2271       ++dfa->nbackref;
2272       dfa->has_mb_node = 1;
2273       break;
2274     case OP_OPEN_DUP_NUM:
2275       if (syntax & RE_CONTEXT_INVALID_DUP)
2276         {
2277           *err = REG_BADRPT;
2278           return NULL;
2279         }
2280       /* FALLTHROUGH */
2281     case OP_DUP_ASTERISK:
2282     case OP_DUP_PLUS:
2283     case OP_DUP_QUESTION:
2284       if (syntax & RE_CONTEXT_INVALID_OPS)
2285         {
2286           *err = REG_BADRPT;
2287           return NULL;
2288         }
2289       else if (syntax & RE_CONTEXT_INDEP_OPS)
2290         {
2291           fetch_token (token, regexp, syntax);
2292           return parse_expression (regexp, preg, token, syntax, nest, err);
2293         }
2294       /* else fall through  */
2295     case OP_CLOSE_SUBEXP:
2296       if ((token->type == OP_CLOSE_SUBEXP) &&
2297           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2298         {
2299           *err = REG_ERPAREN;
2300           return NULL;
2301         }
2302       /* else fall through  */
2303     case OP_CLOSE_DUP_NUM:
2304       /* We treat it as a normal character.  */
2305
2306       /* Then we can these characters as normal characters.  */
2307       token->type = CHARACTER;
2308       /* mb_partial and word_char bits should be initialized already
2309          by peek_token.  */
2310       tree = create_token_tree (dfa, NULL, NULL, token);
2311       if (BE (tree == NULL, 0))
2312         {
2313           *err = REG_ESPACE;
2314           return NULL;
2315         }
2316       break;
2317     case ANCHOR:
2318       if ((token->opr.ctx_type
2319            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2320           && dfa->word_ops_used == 0)
2321         init_word_char (dfa);
2322       if (token->opr.ctx_type == WORD_DELIM
2323           || token->opr.ctx_type == NOT_WORD_DELIM)
2324         {
2325           bin_tree_t *tree_first, *tree_last;
2326           if (token->opr.ctx_type == WORD_DELIM)
2327             {
2328               token->opr.ctx_type = WORD_FIRST;
2329               tree_first = create_token_tree (dfa, NULL, NULL, token);
2330               token->opr.ctx_type = WORD_LAST;
2331             }
2332           else
2333             {
2334               token->opr.ctx_type = INSIDE_WORD;
2335               tree_first = create_token_tree (dfa, NULL, NULL, token);
2336               token->opr.ctx_type = INSIDE_NOTWORD;
2337             }
2338           tree_last = create_token_tree (dfa, NULL, NULL, token);
2339           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2340           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2341             {
2342               *err = REG_ESPACE;
2343               return NULL;
2344             }
2345         }
2346       else
2347         {
2348           tree = create_token_tree (dfa, NULL, NULL, token);
2349           if (BE (tree == NULL, 0))
2350             {
2351               *err = REG_ESPACE;
2352               return NULL;
2353             }
2354         }
2355       /* We must return here, since ANCHORs can't be followed
2356          by repetition operators.
2357          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2358              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2359       fetch_token (token, regexp, syntax);
2360       return tree;
2361     case OP_PERIOD:
2362       tree = create_token_tree (dfa, NULL, NULL, token);
2363       if (BE (tree == NULL, 0))
2364         {
2365           *err = REG_ESPACE;
2366           return NULL;
2367         }
2368       if (dfa->mb_cur_max > 1)
2369         dfa->has_mb_node = 1;
2370       break;
2371     case OP_WORD:
2372     case OP_NOTWORD:
2373       tree = build_charclass_op (dfa, regexp->trans,
2374                                  (const unsigned char *) "alnum",
2375                                  (const unsigned char *) "_",
2376                                  token->type == OP_NOTWORD, err);
2377       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2378         return NULL;
2379       break;
2380     case OP_SPACE:
2381     case OP_NOTSPACE:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383                                  (const unsigned char *) "space",
2384                                  (const unsigned char *) "",
2385                                  token->type == OP_NOTSPACE, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387         return NULL;
2388       break;
2389     case OP_ALT:
2390     case END_OF_RE:
2391       return NULL;
2392     case BACK_SLASH:
2393       *err = REG_EESCAPE;
2394       return NULL;
2395     default:
2396       /* Must not happen?  */
2397 #ifdef DEBUG
2398       assert (0);
2399 #endif
2400       return NULL;
2401     }
2402   fetch_token (token, regexp, syntax);
2403
2404   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2405          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2406     {
2407       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2408       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2409         return NULL;
2410       /* In BRE consecutive duplications are not allowed.  */
2411       if ((syntax & RE_CONTEXT_INVALID_DUP)
2412           && (token->type == OP_DUP_ASTERISK
2413               || token->type == OP_OPEN_DUP_NUM))
2414         {
2415           *err = REG_BADRPT;
2416           return NULL;
2417         }
2418     }
2419
2420   return tree;
2421 }
2422
2423 /* This function build the following tree, from regular expression
2424    (<reg_exp>):
2425          SUBEXP
2426             |
2427         <reg_exp>
2428 */
2429
2430 static bin_tree_t *
2431 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2432                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2433 {
2434   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2435   bin_tree_t *tree;
2436   size_t cur_nsub;
2437   cur_nsub = preg->re_nsub++;
2438
2439   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2440
2441   /* The subexpression may be a null string.  */
2442   if (token->type == OP_CLOSE_SUBEXP)
2443     tree = NULL;
2444   else
2445     {
2446       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2447       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2448         *err = REG_EPAREN;
2449       if (BE (*err != REG_NOERROR, 0))
2450         return NULL;
2451     }
2452
2453   if (cur_nsub <= '9' - '1')
2454     dfa->completed_bkref_map |= 1 << cur_nsub;
2455
2456   tree = create_tree (dfa, tree, NULL, SUBEXP);
2457   if (BE (tree == NULL, 0))
2458     {
2459       *err = REG_ESPACE;
2460       return NULL;
2461     }
2462   tree->token.opr.idx = cur_nsub;
2463   return tree;
2464 }
2465
2466 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2467
2468 static bin_tree_t *
2469 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2470               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2471 {
2472   bin_tree_t *tree = NULL, *old_tree = NULL;
2473   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2474   re_token_t start_token = *token;
2475
2476   if (token->type == OP_OPEN_DUP_NUM)
2477     {
2478       end = 0;
2479       start = fetch_number (regexp, token, syntax);
2480       if (start == REG_MISSING)
2481         {
2482           if (token->type == CHARACTER && token->opr.c == ',')
2483             start = 0; /* We treat "{,m}" as "{0,m}".  */
2484           else
2485             {
2486               *err = REG_BADBR; /* <re>{} is invalid.  */
2487               return NULL;
2488             }
2489         }
2490       if (BE (start != REG_ERROR, 1))
2491         {
2492           /* We treat "{n}" as "{n,n}".  */
2493           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2494                  : ((token->type == CHARACTER && token->opr.c == ',')
2495                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2496         }
2497       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2498         {
2499           /* Invalid sequence.  */
2500           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2501             {
2502               if (token->type == END_OF_RE)
2503                 *err = REG_EBRACE;
2504               else
2505                 *err = REG_BADBR;
2506
2507               return NULL;
2508             }
2509
2510           /* If the syntax bit is set, rollback.  */
2511           re_string_set_index (regexp, start_idx);
2512           *token = start_token;
2513           token->type = CHARACTER;
2514           /* mb_partial and word_char bits should be already initialized by
2515              peek_token.  */
2516           return elem;
2517         }
2518
2519       if (BE (end != REG_MISSING && start > end, 0))
2520         {
2521           /* First number greater than second.  */
2522           *err = REG_BADBR;
2523           return NULL;
2524         }
2525     }
2526   else
2527     {
2528       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2529       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2530     }
2531
2532   fetch_token (token, regexp, syntax);
2533
2534   if (BE (elem == NULL, 0))
2535     return NULL;
2536   if (BE (start == 0 && end == 0, 0))
2537     {
2538       postorder (elem, free_tree, NULL);
2539       return NULL;
2540     }
2541
2542   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2543   if (BE (start > 0, 0))
2544     {
2545       tree = elem;
2546       for (i = 2; i <= start; ++i)
2547         {
2548           elem = duplicate_tree (elem, dfa);
2549           tree = create_tree (dfa, tree, elem, CONCAT);
2550           if (BE (elem == NULL || tree == NULL, 0))
2551             goto parse_dup_op_espace;
2552         }
2553
2554       if (start == end)
2555         return tree;
2556
2557       /* Duplicate ELEM before it is marked optional.  */
2558       elem = duplicate_tree (elem, dfa);
2559       old_tree = tree;
2560     }
2561   else
2562     old_tree = NULL;
2563
2564   if (elem->token.type == SUBEXP)
2565     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2566
2567   tree = create_tree (dfa, elem, NULL,
2568                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2569   if (BE (tree == NULL, 0))
2570     goto parse_dup_op_espace;
2571
2572   /* This loop is actually executed only when end != REG_MISSING,
2573      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2574      already created the start+1-th copy.  */
2575   if ((Idx) -1 < 0 || end != REG_MISSING)
2576     for (i = start + 2; i <= end; ++i)
2577       {
2578         elem = duplicate_tree (elem, dfa);
2579         tree = create_tree (dfa, tree, elem, CONCAT);
2580         if (BE (elem == NULL || tree == NULL, 0))
2581           goto parse_dup_op_espace;
2582
2583         tree = create_tree (dfa, tree, NULL, OP_ALT);
2584         if (BE (tree == NULL, 0))
2585           goto parse_dup_op_espace;
2586       }
2587
2588   if (old_tree)
2589     tree = create_tree (dfa, old_tree, tree, CONCAT);
2590
2591   return tree;
2592
2593  parse_dup_op_espace:
2594   *err = REG_ESPACE;
2595   return NULL;
2596 }
2597
2598 /* Size of the names for collating symbol/equivalence_class/character_class.
2599    I'm not sure, but maybe enough.  */
2600 #define BRACKET_NAME_BUF_SIZE 32
2601
2602 #ifndef _LIBC
2603   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2604      Build the range expression which starts from START_ELEM, and ends
2605      at END_ELEM.  The result are written to MBCSET and SBCSET.
2606      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2607      mbcset->range_ends, is a pointer argument sinse we may
2608      update it.  */
2609
2610 static reg_errcode_t
2611 internal_function
2612 # ifdef RE_ENABLE_I18N
2613 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2614                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2615 # else /* not RE_ENABLE_I18N */
2616 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2617                  bracket_elem_t *end_elem)
2618 # endif /* not RE_ENABLE_I18N */
2619 {
2620   unsigned int start_ch, end_ch;
2621   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2622   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2623           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2624           0))
2625     return REG_ERANGE;
2626
2627   /* We can handle no multi character collating elements without libc
2628      support.  */
2629   if (BE ((start_elem->type == COLL_SYM
2630            && strlen ((char *) start_elem->opr.name) > 1)
2631           || (end_elem->type == COLL_SYM
2632               && strlen ((char *) end_elem->opr.name) > 1), 0))
2633     return REG_ECOLLATE;
2634
2635 # ifdef RE_ENABLE_I18N
2636   {
2637     wchar_t wc;
2638     wint_t start_wc;
2639     wint_t end_wc;
2640     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2641
2642     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2643                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2644                    : 0));
2645     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2646               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2647                  : 0));
2648     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2649                 ? __btowc (start_ch) : start_elem->opr.wch);
2650     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2651               ? __btowc (end_ch) : end_elem->opr.wch);
2652     if (start_wc == WEOF || end_wc == WEOF)
2653       return REG_ECOLLATE;
2654     cmp_buf[0] = start_wc;
2655     cmp_buf[4] = end_wc;
2656     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2657       return REG_ERANGE;
2658
2659     /* Got valid collation sequence values, add them as a new entry.
2660        However, for !_LIBC we have no collation elements: if the
2661        character set is single byte, the single byte character set
2662        that we build below suffices.  parse_bracket_exp passes
2663        no MBCSET if dfa->mb_cur_max == 1.  */
2664     if (mbcset)
2665       {
2666         /* Check the space of the arrays.  */
2667         if (BE (*range_alloc == mbcset->nranges, 0))
2668           {
2669             /* There is not enough space, need realloc.  */
2670             wchar_t *new_array_start, *new_array_end;
2671             Idx new_nranges;
2672
2673             /* +1 in case of mbcset->nranges is 0.  */
2674             new_nranges = 2 * mbcset->nranges + 1;
2675             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2676                are NULL if *range_alloc == 0.  */
2677             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2678                                           new_nranges);
2679             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2680                                         new_nranges);
2681
2682             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2683               return REG_ESPACE;
2684
2685             mbcset->range_starts = new_array_start;
2686             mbcset->range_ends = new_array_end;
2687             *range_alloc = new_nranges;
2688           }
2689
2690         mbcset->range_starts[mbcset->nranges] = start_wc;
2691         mbcset->range_ends[mbcset->nranges++] = end_wc;
2692       }
2693
2694     /* Build the table for single byte characters.  */
2695     for (wc = 0; wc < SBC_MAX; ++wc)
2696       {
2697         cmp_buf[2] = wc;
2698         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2699             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2700           bitset_set (sbcset, wc);
2701       }
2702   }
2703 # else /* not RE_ENABLE_I18N */
2704   {
2705     unsigned int ch;
2706     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2707                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2708                    : 0));
2709     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2710               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2711                  : 0));
2712     if (start_ch > end_ch)
2713       return REG_ERANGE;
2714     /* Build the table for single byte characters.  */
2715     for (ch = 0; ch < SBC_MAX; ++ch)
2716       if (start_ch <= ch  && ch <= end_ch)
2717         bitset_set (sbcset, ch);
2718   }
2719 # endif /* not RE_ENABLE_I18N */
2720   return REG_NOERROR;
2721 }
2722 #endif /* not _LIBC */
2723
2724 #ifndef _LIBC
2725 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2726    Build the collating element which is represented by NAME.
2727    The result are written to MBCSET and SBCSET.
2728    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2729    pointer argument since we may update it.  */
2730
2731 static reg_errcode_t
2732 internal_function
2733 build_collating_symbol (bitset_t sbcset,
2734 # ifdef RE_ENABLE_I18N
2735                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2736 # endif
2737                         const unsigned char *name)
2738 {
2739   size_t name_len = strlen ((const char *) name);
2740   if (BE (name_len != 1, 0))
2741     return REG_ECOLLATE;
2742   else
2743     {
2744       bitset_set (sbcset, name[0]);
2745       return REG_NOERROR;
2746     }
2747 }
2748 #endif /* not _LIBC */
2749
2750 /* This function parse bracket expression like "[abc]", "[a-c]",
2751    "[[.a-a.]]" etc.  */
2752
2753 static bin_tree_t *
2754 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2755                    reg_syntax_t syntax, reg_errcode_t *err)
2756 {
2757 #ifdef _LIBC
2758   const unsigned char *collseqmb;
2759   const char *collseqwc;
2760   uint32_t nrules;
2761   int32_t table_size;
2762   const int32_t *symb_table;
2763   const unsigned char *extra;
2764
2765   /* Local function for parse_bracket_exp used in _LIBC environement.
2766      Seek the collating symbol entry correspondings to NAME.
2767      Return the index of the symbol in the SYMB_TABLE.  */
2768
2769   auto inline int32_t
2770   __attribute ((always_inline))
2771   seek_collating_symbol_entry (name, name_len)
2772          const unsigned char *name;
2773          size_t name_len;
2774     {
2775       int32_t hash = elem_hash ((const char *) name, name_len);
2776       int32_t elem = hash % table_size;
2777       if (symb_table[2 * elem] != 0)
2778         {
2779           int32_t second = hash % (table_size - 2) + 1;
2780
2781           do
2782             {
2783               /* First compare the hashing value.  */
2784               if (symb_table[2 * elem] == hash
2785                   /* Compare the length of the name.  */
2786                   && name_len == extra[symb_table[2 * elem + 1]]
2787                   /* Compare the name.  */
2788                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2789                              name_len) == 0)
2790                 {
2791                   /* Yep, this is the entry.  */
2792                   break;
2793                 }
2794
2795               /* Next entry.  */
2796               elem += second;
2797             }
2798           while (symb_table[2 * elem] != 0);
2799         }
2800       return elem;
2801     }
2802
2803   /* Local function for parse_bracket_exp used in _LIBC environement.
2804      Look up the collation sequence value of BR_ELEM.
2805      Return the value if succeeded, UINT_MAX otherwise.  */
2806
2807   auto inline unsigned int
2808   __attribute ((always_inline))
2809   lookup_collation_sequence_value (br_elem)
2810          bracket_elem_t *br_elem;
2811     {
2812       if (br_elem->type == SB_CHAR)
2813         {
2814           /*
2815           if (MB_CUR_MAX == 1)
2816           */
2817           if (nrules == 0)
2818             return collseqmb[br_elem->opr.ch];
2819           else
2820             {
2821               wint_t wc = __btowc (br_elem->opr.ch);
2822               return __collseq_table_lookup (collseqwc, wc);
2823             }
2824         }
2825       else if (br_elem->type == MB_CHAR)
2826         {
2827           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2828         }
2829       else if (br_elem->type == COLL_SYM)
2830         {
2831           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2832           if (nrules != 0)
2833             {
2834               int32_t elem, idx;
2835               elem = seek_collating_symbol_entry (br_elem->opr.name,
2836                                                   sym_name_len);
2837               if (symb_table[2 * elem] != 0)
2838                 {
2839                   /* We found the entry.  */
2840                   idx = symb_table[2 * elem + 1];
2841                   /* Skip the name of collating element name.  */
2842                   idx += 1 + extra[idx];
2843                   /* Skip the byte sequence of the collating element.  */
2844                   idx += 1 + extra[idx];
2845                   /* Adjust for the alignment.  */
2846                   idx = (idx + 3) & ~3;
2847                   /* Skip the multibyte collation sequence value.  */
2848                   idx += sizeof (unsigned int);
2849                   /* Skip the wide char sequence of the collating element.  */
2850                   idx += sizeof (unsigned int) *
2851                     (1 + *(unsigned int *) (extra + idx));
2852                   /* Return the collation sequence value.  */
2853                   return *(unsigned int *) (extra + idx);
2854                 }
2855               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2856                 {
2857                   /* No valid character.  Match it as a single byte
2858                      character.  */
2859                   return collseqmb[br_elem->opr.name[0]];
2860                 }
2861             }
2862           else if (sym_name_len == 1)
2863             return collseqmb[br_elem->opr.name[0]];
2864         }
2865       return UINT_MAX;
2866     }
2867
2868   /* Local function for parse_bracket_exp used in _LIBC environement.
2869      Build the range expression which starts from START_ELEM, and ends
2870      at END_ELEM.  The result are written to MBCSET and SBCSET.
2871      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2872      mbcset->range_ends, is a pointer argument sinse we may
2873      update it.  */
2874
2875   auto inline reg_errcode_t
2876   __attribute ((always_inline))
2877   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2878          re_charset_t *mbcset;
2879          Idx *range_alloc;
2880          bitset_t sbcset;
2881          bracket_elem_t *start_elem, *end_elem;
2882     {
2883       unsigned int ch;
2884       uint32_t start_collseq;
2885       uint32_t end_collseq;
2886
2887       /* Equivalence Classes and Character Classes can't be a range
2888          start/end.  */
2889       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2890               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2891               0))
2892         return REG_ERANGE;
2893
2894       start_collseq = lookup_collation_sequence_value (start_elem);
2895       end_collseq = lookup_collation_sequence_value (end_elem);
2896       /* Check start/end collation sequence values.  */
2897       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2898         return REG_ECOLLATE;
2899       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2900         return REG_ERANGE;
2901
2902       /* Got valid collation sequence values, add them as a new entry.
2903          However, if we have no collation elements, and the character set
2904          is single byte, the single byte character set that we
2905          build below suffices. */
2906       if (nrules > 0 || dfa->mb_cur_max > 1)
2907         {
2908           /* Check the space of the arrays.  */
2909           if (BE (*range_alloc == mbcset->nranges, 0))
2910             {
2911               /* There is not enough space, need realloc.  */
2912               uint32_t *new_array_start;
2913               uint32_t *new_array_end;
2914               Idx new_nranges;
2915
2916               /* +1 in case of mbcset->nranges is 0.  */
2917               new_nranges = 2 * mbcset->nranges + 1;
2918               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2919                                             new_nranges);
2920               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2921                                           new_nranges);
2922
2923               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2924                 return REG_ESPACE;
2925
2926               mbcset->range_starts = new_array_start;
2927               mbcset->range_ends = new_array_end;
2928               *range_alloc = new_nranges;
2929             }
2930
2931           mbcset->range_starts[mbcset->nranges] = start_collseq;
2932           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2933         }
2934
2935       /* Build the table for single byte characters.  */
2936       for (ch = 0; ch < SBC_MAX; ch++)
2937         {
2938           uint32_t ch_collseq;
2939           /*
2940           if (MB_CUR_MAX == 1)
2941           */
2942           if (nrules == 0)
2943             ch_collseq = collseqmb[ch];
2944           else
2945             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947             bitset_set (sbcset, ch);
2948         }
2949       return REG_NOERROR;
2950     }
2951
2952   /* Local function for parse_bracket_exp used in _LIBC environement.
2953      Build the collating element which is represented by NAME.
2954      The result are written to MBCSET and SBCSET.
2955      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956      pointer argument sinse we may update it.  */
2957
2958   auto inline reg_errcode_t
2959   __attribute ((always_inline))
2960   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2961          re_charset_t *mbcset;
2962          Idx *coll_sym_alloc;
2963          bitset_t sbcset;
2964          const unsigned char *name;
2965     {
2966       int32_t elem, idx;
2967       size_t name_len = strlen ((const char *) name);
2968       if (nrules != 0)
2969         {
2970           elem = seek_collating_symbol_entry (name, name_len);
2971           if (symb_table[2 * elem] != 0)
2972             {
2973               /* We found the entry.  */
2974               idx = symb_table[2 * elem + 1];
2975               /* Skip the name of collating element name.  */
2976               idx += 1 + extra[idx];
2977             }
2978           else if (symb_table[2 * elem] == 0 && name_len == 1)
2979             {
2980               /* No valid character, treat it as a normal
2981                  character.  */
2982               bitset_set (sbcset, name[0]);
2983               return REG_NOERROR;
2984             }
2985           else
2986             return REG_ECOLLATE;
2987
2988           /* Got valid collation sequence, add it as a new entry.  */
2989           /* Check the space of the arrays.  */
2990           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2991             {
2992               /* Not enough, realloc it.  */
2993               /* +1 in case of mbcset->ncoll_syms is 0.  */
2994               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2995               /* Use realloc since mbcset->coll_syms is NULL
2996                  if *alloc == 0.  */
2997               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2998                                                    new_coll_sym_alloc);
2999               if (BE (new_coll_syms == NULL, 0))
3000                 return REG_ESPACE;
3001               mbcset->coll_syms = new_coll_syms;
3002               *coll_sym_alloc = new_coll_sym_alloc;
3003             }
3004           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3005           return REG_NOERROR;
3006         }
3007       else
3008         {
3009           if (BE (name_len != 1, 0))
3010             return REG_ECOLLATE;
3011           else
3012             {
3013               bitset_set (sbcset, name[0]);
3014               return REG_NOERROR;
3015             }
3016         }
3017     }
3018 #endif
3019
3020   re_token_t br_token;
3021   re_bitset_ptr_t sbcset;
3022 #ifdef RE_ENABLE_I18N
3023   re_charset_t *mbcset;
3024   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3025   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3026 #endif /* not RE_ENABLE_I18N */
3027   bool non_match = false;
3028   bin_tree_t *work_tree;
3029   int token_len;
3030   bool first_round = true;
3031 #ifdef _LIBC
3032   collseqmb = (const unsigned char *)
3033     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3034   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3035   if (nrules)
3036     {
3037       /*
3038       if (MB_CUR_MAX > 1)
3039       */
3040       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3041       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3042       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3043                                                   _NL_COLLATE_SYMB_TABLEMB);
3044       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045                                                    _NL_COLLATE_SYMB_EXTRAMB);
3046     }
3047 #endif
3048   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049 #ifdef RE_ENABLE_I18N
3050   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3051 #endif /* RE_ENABLE_I18N */
3052 #ifdef RE_ENABLE_I18N
3053   if (BE (sbcset == NULL || mbcset == NULL, 0))
3054 #else
3055   if (BE (sbcset == NULL, 0))
3056 #endif /* RE_ENABLE_I18N */
3057     {
3058       *err = REG_ESPACE;
3059       return NULL;
3060     }
3061
3062   token_len = peek_token_bracket (token, regexp, syntax);
3063   if (BE (token->type == END_OF_RE, 0))
3064     {
3065       *err = REG_BADPAT;
3066       goto parse_bracket_exp_free_return;
3067     }
3068   if (token->type == OP_NON_MATCH_LIST)
3069     {
3070 #ifdef RE_ENABLE_I18N
3071       mbcset->non_match = 1;
3072 #endif /* not RE_ENABLE_I18N */
3073       non_match = true;
3074       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3075         bitset_set (sbcset, '\0');
3076       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3077       token_len = peek_token_bracket (token, regexp, syntax);
3078       if (BE (token->type == END_OF_RE, 0))
3079         {
3080           *err = REG_BADPAT;
3081           goto parse_bracket_exp_free_return;
3082         }
3083     }
3084
3085   /* We treat the first ']' as a normal character.  */
3086   if (token->type == OP_CLOSE_BRACKET)
3087     token->type = CHARACTER;
3088
3089   while (1)
3090     {
3091       bracket_elem_t start_elem, end_elem;
3092       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3093       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3094       reg_errcode_t ret;
3095       int token_len2 = 0;
3096       bool is_range_exp = false;
3097       re_token_t token2;
3098
3099       start_elem.opr.name = start_name_buf;
3100       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3101                                    syntax, first_round);
3102       if (BE (ret != REG_NOERROR, 0))
3103         {
3104           *err = ret;
3105           goto parse_bracket_exp_free_return;
3106         }
3107       first_round = false;
3108
3109       /* Get information about the next token.  We need it in any case.  */
3110       token_len = peek_token_bracket (token, regexp, syntax);
3111
3112       /* Do not check for ranges if we know they are not allowed.  */
3113       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3114         {
3115           if (BE (token->type == END_OF_RE, 0))
3116             {
3117               *err = REG_EBRACK;
3118               goto parse_bracket_exp_free_return;
3119             }
3120           if (token->type == OP_CHARSET_RANGE)
3121             {
3122               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3123               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3124               if (BE (token2.type == END_OF_RE, 0))
3125                 {
3126                   *err = REG_EBRACK;
3127                   goto parse_bracket_exp_free_return;
3128                 }
3129               if (token2.type == OP_CLOSE_BRACKET)
3130                 {
3131                   /* We treat the last '-' as a normal character.  */
3132                   re_string_skip_bytes (regexp, -token_len);
3133                   token->type = CHARACTER;
3134                 }
3135               else
3136                 is_range_exp = true;
3137             }
3138         }
3139
3140       if (is_range_exp == true)
3141         {
3142           end_elem.opr.name = end_name_buf;
3143           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3144                                        dfa, syntax, true);
3145           if (BE (ret != REG_NOERROR, 0))
3146             {
3147               *err = ret;
3148               goto parse_bracket_exp_free_return;
3149             }
3150
3151           token_len = peek_token_bracket (token, regexp, syntax);
3152
3153 #ifdef _LIBC
3154           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3155                                   &start_elem, &end_elem);
3156 #else
3157 # ifdef RE_ENABLE_I18N
3158           *err = build_range_exp (sbcset,
3159                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3160                                   &range_alloc, &start_elem, &end_elem);
3161 # else
3162           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3163 # endif
3164 #endif /* RE_ENABLE_I18N */
3165           if (BE (*err != REG_NOERROR, 0))
3166             goto parse_bracket_exp_free_return;
3167         }
3168       else
3169         {
3170           switch (start_elem.type)
3171             {
3172             case SB_CHAR:
3173               bitset_set (sbcset, start_elem.opr.ch);
3174               break;
3175 #ifdef RE_ENABLE_I18N
3176             case MB_CHAR:
3177               /* Check whether the array has enough space.  */
3178               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3179                 {
3180                   wchar_t *new_mbchars;
3181                   /* Not enough, realloc it.  */
3182                   /* +1 in case of mbcset->nmbchars is 0.  */
3183                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3184                   /* Use realloc since array is NULL if *alloc == 0.  */
3185                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3186                                             mbchar_alloc);
3187                   if (BE (new_mbchars == NULL, 0))
3188                     goto parse_bracket_exp_espace;
3189                   mbcset->mbchars = new_mbchars;
3190                 }
3191               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3192               break;
3193 #endif /* RE_ENABLE_I18N */
3194             case EQUIV_CLASS:
3195               *err = build_equiv_class (sbcset,
3196 #ifdef RE_ENABLE_I18N
3197                                         mbcset, &equiv_class_alloc,
3198 #endif /* RE_ENABLE_I18N */
3199                                         start_elem.opr.name);
3200               if (BE (*err != REG_NOERROR, 0))
3201                 goto parse_bracket_exp_free_return;
3202               break;
3203             case COLL_SYM:
3204               *err = build_collating_symbol (sbcset,
3205 #ifdef RE_ENABLE_I18N
3206                                              mbcset, &coll_sym_alloc,
3207 #endif /* RE_ENABLE_I18N */
3208                                              start_elem.opr.name);
3209               if (BE (*err != REG_NOERROR, 0))
3210                 goto parse_bracket_exp_free_return;
3211               break;
3212             case CHAR_CLASS:
3213               *err = build_charclass (regexp->trans, sbcset,
3214 #ifdef RE_ENABLE_I18N
3215                                       mbcset, &char_class_alloc,
3216 #endif /* RE_ENABLE_I18N */
3217                                       start_elem.opr.name, syntax);
3218               if (BE (*err != REG_NOERROR, 0))
3219                goto parse_bracket_exp_free_return;
3220               break;
3221             default:
3222               assert (0);
3223               break;
3224             }
3225         }
3226       if (BE (token->type == END_OF_RE, 0))
3227         {
3228           *err = REG_EBRACK;
3229           goto parse_bracket_exp_free_return;
3230         }
3231       if (token->type == OP_CLOSE_BRACKET)
3232         break;
3233     }
3234
3235   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3236
3237   /* If it is non-matching list.  */
3238   if (non_match)
3239     bitset_not (sbcset);
3240
3241 #ifdef RE_ENABLE_I18N
3242   /* Ensure only single byte characters are set.  */
3243   if (dfa->mb_cur_max > 1)
3244     bitset_mask (sbcset, dfa->sb_char);
3245
3246   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3247       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3248                                                      || mbcset->non_match)))
3249     {
3250       bin_tree_t *mbc_tree;
3251       int sbc_idx;
3252       /* Build a tree for complex bracket.  */
3253       dfa->has_mb_node = 1;
3254       br_token.type = COMPLEX_BRACKET;
3255       br_token.opr.mbcset = mbcset;
3256       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3257       if (BE (mbc_tree == NULL, 0))
3258         goto parse_bracket_exp_espace;
3259       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3260         if (sbcset[sbc_idx])
3261           break;
3262       /* If there are no bits set in sbcset, there is no point
3263          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3264       if (sbc_idx < BITSET_WORDS)
3265         {
3266           /* Build a tree for simple bracket.  */
3267           br_token.type = SIMPLE_BRACKET;
3268           br_token.opr.sbcset = sbcset;
3269           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270           if (BE (work_tree == NULL, 0))
3271             goto parse_bracket_exp_espace;
3272
3273           /* Then join them by ALT node.  */
3274           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3275           if (BE (work_tree == NULL, 0))
3276             goto parse_bracket_exp_espace;
3277         }
3278       else
3279         {
3280           re_free (sbcset);
3281           work_tree = mbc_tree;
3282         }
3283     }
3284   else
3285 #endif /* not RE_ENABLE_I18N */
3286     {
3287 #ifdef RE_ENABLE_I18N
3288       free_charset (mbcset);
3289 #endif
3290       /* Build a tree for simple bracket.  */
3291       br_token.type = SIMPLE_BRACKET;
3292       br_token.opr.sbcset = sbcset;
3293       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3294       if (BE (work_tree == NULL, 0))
3295         goto parse_bracket_exp_espace;
3296     }
3297   return work_tree;
3298
3299  parse_bracket_exp_espace:
3300   *err = REG_ESPACE;
3301  parse_bracket_exp_free_return:
3302   re_free (sbcset);
3303 #ifdef RE_ENABLE_I18N
3304   free_charset (mbcset);
3305 #endif /* RE_ENABLE_I18N */
3306   return NULL;
3307 }
3308
3309 /* Parse an element in the bracket expression.  */
3310
3311 static reg_errcode_t
3312 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3313                        re_token_t *token, int token_len, re_dfa_t *dfa,
3314                        reg_syntax_t syntax, bool accept_hyphen)
3315 {
3316 #ifdef RE_ENABLE_I18N
3317   int cur_char_size;
3318   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3319   if (cur_char_size > 1)
3320     {
3321       elem->type = MB_CHAR;
3322       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3323       re_string_skip_bytes (regexp, cur_char_size);
3324       return REG_NOERROR;
3325     }
3326 #endif /* RE_ENABLE_I18N */
3327   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3328   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3329       || token->type == OP_OPEN_EQUIV_CLASS)
3330     return parse_bracket_symbol (elem, regexp, token);
3331   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3332     {
3333       /* A '-' must only appear as anything but a range indicator before
3334          the closing bracket.  Everything else is an error.  */
3335       re_token_t token2;
3336       (void) peek_token_bracket (&token2, regexp, syntax);
3337       if (token2.type != OP_CLOSE_BRACKET)
3338         /* The actual error value is not standardized since this whole
3339            case is undefined.  But ERANGE makes good sense.  */
3340         return REG_ERANGE;
3341     }
3342   elem->type = SB_CHAR;
3343   elem->opr.ch = token->opr.c;
3344   return REG_NOERROR;
3345 }
3346
3347 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3348    such as [:<character_class>:], [.<collating_element>.], and
3349    [=<equivalent_class>=].  */
3350
3351 static reg_errcode_t
3352 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3353                       re_token_t *token)
3354 {
3355   unsigned char ch, delim = token->opr.c;
3356   int i = 0;
3357   if (re_string_eoi(regexp))
3358     return REG_EBRACK;
3359   for (;; ++i)
3360     {
3361       if (i >= BRACKET_NAME_BUF_SIZE)
3362         return REG_EBRACK;
3363       if (token->type == OP_OPEN_CHAR_CLASS)
3364         ch = re_string_fetch_byte_case (regexp);
3365       else
3366         ch = re_string_fetch_byte (regexp);
3367       if (re_string_eoi(regexp))
3368         return REG_EBRACK;
3369       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3370         break;
3371       elem->opr.name[i] = ch;
3372     }
3373   re_string_skip_bytes (regexp, 1);
3374   elem->opr.name[i] = '\0';
3375   switch (token->type)
3376     {
3377     case OP_OPEN_COLL_ELEM:
3378       elem->type = COLL_SYM;
3379       break;
3380     case OP_OPEN_EQUIV_CLASS:
3381       elem->type = EQUIV_CLASS;
3382       break;
3383     case OP_OPEN_CHAR_CLASS:
3384       elem->type = CHAR_CLASS;
3385       break;
3386     default:
3387       break;
3388     }
3389   return REG_NOERROR;
3390 }
3391
3392   /* Helper function for parse_bracket_exp.
3393      Build the equivalence class which is represented by NAME.
3394      The result are written to MBCSET and SBCSET.
3395      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3396      is a pointer argument sinse we may update it.  */
3397
3398 static reg_errcode_t
3399 #ifdef RE_ENABLE_I18N
3400 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3401                    Idx *equiv_class_alloc, const unsigned char *name)
3402 #else /* not RE_ENABLE_I18N */
3403 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3404 #endif /* not RE_ENABLE_I18N */
3405 {
3406 #ifdef _LIBC
3407   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3408   if (nrules != 0)
3409     {
3410       const int32_t *table, *indirect;
3411       const unsigned char *weights, *extra, *cp;
3412       unsigned char char_buf[2];
3413       int32_t idx1, idx2;
3414       unsigned int ch;
3415       size_t len;
3416       /* This #include defines a local function!  */
3417 # include <locale/weight.h>
3418       /* Calculate the index for equivalence class.  */
3419       cp = name;
3420       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3421       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422                                                _NL_COLLATE_WEIGHTMB);
3423       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3424                                                    _NL_COLLATE_EXTRAMB);
3425       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3426                                                 _NL_COLLATE_INDIRECTMB);
3427       idx1 = findidx (&cp);
3428       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3429         /* This isn't a valid character.  */
3430         return REG_ECOLLATE;
3431
3432       /* Build single byte matcing table for this equivalence class.  */
3433       char_buf[1] = (unsigned char) '\0';
3434       len = weights[idx1];
3435       for (ch = 0; ch < SBC_MAX; ++ch)
3436         {
3437           char_buf[0] = ch;
3438           cp = char_buf;
3439           idx2 = findidx (&cp);
3440 /*
3441           idx2 = table[ch];
3442 */
3443           if (idx2 == 0)
3444             /* This isn't a valid character.  */
3445             continue;
3446           if (len == weights[idx2])
3447             {
3448               int cnt = 0;
3449               while (cnt <= len &&
3450                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3451                 ++cnt;
3452
3453               if (cnt > len)
3454                 bitset_set (sbcset, ch);
3455             }
3456         }
3457       /* Check whether the array has enough space.  */
3458       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3459         {
3460           /* Not enough, realloc it.  */
3461           /* +1 in case of mbcset->nequiv_classes is 0.  */
3462           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3463           /* Use realloc since the array is NULL if *alloc == 0.  */
3464           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3465                                                    int32_t,
3466                                                    new_equiv_class_alloc);
3467           if (BE (new_equiv_classes == NULL, 0))
3468             return REG_ESPACE;
3469           mbcset->equiv_classes = new_equiv_classes;
3470           *equiv_class_alloc = new_equiv_class_alloc;
3471         }
3472       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3473     }
3474   else
3475 #endif /* _LIBC */
3476     {
3477       if (BE (strlen ((const char *) name) != 1, 0))
3478         return REG_ECOLLATE;
3479       bitset_set (sbcset, *name);
3480     }
3481   return REG_NOERROR;
3482 }
3483
3484   /* Helper function for parse_bracket_exp.
3485      Build the character class which is represented by NAME.
3486      The result are written to MBCSET and SBCSET.
3487      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3488      is a pointer argument sinse we may update it.  */
3489
3490 static reg_errcode_t
3491 #ifdef RE_ENABLE_I18N
3492 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3493                  re_charset_t *mbcset, Idx *char_class_alloc,
3494                  const unsigned char *class_name, reg_syntax_t syntax)
3495 #else /* not RE_ENABLE_I18N */
3496 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3497                  const unsigned char *class_name, reg_syntax_t syntax)
3498 #endif /* not RE_ENABLE_I18N */
3499 {
3500   int i;
3501   const char *name = (const char *) class_name;
3502
3503   /* In case of REG_ICASE "upper" and "lower" match the both of
3504      upper and lower cases.  */
3505   if ((syntax & RE_ICASE)
3506       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3507     name = "alpha";
3508
3509 #ifdef RE_ENABLE_I18N
3510   /* Check the space of the arrays.  */
3511   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3512     {
3513       /* Not enough, realloc it.  */
3514       /* +1 in case of mbcset->nchar_classes is 0.  */
3515       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3516       /* Use realloc since array is NULL if *alloc == 0.  */
3517       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3518                                                new_char_class_alloc);
3519       if (BE (new_char_classes == NULL, 0))
3520         return REG_ESPACE;
3521       mbcset->char_classes = new_char_classes;
3522       *char_class_alloc = new_char_class_alloc;
3523     }
3524   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3525 #endif /* RE_ENABLE_I18N */
3526
3527 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3528   do {                                          \
3529     if (BE (trans != NULL, 0))                  \
3530       {                                         \
3531         for (i = 0; i < SBC_MAX; ++i)           \
3532           if (ctype_func (i))                   \
3533             bitset_set (sbcset, trans[i]);      \
3534       }                                         \
3535     else                                        \
3536       {                                         \
3537         for (i = 0; i < SBC_MAX; ++i)           \
3538           if (ctype_func (i))                   \
3539             bitset_set (sbcset, i);             \
3540       }                                         \
3541   } while (0)
3542
3543   if (strcmp (name, "alnum") == 0)
3544     BUILD_CHARCLASS_LOOP (isalnum);
3545   else if (strcmp (name, "cntrl") == 0)
3546     BUILD_CHARCLASS_LOOP (iscntrl);
3547   else if (strcmp (name, "lower") == 0)
3548     BUILD_CHARCLASS_LOOP (islower);
3549   else if (strcmp (name, "space") == 0)
3550     BUILD_CHARCLASS_LOOP (isspace);
3551   else if (strcmp (name, "alpha") == 0)
3552     BUILD_CHARCLASS_LOOP (isalpha);
3553   else if (strcmp (name, "digit") == 0)
3554     BUILD_CHARCLASS_LOOP (isdigit);
3555   else if (strcmp (name, "print") == 0)
3556     BUILD_CHARCLASS_LOOP (isprint);
3557   else if (strcmp (name, "upper") == 0)
3558     BUILD_CHARCLASS_LOOP (isupper);
3559   else if (strcmp (name, "blank") == 0)
3560     BUILD_CHARCLASS_LOOP (isblank);
3561   else if (strcmp (name, "graph") == 0)
3562     BUILD_CHARCLASS_LOOP (isgraph);
3563   else if (strcmp (name, "punct") == 0)
3564     BUILD_CHARCLASS_LOOP (ispunct);
3565   else if (strcmp (name, "xdigit") == 0)
3566     BUILD_CHARCLASS_LOOP (isxdigit);
3567   else
3568     return REG_ECTYPE;
3569
3570   return REG_NOERROR;
3571 }
3572
3573 static bin_tree_t *
3574 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3575                     const unsigned char *class_name,
3576                     const unsigned char *extra, bool non_match,
3577                     reg_errcode_t *err)
3578 {
3579   re_bitset_ptr_t sbcset;
3580 #ifdef RE_ENABLE_I18N
3581   re_charset_t *mbcset;
3582   Idx alloc = 0;
3583 #endif /* not RE_ENABLE_I18N */
3584   reg_errcode_t ret;
3585   re_token_t br_token;
3586   bin_tree_t *tree;
3587
3588   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3589 #ifdef RE_ENABLE_I18N
3590   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3591 #endif /* RE_ENABLE_I18N */
3592
3593 #ifdef RE_ENABLE_I18N
3594   if (BE (sbcset == NULL || mbcset == NULL, 0))
3595 #else /* not RE_ENABLE_I18N */
3596   if (BE (sbcset == NULL, 0))
3597 #endif /* not RE_ENABLE_I18N */
3598     {
3599       *err = REG_ESPACE;
3600       return NULL;
3601     }
3602
3603   if (non_match)
3604     {
3605 #ifdef RE_ENABLE_I18N
3606       /*
3607       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3608         bitset_set(cset->sbcset, '\0');
3609       */
3610       mbcset->non_match = 1;
3611 #endif /* not RE_ENABLE_I18N */
3612     }
3613
3614   /* We don't care the syntax in this case.  */
3615   ret = build_charclass (trans, sbcset,
3616 #ifdef RE_ENABLE_I18N
3617                          mbcset, &alloc,
3618 #endif /* RE_ENABLE_I18N */
3619                          class_name, 0);
3620
3621   if (BE (ret != REG_NOERROR, 0))
3622     {
3623       re_free (sbcset);
3624 #ifdef RE_ENABLE_I18N
3625       free_charset (mbcset);
3626 #endif /* RE_ENABLE_I18N */
3627       *err = ret;
3628       return NULL;
3629     }
3630   /* \w match '_' also.  */
3631   for (; *extra; extra++)
3632     bitset_set (sbcset, *extra);
3633
3634   /* If it is non-matching list.  */
3635   if (non_match)
3636     bitset_not (sbcset);
3637
3638 #ifdef RE_ENABLE_I18N
3639   /* Ensure only single byte characters are set.  */
3640   if (dfa->mb_cur_max > 1)
3641     bitset_mask (sbcset, dfa->sb_char);
3642 #endif
3643
3644   /* Build a tree for simple bracket.  */
3645   br_token.type = SIMPLE_BRACKET;
3646   br_token.opr.sbcset = sbcset;
3647   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3648   if (BE (tree == NULL, 0))
3649     goto build_word_op_espace;
3650
3651 #ifdef RE_ENABLE_I18N
3652   if (dfa->mb_cur_max > 1)
3653     {
3654       bin_tree_t *mbc_tree;
3655       /* Build a tree for complex bracket.  */
3656       br_token.type = COMPLEX_BRACKET;
3657       br_token.opr.mbcset = mbcset;
3658       dfa->has_mb_node = 1;
3659       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3660       if (BE (mbc_tree == NULL, 0))
3661         goto build_word_op_espace;
3662       /* Then join them by ALT node.  */
3663       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3664       if (BE (mbc_tree != NULL, 1))
3665         return tree;
3666     }
3667   else
3668     {
3669       free_charset (mbcset);
3670       return tree;
3671     }
3672 #else /* not RE_ENABLE_I18N */
3673   return tree;
3674 #endif /* not RE_ENABLE_I18N */
3675
3676  build_word_op_espace:
3677   re_free (sbcset);
3678 #ifdef RE_ENABLE_I18N
3679   free_charset (mbcset);
3680 #endif /* RE_ENABLE_I18N */
3681   *err = REG_ESPACE;
3682   return NULL;
3683 }
3684
3685 /* This is intended for the expressions like "a{1,3}".
3686    Fetch a number from `input', and return the number.
3687    Return REG_MISSING if the number field is empty like "{,1}".
3688    Return REG_ERROR if an error occurred.  */
3689
3690 static Idx
3691 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3692 {
3693   Idx num = REG_MISSING;
3694   unsigned char c;
3695   while (1)
3696     {
3697       fetch_token (token, input, syntax);
3698       c = token->opr.c;
3699       if (BE (token->type == END_OF_RE, 0))
3700         return REG_ERROR;
3701       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3702         break;
3703       num = ((token->type != CHARACTER || c < '0' || '9' < c
3704               || num == REG_ERROR)
3705              ? REG_ERROR
3706              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3707       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3708     }
3709   return num;
3710 }
3711 \f
3712 #ifdef RE_ENABLE_I18N
3713 static void
3714 free_charset (re_charset_t *cset)
3715 {
3716   re_free (cset->mbchars);
3717 # ifdef _LIBC
3718   re_free (cset->coll_syms);
3719   re_free (cset->equiv_classes);
3720   re_free (cset->range_starts);
3721   re_free (cset->range_ends);
3722 # endif
3723   re_free (cset->char_classes);
3724   re_free (cset);
3725 }
3726 #endif /* RE_ENABLE_I18N */
3727 \f
3728 /* Functions for binary tree operation.  */
3729
3730 /* Create a tree node.  */
3731
3732 static bin_tree_t *
3733 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3734              re_token_type_t type)
3735 {
3736   re_token_t t;
3737   t.type = type;
3738   return create_token_tree (dfa, left, right, &t);
3739 }
3740
3741 static bin_tree_t *
3742 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743                    const re_token_t *token)
3744 {
3745   bin_tree_t *tree;
3746   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3747     {
3748       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3749
3750       if (storage == NULL)
3751         return NULL;
3752       storage->next = dfa->str_tree_storage;
3753       dfa->str_tree_storage = storage;
3754       dfa->str_tree_storage_idx = 0;
3755     }
3756   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3757
3758   tree->parent = NULL;
3759   tree->left = left;
3760   tree->right = right;
3761   tree->token = *token;
3762   tree->token.duplicated = 0;
3763   tree->token.opt_subexp = 0;
3764   tree->first = NULL;
3765   tree->next = NULL;
3766   tree->node_idx = REG_MISSING;
3767
3768   if (left != NULL)
3769     left->parent = tree;
3770   if (right != NULL)
3771     right->parent = tree;
3772   return tree;
3773 }
3774
3775 /* Mark the tree SRC as an optional subexpression.
3776    To be called from preorder or postorder.  */
3777
3778 static reg_errcode_t
3779 mark_opt_subexp (void *extra, bin_tree_t *node)
3780 {
3781   Idx idx = (Idx) (long) extra;
3782   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3783     node->token.opt_subexp = 1;
3784
3785   return REG_NOERROR;
3786 }
3787
3788 /* Free the allocated memory inside NODE. */
3789
3790 static void
3791 free_token (re_token_t *node)
3792 {
3793 #ifdef RE_ENABLE_I18N
3794   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3795     free_charset (node->opr.mbcset);
3796   else
3797 #endif /* RE_ENABLE_I18N */
3798     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3799       re_free (node->opr.sbcset);
3800 }
3801
3802 /* Worker function for tree walking.  Free the allocated memory inside NODE
3803    and its children. */
3804
3805 static reg_errcode_t
3806 free_tree (void *extra, bin_tree_t *node)
3807 {
3808   free_token (&node->token);
3809   return REG_NOERROR;
3810 }
3811
3812
3813 /* Duplicate the node SRC, and return new node.  This is a preorder
3814    visit similar to the one implemented by the generic visitor, but
3815    we need more infrastructure to maintain two parallel trees --- so,
3816    it's easier to duplicate.  */
3817
3818 static bin_tree_t *
3819 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3820 {
3821   const bin_tree_t *node;
3822   bin_tree_t *dup_root;
3823   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3824
3825   for (node = root; ; )
3826     {
3827       /* Create a new tree and link it back to the current parent.  */
3828       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3829       if (*p_new == NULL)
3830         return NULL;
3831       (*p_new)->parent = dup_node;
3832       (*p_new)->token.duplicated = 1;
3833       dup_node = *p_new;
3834
3835       /* Go to the left node, or up and to the right.  */
3836       if (node->left)
3837         {
3838           node = node->left;
3839           p_new = &dup_node->left;
3840         }
3841       else
3842         {
3843           const bin_tree_t *prev = NULL;
3844           while (node->right == prev || node->right == NULL)
3845             {
3846               prev = node;
3847               node = node->parent;
3848               dup_node = dup_node->parent;
3849               if (!node)
3850                 return dup_root;
3851             }
3852           node = node->right;
3853           p_new = &dup_node->right;
3854         }
3855     }
3856 }