lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 337                              &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 360           Idx i;
 361
 362 # ifdef _LIBC
 363           /* See if we have to try all bytes which start multiple collation
 364              elements.
 365              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 366                   collation element, and don't catch 'b' since 'b' is
 367                   the only collation element which starts from 'b' (and
 368                   it is caught by SIMPLE_BRACKET).  */
 369               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 370                   && (cset->ncoll_syms || cset->nranges))
 371                 {
 372                   const int32_t *table = (const int32_t *)
 373                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 374                   for (i = 0; i < SBC_MAX; ++i)
 375                     if (table[i] < 0)
 376                       re_set_fastmap (fastmap, icase, i);
 377                 }
 378 # endif /* _LIBC */
 379
 380           /* See if we have to start the match at all multibyte characters,
 381              i.e. where we would not find an invalid sequence.  This only
 382              applies to multibyte character sets; for single byte character
 383              sets, the SIMPLE_BRACKET again suffices.  */
 384           if (dfa->mb_cur_max > 1
 385               && (cset->nchar_classes || cset->non_match || cset->nranges
 386 # ifdef _LIBC
 387                   || cset->nequiv_classes
 388 # endif /* _LIBC */
 389                  ))
 390             {
 391               unsigned char c = 0;
 392               do
 393                 {
 394                   mbstate_t mbs;
 395                   memset (&mbs, 0, sizeof (mbs));
 396                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 397                     re_set_fastmap (fastmap, false, (int) c);
 398                 }
 399               while (++c != 0);
 400             }
 401
 402           else
 403             {
 404               /* ... Else catch all bytes which can start the mbchars.  */
 405               for (i = 0; i < cset->nmbchars; ++i)
 406                 {
 407                   char buf[256];
 408                   mbstate_t state;
 409                   memset (&state, '\0', sizeof (state));
 410                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 411                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                     {
 414                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 415                           != (size_t) -1)
 416                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 417                     }
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      'buffer' to the compiled pattern;
 443      'used' to the length of the compiled pattern;
 444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      'fastmap' to an allocated space for the fastmap;
 449      'fastmap_accurate' to zero;
 450      're_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *_Restrict_ preg;
 475     const char *_Restrict_ pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 #ifdef _LIBC
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *_Restrict_ preg;
 539     char *_Restrict_ errbuf;
 540     size_t errbuf_size;
 541 #else /* size_t might promote */
 542 size_t
 543 regerror (int errcode, const regex_t *_Restrict_ preg,
 544           char *_Restrict_ errbuf, size_t errbuf_size)
 545 #endif
 546 {
 547   const char *msg;
 548   size_t msg_size;
 549
 550   if (BE (errcode < 0
 551           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 552                                / sizeof (__re_error_msgid_idx[0])), 0))
 553     /* Only error codes returned by the rest of the code should be passed
 554        to this routine.  If we are given anything else, or if other regex
 555        code generates an invalid error code, then the program has a bug.
 556        Dump core so we can fix it.  */
 557     abort ();
 558
 559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 560
 561   msg_size = strlen (msg) + 1; /* Includes the null.  */
 562
 563   if (BE (errbuf_size != 0, 1))
 564     {
 565       size_t cpy_size = msg_size;
 566       if (BE (msg_size > errbuf_size, 0))
 567         {
 568           cpy_size = errbuf_size - 1;
 569           errbuf[cpy_size] = '\0';
 570         }
 571       memcpy (errbuf, msg, cpy_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset_t utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 590 #  error "bitset_word_t is narrower than 32 bits"
 591 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 592   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 593 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX,
 597 # endif
 598   (BITSET_WORD_MAX
 599    >> (SBC_MAX % BITSET_WORD_BITS == 0
 600        ? 0
 601        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 602 };
 603 #endif
 604
 605
 606 static void
 607 free_dfa_content (re_dfa_t *dfa)
 608 {
 609   Idx i, j;
 610
 611   if (dfa->nodes)
 612     for (i = 0; i < dfa->nodes_len; ++i)
 613       free_token (dfa->nodes + i);
 614   re_free (dfa->nexts);
 615   for (i = 0; i < dfa->nodes_len; ++i)
 616     {
 617       if (dfa->eclosures != NULL)
 618         re_node_set_free (dfa->eclosures + i);
 619       if (dfa->inveclosures != NULL)
 620         re_node_set_free (dfa->inveclosures + i);
 621       if (dfa->edests != NULL)
 622         re_node_set_free (dfa->edests + i);
 623     }
 624   re_free (dfa->edests);
 625   re_free (dfa->eclosures);
 626   re_free (dfa->inveclosures);
 627   re_free (dfa->nodes);
 628
 629   if (dfa->state_table)
 630     for (i = 0; i <= dfa->state_hash_mask; ++i)
 631       {
 632         struct re_state_table_entry *entry = dfa->state_table + i;
 633         for (j = 0; j < entry->num; ++j)
 634           {
 635             re_dfastate_t *state = entry->array[j];
 636             free_state (state);
 637           }
 638         re_free (entry->array);
 639       }
 640   re_free (dfa->state_table);
 641 #ifdef RE_ENABLE_I18N
 642   if (dfa->sb_char != utf8_sb_map)
 643     re_free (dfa->sb_char);
 644 #endif
 645   re_free (dfa->subexp_map);
 646 #ifdef DEBUG
 647   re_free (dfa->re_str);
 648 #endif
 649
 650   re_free (dfa);
 651 }
 652
 653
 654 /* Free dynamically allocated space used by PREG.  */
 655
 656 void
 657 regfree (preg)
 658     regex_t *preg;
 659 {
 660   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 661   if (BE (dfa != NULL, 1))
 662     free_dfa_content (dfa);
 663   preg->buffer = NULL;
 664   preg->allocated = 0;
 665
 666   re_free (preg->fastmap);
 667   preg->fastmap = NULL;
 668
 669   re_free (preg->translate);
 670   preg->translate = NULL;
 671 }
 672 #ifdef _LIBC
 673 weak_alias (__regfree, regfree)
 674 #endif
 675 \f
 676 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 677    them unless specifically requested.  */
 678
 679 #if defined _REGEX_RE_COMP || defined _LIBC
 680
 681 /* BSD has one and only one pattern buffer.  */
 682 static struct re_pattern_buffer re_comp_buf;
 683
 684 char *
 685 # ifdef _LIBC
 686 /* Make these definitions weak in libc, so POSIX programs can redefine
 687    these names if they don't use our functions, and still use
 688    regcomp/regexec above without link errors.  */
 689 weak_function
 690 # endif
 691 re_comp (s)
 692      const char *s;
 693 {
 694   reg_errcode_t ret;
 695   char *fastmap;
 696
 697   if (!s)
 698     {
 699       if (!re_comp_buf.buffer)
 700         return gettext ("No previous regular expression");
 701       return 0;
 702     }
 703
 704   if (re_comp_buf.buffer)
 705     {
 706       fastmap = re_comp_buf.fastmap;
 707       re_comp_buf.fastmap = NULL;
 708       __regfree (&re_comp_buf);
 709       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 710       re_comp_buf.fastmap = fastmap;
 711     }
 712
 713   if (re_comp_buf.fastmap == NULL)
 714     {
 715       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 716       if (re_comp_buf.fastmap == NULL)
 717         return (char *) gettext (__re_error_msgid
 718                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 719     }
 720
 721   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 722      don't need to initialize the pattern buffer fields which affect it.  */
 723
 724   /* Match anchors at newlines.  */
 725   re_comp_buf.newline_anchor = 1;
 726
 727   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 728
 729   if (!ret)
 730     return NULL;
 731
 732   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 733   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 734 }
 735
 736 #ifdef _LIBC
 737 libc_freeres_fn (free_mem)
 738 {
 739   __regfree (&re_comp_buf);
 740 }
 741 #endif
 742
 743 #endif /* _REGEX_RE_COMP */
 744 \f
 745 /* Internal entry point.
 746    Compile the regular expression PATTERN, whose length is LENGTH.
 747    SYNTAX indicate regular expression's syntax.  */
 748
 749 static reg_errcode_t
 750 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 751                      reg_syntax_t syntax)
 752 {
 753   reg_errcode_t err = REG_NOERROR;
 754   re_dfa_t *dfa;
 755   re_string_t regexp;
 756
 757   /* Initialize the pattern buffer.  */
 758   preg->fastmap_accurate = 0;
 759   preg->syntax = syntax;
 760   preg->not_bol = preg->not_eol = 0;
 761   preg->used = 0;
 762   preg->re_nsub = 0;
 763   preg->can_be_null = 0;
 764   preg->regs_allocated = REGS_UNALLOCATED;
 765
 766   /* Initialize the dfa.  */
 767   dfa = (re_dfa_t *) preg->buffer;
 768   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 769     {
 770       /* If zero allocated, but buffer is non-null, try to realloc
 771          enough space.  This loses if buffer's address is bogus, but
 772          that is the user's responsibility.  If ->buffer is NULL this
 773          is a simple allocation.  */
 774       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 775       if (dfa == NULL)
 776         return REG_ESPACE;
 777       preg->allocated = sizeof (re_dfa_t);
 778       preg->buffer = (unsigned char *) dfa;
 779     }
 780   preg->used = sizeof (re_dfa_t);
 781
 782   err = init_dfa (dfa, length);
 783   if (BE (err != REG_NOERROR, 0))
 784     {
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790 #ifdef DEBUG
 791   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 792   dfa->re_str = re_malloc (char, length + 1);
 793   strncpy (dfa->re_str, pattern, length + 1);
 794 #endif
 795
 796   __libc_lock_init (dfa->lock);
 797
 798   err = re_string_construct (&regexp, pattern, length, preg->translate,
 799                              (syntax & RE_ICASE) != 0, dfa);
 800   if (BE (err != REG_NOERROR, 0))
 801     {
 802     re_compile_internal_free_return:
 803       free_workarea_compile (preg);
 804       re_string_destruct (&regexp);
 805       free_dfa_content (dfa);
 806       preg->buffer = NULL;
 807       preg->allocated = 0;
 808       return err;
 809     }
 810
 811   /* Parse the regular expression, and build a structure tree.  */
 812   preg->re_nsub = 0;
 813   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 814   if (BE (dfa->str_tree == NULL, 0))
 815     goto re_compile_internal_free_return;
 816
 817   /* Analyze the tree and create the nfa.  */
 818   err = analyze (preg);
 819   if (BE (err != REG_NOERROR, 0))
 820     goto re_compile_internal_free_return;
 821
 822 #ifdef RE_ENABLE_I18N
 823   /* If possible, do searching in single byte encoding to speed things up.  */
 824   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 825     optimize_utf8 (dfa);
 826 #endif
 827
 828   /* Then create the initial state of the dfa.  */
 829   err = create_initial_state (dfa);
 830
 831   /* Release work areas.  */
 832   free_workarea_compile (preg);
 833   re_string_destruct (&regexp);
 834
 835   if (BE (err != REG_NOERROR, 0))
 836     {
 837       free_dfa_content (dfa);
 838       preg->buffer = NULL;
 839       preg->allocated = 0;
 840     }
 841
 842   return err;
 843 }
 844
 845 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 846    as the initial length of some arrays.  */
 847
 848 static reg_errcode_t
 849 init_dfa (re_dfa_t *dfa, size_t pat_len)
 850 {
 851   __re_size_t table_size;
 852 #ifndef _LIBC
 853   char *codeset_name;
 854 #endif
 855 #ifdef RE_ENABLE_I18N
 856   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 857 #else
 858   size_t max_i18n_object_size = 0;
 859 #endif
 860   size_t max_object_size =
 861     MAX (sizeof (struct re_state_table_entry),
 862          MAX (sizeof (re_token_t),
 863               MAX (sizeof (re_node_set),
 864                    MAX (sizeof (regmatch_t),
 865                         max_i18n_object_size))));
 866
 867   memset (dfa, '\0', sizeof (re_dfa_t));
 868
 869   /* Force allocation of str_tree_storage the first time.  */
 870   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 871
 872   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 873      calculation below, and for similar doubling calculations
 874      elsewhere.  And it's <= rather than <, because some of the
 875      doubling calculations add 1 afterwards.  */
 876   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 877     return REG_ESPACE;
 878
 879   dfa->nodes_alloc = pat_len + 1;
 880   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 881
 882   /*  table_size = 2 ^ ceil(log pat_len) */
 883   for (table_size = 1; ; table_size <<= 1)
 884     if (table_size > pat_len)
 885       break;
 886
 887   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 888   dfa->state_hash_mask = table_size - 1;
 889
 890   dfa->mb_cur_max = MB_CUR_MAX;
 891 #ifdef _LIBC
 892   if (dfa->mb_cur_max == 6
 893       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 894     dfa->is_utf8 = 1;
 895   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 896                        != 0);
 897 #else
 898   codeset_name = nl_langinfo (CODESET);
 899   if (strcasecmp (codeset_name, "UTF-8") == 0
 900       || strcasecmp (codeset_name, "UTF8") == 0)
 901     dfa->is_utf8 = 1;
 902
 903   /* We check exhaustively in the loop below if this charset is a
 904      superset of ASCII.  */
 905   dfa->map_notascii = 0;
 906 #endif
 907
 908 #ifdef RE_ENABLE_I18N
 909   if (dfa->mb_cur_max > 1)
 910     {
 911       if (dfa->is_utf8)
 912         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 913       else
 914         {
 915           int i, j, ch;
 916
 917           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 918           if (BE (dfa->sb_char == NULL, 0))
 919             return REG_ESPACE;
 920
 921           /* Set the bits corresponding to single byte chars.  */
 922           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 923             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 924               {
 925                 wint_t wch = __btowc (ch);
 926                 if (wch != WEOF)
 927                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 928 # ifndef _LIBC
 929                 if (isascii (ch) && wch != ch)
 930                   dfa->map_notascii = 1;
 931 # endif
 932               }
 933         }
 934     }
 935 #endif
 936
 937   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 938     return REG_ESPACE;
 939   return REG_NOERROR;
 940 }
 941
 942 /* Initialize WORD_CHAR table, which indicate which character is
 943    "word".  In this case "word" means that it is the word construction
 944    character used by some operators like "\<", "\>", etc.  */
 945
 946 static void
 947 internal_function
 948 init_word_char (re_dfa_t *dfa)
 949 {
 950   int i, j, ch;
 951   dfa->word_ops_used = 1;
 952   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 953     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 954       if (isalnum (ch) || ch == '_')
 955         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 956 }
 957
 958 /* Free the work area which are only used while compiling.  */
 959
 960 static void
 961 free_workarea_compile (regex_t *preg)
 962 {
 963   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 964   bin_tree_storage_t *storage, *next;
 965   for (storage = dfa->str_tree_storage; storage; storage = next)
 966     {
 967       next = storage->next;
 968       re_free (storage);
 969     }
 970   dfa->str_tree_storage = NULL;
 971   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 972   dfa->str_tree = NULL;
 973   re_free (dfa->org_indices);
 974   dfa->org_indices = NULL;
 975 }
 976
 977 /* Create initial states for all contexts.  */
 978
 979 static reg_errcode_t
 980 create_initial_state (re_dfa_t *dfa)
 981 {
 982   Idx first, i;
 983   reg_errcode_t err;
 984   re_node_set init_nodes;
 985
 986   /* Initial states have the epsilon closure of the node which is
 987      the first node of the regular expression.  */
 988   first = dfa->str_tree->first->node_idx;
 989   dfa->init_node = first;
 990   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 991   if (BE (err != REG_NOERROR, 0))
 992     return err;
 993
 994   /* The back-references which are in initial states can epsilon transit,
 995      since in this case all of the subexpressions can be null.
 996      Then we add epsilon closures of the nodes which are the next nodes of
 997      the back-references.  */
 998   if (dfa->nbackref > 0)
 999     for (i = 0; i < init_nodes.nelem; ++i)
1000       {
1001         Idx node_idx = init_nodes.elems[i];
1002         re_token_type_t type = dfa->nodes[node_idx].type;
1003
1004         Idx clexp_idx;
1005         if (type != OP_BACK_REF)
1006           continue;
1007         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1008           {
1009             re_token_t *clexp_node;
1010             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1011             if (clexp_node->type == OP_CLOSE_SUBEXP
1012                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1013               break;
1014           }
1015         if (clexp_idx == init_nodes.nelem)
1016           continue;
1017
1018         if (type == OP_BACK_REF)
1019           {
1020             Idx dest_idx = dfa->edests[node_idx].elems[0];
1021             if (!re_node_set_contains (&init_nodes, dest_idx))
1022               {
1023                 reg_errcode_t merge_err
1024                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1025                 if (merge_err != REG_NOERROR)
1026                   return merge_err;
1027                 i = 0;
1028               }
1029           }
1030       }
1031
1032   /* It must be the first time to invoke acquire_state.  */
1033   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1034   /* We don't check ERR here, since the initial state must not be NULL.  */
1035   if (BE (dfa->init_state == NULL, 0))
1036     return err;
1037   if (dfa->init_state->has_constraint)
1038     {
1039       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1040                                                        CONTEXT_WORD);
1041       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1042                                                      CONTEXT_NEWLINE);
1043       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1044                                                          &init_nodes,
1045                                                          CONTEXT_NEWLINE
1046                                                          | CONTEXT_BEGBUF);
1047       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1048               || dfa->init_state_begbuf == NULL, 0))
1049         return err;
1050     }
1051   else
1052     dfa->init_state_word = dfa->init_state_nl
1053       = dfa->init_state_begbuf = dfa->init_state;
1054
1055   re_node_set_free (&init_nodes);
1056   return REG_NOERROR;
1057 }
1058 \f
1059 #ifdef RE_ENABLE_I18N
1060 /* If it is possible to do searching in single byte encoding instead of UTF-8
1061    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1062    DFA nodes where needed.  */
1063
1064 static void
1065 optimize_utf8 (re_dfa_t *dfa)
1066 {
1067   Idx node;
1068   int i;
1069   bool mb_chars = false;
1070   bool has_period = false;
1071
1072   for (node = 0; node < dfa->nodes_len; ++node)
1073     switch (dfa->nodes[node].type)
1074       {
1075       case CHARACTER:
1076         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1077           mb_chars = true;
1078         break;
1079       case ANCHOR:
1080         switch (dfa->nodes[node].opr.ctx_type)
1081           {
1082           case LINE_FIRST:
1083           case LINE_LAST:
1084           case BUF_FIRST:
1085           case BUF_LAST:
1086             break;
1087           default:
1088             /* Word anchors etc. cannot be handled.  It's okay to test
1089                opr.ctx_type since constraints (for all DFA nodes) are
1090                created by ORing one or more opr.ctx_type values.  */
1091             return;
1092           }
1093         break;
1094       case OP_PERIOD:
1095         has_period = true;
1096         break;
1097       case OP_BACK_REF:
1098       case OP_ALT:
1099       case END_OF_RE:
1100       case OP_DUP_ASTERISK:
1101       case OP_OPEN_SUBEXP:
1102       case OP_CLOSE_SUBEXP:
1103         break;
1104       case COMPLEX_BRACKET:
1105         return;
1106       case SIMPLE_BRACKET:
1107         /* Just double check.  */
1108         {
1109           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1110                         ? 0
1111                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1112           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1113             {
1114               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1115                 return;
1116               rshift = 0;
1117             }
1118         }
1119         break;
1120       default:
1121         abort ();
1122       }
1123
1124   if (mb_chars || has_period)
1125     for (node = 0; node < dfa->nodes_len; ++node)
1126       {
1127         if (dfa->nodes[node].type == CHARACTER
1128             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1129           dfa->nodes[node].mb_partial = 0;
1130         else if (dfa->nodes[node].type == OP_PERIOD)
1131           dfa->nodes[node].type = OP_UTF8_PERIOD;
1132       }
1133
1134   /* The search can be in single byte locale.  */
1135   dfa->mb_cur_max = 1;
1136   dfa->is_utf8 = 0;
1137   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1138 }
1139 #endif
1140 \f
1141 /* Analyze the structure tree, and calculate "first", "next", "edest",
1142    "eclosure", and "inveclosure".  */
1143
1144 static reg_errcode_t
1145 analyze (regex_t *preg)
1146 {
1147   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1148   reg_errcode_t ret;
1149
1150   /* Allocate arrays.  */
1151   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1152   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1153   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1154   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1155   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1156           || dfa->eclosures == NULL, 0))
1157     return REG_ESPACE;
1158
1159   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1160   if (dfa->subexp_map != NULL)
1161     {
1162       Idx i;
1163       for (i = 0; i < preg->re_nsub; i++)
1164         dfa->subexp_map[i] = i;
1165       preorder (dfa->str_tree, optimize_subexps, dfa);
1166       for (i = 0; i < preg->re_nsub; i++)
1167         if (dfa->subexp_map[i] != i)
1168           break;
1169       if (i == preg->re_nsub)
1170         {
1171           free (dfa->subexp_map);
1172           dfa->subexp_map = NULL;
1173         }
1174     }
1175
1176   ret = postorder (dfa->str_tree, lower_subexps, preg);
1177   if (BE (ret != REG_NOERROR, 0))
1178     return ret;
1179   ret = postorder (dfa->str_tree, calc_first, dfa);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   preorder (dfa->str_tree, calc_next, dfa);
1183   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1184   if (BE (ret != REG_NOERROR, 0))
1185     return ret;
1186   ret = calc_eclosure (dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189
1190   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1191      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1192   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1193       || dfa->nbackref)
1194     {
1195       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1196       if (BE (dfa->inveclosures == NULL, 0))
1197         return REG_ESPACE;
1198       ret = calc_inveclosure (dfa);
1199     }
1200
1201   return ret;
1202 }
1203
1204 /* Our parse trees are very unbalanced, so we cannot use a stack to
1205    implement parse tree visits.  Instead, we use parent pointers and
1206    some hairy code in these two functions.  */
1207 static reg_errcode_t
1208 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1209            void *extra)
1210 {
1211   bin_tree_t *node, *prev;
1212
1213   for (node = root; ; )
1214     {
1215       /* Descend down the tree, preferably to the left (or to the right
1216          if that's the only child).  */
1217       while (node->left || node->right)
1218         if (node->left)
1219           node = node->left;
1220         else
1221           node = node->right;
1222
1223       do
1224         {
1225           reg_errcode_t err = fn (extra, node);
1226           if (BE (err != REG_NOERROR, 0))
1227             return err;
1228           if (node->parent == NULL)
1229             return REG_NOERROR;
1230           prev = node;
1231           node = node->parent;
1232         }
1233       /* Go up while we have a node that is reached from the right.  */
1234       while (node->right == prev || node->right == NULL);
1235       node = node->right;
1236     }
1237 }
1238
1239 static reg_errcode_t
1240 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1241           void *extra)
1242 {
1243   bin_tree_t *node;
1244
1245   for (node = root; ; )
1246     {
1247       reg_errcode_t err = fn (extra, node);
1248       if (BE (err != REG_NOERROR, 0))
1249         return err;
1250
1251       /* Go to the left node, or up and to the right.  */
1252       if (node->left)
1253         node = node->left;
1254       else
1255         {
1256           bin_tree_t *prev = NULL;
1257           while (node->right == prev || node->right == NULL)
1258             {
1259               prev = node;
1260               node = node->parent;
1261               if (!node)
1262                 return REG_NOERROR;
1263             }
1264           node = node->right;
1265         }
1266     }
1267 }
1268
1269 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1270    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1271    backreferences as well.  Requires a preorder visit.  */
1272 static reg_errcode_t
1273 optimize_subexps (void *extra, bin_tree_t *node)
1274 {
1275   re_dfa_t *dfa = (re_dfa_t *) extra;
1276
1277   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1278     {
1279       int idx = node->token.opr.idx;
1280       node->token.opr.idx = dfa->subexp_map[idx];
1281       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1282     }
1283
1284   else if (node->token.type == SUBEXP
1285            && node->left && node->left->token.type == SUBEXP)
1286     {
1287       Idx other_idx = node->left->token.opr.idx;
1288
1289       node->left = node->left->left;
1290       if (node->left)
1291         node->left->parent = node;
1292
1293       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1294       if (other_idx < BITSET_WORD_BITS)
1295         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1296     }
1297
1298   return REG_NOERROR;
1299 }
1300
1301 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1302    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1303 static reg_errcode_t
1304 lower_subexps (void *extra, bin_tree_t *node)
1305 {
1306   regex_t *preg = (regex_t *) extra;
1307   reg_errcode_t err = REG_NOERROR;
1308
1309   if (node->left && node->left->token.type == SUBEXP)
1310     {
1311       node->left = lower_subexp (&err, preg, node->left);
1312       if (node->left)
1313         node->left->parent = node;
1314     }
1315   if (node->right && node->right->token.type == SUBEXP)
1316     {
1317       node->right = lower_subexp (&err, preg, node->right);
1318       if (node->right)
1319         node->right->parent = node;
1320     }
1321
1322   return err;
1323 }
1324
1325 static bin_tree_t *
1326 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1327 {
1328   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1329   bin_tree_t *body = node->left;
1330   bin_tree_t *op, *cls, *tree1, *tree;
1331
1332   if (preg->no_sub
1333       /* We do not optimize empty subexpressions, because otherwise we may
1334          have bad CONCAT nodes with NULL children.  This is obviously not
1335          very common, so we do not lose much.  An example that triggers
1336          this case is the sed "script" /\(\)/x.  */
1337       && node->left != NULL
1338       && (node->token.opr.idx >= BITSET_WORD_BITS
1339           || !(dfa->used_bkref_map
1340                & ((bitset_word_t) 1 << node->token.opr.idx))))
1341     return node->left;
1342
1343   /* Convert the SUBEXP node to the concatenation of an
1344      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1345   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348   tree = create_tree (dfa, op, tree1, CONCAT);
1349   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1350     {
1351       *err = REG_ESPACE;
1352       return NULL;
1353     }
1354
1355   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357   return tree;
1358 }
1359
1360 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361    nodes.  Requires a postorder visit.  */
1362 static reg_errcode_t
1363 calc_first (void *extra, bin_tree_t *node)
1364 {
1365   re_dfa_t *dfa = (re_dfa_t *) extra;
1366   if (node->token.type == CONCAT)
1367     {
1368       node->first = node->left->first;
1369       node->node_idx = node->left->node_idx;
1370     }
1371   else
1372     {
1373       node->first = node;
1374       node->node_idx = re_dfa_add_node (dfa, node->token);
1375       if (BE (node->node_idx == REG_MISSING, 0))
1376         return REG_ESPACE;
1377       if (node->token.type == ANCHOR)
1378         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1379     }
1380   return REG_NOERROR;
1381 }
1382
1383 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1384 static reg_errcode_t
1385 calc_next (void *extra, bin_tree_t *node)
1386 {
1387   switch (node->token.type)
1388     {
1389     case OP_DUP_ASTERISK:
1390       node->left->next = node;
1391       break;
1392     case CONCAT:
1393       node->left->next = node->right->first;
1394       node->right->next = node->next;
1395       break;
1396     default:
1397       if (node->left)
1398         node->left->next = node->next;
1399       if (node->right)
1400         node->right->next = node->next;
1401       break;
1402     }
1403   return REG_NOERROR;
1404 }
1405
1406 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1407 static reg_errcode_t
1408 link_nfa_nodes (void *extra, bin_tree_t *node)
1409 {
1410   re_dfa_t *dfa = (re_dfa_t *) extra;
1411   Idx idx = node->node_idx;
1412   reg_errcode_t err = REG_NOERROR;
1413
1414   switch (node->token.type)
1415     {
1416     case CONCAT:
1417       break;
1418
1419     case END_OF_RE:
1420       assert (node->next == NULL);
1421       break;
1422
1423     case OP_DUP_ASTERISK:
1424     case OP_ALT:
1425       {
1426         Idx left, right;
1427         dfa->has_plural_match = 1;
1428         if (node->left != NULL)
1429           left = node->left->first->node_idx;
1430         else
1431           left = node->next->node_idx;
1432         if (node->right != NULL)
1433           right = node->right->first->node_idx;
1434         else
1435           right = node->next->node_idx;
1436         assert (REG_VALID_INDEX (left));
1437         assert (REG_VALID_INDEX (right));
1438         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1439       }
1440       break;
1441
1442     case ANCHOR:
1443     case OP_OPEN_SUBEXP:
1444     case OP_CLOSE_SUBEXP:
1445       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1446       break;
1447
1448     case OP_BACK_REF:
1449       dfa->nexts[idx] = node->next->node_idx;
1450       if (node->token.type == OP_BACK_REF)
1451         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1452       break;
1453
1454     default:
1455       assert (!IS_EPSILON_NODE (node->token.type));
1456       dfa->nexts[idx] = node->next->node_idx;
1457       break;
1458     }
1459
1460   return err;
1461 }
1462
1463 /* Duplicate the epsilon closure of the node ROOT_NODE.
1464    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465    to their own constraint.  */
1466
1467 static reg_errcode_t
1468 internal_function
1469 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1470                         Idx root_node, unsigned int init_constraint)
1471 {
1472   Idx org_node, clone_node;
1473   bool ok;
1474   unsigned int constraint = init_constraint;
1475   for (org_node = top_org_node, clone_node = top_clone_node;;)
1476     {
1477       Idx org_dest, clone_dest;
1478       if (dfa->nodes[org_node].type == OP_BACK_REF)
1479         {
1480           /* If the back reference epsilon-transit, its destination must
1481              also have the constraint.  Then duplicate the epsilon closure
1482              of the destination of the back reference, and store it in
1483              edests of the back reference.  */
1484           org_dest = dfa->nexts[org_node];
1485           re_node_set_empty (dfa->edests + clone_node);
1486           clone_dest = duplicate_node (dfa, org_dest, constraint);
1487           if (BE (clone_dest == REG_MISSING, 0))
1488             return REG_ESPACE;
1489           dfa->nexts[clone_node] = dfa->nexts[org_node];
1490           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491           if (BE (! ok, 0))
1492             return REG_ESPACE;
1493         }
1494       else if (dfa->edests[org_node].nelem == 0)
1495         {
1496           /* In case of the node can't epsilon-transit, don't duplicate the
1497              destination and store the original destination as the
1498              destination of the node.  */
1499           dfa->nexts[clone_node] = dfa->nexts[org_node];
1500           break;
1501         }
1502       else if (dfa->edests[org_node].nelem == 1)
1503         {
1504           /* In case of the node can epsilon-transit, and it has only one
1505              destination.  */
1506           org_dest = dfa->edests[org_node].elems[0];
1507           re_node_set_empty (dfa->edests + clone_node);
1508           /* If the node is root_node itself, it means the epsilon closure
1509              has a loop.  Then tie it to the destination of the root_node.  */
1510           if (org_node == root_node && clone_node != org_node)
1511             {
1512               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513               if (BE (! ok, 0))
1514                 return REG_ESPACE;
1515               break;
1516             }
1517           /* In case the node has another constraint, append it.  */
1518           constraint |= dfa->nodes[org_node].constraint;
1519           clone_dest = duplicate_node (dfa, org_dest, constraint);
1520           if (BE (clone_dest == REG_MISSING, 0))
1521             return REG_ESPACE;
1522           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523           if (BE (! ok, 0))
1524             return REG_ESPACE;
1525         }
1526       else /* dfa->edests[org_node].nelem == 2 */
1527         {
1528           /* In case of the node can epsilon-transit, and it has two
1529              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1530           org_dest = dfa->edests[org_node].elems[0];
1531           re_node_set_empty (dfa->edests + clone_node);
1532           /* Search for a duplicated node which satisfies the constraint.  */
1533           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534           if (clone_dest == REG_MISSING)
1535             {
1536               /* There is no such duplicated node, create a new one.  */
1537               reg_errcode_t err;
1538               clone_dest = duplicate_node (dfa, org_dest, constraint);
1539               if (BE (clone_dest == REG_MISSING, 0))
1540                 return REG_ESPACE;
1541               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542               if (BE (! ok, 0))
1543                 return REG_ESPACE;
1544               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545                                             root_node, constraint);
1546               if (BE (err != REG_NOERROR, 0))
1547                 return err;
1548             }
1549           else
1550             {
1551               /* There is a duplicated node which satisfies the constraint,
1552                  use it to avoid infinite loop.  */
1553               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554               if (BE (! ok, 0))
1555                 return REG_ESPACE;
1556             }
1557
1558           org_dest = dfa->edests[org_node].elems[1];
1559           clone_dest = duplicate_node (dfa, org_dest, constraint);
1560           if (BE (clone_dest == REG_MISSING, 0))
1561             return REG_ESPACE;
1562           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563           if (BE (! ok, 0))
1564             return REG_ESPACE;
1565         }
1566       org_node = org_dest;
1567       clone_node = clone_dest;
1568     }
1569   return REG_NOERROR;
1570 }
1571
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573    satisfies the constraint CONSTRAINT.  */
1574
1575 static Idx
1576 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1577                         unsigned int constraint)
1578 {
1579   Idx idx;
1580   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581     {
1582       if (org_node == dfa->org_indices[idx]
1583           && constraint == dfa->nodes[idx].constraint)
1584         return idx; /* Found.  */
1585     }
1586   return REG_MISSING; /* Not found.  */
1587 }
1588
1589 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590    Return the index of the new node, or REG_MISSING if insufficient storage is
1591    available.  */
1592
1593 static Idx
1594 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1595 {
1596   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597   if (BE (dup_idx != REG_MISSING, 1))
1598     {
1599       dfa->nodes[dup_idx].constraint = constraint;
1600       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601       dfa->nodes[dup_idx].duplicated = 1;
1602
1603       /* Store the index of the original node.  */
1604       dfa->org_indices[dup_idx] = org_idx;
1605     }
1606   return dup_idx;
1607 }
1608
1609 static reg_errcode_t
1610 calc_inveclosure (re_dfa_t *dfa)
1611 {
1612   Idx src, idx;
1613   bool ok;
1614   for (idx = 0; idx < dfa->nodes_len; ++idx)
1615     re_node_set_init_empty (dfa->inveclosures + idx);
1616
1617   for (src = 0; src < dfa->nodes_len; ++src)
1618     {
1619       Idx *elems = dfa->eclosures[src].elems;
1620       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1621         {
1622           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1623           if (BE (! ok, 0))
1624             return REG_ESPACE;
1625         }
1626     }
1627
1628   return REG_NOERROR;
1629 }
1630
1631 /* Calculate "eclosure" for all the node in DFA.  */
1632
1633 static reg_errcode_t
1634 calc_eclosure (re_dfa_t *dfa)
1635 {
1636   Idx node_idx;
1637   bool incomplete;
1638 #ifdef DEBUG
1639   assert (dfa->nodes_len > 0);
1640 #endif
1641   incomplete = false;
1642   /* For each nodes, calculate epsilon closure.  */
1643   for (node_idx = 0; ; ++node_idx)
1644     {
1645       reg_errcode_t err;
1646       re_node_set eclosure_elem;
1647       if (node_idx == dfa->nodes_len)
1648         {
1649           if (!incomplete)
1650             break;
1651           incomplete = false;
1652           node_idx = 0;
1653         }
1654
1655 #ifdef DEBUG
1656       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1657 #endif
1658
1659       /* If we have already calculated, skip it.  */
1660       if (dfa->eclosures[node_idx].nelem != 0)
1661         continue;
1662       /* Calculate epsilon closure of 'node_idx'.  */
1663       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1664       if (BE (err != REG_NOERROR, 0))
1665         return err;
1666
1667       if (dfa->eclosures[node_idx].nelem == 0)
1668         {
1669           incomplete = true;
1670           re_node_set_free (&eclosure_elem);
1671         }
1672     }
1673   return REG_NOERROR;
1674 }
1675
1676 /* Calculate epsilon closure of NODE.  */
1677
1678 static reg_errcode_t
1679 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1680 {
1681   reg_errcode_t err;
1682   Idx i;
1683   re_node_set eclosure;
1684   bool ok;
1685   bool incomplete = false;
1686   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1687   if (BE (err != REG_NOERROR, 0))
1688     return err;
1689
1690   /* This indicates that we are calculating this node now.
1691      We reference this value to avoid infinite loop.  */
1692   dfa->eclosures[node].nelem = REG_MISSING;
1693
1694   /* If the current node has constraints, duplicate all nodes
1695      since they must inherit the constraints.  */
1696   if (dfa->nodes[node].constraint
1697       && dfa->edests[node].nelem
1698       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1699     {
1700       err = duplicate_node_closure (dfa, node, node, node,
1701                                     dfa->nodes[node].constraint);
1702       if (BE (err != REG_NOERROR, 0))
1703         return err;
1704     }
1705
1706   /* Expand each epsilon destination nodes.  */
1707   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1708     for (i = 0; i < dfa->edests[node].nelem; ++i)
1709       {
1710         re_node_set eclosure_elem;
1711         Idx edest = dfa->edests[node].elems[i];
1712         /* If calculating the epsilon closure of 'edest' is in progress,
1713            return intermediate result.  */
1714         if (dfa->eclosures[edest].nelem == REG_MISSING)
1715           {
1716             incomplete = true;
1717             continue;
1718           }
1719         /* If we haven't calculated the epsilon closure of 'edest' yet,
1720            calculate now. Otherwise use calculated epsilon closure.  */
1721         if (dfa->eclosures[edest].nelem == 0)
1722           {
1723             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1724             if (BE (err != REG_NOERROR, 0))
1725               return err;
1726           }
1727         else
1728           eclosure_elem = dfa->eclosures[edest];
1729         /* Merge the epsilon closure of 'edest'.  */
1730         err = re_node_set_merge (&eclosure, &eclosure_elem);
1731         if (BE (err != REG_NOERROR, 0))
1732           return err;
1733         /* If the epsilon closure of 'edest' is incomplete,
1734            the epsilon closure of this node is also incomplete.  */
1735         if (dfa->eclosures[edest].nelem == 0)
1736           {
1737             incomplete = true;
1738             re_node_set_free (&eclosure_elem);
1739           }
1740       }
1741
1742   /* An epsilon closure includes itself.  */
1743   ok = re_node_set_insert (&eclosure, node);
1744   if (BE (! ok, 0))
1745     return REG_ESPACE;
1746   if (incomplete && !root)
1747     dfa->eclosures[node].nelem = 0;
1748   else
1749     dfa->eclosures[node] = eclosure;
1750   *new_set = eclosure;
1751   return REG_NOERROR;
1752 }
1753 \f
1754 /* Functions for token which are used in the parser.  */
1755
1756 /* Fetch a token from INPUT.
1757    We must not use this function inside bracket expressions.  */
1758
1759 static void
1760 internal_function
1761 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1762 {
1763   re_string_skip_bytes (input, peek_token (result, input, syntax));
1764 }
1765
1766 /* Peek a token from INPUT, and return the length of the token.
1767    We must not use this function inside bracket expressions.  */
1768
1769 static int
1770 internal_function
1771 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1772 {
1773   unsigned char c;
1774
1775   if (re_string_eoi (input))
1776     {
1777       token->type = END_OF_RE;
1778       return 0;
1779     }
1780
1781   c = re_string_peek_byte (input, 0);
1782   token->opr.c = c;
1783
1784   token->word_char = 0;
1785 #ifdef RE_ENABLE_I18N
1786   token->mb_partial = 0;
1787   if (input->mb_cur_max > 1 &&
1788       !re_string_first_byte (input, re_string_cur_idx (input)))
1789     {
1790       token->type = CHARACTER;
1791       token->mb_partial = 1;
1792       return 1;
1793     }
1794 #endif
1795   if (c == '\\')
1796     {
1797       unsigned char c2;
1798       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1799         {
1800           token->type = BACK_SLASH;
1801           return 1;
1802         }
1803
1804       c2 = re_string_peek_byte_case (input, 1);
1805       token->opr.c = c2;
1806       token->type = CHARACTER;
1807 #ifdef RE_ENABLE_I18N
1808       if (input->mb_cur_max > 1)
1809         {
1810           wint_t wc = re_string_wchar_at (input,
1811                                           re_string_cur_idx (input) + 1);
1812           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1813         }
1814       else
1815 #endif
1816         token->word_char = IS_WORD_CHAR (c2) != 0;
1817
1818       switch (c2)
1819         {
1820         case '|':
1821           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1822             token->type = OP_ALT;
1823           break;
1824         case '1': case '2': case '3': case '4': case '5':
1825         case '6': case '7': case '8': case '9':
1826           if (!(syntax & RE_NO_BK_REFS))
1827             {
1828               token->type = OP_BACK_REF;
1829               token->opr.idx = c2 - '1';
1830             }
1831           break;
1832         case '<':
1833           if (!(syntax & RE_NO_GNU_OPS))
1834             {
1835               token->type = ANCHOR;
1836               token->opr.ctx_type = WORD_FIRST;
1837             }
1838           break;
1839         case '>':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             {
1842               token->type = ANCHOR;
1843               token->opr.ctx_type = WORD_LAST;
1844             }
1845           break;
1846         case 'b':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = WORD_DELIM;
1851             }
1852           break;
1853         case 'B':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             {
1856               token->type = ANCHOR;
1857               token->opr.ctx_type = NOT_WORD_DELIM;
1858             }
1859           break;
1860         case 'w':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             token->type = OP_WORD;
1863           break;
1864         case 'W':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             token->type = OP_NOTWORD;
1867           break;
1868         case 's':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             token->type = OP_SPACE;
1871           break;
1872         case 'S':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             token->type = OP_NOTSPACE;
1875           break;
1876         case '`':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             {
1879               token->type = ANCHOR;
1880               token->opr.ctx_type = BUF_FIRST;
1881             }
1882           break;
1883         case '\'':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             {
1886               token->type = ANCHOR;
1887               token->opr.ctx_type = BUF_LAST;
1888             }
1889           break;
1890         case '(':
1891           if (!(syntax & RE_NO_BK_PARENS))
1892             token->type = OP_OPEN_SUBEXP;
1893           break;
1894         case ')':
1895           if (!(syntax & RE_NO_BK_PARENS))
1896             token->type = OP_CLOSE_SUBEXP;
1897           break;
1898         case '+':
1899           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900             token->type = OP_DUP_PLUS;
1901           break;
1902         case '?':
1903           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904             token->type = OP_DUP_QUESTION;
1905           break;
1906         case '{':
1907           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908             token->type = OP_OPEN_DUP_NUM;
1909           break;
1910         case '}':
1911           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912             token->type = OP_CLOSE_DUP_NUM;
1913           break;
1914         default:
1915           break;
1916         }
1917       return 2;
1918     }
1919
1920   token->type = CHARACTER;
1921 #ifdef RE_ENABLE_I18N
1922   if (input->mb_cur_max > 1)
1923     {
1924       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1925       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1926     }
1927   else
1928 #endif
1929     token->word_char = IS_WORD_CHAR (token->opr.c);
1930
1931   switch (c)
1932     {
1933     case '\n':
1934       if (syntax & RE_NEWLINE_ALT)
1935         token->type = OP_ALT;
1936       break;
1937     case '|':
1938       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1939         token->type = OP_ALT;
1940       break;
1941     case '*':
1942       token->type = OP_DUP_ASTERISK;
1943       break;
1944     case '+':
1945       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946         token->type = OP_DUP_PLUS;
1947       break;
1948     case '?':
1949       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950         token->type = OP_DUP_QUESTION;
1951       break;
1952     case '{':
1953       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954         token->type = OP_OPEN_DUP_NUM;
1955       break;
1956     case '}':
1957       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958         token->type = OP_CLOSE_DUP_NUM;
1959       break;
1960     case '(':
1961       if (syntax & RE_NO_BK_PARENS)
1962         token->type = OP_OPEN_SUBEXP;
1963       break;
1964     case ')':
1965       if (syntax & RE_NO_BK_PARENS)
1966         token->type = OP_CLOSE_SUBEXP;
1967       break;
1968     case '[':
1969       token->type = OP_OPEN_BRACKET;
1970       break;
1971     case '.':
1972       token->type = OP_PERIOD;
1973       break;
1974     case '^':
1975       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1976           re_string_cur_idx (input) != 0)
1977         {
1978           char prev = re_string_peek_byte (input, -1);
1979           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1980             break;
1981         }
1982       token->type = ANCHOR;
1983       token->opr.ctx_type = LINE_FIRST;
1984       break;
1985     case '$':
1986       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1987           re_string_cur_idx (input) + 1 != re_string_length (input))
1988         {
1989           re_token_t next;
1990           re_string_skip_bytes (input, 1);
1991           peek_token (&next, input, syntax);
1992           re_string_skip_bytes (input, -1);
1993           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1994             break;
1995         }
1996       token->type = ANCHOR;
1997       token->opr.ctx_type = LINE_LAST;
1998       break;
1999     default:
2000       break;
2001     }
2002   return 1;
2003 }
2004
2005 /* Peek a token from INPUT, and return the length of the token.
2006    We must not use this function out of bracket expressions.  */
2007
2008 static int
2009 internal_function
2010 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2011 {
2012   unsigned char c;
2013   if (re_string_eoi (input))
2014     {
2015       token->type = END_OF_RE;
2016       return 0;
2017     }
2018   c = re_string_peek_byte (input, 0);
2019   token->opr.c = c;
2020
2021 #ifdef RE_ENABLE_I18N
2022   if (input->mb_cur_max > 1 &&
2023       !re_string_first_byte (input, re_string_cur_idx (input)))
2024     {
2025       token->type = CHARACTER;
2026       return 1;
2027     }
2028 #endif /* RE_ENABLE_I18N */
2029
2030   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2031       && re_string_cur_idx (input) + 1 < re_string_length (input))
2032     {
2033       /* In this case, '\' escape a character.  */
2034       unsigned char c2;
2035       re_string_skip_bytes (input, 1);
2036       c2 = re_string_peek_byte (input, 0);
2037       token->opr.c = c2;
2038       token->type = CHARACTER;
2039       return 1;
2040     }
2041   if (c == '[') /* '[' is a special char in a bracket exps.  */
2042     {
2043       unsigned char c2;
2044       int token_len;
2045       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2046         c2 = re_string_peek_byte (input, 1);
2047       else
2048         c2 = 0;
2049       token->opr.c = c2;
2050       token_len = 2;
2051       switch (c2)
2052         {
2053         case '.':
2054           token->type = OP_OPEN_COLL_ELEM;
2055           break;
2056         case '=':
2057           token->type = OP_OPEN_EQUIV_CLASS;
2058           break;
2059         case ':':
2060           if (syntax & RE_CHAR_CLASSES)
2061             {
2062               token->type = OP_OPEN_CHAR_CLASS;
2063               break;
2064             }
2065           /* else fall through.  */
2066         default:
2067           token->type = CHARACTER;
2068           token->opr.c = c;
2069           token_len = 1;
2070           break;
2071         }
2072       return token_len;
2073     }
2074   switch (c)
2075     {
2076     case '-':
2077       token->type = OP_CHARSET_RANGE;
2078       break;
2079     case ']':
2080       token->type = OP_CLOSE_BRACKET;
2081       break;
2082     case '^':
2083       token->type = OP_NON_MATCH_LIST;
2084       break;
2085     default:
2086       token->type = CHARACTER;
2087     }
2088   return 1;
2089 }
2090 \f
2091 /* Functions for parser.  */
2092
2093 /* Entry point of the parser.
2094    Parse the regular expression REGEXP and return the structure tree.
2095    If an error occurs, ERR is set by error code, and return NULL.
2096    This function build the following tree, from regular expression <reg_exp>:
2097            CAT
2098            / \
2099           /   \
2100    <reg_exp>  EOR
2101
2102    CAT means concatenation.
2103    EOR means end of regular expression.  */
2104
2105 static bin_tree_t *
2106 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2107        reg_errcode_t *err)
2108 {
2109   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2110   bin_tree_t *tree, *eor, *root;
2111   re_token_t current_token;
2112   dfa->syntax = syntax;
2113   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2114   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2115   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2116     return NULL;
2117   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2118   if (tree != NULL)
2119     root = create_tree (dfa, tree, eor, CONCAT);
2120   else
2121     root = eor;
2122   if (BE (eor == NULL || root == NULL, 0))
2123     {
2124       *err = REG_ESPACE;
2125       return NULL;
2126     }
2127   return root;
2128 }
2129
2130 /* This function build the following tree, from regular expression
2131    <branch1>|<branch2>:
2132            ALT
2133            / \
2134           /   \
2135    <branch1> <branch2>
2136
2137    ALT means alternative, which represents the operator '|'.  */
2138
2139 static bin_tree_t *
2140 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2142 {
2143   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144   bin_tree_t *tree, *branch = NULL;
2145   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147     return NULL;
2148
2149   while (token->type == OP_ALT)
2150     {
2151       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152       if (token->type != OP_ALT && token->type != END_OF_RE
2153           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154         {
2155           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2157             return NULL;
2158         }
2159       else
2160         branch = NULL;
2161       tree = create_tree (dfa, tree, branch, OP_ALT);
2162       if (BE (tree == NULL, 0))
2163         {
2164           *err = REG_ESPACE;
2165           return NULL;
2166         }
2167     }
2168   return tree;
2169 }
2170
2171 /* This function build the following tree, from regular expression
2172    <exp1><exp2>:
2173         CAT
2174         / \
2175        /   \
2176    <exp1> <exp2>
2177
2178    CAT means concatenation.  */
2179
2180 static bin_tree_t *
2181 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2182               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2183 {
2184   bin_tree_t *tree, *expr;
2185   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2186   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2187   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188     return NULL;
2189
2190   while (token->type != OP_ALT && token->type != END_OF_RE
2191          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2192     {
2193       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2194       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2195         {
2196           return NULL;
2197         }
2198       if (tree != NULL && expr != NULL)
2199         {
2200           tree = create_tree (dfa, tree, expr, CONCAT);
2201           if (tree == NULL)
2202             {
2203               *err = REG_ESPACE;
2204               return NULL;
2205             }
2206         }
2207       else if (tree == NULL)
2208         tree = expr;
2209       /* Otherwise expr == NULL, we don't need to create new tree.  */
2210     }
2211   return tree;
2212 }
2213
2214 /* This function build the following tree, from regular expression a*:
2215          *
2216          |
2217          a
2218 */
2219
2220 static bin_tree_t *
2221 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2223 {
2224   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2225   bin_tree_t *tree;
2226   switch (token->type)
2227     {
2228     case CHARACTER:
2229       tree = create_token_tree (dfa, NULL, NULL, token);
2230       if (BE (tree == NULL, 0))
2231         {
2232           *err = REG_ESPACE;
2233           return NULL;
2234         }
2235 #ifdef RE_ENABLE_I18N
2236       if (dfa->mb_cur_max > 1)
2237         {
2238           while (!re_string_eoi (regexp)
2239                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2240             {
2241               bin_tree_t *mbc_remain;
2242               fetch_token (token, regexp, syntax);
2243               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2244               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2245               if (BE (mbc_remain == NULL || tree == NULL, 0))
2246                 {
2247                   *err = REG_ESPACE;
2248                   return NULL;
2249                 }
2250             }
2251         }
2252 #endif
2253       break;
2254     case OP_OPEN_SUBEXP:
2255       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2256       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257         return NULL;
2258       break;
2259     case OP_OPEN_BRACKET:
2260       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2261       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262         return NULL;
2263       break;
2264     case OP_BACK_REF:
2265       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2266         {
2267           *err = REG_ESUBREG;
2268           return NULL;
2269         }
2270       dfa->used_bkref_map |= 1 << token->opr.idx;
2271       tree = create_token_tree (dfa, NULL, NULL, token);
2272       if (BE (tree == NULL, 0))
2273         {
2274           *err = REG_ESPACE;
2275           return NULL;
2276         }
2277       ++dfa->nbackref;
2278       dfa->has_mb_node = 1;
2279       break;
2280     case OP_OPEN_DUP_NUM:
2281       if (syntax & RE_CONTEXT_INVALID_DUP)
2282         {
2283           *err = REG_BADRPT;
2284           return NULL;
2285         }
2286       /* FALLTHROUGH */
2287     case OP_DUP_ASTERISK:
2288     case OP_DUP_PLUS:
2289     case OP_DUP_QUESTION:
2290       if (syntax & RE_CONTEXT_INVALID_OPS)
2291         {
2292           *err = REG_BADRPT;
2293           return NULL;
2294         }
2295       else if (syntax & RE_CONTEXT_INDEP_OPS)
2296         {
2297           fetch_token (token, regexp, syntax);
2298           return parse_expression (regexp, preg, token, syntax, nest, err);
2299         }
2300       /* else fall through  */
2301     case OP_CLOSE_SUBEXP:
2302       if ((token->type == OP_CLOSE_SUBEXP) &&
2303           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2304         {
2305           *err = REG_ERPAREN;
2306           return NULL;
2307         }
2308       /* else fall through  */
2309     case OP_CLOSE_DUP_NUM:
2310       /* We treat it as a normal character.  */
2311
2312       /* Then we can these characters as normal characters.  */
2313       token->type = CHARACTER;
2314       /* mb_partial and word_char bits should be initialized already
2315          by peek_token.  */
2316       tree = create_token_tree (dfa, NULL, NULL, token);
2317       if (BE (tree == NULL, 0))
2318         {
2319           *err = REG_ESPACE;
2320           return NULL;
2321         }
2322       break;
2323     case ANCHOR:
2324       if ((token->opr.ctx_type
2325            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2326           && dfa->word_ops_used == 0)
2327         init_word_char (dfa);
2328       if (token->opr.ctx_type == WORD_DELIM
2329           || token->opr.ctx_type == NOT_WORD_DELIM)
2330         {
2331           bin_tree_t *tree_first, *tree_last;
2332           if (token->opr.ctx_type == WORD_DELIM)
2333             {
2334               token->opr.ctx_type = WORD_FIRST;
2335               tree_first = create_token_tree (dfa, NULL, NULL, token);
2336               token->opr.ctx_type = WORD_LAST;
2337             }
2338           else
2339             {
2340               token->opr.ctx_type = INSIDE_WORD;
2341               tree_first = create_token_tree (dfa, NULL, NULL, token);
2342               token->opr.ctx_type = INSIDE_NOTWORD;
2343             }
2344           tree_last = create_token_tree (dfa, NULL, NULL, token);
2345           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2346           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2347             {
2348               *err = REG_ESPACE;
2349               return NULL;
2350             }
2351         }
2352       else
2353         {
2354           tree = create_token_tree (dfa, NULL, NULL, token);
2355           if (BE (tree == NULL, 0))
2356             {
2357               *err = REG_ESPACE;
2358               return NULL;
2359             }
2360         }
2361       /* We must return here, since ANCHORs can't be followed
2362          by repetition operators.
2363          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2364              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2365       fetch_token (token, regexp, syntax);
2366       return tree;
2367     case OP_PERIOD:
2368       tree = create_token_tree (dfa, NULL, NULL, token);
2369       if (BE (tree == NULL, 0))
2370         {
2371           *err = REG_ESPACE;
2372           return NULL;
2373         }
2374       if (dfa->mb_cur_max > 1)
2375         dfa->has_mb_node = 1;
2376       break;
2377     case OP_WORD:
2378     case OP_NOTWORD:
2379       tree = build_charclass_op (dfa, regexp->trans,
2380                                  (const unsigned char *) "alnum",
2381                                  (const unsigned char *) "_",
2382                                  token->type == OP_NOTWORD, err);
2383       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2384         return NULL;
2385       break;
2386     case OP_SPACE:
2387     case OP_NOTSPACE:
2388       tree = build_charclass_op (dfa, regexp->trans,
2389                                  (const unsigned char *) "space",
2390                                  (const unsigned char *) "",
2391                                  token->type == OP_NOTSPACE, err);
2392       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2393         return NULL;
2394       break;
2395     case OP_ALT:
2396     case END_OF_RE:
2397       return NULL;
2398     case BACK_SLASH:
2399       *err = REG_EESCAPE;
2400       return NULL;
2401     default:
2402       /* Must not happen?  */
2403 #ifdef DEBUG
2404       assert (0);
2405 #endif
2406       return NULL;
2407     }
2408   fetch_token (token, regexp, syntax);
2409
2410   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2411          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2412     {
2413       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2414       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2415         return NULL;
2416       /* In BRE consecutive duplications are not allowed.  */
2417       if ((syntax & RE_CONTEXT_INVALID_DUP)
2418           && (token->type == OP_DUP_ASTERISK
2419               || token->type == OP_OPEN_DUP_NUM))
2420         {
2421           *err = REG_BADRPT;
2422           return NULL;
2423         }
2424     }
2425
2426   return tree;
2427 }
2428
2429 /* This function build the following tree, from regular expression
2430    (<reg_exp>):
2431          SUBEXP
2432             |
2433         <reg_exp>
2434 */
2435
2436 static bin_tree_t *
2437 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2438                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2439 {
2440   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2441   bin_tree_t *tree;
2442   size_t cur_nsub;
2443   cur_nsub = preg->re_nsub++;
2444
2445   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2446
2447   /* The subexpression may be a null string.  */
2448   if (token->type == OP_CLOSE_SUBEXP)
2449     tree = NULL;
2450   else
2451     {
2452       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2453       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2454         *err = REG_EPAREN;
2455       if (BE (*err != REG_NOERROR, 0))
2456         return NULL;
2457     }
2458
2459   if (cur_nsub <= '9' - '1')
2460     dfa->completed_bkref_map |= 1 << cur_nsub;
2461
2462   tree = create_tree (dfa, tree, NULL, SUBEXP);
2463   if (BE (tree == NULL, 0))
2464     {
2465       *err = REG_ESPACE;
2466       return NULL;
2467     }
2468   tree->token.opr.idx = cur_nsub;
2469   return tree;
2470 }
2471
2472 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2473
2474 static bin_tree_t *
2475 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2476               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2477 {
2478   bin_tree_t *tree = NULL, *old_tree = NULL;
2479   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2480   re_token_t start_token = *token;
2481
2482   if (token->type == OP_OPEN_DUP_NUM)
2483     {
2484       end = 0;
2485       start = fetch_number (regexp, token, syntax);
2486       if (start == REG_MISSING)
2487         {
2488           if (token->type == CHARACTER && token->opr.c == ',')
2489             start = 0; /* We treat "{,m}" as "{0,m}".  */
2490           else
2491             {
2492               *err = REG_BADBR; /* <re>{} is invalid.  */
2493               return NULL;
2494             }
2495         }
2496       if (BE (start != REG_ERROR, 1))
2497         {
2498           /* We treat "{n}" as "{n,n}".  */
2499           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2500                  : ((token->type == CHARACTER && token->opr.c == ',')
2501                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2502         }
2503       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2504         {
2505           /* Invalid sequence.  */
2506           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2507             {
2508               if (token->type == END_OF_RE)
2509                 *err = REG_EBRACE;
2510               else
2511                 *err = REG_BADBR;
2512
2513               return NULL;
2514             }
2515
2516           /* If the syntax bit is set, rollback.  */
2517           re_string_set_index (regexp, start_idx);
2518           *token = start_token;
2519           token->type = CHARACTER;
2520           /* mb_partial and word_char bits should be already initialized by
2521              peek_token.  */
2522           return elem;
2523         }
2524
2525       if (BE ((end != REG_MISSING && start > end)
2526               || token->type != OP_CLOSE_DUP_NUM, 0))
2527         {
2528           /* First number greater than second.  */
2529           *err = REG_BADBR;
2530           return NULL;
2531         }
2532     }
2533   else
2534     {
2535       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2536       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2537     }
2538
2539   fetch_token (token, regexp, syntax);
2540
2541   if (BE (elem == NULL, 0))
2542     return NULL;
2543   if (BE (start == 0 && end == 0, 0))
2544     {
2545       postorder (elem, free_tree, NULL);
2546       return NULL;
2547     }
2548
2549   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2550   if (BE (start > 0, 0))
2551     {
2552       tree = elem;
2553       for (i = 2; i <= start; ++i)
2554         {
2555           elem = duplicate_tree (elem, dfa);
2556           tree = create_tree (dfa, tree, elem, CONCAT);
2557           if (BE (elem == NULL || tree == NULL, 0))
2558             goto parse_dup_op_espace;
2559         }
2560
2561       if (start == end)
2562         return tree;
2563
2564       /* Duplicate ELEM before it is marked optional.  */
2565       elem = duplicate_tree (elem, dfa);
2566       old_tree = tree;
2567     }
2568   else
2569     old_tree = NULL;
2570
2571   if (elem->token.type == SUBEXP)
2572     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2573
2574   tree = create_tree (dfa, elem, NULL,
2575                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2576   if (BE (tree == NULL, 0))
2577     goto parse_dup_op_espace;
2578
2579 /* From gnulib's "intprops.h":
2580    True if the arithmetic type T is signed.  */
2581 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2582
2583   /* This loop is actually executed only when end != REG_MISSING,
2584      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2585      already created the start+1-th copy.  */
2586   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2587     for (i = start + 2; i <= end; ++i)
2588       {
2589         elem = duplicate_tree (elem, dfa);
2590         tree = create_tree (dfa, tree, elem, CONCAT);
2591         if (BE (elem == NULL || tree == NULL, 0))
2592           goto parse_dup_op_espace;
2593
2594         tree = create_tree (dfa, tree, NULL, OP_ALT);
2595         if (BE (tree == NULL, 0))
2596           goto parse_dup_op_espace;
2597       }
2598
2599   if (old_tree)
2600     tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602   return tree;
2603
2604  parse_dup_op_espace:
2605   *err = REG_ESPACE;
2606   return NULL;
2607 }
2608
2609 /* Size of the names for collating symbol/equivalence_class/character_class.
2610    I'm not sure, but maybe enough.  */
2611 #define BRACKET_NAME_BUF_SIZE 32
2612
2613 #ifndef _LIBC
2614   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615      Build the range expression which starts from START_ELEM, and ends
2616      at END_ELEM.  The result are written to MBCSET and SBCSET.
2617      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618      mbcset->range_ends, is a pointer argument since we may
2619      update it.  */
2620
2621 static reg_errcode_t
2622 internal_function
2623 # ifdef RE_ENABLE_I18N
2624 build_range_exp (const reg_syntax_t syntax,
2625                  bitset_t sbcset,
2626                  re_charset_t *mbcset,
2627                  Idx *range_alloc,
2628                  const bracket_elem_t *start_elem,
2629                  const bracket_elem_t *end_elem)
2630 # else /* not RE_ENABLE_I18N */
2631 build_range_exp (const reg_syntax_t syntax,
2632                  bitset_t sbcset,
2633                  const bracket_elem_t *start_elem,
2634                  const bracket_elem_t *end_elem)
2635 # endif /* not RE_ENABLE_I18N */
2636 {
2637   unsigned int start_ch, end_ch;
2638   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2639   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2640           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2641           0))
2642     return REG_ERANGE;
2643
2644   /* We can handle no multi character collating elements without libc
2645      support.  */
2646   if (BE ((start_elem->type == COLL_SYM
2647            && strlen ((char *) start_elem->opr.name) > 1)
2648           || (end_elem->type == COLL_SYM
2649               && strlen ((char *) end_elem->opr.name) > 1), 0))
2650     return REG_ECOLLATE;
2651
2652 # ifdef RE_ENABLE_I18N
2653   {
2654     wchar_t wc;
2655     wint_t start_wc;
2656     wint_t end_wc;
2657     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2658
2659     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2660                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2661                    : 0));
2662     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2663               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2664                  : 0));
2665     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2666                 ? __btowc (start_ch) : start_elem->opr.wch);
2667     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2668               ? __btowc (end_ch) : end_elem->opr.wch);
2669     if (start_wc == WEOF || end_wc == WEOF)
2670       return REG_ECOLLATE;
2671     cmp_buf[0] = start_wc;
2672     cmp_buf[4] = end_wc;
2673
2674     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2675             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2676       return REG_ERANGE;
2677
2678     /* Got valid collation sequence values, add them as a new entry.
2679        However, for !_LIBC we have no collation elements: if the
2680        character set is single byte, the single byte character set
2681        that we build below suffices.  parse_bracket_exp passes
2682        no MBCSET if dfa->mb_cur_max == 1.  */
2683     if (mbcset)
2684       {
2685         /* Check the space of the arrays.  */
2686         if (BE (*range_alloc == mbcset->nranges, 0))
2687           {
2688             /* There is not enough space, need realloc.  */
2689             wchar_t *new_array_start, *new_array_end;
2690             Idx new_nranges;
2691
2692             /* +1 in case of mbcset->nranges is 0.  */
2693             new_nranges = 2 * mbcset->nranges + 1;
2694             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2695                are NULL if *range_alloc == 0.  */
2696             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2697                                           new_nranges);
2698             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2699                                         new_nranges);
2700
2701             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2702               return REG_ESPACE;
2703
2704             mbcset->range_starts = new_array_start;
2705             mbcset->range_ends = new_array_end;
2706             *range_alloc = new_nranges;
2707           }
2708
2709         mbcset->range_starts[mbcset->nranges] = start_wc;
2710         mbcset->range_ends[mbcset->nranges++] = end_wc;
2711       }
2712
2713     /* Build the table for single byte characters.  */
2714     for (wc = 0; wc < SBC_MAX; ++wc)
2715       {
2716         cmp_buf[2] = wc;
2717         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2718             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2719           bitset_set (sbcset, wc);
2720       }
2721   }
2722 # else /* not RE_ENABLE_I18N */
2723   {
2724     unsigned int ch;
2725     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2726                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2727                    : 0));
2728     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2729               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2730                  : 0));
2731     if (start_ch > end_ch)
2732       return REG_ERANGE;
2733     /* Build the table for single byte characters.  */
2734     for (ch = 0; ch < SBC_MAX; ++ch)
2735       if (start_ch <= ch  && ch <= end_ch)
2736         bitset_set (sbcset, ch);
2737   }
2738 # endif /* not RE_ENABLE_I18N */
2739   return REG_NOERROR;
2740 }
2741 #endif /* not _LIBC */
2742
2743 #ifndef _LIBC
2744 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2745    Build the collating element which is represented by NAME.
2746    The result are written to MBCSET and SBCSET.
2747    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2748    pointer argument since we may update it.  */
2749
2750 static reg_errcode_t
2751 internal_function
2752 build_collating_symbol (bitset_t sbcset,
2753 # ifdef RE_ENABLE_I18N
2754                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2755 # endif
2756                         const unsigned char *name)
2757 {
2758   size_t name_len = strlen ((const char *) name);
2759   if (BE (name_len != 1, 0))
2760     return REG_ECOLLATE;
2761   else
2762     {
2763       bitset_set (sbcset, name[0]);
2764       return REG_NOERROR;
2765     }
2766 }
2767 #endif /* not _LIBC */
2768
2769 /* This function parse bracket expression like "[abc]", "[a-c]",
2770    "[[.a-a.]]" etc.  */
2771
2772 static bin_tree_t *
2773 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2774                    reg_syntax_t syntax, reg_errcode_t *err)
2775 {
2776 #ifdef _LIBC
2777   const unsigned char *collseqmb;
2778   const char *collseqwc;
2779   uint32_t nrules;
2780   int32_t table_size;
2781   const int32_t *symb_table;
2782   const unsigned char *extra;
2783
2784   /* Local function for parse_bracket_exp used in _LIBC environment.
2785      Seek the collating symbol entry corresponding to NAME.
2786      Return the index of the symbol in the SYMB_TABLE.  */
2787
2788   auto inline int32_t
2789   __attribute ((always_inline))
2790   seek_collating_symbol_entry (name, name_len)
2791          const unsigned char *name;
2792          size_t name_len;
2793     {
2794       int32_t hash = elem_hash ((const char *) name, name_len);
2795       int32_t elem = hash % table_size;
2796       if (symb_table[2 * elem] != 0)
2797         {
2798           int32_t second = hash % (table_size - 2) + 1;
2799
2800           do
2801             {
2802               /* First compare the hashing value.  */
2803               if (symb_table[2 * elem] == hash
2804                   /* Compare the length of the name.  */
2805                   && name_len == extra[symb_table[2 * elem + 1]]
2806                   /* Compare the name.  */
2807                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2808                              name_len) == 0)
2809                 {
2810                   /* Yep, this is the entry.  */
2811                   break;
2812                 }
2813
2814               /* Next entry.  */
2815               elem += second;
2816             }
2817           while (symb_table[2 * elem] != 0);
2818         }
2819       return elem;
2820     }
2821
2822   /* Local function for parse_bracket_exp used in _LIBC environment.
2823      Look up the collation sequence value of BR_ELEM.
2824      Return the value if succeeded, UINT_MAX otherwise.  */
2825
2826   auto inline unsigned int
2827   __attribute ((always_inline))
2828   lookup_collation_sequence_value (br_elem)
2829          bracket_elem_t *br_elem;
2830     {
2831       if (br_elem->type == SB_CHAR)
2832         {
2833           /*
2834           if (MB_CUR_MAX == 1)
2835           */
2836           if (nrules == 0)
2837             return collseqmb[br_elem->opr.ch];
2838           else
2839             {
2840               wint_t wc = __btowc (br_elem->opr.ch);
2841               return __collseq_table_lookup (collseqwc, wc);
2842             }
2843         }
2844       else if (br_elem->type == MB_CHAR)
2845         {
2846           if (nrules != 0)
2847             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2848         }
2849       else if (br_elem->type == COLL_SYM)
2850         {
2851           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2852           if (nrules != 0)
2853             {
2854               int32_t elem, idx;
2855               elem = seek_collating_symbol_entry (br_elem->opr.name,
2856                                                   sym_name_len);
2857               if (symb_table[2 * elem] != 0)
2858                 {
2859                   /* We found the entry.  */
2860                   idx = symb_table[2 * elem + 1];
2861                   /* Skip the name of collating element name.  */
2862                   idx += 1 + extra[idx];
2863                   /* Skip the byte sequence of the collating element.  */
2864                   idx += 1 + extra[idx];
2865                   /* Adjust for the alignment.  */
2866                   idx = (idx + 3) & ~3;
2867                   /* Skip the multibyte collation sequence value.  */
2868                   idx += sizeof (unsigned int);
2869                   /* Skip the wide char sequence of the collating element.  */
2870                   idx += sizeof (unsigned int) *
2871                     (1 + *(unsigned int *) (extra + idx));
2872                   /* Return the collation sequence value.  */
2873                   return *(unsigned int *) (extra + idx);
2874                 }
2875               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2876                 {
2877                   /* No valid character.  Match it as a single byte
2878                      character.  */
2879                   return collseqmb[br_elem->opr.name[0]];
2880                 }
2881             }
2882           else if (sym_name_len == 1)
2883             return collseqmb[br_elem->opr.name[0]];
2884         }
2885       return UINT_MAX;
2886     }
2887
2888   /* Local function for parse_bracket_exp used in _LIBC environment.
2889      Build the range expression which starts from START_ELEM, and ends
2890      at END_ELEM.  The result are written to MBCSET and SBCSET.
2891      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892      mbcset->range_ends, is a pointer argument since we may
2893      update it.  */
2894
2895   auto inline reg_errcode_t
2896   __attribute ((always_inline))
2897   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2898          re_charset_t *mbcset;
2899          Idx *range_alloc;
2900          bitset_t sbcset;
2901          bracket_elem_t *start_elem, *end_elem;
2902     {
2903       unsigned int ch;
2904       uint32_t start_collseq;
2905       uint32_t end_collseq;
2906
2907       /* Equivalence Classes and Character Classes can't be a range
2908          start/end.  */
2909       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2910               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2911               0))
2912         return REG_ERANGE;
2913
2914       start_collseq = lookup_collation_sequence_value (start_elem);
2915       end_collseq = lookup_collation_sequence_value (end_elem);
2916       /* Check start/end collation sequence values.  */
2917       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2918         return REG_ECOLLATE;
2919       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2920         return REG_ERANGE;
2921
2922       /* Got valid collation sequence values, add them as a new entry.
2923          However, if we have no collation elements, and the character set
2924          is single byte, the single byte character set that we
2925          build below suffices. */
2926       if (nrules > 0 || dfa->mb_cur_max > 1)
2927         {
2928           /* Check the space of the arrays.  */
2929           if (BE (*range_alloc == mbcset->nranges, 0))
2930             {
2931               /* There is not enough space, need realloc.  */
2932               uint32_t *new_array_start;
2933               uint32_t *new_array_end;
2934               Idx new_nranges;
2935
2936               /* +1 in case of mbcset->nranges is 0.  */
2937               new_nranges = 2 * mbcset->nranges + 1;
2938               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2939                                             new_nranges);
2940               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2941                                           new_nranges);
2942
2943               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2944                 return REG_ESPACE;
2945
2946               mbcset->range_starts = new_array_start;
2947               mbcset->range_ends = new_array_end;
2948               *range_alloc = new_nranges;
2949             }
2950
2951           mbcset->range_starts[mbcset->nranges] = start_collseq;
2952           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2953         }
2954
2955       /* Build the table for single byte characters.  */
2956       for (ch = 0; ch < SBC_MAX; ch++)
2957         {
2958           uint32_t ch_collseq;
2959           /*
2960           if (MB_CUR_MAX == 1)
2961           */
2962           if (nrules == 0)
2963             ch_collseq = collseqmb[ch];
2964           else
2965             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2966           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2967             bitset_set (sbcset, ch);
2968         }
2969       return REG_NOERROR;
2970     }
2971
2972   /* Local function for parse_bracket_exp used in _LIBC environment.
2973      Build the collating element which is represented by NAME.
2974      The result are written to MBCSET and SBCSET.
2975      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2976      pointer argument since we may update it.  */
2977
2978   auto inline reg_errcode_t
2979   __attribute ((always_inline))
2980   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2981          re_charset_t *mbcset;
2982          Idx *coll_sym_alloc;
2983          bitset_t sbcset;
2984          const unsigned char *name;
2985     {
2986       int32_t elem, idx;
2987       size_t name_len = strlen ((const char *) name);
2988       if (nrules != 0)
2989         {
2990           elem = seek_collating_symbol_entry (name, name_len);
2991           if (symb_table[2 * elem] != 0)
2992             {
2993               /* We found the entry.  */
2994               idx = symb_table[2 * elem + 1];
2995               /* Skip the name of collating element name.  */
2996               idx += 1 + extra[idx];
2997             }
2998           else if (symb_table[2 * elem] == 0 && name_len == 1)
2999             {
3000               /* No valid character, treat it as a normal
3001                  character.  */
3002               bitset_set (sbcset, name[0]);
3003               return REG_NOERROR;
3004             }
3005           else
3006             return REG_ECOLLATE;
3007
3008           /* Got valid collation sequence, add it as a new entry.  */
3009           /* Check the space of the arrays.  */
3010           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3011             {
3012               /* Not enough, realloc it.  */
3013               /* +1 in case of mbcset->ncoll_syms is 0.  */
3014               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3015               /* Use realloc since mbcset->coll_syms is NULL
3016                  if *alloc == 0.  */
3017               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3018                                                    new_coll_sym_alloc);
3019               if (BE (new_coll_syms == NULL, 0))
3020                 return REG_ESPACE;
3021               mbcset->coll_syms = new_coll_syms;
3022               *coll_sym_alloc = new_coll_sym_alloc;
3023             }
3024           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3025           return REG_NOERROR;
3026         }
3027       else
3028         {
3029           if (BE (name_len != 1, 0))
3030             return REG_ECOLLATE;
3031           else
3032             {
3033               bitset_set (sbcset, name[0]);
3034               return REG_NOERROR;
3035             }
3036         }
3037     }
3038 #endif
3039
3040   re_token_t br_token;
3041   re_bitset_ptr_t sbcset;
3042 #ifdef RE_ENABLE_I18N
3043   re_charset_t *mbcset;
3044   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3045   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3046 #endif /* not RE_ENABLE_I18N */
3047   bool non_match = false;
3048   bin_tree_t *work_tree;
3049   int token_len;
3050   bool first_round = true;
3051 #ifdef _LIBC
3052   collseqmb = (const unsigned char *)
3053     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3054   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3055   if (nrules)
3056     {
3057       /*
3058       if (MB_CUR_MAX > 1)
3059       */
3060       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3061       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3062       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3063                                                   _NL_COLLATE_SYMB_TABLEMB);
3064       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3065                                                    _NL_COLLATE_SYMB_EXTRAMB);
3066     }
3067 #endif
3068   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3069 #ifdef RE_ENABLE_I18N
3070   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3071 #endif /* RE_ENABLE_I18N */
3072 #ifdef RE_ENABLE_I18N
3073   if (BE (sbcset == NULL || mbcset == NULL, 0))
3074 #else
3075   if (BE (sbcset == NULL, 0))
3076 #endif /* RE_ENABLE_I18N */
3077     {
3078       *err = REG_ESPACE;
3079       return NULL;
3080     }
3081
3082   token_len = peek_token_bracket (token, regexp, syntax);
3083   if (BE (token->type == END_OF_RE, 0))
3084     {
3085       *err = REG_BADPAT;
3086       goto parse_bracket_exp_free_return;
3087     }
3088   if (token->type == OP_NON_MATCH_LIST)
3089     {
3090 #ifdef RE_ENABLE_I18N
3091       mbcset->non_match = 1;
3092 #endif /* not RE_ENABLE_I18N */
3093       non_match = true;
3094       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3095         bitset_set (sbcset, '\n');
3096       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3097       token_len = peek_token_bracket (token, regexp, syntax);
3098       if (BE (token->type == END_OF_RE, 0))
3099         {
3100           *err = REG_BADPAT;
3101           goto parse_bracket_exp_free_return;
3102         }
3103     }
3104
3105   /* We treat the first ']' as a normal character.  */
3106   if (token->type == OP_CLOSE_BRACKET)
3107     token->type = CHARACTER;
3108
3109   while (1)
3110     {
3111       bracket_elem_t start_elem, end_elem;
3112       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3113       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3114       reg_errcode_t ret;
3115       int token_len2 = 0;
3116       bool is_range_exp = false;
3117       re_token_t token2;
3118
3119       start_elem.opr.name = start_name_buf;
3120       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3121                                    syntax, first_round);
3122       if (BE (ret != REG_NOERROR, 0))
3123         {
3124           *err = ret;
3125           goto parse_bracket_exp_free_return;
3126         }
3127       first_round = false;
3128
3129       /* Get information about the next token.  We need it in any case.  */
3130       token_len = peek_token_bracket (token, regexp, syntax);
3131
3132       /* Do not check for ranges if we know they are not allowed.  */
3133       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3134         {
3135           if (BE (token->type == END_OF_RE, 0))
3136             {
3137               *err = REG_EBRACK;
3138               goto parse_bracket_exp_free_return;
3139             }
3140           if (token->type == OP_CHARSET_RANGE)
3141             {
3142               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3143               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3144               if (BE (token2.type == END_OF_RE, 0))
3145                 {
3146                   *err = REG_EBRACK;
3147                   goto parse_bracket_exp_free_return;
3148                 }
3149               if (token2.type == OP_CLOSE_BRACKET)
3150                 {
3151                   /* We treat the last '-' as a normal character.  */
3152                   re_string_skip_bytes (regexp, -token_len);
3153                   token->type = CHARACTER;
3154                 }
3155               else
3156                 is_range_exp = true;
3157             }
3158         }
3159
3160       if (is_range_exp == true)
3161         {
3162           end_elem.opr.name = end_name_buf;
3163           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3164                                        dfa, syntax, true);
3165           if (BE (ret != REG_NOERROR, 0))
3166             {
3167               *err = ret;
3168               goto parse_bracket_exp_free_return;
3169             }
3170
3171           token_len = peek_token_bracket (token, regexp, syntax);
3172
3173 #ifdef _LIBC
3174           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3175                                   &start_elem, &end_elem);
3176 #else
3177 # ifdef RE_ENABLE_I18N
3178           *err = build_range_exp (syntax, sbcset,
3179                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3180                                   &range_alloc, &start_elem, &end_elem);
3181 # else
3182           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3183 # endif
3184 #endif /* RE_ENABLE_I18N */
3185           if (BE (*err != REG_NOERROR, 0))
3186             goto parse_bracket_exp_free_return;
3187         }
3188       else
3189         {
3190           switch (start_elem.type)
3191             {
3192             case SB_CHAR:
3193               bitset_set (sbcset, start_elem.opr.ch);
3194               break;
3195 #ifdef RE_ENABLE_I18N
3196             case MB_CHAR:
3197               /* Check whether the array has enough space.  */
3198               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3199                 {
3200                   wchar_t *new_mbchars;
3201                   /* Not enough, realloc it.  */
3202                   /* +1 in case of mbcset->nmbchars is 0.  */
3203                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3204                   /* Use realloc since array is NULL if *alloc == 0.  */
3205                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3206                                             mbchar_alloc);
3207                   if (BE (new_mbchars == NULL, 0))
3208                     goto parse_bracket_exp_espace;
3209                   mbcset->mbchars = new_mbchars;
3210                 }
3211               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3212               break;
3213 #endif /* RE_ENABLE_I18N */
3214             case EQUIV_CLASS:
3215               *err = build_equiv_class (sbcset,
3216 #ifdef RE_ENABLE_I18N
3217                                         mbcset, &equiv_class_alloc,
3218 #endif /* RE_ENABLE_I18N */
3219                                         start_elem.opr.name);
3220               if (BE (*err != REG_NOERROR, 0))
3221                 goto parse_bracket_exp_free_return;
3222               break;
3223             case COLL_SYM:
3224               *err = build_collating_symbol (sbcset,
3225 #ifdef RE_ENABLE_I18N
3226                                              mbcset, &coll_sym_alloc,
3227 #endif /* RE_ENABLE_I18N */
3228                                              start_elem.opr.name);
3229               if (BE (*err != REG_NOERROR, 0))
3230                 goto parse_bracket_exp_free_return;
3231               break;
3232             case CHAR_CLASS:
3233               *err = build_charclass (regexp->trans, sbcset,
3234 #ifdef RE_ENABLE_I18N
3235                                       mbcset, &char_class_alloc,
3236 #endif /* RE_ENABLE_I18N */
3237                                       start_elem.opr.name, syntax);
3238               if (BE (*err != REG_NOERROR, 0))
3239                goto parse_bracket_exp_free_return;
3240               break;
3241             default:
3242               assert (0);
3243               break;
3244             }
3245         }
3246       if (BE (token->type == END_OF_RE, 0))
3247         {
3248           *err = REG_EBRACK;
3249           goto parse_bracket_exp_free_return;
3250         }
3251       if (token->type == OP_CLOSE_BRACKET)
3252         break;
3253     }
3254
3255   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3256
3257   /* If it is non-matching list.  */
3258   if (non_match)
3259     bitset_not (sbcset);
3260
3261 #ifdef RE_ENABLE_I18N
3262   /* Ensure only single byte characters are set.  */
3263   if (dfa->mb_cur_max > 1)
3264     bitset_mask (sbcset, dfa->sb_char);
3265
3266   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3267       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3268                                                      || mbcset->non_match)))
3269     {
3270       bin_tree_t *mbc_tree;
3271       int sbc_idx;
3272       /* Build a tree for complex bracket.  */
3273       dfa->has_mb_node = 1;
3274       br_token.type = COMPLEX_BRACKET;
3275       br_token.opr.mbcset = mbcset;
3276       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277       if (BE (mbc_tree == NULL, 0))
3278         goto parse_bracket_exp_espace;
3279       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3280         if (sbcset[sbc_idx])
3281           break;
3282       /* If there are no bits set in sbcset, there is no point
3283          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3284       if (sbc_idx < BITSET_WORDS)
3285         {
3286           /* Build a tree for simple bracket.  */
3287           br_token.type = SIMPLE_BRACKET;
3288           br_token.opr.sbcset = sbcset;
3289           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290           if (BE (work_tree == NULL, 0))
3291             goto parse_bracket_exp_espace;
3292
3293           /* Then join them by ALT node.  */
3294           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3295           if (BE (work_tree == NULL, 0))
3296             goto parse_bracket_exp_espace;
3297         }
3298       else
3299         {
3300           re_free (sbcset);
3301           work_tree = mbc_tree;
3302         }
3303     }
3304   else
3305 #endif /* not RE_ENABLE_I18N */
3306     {
3307 #ifdef RE_ENABLE_I18N
3308       free_charset (mbcset);
3309 #endif
3310       /* Build a tree for simple bracket.  */
3311       br_token.type = SIMPLE_BRACKET;
3312       br_token.opr.sbcset = sbcset;
3313       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314       if (BE (work_tree == NULL, 0))
3315         goto parse_bracket_exp_espace;
3316     }
3317   return work_tree;
3318
3319  parse_bracket_exp_espace:
3320   *err = REG_ESPACE;
3321  parse_bracket_exp_free_return:
3322   re_free (sbcset);
3323 #ifdef RE_ENABLE_I18N
3324   free_charset (mbcset);
3325 #endif /* RE_ENABLE_I18N */
3326   return NULL;
3327 }
3328
3329 /* Parse an element in the bracket expression.  */
3330
3331 static reg_errcode_t
3332 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3333                        re_token_t *token, int token_len, re_dfa_t *dfa,
3334                        reg_syntax_t syntax, bool accept_hyphen)
3335 {
3336 #ifdef RE_ENABLE_I18N
3337   int cur_char_size;
3338   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3339   if (cur_char_size > 1)
3340     {
3341       elem->type = MB_CHAR;
3342       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3343       re_string_skip_bytes (regexp, cur_char_size);
3344       return REG_NOERROR;
3345     }
3346 #endif /* RE_ENABLE_I18N */
3347   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3348   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3349       || token->type == OP_OPEN_EQUIV_CLASS)
3350     return parse_bracket_symbol (elem, regexp, token);
3351   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3352     {
3353       /* A '-' must only appear as anything but a range indicator before
3354          the closing bracket.  Everything else is an error.  */
3355       re_token_t token2;
3356       (void) peek_token_bracket (&token2, regexp, syntax);
3357       if (token2.type != OP_CLOSE_BRACKET)
3358         /* The actual error value is not standardized since this whole
3359            case is undefined.  But ERANGE makes good sense.  */
3360         return REG_ERANGE;
3361     }
3362   elem->type = SB_CHAR;
3363   elem->opr.ch = token->opr.c;
3364   return REG_NOERROR;
3365 }
3366
3367 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3368    such as [:<character_class>:], [.<collating_element>.], and
3369    [=<equivalent_class>=].  */
3370
3371 static reg_errcode_t
3372 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3373                       re_token_t *token)
3374 {
3375   unsigned char ch, delim = token->opr.c;
3376   int i = 0;
3377   if (re_string_eoi(regexp))
3378     return REG_EBRACK;
3379   for (;; ++i)
3380     {
3381       if (i >= BRACKET_NAME_BUF_SIZE)
3382         return REG_EBRACK;
3383       if (token->type == OP_OPEN_CHAR_CLASS)
3384         ch = re_string_fetch_byte_case (regexp);
3385       else
3386         ch = re_string_fetch_byte (regexp);
3387       if (re_string_eoi(regexp))
3388         return REG_EBRACK;
3389       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3390         break;
3391       elem->opr.name[i] = ch;
3392     }
3393   re_string_skip_bytes (regexp, 1);
3394   elem->opr.name[i] = '\0';
3395   switch (token->type)
3396     {
3397     case OP_OPEN_COLL_ELEM:
3398       elem->type = COLL_SYM;
3399       break;
3400     case OP_OPEN_EQUIV_CLASS:
3401       elem->type = EQUIV_CLASS;
3402       break;
3403     case OP_OPEN_CHAR_CLASS:
3404       elem->type = CHAR_CLASS;
3405       break;
3406     default:
3407       break;
3408     }
3409   return REG_NOERROR;
3410 }
3411
3412   /* Helper function for parse_bracket_exp.
3413      Build the equivalence class which is represented by NAME.
3414      The result are written to MBCSET and SBCSET.
3415      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3416      is a pointer argument since we may update it.  */
3417
3418 static reg_errcode_t
3419 #ifdef RE_ENABLE_I18N
3420 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3421                    Idx *equiv_class_alloc, const unsigned char *name)
3422 #else /* not RE_ENABLE_I18N */
3423 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3424 #endif /* not RE_ENABLE_I18N */
3425 {
3426 #ifdef _LIBC
3427   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3428   if (nrules != 0)
3429     {
3430       const int32_t *table, *indirect;
3431       const unsigned char *weights, *extra, *cp;
3432       unsigned char char_buf[2];
3433       int32_t idx1, idx2;
3434       unsigned int ch;
3435       size_t len;
3436       /* This #include defines a local function!  */
3437 # include <locale/weight.h>
3438       /* Calculate the index for equivalence class.  */
3439       cp = name;
3440       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3441       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3442                                                _NL_COLLATE_WEIGHTMB);
3443       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3444                                                    _NL_COLLATE_EXTRAMB);
3445       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3446                                                 _NL_COLLATE_INDIRECTMB);
3447       idx1 = findidx (&cp);
3448       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3449         /* This isn't a valid character.  */
3450         return REG_ECOLLATE;
3451
3452       /* Build single byte matching table for this equivalence class.  */
3453       char_buf[1] = (unsigned char) '\0';
3454       len = weights[idx1 & 0xffffff];
3455       for (ch = 0; ch < SBC_MAX; ++ch)
3456         {
3457           char_buf[0] = ch;
3458           cp = char_buf;
3459           idx2 = findidx (&cp);
3460 /*
3461           idx2 = table[ch];
3462 */
3463           if (idx2 == 0)
3464             /* This isn't a valid character.  */
3465             continue;
3466           /* Compare only if the length matches and the collation rule
3467              index is the same.  */
3468           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3469             {
3470               int cnt = 0;
3471
3472               while (cnt <= len &&
3473                      weights[(idx1 & 0xffffff) + 1 + cnt]
3474                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3475                 ++cnt;
3476
3477               if (cnt > len)
3478                 bitset_set (sbcset, ch);
3479             }
3480         }
3481       /* Check whether the array has enough space.  */
3482       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3483         {
3484           /* Not enough, realloc it.  */
3485           /* +1 in case of mbcset->nequiv_classes is 0.  */
3486           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3487           /* Use realloc since the array is NULL if *alloc == 0.  */
3488           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3489                                                    int32_t,
3490                                                    new_equiv_class_alloc);
3491           if (BE (new_equiv_classes == NULL, 0))
3492             return REG_ESPACE;
3493           mbcset->equiv_classes = new_equiv_classes;
3494           *equiv_class_alloc = new_equiv_class_alloc;
3495         }
3496       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3497     }
3498   else
3499 #endif /* _LIBC */
3500     {
3501       if (BE (strlen ((const char *) name) != 1, 0))
3502         return REG_ECOLLATE;
3503       bitset_set (sbcset, *name);
3504     }
3505   return REG_NOERROR;
3506 }
3507
3508   /* Helper function for parse_bracket_exp.
3509      Build the character class which is represented by NAME.
3510      The result are written to MBCSET and SBCSET.
3511      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3512      is a pointer argument since we may update it.  */
3513
3514 static reg_errcode_t
3515 #ifdef RE_ENABLE_I18N
3516 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3517                  re_charset_t *mbcset, Idx *char_class_alloc,
3518                  const unsigned char *class_name, reg_syntax_t syntax)
3519 #else /* not RE_ENABLE_I18N */
3520 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3521                  const unsigned char *class_name, reg_syntax_t syntax)
3522 #endif /* not RE_ENABLE_I18N */
3523 {
3524   int i;
3525   const char *name = (const char *) class_name;
3526
3527   /* In case of REG_ICASE "upper" and "lower" match the both of
3528      upper and lower cases.  */
3529   if ((syntax & RE_ICASE)
3530       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3531     name = "alpha";
3532
3533 #ifdef RE_ENABLE_I18N
3534   /* Check the space of the arrays.  */
3535   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3536     {
3537       /* Not enough, realloc it.  */
3538       /* +1 in case of mbcset->nchar_classes is 0.  */
3539       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3540       /* Use realloc since array is NULL if *alloc == 0.  */
3541       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3542                                                new_char_class_alloc);
3543       if (BE (new_char_classes == NULL, 0))
3544         return REG_ESPACE;
3545       mbcset->char_classes = new_char_classes;
3546       *char_class_alloc = new_char_class_alloc;
3547     }
3548   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3549 #endif /* RE_ENABLE_I18N */
3550
3551 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3552   do {                                          \
3553     if (BE (trans != NULL, 0))                  \
3554       {                                         \
3555         for (i = 0; i < SBC_MAX; ++i)           \
3556           if (ctype_func (i))                   \
3557             bitset_set (sbcset, trans[i]);      \
3558       }                                         \
3559     else                                        \
3560       {                                         \
3561         for (i = 0; i < SBC_MAX; ++i)           \
3562           if (ctype_func (i))                   \
3563             bitset_set (sbcset, i);             \
3564       }                                         \
3565   } while (0)
3566
3567   if (strcmp (name, "alnum") == 0)
3568     BUILD_CHARCLASS_LOOP (isalnum);
3569   else if (strcmp (name, "cntrl") == 0)
3570     BUILD_CHARCLASS_LOOP (iscntrl);
3571   else if (strcmp (name, "lower") == 0)
3572     BUILD_CHARCLASS_LOOP (islower);
3573   else if (strcmp (name, "space") == 0)
3574     BUILD_CHARCLASS_LOOP (isspace);
3575   else if (strcmp (name, "alpha") == 0)
3576     BUILD_CHARCLASS_LOOP (isalpha);
3577   else if (strcmp (name, "digit") == 0)
3578     BUILD_CHARCLASS_LOOP (isdigit);
3579   else if (strcmp (name, "print") == 0)
3580     BUILD_CHARCLASS_LOOP (isprint);
3581   else if (strcmp (name, "upper") == 0)
3582     BUILD_CHARCLASS_LOOP (isupper);
3583   else if (strcmp (name, "blank") == 0)
3584     BUILD_CHARCLASS_LOOP (isblank);
3585   else if (strcmp (name, "graph") == 0)
3586     BUILD_CHARCLASS_LOOP (isgraph);
3587   else if (strcmp (name, "punct") == 0)
3588     BUILD_CHARCLASS_LOOP (ispunct);
3589   else if (strcmp (name, "xdigit") == 0)
3590     BUILD_CHARCLASS_LOOP (isxdigit);
3591   else
3592     return REG_ECTYPE;
3593
3594   return REG_NOERROR;
3595 }
3596
3597 static bin_tree_t *
3598 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3599                     const unsigned char *class_name,
3600                     const unsigned char *extra, bool non_match,
3601                     reg_errcode_t *err)
3602 {
3603   re_bitset_ptr_t sbcset;
3604 #ifdef RE_ENABLE_I18N
3605   re_charset_t *mbcset;
3606   Idx alloc = 0;
3607 #endif /* not RE_ENABLE_I18N */
3608   reg_errcode_t ret;
3609   re_token_t br_token;
3610   bin_tree_t *tree;
3611
3612   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3613 #ifdef RE_ENABLE_I18N
3614   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3615 #endif /* RE_ENABLE_I18N */
3616
3617 #ifdef RE_ENABLE_I18N
3618   if (BE (sbcset == NULL || mbcset == NULL, 0))
3619 #else /* not RE_ENABLE_I18N */
3620   if (BE (sbcset == NULL, 0))
3621 #endif /* not RE_ENABLE_I18N */
3622     {
3623       *err = REG_ESPACE;
3624       return NULL;
3625     }
3626
3627   if (non_match)
3628     {
3629 #ifdef RE_ENABLE_I18N
3630       mbcset->non_match = 1;
3631 #endif /* not RE_ENABLE_I18N */
3632     }
3633
3634   /* We don't care the syntax in this case.  */
3635   ret = build_charclass (trans, sbcset,
3636 #ifdef RE_ENABLE_I18N
3637                          mbcset, &alloc,
3638 #endif /* RE_ENABLE_I18N */
3639                          class_name, 0);
3640
3641   if (BE (ret != REG_NOERROR, 0))
3642     {
3643       re_free (sbcset);
3644 #ifdef RE_ENABLE_I18N
3645       free_charset (mbcset);
3646 #endif /* RE_ENABLE_I18N */
3647       *err = ret;
3648       return NULL;
3649     }
3650   /* \w match '_' also.  */
3651   for (; *extra; extra++)
3652     bitset_set (sbcset, *extra);
3653
3654   /* If it is non-matching list.  */
3655   if (non_match)
3656     bitset_not (sbcset);
3657
3658 #ifdef RE_ENABLE_I18N
3659   /* Ensure only single byte characters are set.  */
3660   if (dfa->mb_cur_max > 1)
3661     bitset_mask (sbcset, dfa->sb_char);
3662 #endif
3663
3664   /* Build a tree for simple bracket.  */
3665   br_token.type = SIMPLE_BRACKET;
3666   br_token.opr.sbcset = sbcset;
3667   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3668   if (BE (tree == NULL, 0))
3669     goto build_word_op_espace;
3670
3671 #ifdef RE_ENABLE_I18N
3672   if (dfa->mb_cur_max > 1)
3673     {
3674       bin_tree_t *mbc_tree;
3675       /* Build a tree for complex bracket.  */
3676       br_token.type = COMPLEX_BRACKET;
3677       br_token.opr.mbcset = mbcset;
3678       dfa->has_mb_node = 1;
3679       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680       if (BE (mbc_tree == NULL, 0))
3681         goto build_word_op_espace;
3682       /* Then join them by ALT node.  */
3683       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3684       if (BE (mbc_tree != NULL, 1))
3685         return tree;
3686     }
3687   else
3688     {
3689       free_charset (mbcset);
3690       return tree;
3691     }
3692 #else /* not RE_ENABLE_I18N */
3693   return tree;
3694 #endif /* not RE_ENABLE_I18N */
3695
3696  build_word_op_espace:
3697   re_free (sbcset);
3698 #ifdef RE_ENABLE_I18N
3699   free_charset (mbcset);
3700 #endif /* RE_ENABLE_I18N */
3701   *err = REG_ESPACE;
3702   return NULL;
3703 }
3704
3705 /* This is intended for the expressions like "a{1,3}".
3706    Fetch a number from 'input', and return the number.
3707    Return REG_MISSING if the number field is empty like "{,1}".
3708    Return REG_ERROR if an error occurred.  */
3709
3710 static Idx
3711 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3712 {
3713   Idx num = REG_MISSING;
3714   unsigned char c;
3715   while (1)
3716     {
3717       fetch_token (token, input, syntax);
3718       c = token->opr.c;
3719       if (BE (token->type == END_OF_RE, 0))
3720         return REG_ERROR;
3721       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3722         break;
3723       num = ((token->type != CHARACTER || c < '0' || '9' < c
3724               || num == REG_ERROR)
3725              ? REG_ERROR
3726              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3727       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3728     }
3729   return num;
3730 }
3731 \f
3732 #ifdef RE_ENABLE_I18N
3733 static void
3734 free_charset (re_charset_t *cset)
3735 {
3736   re_free (cset->mbchars);
3737 # ifdef _LIBC
3738   re_free (cset->coll_syms);
3739   re_free (cset->equiv_classes);
3740   re_free (cset->range_starts);
3741   re_free (cset->range_ends);
3742 # endif
3743   re_free (cset->char_classes);
3744   re_free (cset);
3745 }
3746 #endif /* RE_ENABLE_I18N */
3747 \f
3748 /* Functions for binary tree operation.  */
3749
3750 /* Create a tree node.  */
3751
3752 static bin_tree_t *
3753 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3754              re_token_type_t type)
3755 {
3756   re_token_t t;
3757   t.type = type;
3758   return create_token_tree (dfa, left, right, &t);
3759 }
3760
3761 static bin_tree_t *
3762 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3763                    const re_token_t *token)
3764 {
3765   bin_tree_t *tree;
3766   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3767     {
3768       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3769
3770       if (storage == NULL)
3771         return NULL;
3772       storage->next = dfa->str_tree_storage;
3773       dfa->str_tree_storage = storage;
3774       dfa->str_tree_storage_idx = 0;
3775     }
3776   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3777
3778   tree->parent = NULL;
3779   tree->left = left;
3780   tree->right = right;
3781   tree->token = *token;
3782   tree->token.duplicated = 0;
3783   tree->token.opt_subexp = 0;
3784   tree->first = NULL;
3785   tree->next = NULL;
3786   tree->node_idx = REG_MISSING;
3787
3788   if (left != NULL)
3789     left->parent = tree;
3790   if (right != NULL)
3791     right->parent = tree;
3792   return tree;
3793 }
3794
3795 /* Mark the tree SRC as an optional subexpression.
3796    To be called from preorder or postorder.  */
3797
3798 static reg_errcode_t
3799 mark_opt_subexp (void *extra, bin_tree_t *node)
3800 {
3801   Idx idx = (Idx) (long) extra;
3802   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3803     node->token.opt_subexp = 1;
3804
3805   return REG_NOERROR;
3806 }
3807
3808 /* Free the allocated memory inside NODE. */
3809
3810 static void
3811 free_token (re_token_t *node)
3812 {
3813 #ifdef RE_ENABLE_I18N
3814   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3815     free_charset (node->opr.mbcset);
3816   else
3817 #endif /* RE_ENABLE_I18N */
3818     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3819       re_free (node->opr.sbcset);
3820 }
3821
3822 /* Worker function for tree walking.  Free the allocated memory inside NODE
3823    and its children. */
3824
3825 static reg_errcode_t
3826 free_tree (void *extra, bin_tree_t *node)
3827 {
3828   free_token (&node->token);
3829   return REG_NOERROR;
3830 }
3831
3832
3833 /* Duplicate the node SRC, and return new node.  This is a preorder
3834    visit similar to the one implemented by the generic visitor, but
3835    we need more infrastructure to maintain two parallel trees --- so,
3836    it's easier to duplicate.  */
3837
3838 static bin_tree_t *
3839 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3840 {
3841   const bin_tree_t *node;
3842   bin_tree_t *dup_root;
3843   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3844
3845   for (node = root; ; )
3846     {
3847       /* Create a new tree and link it back to the current parent.  */
3848       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3849       if (*p_new == NULL)
3850         return NULL;
3851       (*p_new)->parent = dup_node;
3852       (*p_new)->token.duplicated = 1;
3853       dup_node = *p_new;
3854
3855       /* Go to the left node, or up and to the right.  */
3856       if (node->left)
3857         {
3858           node = node->left;
3859           p_new = &dup_node->left;
3860         }
3861       else
3862         {
3863           const bin_tree_t *prev = NULL;
3864           while (node->right == prev || node->right == NULL)
3865             {
3866               prev = node;
3867               node = node->parent;
3868               dup_node = dup_node->parent;
3869               if (!node)
3870                 return dup_root;
3871             }
3872           node = node->right;
3873           p_new = &dup_node->right;
3874         }
3875     }
3876 }