lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 337                              &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 360           Idx i;
 361
 362 # ifdef _LIBC
 363           /* See if we have to try all bytes which start multiple collation
 364              elements.
 365              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 366                   collation element, and don't catch 'b' since 'b' is
 367                   the only collation element which starts from 'b' (and
 368                   it is caught by SIMPLE_BRACKET).  */
 369               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 370                   && (cset->ncoll_syms || cset->nranges))
 371                 {
 372                   const int32_t *table = (const int32_t *)
 373                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 374                   for (i = 0; i < SBC_MAX; ++i)
 375                     if (table[i] < 0)
 376                       re_set_fastmap (fastmap, icase, i);
 377                 }
 378 # endif /* _LIBC */
 379
 380           /* See if we have to start the match at all multibyte characters,
 381              i.e. where we would not find an invalid sequence.  This only
 382              applies to multibyte character sets; for single byte character
 383              sets, the SIMPLE_BRACKET again suffices.  */
 384           if (dfa->mb_cur_max > 1
 385               && (cset->nchar_classes || cset->non_match || cset->nranges
 386 # ifdef _LIBC
 387                   || cset->nequiv_classes
 388 # endif /* _LIBC */
 389                  ))
 390             {
 391               unsigned char c = 0;
 392               do
 393                 {
 394                   mbstate_t mbs;
 395                   memset (&mbs, 0, sizeof (mbs));
 396                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 397                     re_set_fastmap (fastmap, false, (int) c);
 398                 }
 399               while (++c != 0);
 400             }
 401
 402           else
 403             {
 404               /* ... Else catch all bytes which can start the mbchars.  */
 405               for (i = 0; i < cset->nmbchars; ++i)
 406                 {
 407                   char buf[256];
 408                   mbstate_t state;
 409                   memset (&state, '\0', sizeof (state));
 410                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 411                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                     {
 414                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 415                           != (size_t) -1)
 416                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 417                     }
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      'buffer' to the compiled pattern;
 443      'used' to the length of the compiled pattern;
 444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      'fastmap' to an allocated space for the fastmap;
 449      'fastmap_accurate' to zero;
 450      're_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *_Restrict_ preg;
 475     const char *_Restrict_ pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 #ifdef _LIBC
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *_Restrict_ preg;
 539     char *_Restrict_ errbuf;
 540     size_t errbuf_size;
 541 #else /* size_t might promote */
 542 size_t
 543 regerror (int errcode, const regex_t *_Restrict_ preg,
 544           char *_Restrict_ errbuf, size_t errbuf_size)
 545 #endif
 546 {
 547   const char *msg;
 548   size_t msg_size;
 549
 550   if (BE (errcode < 0
 551           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 552                                / sizeof (__re_error_msgid_idx[0])), 0))
 553     /* Only error codes returned by the rest of the code should be passed
 554        to this routine.  If we are given anything else, or if other regex
 555        code generates an invalid error code, then the program has a bug.
 556        Dump core so we can fix it.  */
 557     abort ();
 558
 559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 560
 561   msg_size = strlen (msg) + 1; /* Includes the null.  */
 562
 563   if (BE (errbuf_size != 0, 1))
 564     {
 565       size_t cpy_size = msg_size;
 566       if (BE (msg_size > errbuf_size, 0))
 567         {
 568           cpy_size = errbuf_size - 1;
 569           errbuf[cpy_size] = '\0';
 570         }
 571       memcpy (errbuf, msg, cpy_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset_t utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589 # ifdef __GNUC__
 590   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 591 # else
 592 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #   error "bitset_word_t is narrower than 32 bits"
 594 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 #  endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 # endif
 606 };
 607 #endif
 608
 609
 610 static void
 611 free_dfa_content (re_dfa_t *dfa)
 612 {
 613   Idx i, j;
 614
 615   if (dfa->nodes)
 616     for (i = 0; i < dfa->nodes_len; ++i)
 617       free_token (dfa->nodes + i);
 618   re_free (dfa->nexts);
 619   for (i = 0; i < dfa->nodes_len; ++i)
 620     {
 621       if (dfa->eclosures != NULL)
 622         re_node_set_free (dfa->eclosures + i);
 623       if (dfa->inveclosures != NULL)
 624         re_node_set_free (dfa->inveclosures + i);
 625       if (dfa->edests != NULL)
 626         re_node_set_free (dfa->edests + i);
 627     }
 628   re_free (dfa->edests);
 629   re_free (dfa->eclosures);
 630   re_free (dfa->inveclosures);
 631   re_free (dfa->nodes);
 632
 633   if (dfa->state_table)
 634     for (i = 0; i <= dfa->state_hash_mask; ++i)
 635       {
 636         struct re_state_table_entry *entry = dfa->state_table + i;
 637         for (j = 0; j < entry->num; ++j)
 638           {
 639             re_dfastate_t *state = entry->array[j];
 640             free_state (state);
 641           }
 642         re_free (entry->array);
 643       }
 644   re_free (dfa->state_table);
 645 #ifdef RE_ENABLE_I18N
 646   if (dfa->sb_char != utf8_sb_map)
 647     re_free (dfa->sb_char);
 648 #endif
 649   re_free (dfa->subexp_map);
 650 #ifdef DEBUG
 651   re_free (dfa->re_str);
 652 #endif
 653
 654   re_free (dfa);
 655 }
 656
 657
 658 /* Free dynamically allocated space used by PREG.  */
 659
 660 void
 661 regfree (preg)
 662     regex_t *preg;
 663 {
 664   re_dfa_t *dfa = preg->buffer;
 665   if (BE (dfa != NULL, 1))
 666     free_dfa_content (dfa);
 667   preg->buffer = NULL;
 668   preg->allocated = 0;
 669
 670   re_free (preg->fastmap);
 671   preg->fastmap = NULL;
 672
 673   re_free (preg->translate);
 674   preg->translate = NULL;
 675 }
 676 #ifdef _LIBC
 677 weak_alias (__regfree, regfree)
 678 #endif
 679 \f
 680 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 681    them unless specifically requested.  */
 682
 683 #if defined _REGEX_RE_COMP || defined _LIBC
 684
 685 /* BSD has one and only one pattern buffer.  */
 686 static struct re_pattern_buffer re_comp_buf;
 687
 688 char *
 689 # ifdef _LIBC
 690 /* Make these definitions weak in libc, so POSIX programs can redefine
 691    these names if they don't use our functions, and still use
 692    regcomp/regexec above without link errors.  */
 693 weak_function
 694 # endif
 695 re_comp (s)
 696      const char *s;
 697 {
 698   reg_errcode_t ret;
 699   char *fastmap;
 700
 701   if (!s)
 702     {
 703       if (!re_comp_buf.buffer)
 704         return gettext ("No previous regular expression");
 705       return 0;
 706     }
 707
 708   if (re_comp_buf.buffer)
 709     {
 710       fastmap = re_comp_buf.fastmap;
 711       re_comp_buf.fastmap = NULL;
 712       __regfree (&re_comp_buf);
 713       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 714       re_comp_buf.fastmap = fastmap;
 715     }
 716
 717   if (re_comp_buf.fastmap == NULL)
 718     {
 719       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 720       if (re_comp_buf.fastmap == NULL)
 721         return (char *) gettext (__re_error_msgid
 722                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 723     }
 724
 725   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 726      don't need to initialize the pattern buffer fields which affect it.  */
 727
 728   /* Match anchors at newlines.  */
 729   re_comp_buf.newline_anchor = 1;
 730
 731   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 732
 733   if (!ret)
 734     return NULL;
 735
 736   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 737   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 738 }
 739
 740 #ifdef _LIBC
 741 libc_freeres_fn (free_mem)
 742 {
 743   __regfree (&re_comp_buf);
 744 }
 745 #endif
 746
 747 #endif /* _REGEX_RE_COMP */
 748 \f
 749 /* Internal entry point.
 750    Compile the regular expression PATTERN, whose length is LENGTH.
 751    SYNTAX indicate regular expression's syntax.  */
 752
 753 static reg_errcode_t
 754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 755                      reg_syntax_t syntax)
 756 {
 757   reg_errcode_t err = REG_NOERROR;
 758   re_dfa_t *dfa;
 759   re_string_t regexp;
 760
 761   /* Initialize the pattern buffer.  */
 762   preg->fastmap_accurate = 0;
 763   preg->syntax = syntax;
 764   preg->not_bol = preg->not_eol = 0;
 765   preg->used = 0;
 766   preg->re_nsub = 0;
 767   preg->can_be_null = 0;
 768   preg->regs_allocated = REGS_UNALLOCATED;
 769
 770   /* Initialize the dfa.  */
 771   dfa = preg->buffer;
 772   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 773     {
 774       /* If zero allocated, but buffer is non-null, try to realloc
 775          enough space.  This loses if buffer's address is bogus, but
 776          that is the user's responsibility.  If ->buffer is NULL this
 777          is a simple allocation.  */
 778       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 779       if (dfa == NULL)
 780         return REG_ESPACE;
 781       preg->allocated = sizeof (re_dfa_t);
 782       preg->buffer = dfa;
 783     }
 784   preg->used = sizeof (re_dfa_t);
 785
 786   err = init_dfa (dfa, length);
 787   if (BE (err != REG_NOERROR, 0))
 788     {
 789       free_dfa_content (dfa);
 790       preg->buffer = NULL;
 791       preg->allocated = 0;
 792       return err;
 793     }
 794 #ifdef DEBUG
 795   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 796   dfa->re_str = re_malloc (char, length + 1);
 797   strncpy (dfa->re_str, pattern, length + 1);
 798 #endif
 799
 800   __libc_lock_init (dfa->lock);
 801
 802   err = re_string_construct (&regexp, pattern, length, preg->translate,
 803                              (syntax & RE_ICASE) != 0, dfa);
 804   if (BE (err != REG_NOERROR, 0))
 805     {
 806     re_compile_internal_free_return:
 807       free_workarea_compile (preg);
 808       re_string_destruct (&regexp);
 809       free_dfa_content (dfa);
 810       preg->buffer = NULL;
 811       preg->allocated = 0;
 812       return err;
 813     }
 814
 815   /* Parse the regular expression, and build a structure tree.  */
 816   preg->re_nsub = 0;
 817   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 818   if (BE (dfa->str_tree == NULL, 0))
 819     goto re_compile_internal_free_return;
 820
 821   /* Analyze the tree and create the nfa.  */
 822   err = analyze (preg);
 823   if (BE (err != REG_NOERROR, 0))
 824     goto re_compile_internal_free_return;
 825
 826 #ifdef RE_ENABLE_I18N
 827   /* If possible, do searching in single byte encoding to speed things up.  */
 828   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 829     optimize_utf8 (dfa);
 830 #endif
 831
 832   /* Then create the initial state of the dfa.  */
 833   err = create_initial_state (dfa);
 834
 835   /* Release work areas.  */
 836   free_workarea_compile (preg);
 837   re_string_destruct (&regexp);
 838
 839   if (BE (err != REG_NOERROR, 0))
 840     {
 841       free_dfa_content (dfa);
 842       preg->buffer = NULL;
 843       preg->allocated = 0;
 844     }
 845
 846   return err;
 847 }
 848
 849 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 850    as the initial length of some arrays.  */
 851
 852 static reg_errcode_t
 853 init_dfa (re_dfa_t *dfa, size_t pat_len)
 854 {
 855   __re_size_t table_size;
 856 #ifndef _LIBC
 857   const char *codeset_name;
 858 #endif
 859 #ifdef RE_ENABLE_I18N
 860   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 861 #else
 862   size_t max_i18n_object_size = 0;
 863 #endif
 864   size_t max_object_size =
 865     MAX (sizeof (struct re_state_table_entry),
 866          MAX (sizeof (re_token_t),
 867               MAX (sizeof (re_node_set),
 868                    MAX (sizeof (regmatch_t),
 869                         max_i18n_object_size))));
 870
 871   memset (dfa, '\0', sizeof (re_dfa_t));
 872
 873   /* Force allocation of str_tree_storage the first time.  */
 874   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 875
 876   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 877      calculation below, and for similar doubling calculations
 878      elsewhere.  And it's <= rather than <, because some of the
 879      doubling calculations add 1 afterwards.  */
 880   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 881     return REG_ESPACE;
 882
 883   dfa->nodes_alloc = pat_len + 1;
 884   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 885
 886   /*  table_size = 2 ^ ceil(log pat_len) */
 887   for (table_size = 1; ; table_size <<= 1)
 888     if (table_size > pat_len)
 889       break;
 890
 891   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 892   dfa->state_hash_mask = table_size - 1;
 893
 894   dfa->mb_cur_max = MB_CUR_MAX;
 895 #ifdef _LIBC
 896   if (dfa->mb_cur_max == 6
 897       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 898     dfa->is_utf8 = 1;
 899   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 900                        != 0);
 901 #else
 902   codeset_name = nl_langinfo (CODESET);
 903   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 904       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 905       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 906       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 907     dfa->is_utf8 = 1;
 908
 909   /* We check exhaustively in the loop below if this charset is a
 910      superset of ASCII.  */
 911   dfa->map_notascii = 0;
 912 #endif
 913
 914 #ifdef RE_ENABLE_I18N
 915   if (dfa->mb_cur_max > 1)
 916     {
 917       if (dfa->is_utf8)
 918         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 919       else
 920         {
 921           int i, j, ch;
 922
 923           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 924           if (BE (dfa->sb_char == NULL, 0))
 925             return REG_ESPACE;
 926
 927           /* Set the bits corresponding to single byte chars.  */
 928           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 929             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 930               {
 931                 wint_t wch = __btowc (ch);
 932                 if (wch != WEOF)
 933                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 934 # ifndef _LIBC
 935                 if (isascii (ch) && wch != ch)
 936                   dfa->map_notascii = 1;
 937 # endif
 938               }
 939         }
 940     }
 941 #endif
 942
 943   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 944     return REG_ESPACE;
 945   return REG_NOERROR;
 946 }
 947
 948 /* Initialize WORD_CHAR table, which indicate which character is
 949    "word".  In this case "word" means that it is the word construction
 950    character used by some operators like "\<", "\>", etc.  */
 951
 952 static void
 953 internal_function
 954 init_word_char (re_dfa_t *dfa)
 955 {
 956   dfa->word_ops_used = 1;
 957   int i = 0;
 958   int j;
 959   int ch = 0;
 960   if (BE (dfa->map_notascii == 0, 1))
 961     {
 962       bitset_word_t bits0 = 0x00000000;
 963       bitset_word_t bits1 = 0x03ff0000;
 964       bitset_word_t bits2 = 0x87fffffe;
 965       bitset_word_t bits3 = 0x07fffffe;
 966       if (BITSET_WORD_BITS == 64)
 967         {
 968           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 969           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 970           i = 2;
 971         }
 972       else if (BITSET_WORD_BITS == 32)
 973         {
 974           dfa->word_char[0] = bits0;
 975           dfa->word_char[1] = bits1;
 976           dfa->word_char[2] = bits2;
 977           dfa->word_char[3] = bits3;
 978           i = 4;
 979         }
 980       else
 981         goto general_case;
 982       ch = 128;
 983
 984       if (BE (dfa->is_utf8, 1))
 985         {
 986           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 987           return;
 988         }
 989     }
 990
 991  general_case:
 992   for (; i < BITSET_WORDS; ++i)
 993     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 994       if (isalnum (ch) || ch == '_')
 995         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 996 }
 997
 998 /* Free the work area which are only used while compiling.  */
 999
1000 static void
1001 free_workarea_compile (regex_t *preg)
1002 {
1003   re_dfa_t *dfa = preg->buffer;
1004   bin_tree_storage_t *storage, *next;
1005   for (storage = dfa->str_tree_storage; storage; storage = next)
1006     {
1007       next = storage->next;
1008       re_free (storage);
1009     }
1010   dfa->str_tree_storage = NULL;
1011   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1012   dfa->str_tree = NULL;
1013   re_free (dfa->org_indices);
1014   dfa->org_indices = NULL;
1015 }
1016
1017 /* Create initial states for all contexts.  */
1018
1019 static reg_errcode_t
1020 create_initial_state (re_dfa_t *dfa)
1021 {
1022   Idx first, i;
1023   reg_errcode_t err;
1024   re_node_set init_nodes;
1025
1026   /* Initial states have the epsilon closure of the node which is
1027      the first node of the regular expression.  */
1028   first = dfa->str_tree->first->node_idx;
1029   dfa->init_node = first;
1030   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1031   if (BE (err != REG_NOERROR, 0))
1032     return err;
1033
1034   /* The back-references which are in initial states can epsilon transit,
1035      since in this case all of the subexpressions can be null.
1036      Then we add epsilon closures of the nodes which are the next nodes of
1037      the back-references.  */
1038   if (dfa->nbackref > 0)
1039     for (i = 0; i < init_nodes.nelem; ++i)
1040       {
1041         Idx node_idx = init_nodes.elems[i];
1042         re_token_type_t type = dfa->nodes[node_idx].type;
1043
1044         Idx clexp_idx;
1045         if (type != OP_BACK_REF)
1046           continue;
1047         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1048           {
1049             re_token_t *clexp_node;
1050             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1051             if (clexp_node->type == OP_CLOSE_SUBEXP
1052                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1053               break;
1054           }
1055         if (clexp_idx == init_nodes.nelem)
1056           continue;
1057
1058         if (type == OP_BACK_REF)
1059           {
1060             Idx dest_idx = dfa->edests[node_idx].elems[0];
1061             if (!re_node_set_contains (&init_nodes, dest_idx))
1062               {
1063                 reg_errcode_t merge_err
1064                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1065                 if (merge_err != REG_NOERROR)
1066                   return merge_err;
1067                 i = 0;
1068               }
1069           }
1070       }
1071
1072   /* It must be the first time to invoke acquire_state.  */
1073   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1074   /* We don't check ERR here, since the initial state must not be NULL.  */
1075   if (BE (dfa->init_state == NULL, 0))
1076     return err;
1077   if (dfa->init_state->has_constraint)
1078     {
1079       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1080                                                        CONTEXT_WORD);
1081       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1082                                                      CONTEXT_NEWLINE);
1083       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1084                                                          &init_nodes,
1085                                                          CONTEXT_NEWLINE
1086                                                          | CONTEXT_BEGBUF);
1087       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1088               || dfa->init_state_begbuf == NULL, 0))
1089         return err;
1090     }
1091   else
1092     dfa->init_state_word = dfa->init_state_nl
1093       = dfa->init_state_begbuf = dfa->init_state;
1094
1095   re_node_set_free (&init_nodes);
1096   return REG_NOERROR;
1097 }
1098 \f
1099 #ifdef RE_ENABLE_I18N
1100 /* If it is possible to do searching in single byte encoding instead of UTF-8
1101    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1102    DFA nodes where needed.  */
1103
1104 static void
1105 optimize_utf8 (re_dfa_t *dfa)
1106 {
1107   Idx node;
1108   int i;
1109   bool mb_chars = false;
1110   bool has_period = false;
1111
1112   for (node = 0; node < dfa->nodes_len; ++node)
1113     switch (dfa->nodes[node].type)
1114       {
1115       case CHARACTER:
1116         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1117           mb_chars = true;
1118         break;
1119       case ANCHOR:
1120         switch (dfa->nodes[node].opr.ctx_type)
1121           {
1122           case LINE_FIRST:
1123           case LINE_LAST:
1124           case BUF_FIRST:
1125           case BUF_LAST:
1126             break;
1127           default:
1128             /* Word anchors etc. cannot be handled.  It's okay to test
1129                opr.ctx_type since constraints (for all DFA nodes) are
1130                created by ORing one or more opr.ctx_type values.  */
1131             return;
1132           }
1133         break;
1134       case OP_PERIOD:
1135         has_period = true;
1136         break;
1137       case OP_BACK_REF:
1138       case OP_ALT:
1139       case END_OF_RE:
1140       case OP_DUP_ASTERISK:
1141       case OP_OPEN_SUBEXP:
1142       case OP_CLOSE_SUBEXP:
1143         break;
1144       case COMPLEX_BRACKET:
1145         return;
1146       case SIMPLE_BRACKET:
1147         /* Just double check.  */
1148         {
1149           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1150                         ? 0
1151                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1152           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1153             {
1154               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1155                 return;
1156               rshift = 0;
1157             }
1158         }
1159         break;
1160       default:
1161         abort ();
1162       }
1163
1164   if (mb_chars || has_period)
1165     for (node = 0; node < dfa->nodes_len; ++node)
1166       {
1167         if (dfa->nodes[node].type == CHARACTER
1168             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1169           dfa->nodes[node].mb_partial = 0;
1170         else if (dfa->nodes[node].type == OP_PERIOD)
1171           dfa->nodes[node].type = OP_UTF8_PERIOD;
1172       }
1173
1174   /* The search can be in single byte locale.  */
1175   dfa->mb_cur_max = 1;
1176   dfa->is_utf8 = 0;
1177   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1178 }
1179 #endif
1180 \f
1181 /* Analyze the structure tree, and calculate "first", "next", "edest",
1182    "eclosure", and "inveclosure".  */
1183
1184 static reg_errcode_t
1185 analyze (regex_t *preg)
1186 {
1187   re_dfa_t *dfa = preg->buffer;
1188   reg_errcode_t ret;
1189
1190   /* Allocate arrays.  */
1191   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1192   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1193   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1194   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1195   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1196           || dfa->eclosures == NULL, 0))
1197     return REG_ESPACE;
1198
1199   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1200   if (dfa->subexp_map != NULL)
1201     {
1202       Idx i;
1203       for (i = 0; i < preg->re_nsub; i++)
1204         dfa->subexp_map[i] = i;
1205       preorder (dfa->str_tree, optimize_subexps, dfa);
1206       for (i = 0; i < preg->re_nsub; i++)
1207         if (dfa->subexp_map[i] != i)
1208           break;
1209       if (i == preg->re_nsub)
1210         {
1211           free (dfa->subexp_map);
1212           dfa->subexp_map = NULL;
1213         }
1214     }
1215
1216   ret = postorder (dfa->str_tree, lower_subexps, preg);
1217   if (BE (ret != REG_NOERROR, 0))
1218     return ret;
1219   ret = postorder (dfa->str_tree, calc_first, dfa);
1220   if (BE (ret != REG_NOERROR, 0))
1221     return ret;
1222   preorder (dfa->str_tree, calc_next, dfa);
1223   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1224   if (BE (ret != REG_NOERROR, 0))
1225     return ret;
1226   ret = calc_eclosure (dfa);
1227   if (BE (ret != REG_NOERROR, 0))
1228     return ret;
1229
1230   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1231      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1232   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1233       || dfa->nbackref)
1234     {
1235       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1236       if (BE (dfa->inveclosures == NULL, 0))
1237         return REG_ESPACE;
1238       ret = calc_inveclosure (dfa);
1239     }
1240
1241   return ret;
1242 }
1243
1244 /* Our parse trees are very unbalanced, so we cannot use a stack to
1245    implement parse tree visits.  Instead, we use parent pointers and
1246    some hairy code in these two functions.  */
1247 static reg_errcode_t
1248 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249            void *extra)
1250 {
1251   bin_tree_t *node, *prev;
1252
1253   for (node = root; ; )
1254     {
1255       /* Descend down the tree, preferably to the left (or to the right
1256          if that's the only child).  */
1257       while (node->left || node->right)
1258         if (node->left)
1259           node = node->left;
1260         else
1261           node = node->right;
1262
1263       do
1264         {
1265           reg_errcode_t err = fn (extra, node);
1266           if (BE (err != REG_NOERROR, 0))
1267             return err;
1268           if (node->parent == NULL)
1269             return REG_NOERROR;
1270           prev = node;
1271           node = node->parent;
1272         }
1273       /* Go up while we have a node that is reached from the right.  */
1274       while (node->right == prev || node->right == NULL);
1275       node = node->right;
1276     }
1277 }
1278
1279 static reg_errcode_t
1280 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1281           void *extra)
1282 {
1283   bin_tree_t *node;
1284
1285   for (node = root; ; )
1286     {
1287       reg_errcode_t err = fn (extra, node);
1288       if (BE (err != REG_NOERROR, 0))
1289         return err;
1290
1291       /* Go to the left node, or up and to the right.  */
1292       if (node->left)
1293         node = node->left;
1294       else
1295         {
1296           bin_tree_t *prev = NULL;
1297           while (node->right == prev || node->right == NULL)
1298             {
1299               prev = node;
1300               node = node->parent;
1301               if (!node)
1302                 return REG_NOERROR;
1303             }
1304           node = node->right;
1305         }
1306     }
1307 }
1308
1309 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1310    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1311    backreferences as well.  Requires a preorder visit.  */
1312 static reg_errcode_t
1313 optimize_subexps (void *extra, bin_tree_t *node)
1314 {
1315   re_dfa_t *dfa = (re_dfa_t *) extra;
1316
1317   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1318     {
1319       int idx = node->token.opr.idx;
1320       node->token.opr.idx = dfa->subexp_map[idx];
1321       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1322     }
1323
1324   else if (node->token.type == SUBEXP
1325            && node->left && node->left->token.type == SUBEXP)
1326     {
1327       Idx other_idx = node->left->token.opr.idx;
1328
1329       node->left = node->left->left;
1330       if (node->left)
1331         node->left->parent = node;
1332
1333       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1334       if (other_idx < BITSET_WORD_BITS)
1335         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1336     }
1337
1338   return REG_NOERROR;
1339 }
1340
1341 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1342    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1343 static reg_errcode_t
1344 lower_subexps (void *extra, bin_tree_t *node)
1345 {
1346   regex_t *preg = (regex_t *) extra;
1347   reg_errcode_t err = REG_NOERROR;
1348
1349   if (node->left && node->left->token.type == SUBEXP)
1350     {
1351       node->left = lower_subexp (&err, preg, node->left);
1352       if (node->left)
1353         node->left->parent = node;
1354     }
1355   if (node->right && node->right->token.type == SUBEXP)
1356     {
1357       node->right = lower_subexp (&err, preg, node->right);
1358       if (node->right)
1359         node->right->parent = node;
1360     }
1361
1362   return err;
1363 }
1364
1365 static bin_tree_t *
1366 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = preg->buffer;
1369   bin_tree_t *body = node->left;
1370   bin_tree_t *op, *cls, *tree1, *tree;
1371
1372   if (preg->no_sub
1373       /* We do not optimize empty subexpressions, because otherwise we may
1374          have bad CONCAT nodes with NULL children.  This is obviously not
1375          very common, so we do not lose much.  An example that triggers
1376          this case is the sed "script" /\(\)/x.  */
1377       && node->left != NULL
1378       && (node->token.opr.idx >= BITSET_WORD_BITS
1379           || !(dfa->used_bkref_map
1380                & ((bitset_word_t) 1 << node->token.opr.idx))))
1381     return node->left;
1382
1383   /* Convert the SUBEXP node to the concatenation of an
1384      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1385   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1386   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1387   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1388   tree = create_tree (dfa, op, tree1, CONCAT);
1389   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1390     {
1391       *err = REG_ESPACE;
1392       return NULL;
1393     }
1394
1395   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1396   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1397   return tree;
1398 }
1399
1400 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1401    nodes.  Requires a postorder visit.  */
1402 static reg_errcode_t
1403 calc_first (void *extra, bin_tree_t *node)
1404 {
1405   re_dfa_t *dfa = (re_dfa_t *) extra;
1406   if (node->token.type == CONCAT)
1407     {
1408       node->first = node->left->first;
1409       node->node_idx = node->left->node_idx;
1410     }
1411   else
1412     {
1413       node->first = node;
1414       node->node_idx = re_dfa_add_node (dfa, node->token);
1415       if (BE (node->node_idx == REG_MISSING, 0))
1416         return REG_ESPACE;
1417       if (node->token.type == ANCHOR)
1418         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1419     }
1420   return REG_NOERROR;
1421 }
1422
1423 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1424 static reg_errcode_t
1425 calc_next (void *extra, bin_tree_t *node)
1426 {
1427   switch (node->token.type)
1428     {
1429     case OP_DUP_ASTERISK:
1430       node->left->next = node;
1431       break;
1432     case CONCAT:
1433       node->left->next = node->right->first;
1434       node->right->next = node->next;
1435       break;
1436     default:
1437       if (node->left)
1438         node->left->next = node->next;
1439       if (node->right)
1440         node->right->next = node->next;
1441       break;
1442     }
1443   return REG_NOERROR;
1444 }
1445
1446 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1447 static reg_errcode_t
1448 link_nfa_nodes (void *extra, bin_tree_t *node)
1449 {
1450   re_dfa_t *dfa = (re_dfa_t *) extra;
1451   Idx idx = node->node_idx;
1452   reg_errcode_t err = REG_NOERROR;
1453
1454   switch (node->token.type)
1455     {
1456     case CONCAT:
1457       break;
1458
1459     case END_OF_RE:
1460       assert (node->next == NULL);
1461       break;
1462
1463     case OP_DUP_ASTERISK:
1464     case OP_ALT:
1465       {
1466         Idx left, right;
1467         dfa->has_plural_match = 1;
1468         if (node->left != NULL)
1469           left = node->left->first->node_idx;
1470         else
1471           left = node->next->node_idx;
1472         if (node->right != NULL)
1473           right = node->right->first->node_idx;
1474         else
1475           right = node->next->node_idx;
1476         assert (REG_VALID_INDEX (left));
1477         assert (REG_VALID_INDEX (right));
1478         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1479       }
1480       break;
1481
1482     case ANCHOR:
1483     case OP_OPEN_SUBEXP:
1484     case OP_CLOSE_SUBEXP:
1485       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1486       break;
1487
1488     case OP_BACK_REF:
1489       dfa->nexts[idx] = node->next->node_idx;
1490       if (node->token.type == OP_BACK_REF)
1491         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1492       break;
1493
1494     default:
1495       assert (!IS_EPSILON_NODE (node->token.type));
1496       dfa->nexts[idx] = node->next->node_idx;
1497       break;
1498     }
1499
1500   return err;
1501 }
1502
1503 /* Duplicate the epsilon closure of the node ROOT_NODE.
1504    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1505    to their own constraint.  */
1506
1507 static reg_errcode_t
1508 internal_function
1509 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1510                         Idx root_node, unsigned int init_constraint)
1511 {
1512   Idx org_node, clone_node;
1513   bool ok;
1514   unsigned int constraint = init_constraint;
1515   for (org_node = top_org_node, clone_node = top_clone_node;;)
1516     {
1517       Idx org_dest, clone_dest;
1518       if (dfa->nodes[org_node].type == OP_BACK_REF)
1519         {
1520           /* If the back reference epsilon-transit, its destination must
1521              also have the constraint.  Then duplicate the epsilon closure
1522              of the destination of the back reference, and store it in
1523              edests of the back reference.  */
1524           org_dest = dfa->nexts[org_node];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           clone_dest = duplicate_node (dfa, org_dest, constraint);
1527           if (BE (clone_dest == REG_MISSING, 0))
1528             return REG_ESPACE;
1529           dfa->nexts[clone_node] = dfa->nexts[org_node];
1530           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531           if (BE (! ok, 0))
1532             return REG_ESPACE;
1533         }
1534       else if (dfa->edests[org_node].nelem == 0)
1535         {
1536           /* In case of the node can't epsilon-transit, don't duplicate the
1537              destination and store the original destination as the
1538              destination of the node.  */
1539           dfa->nexts[clone_node] = dfa->nexts[org_node];
1540           break;
1541         }
1542       else if (dfa->edests[org_node].nelem == 1)
1543         {
1544           /* In case of the node can epsilon-transit, and it has only one
1545              destination.  */
1546           org_dest = dfa->edests[org_node].elems[0];
1547           re_node_set_empty (dfa->edests + clone_node);
1548           /* If the node is root_node itself, it means the epsilon closure
1549              has a loop.  Then tie it to the destination of the root_node.  */
1550           if (org_node == root_node && clone_node != org_node)
1551             {
1552               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1553               if (BE (! ok, 0))
1554                 return REG_ESPACE;
1555               break;
1556             }
1557           /* In case the node has another constraint, append it.  */
1558           constraint |= dfa->nodes[org_node].constraint;
1559           clone_dest = duplicate_node (dfa, org_dest, constraint);
1560           if (BE (clone_dest == REG_MISSING, 0))
1561             return REG_ESPACE;
1562           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563           if (BE (! ok, 0))
1564             return REG_ESPACE;
1565         }
1566       else /* dfa->edests[org_node].nelem == 2 */
1567         {
1568           /* In case of the node can epsilon-transit, and it has two
1569              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1570           org_dest = dfa->edests[org_node].elems[0];
1571           re_node_set_empty (dfa->edests + clone_node);
1572           /* Search for a duplicated node which satisfies the constraint.  */
1573           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1574           if (clone_dest == REG_MISSING)
1575             {
1576               /* There is no such duplicated node, create a new one.  */
1577               reg_errcode_t err;
1578               clone_dest = duplicate_node (dfa, org_dest, constraint);
1579               if (BE (clone_dest == REG_MISSING, 0))
1580                 return REG_ESPACE;
1581               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582               if (BE (! ok, 0))
1583                 return REG_ESPACE;
1584               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1585                                             root_node, constraint);
1586               if (BE (err != REG_NOERROR, 0))
1587                 return err;
1588             }
1589           else
1590             {
1591               /* There is a duplicated node which satisfies the constraint,
1592                  use it to avoid infinite loop.  */
1593               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1594               if (BE (! ok, 0))
1595                 return REG_ESPACE;
1596             }
1597
1598           org_dest = dfa->edests[org_node].elems[1];
1599           clone_dest = duplicate_node (dfa, org_dest, constraint);
1600           if (BE (clone_dest == REG_MISSING, 0))
1601             return REG_ESPACE;
1602           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603           if (BE (! ok, 0))
1604             return REG_ESPACE;
1605         }
1606       org_node = org_dest;
1607       clone_node = clone_dest;
1608     }
1609   return REG_NOERROR;
1610 }
1611
1612 /* Search for a node which is duplicated from the node ORG_NODE, and
1613    satisfies the constraint CONSTRAINT.  */
1614
1615 static Idx
1616 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1617                         unsigned int constraint)
1618 {
1619   Idx idx;
1620   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1621     {
1622       if (org_node == dfa->org_indices[idx]
1623           && constraint == dfa->nodes[idx].constraint)
1624         return idx; /* Found.  */
1625     }
1626   return REG_MISSING; /* Not found.  */
1627 }
1628
1629 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1630    Return the index of the new node, or REG_MISSING if insufficient storage is
1631    available.  */
1632
1633 static Idx
1634 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1635 {
1636   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1637   if (BE (dup_idx != REG_MISSING, 1))
1638     {
1639       dfa->nodes[dup_idx].constraint = constraint;
1640       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1641       dfa->nodes[dup_idx].duplicated = 1;
1642
1643       /* Store the index of the original node.  */
1644       dfa->org_indices[dup_idx] = org_idx;
1645     }
1646   return dup_idx;
1647 }
1648
1649 static reg_errcode_t
1650 calc_inveclosure (re_dfa_t *dfa)
1651 {
1652   Idx src, idx;
1653   bool ok;
1654   for (idx = 0; idx < dfa->nodes_len; ++idx)
1655     re_node_set_init_empty (dfa->inveclosures + idx);
1656
1657   for (src = 0; src < dfa->nodes_len; ++src)
1658     {
1659       Idx *elems = dfa->eclosures[src].elems;
1660       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1661         {
1662           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1663           if (BE (! ok, 0))
1664             return REG_ESPACE;
1665         }
1666     }
1667
1668   return REG_NOERROR;
1669 }
1670
1671 /* Calculate "eclosure" for all the node in DFA.  */
1672
1673 static reg_errcode_t
1674 calc_eclosure (re_dfa_t *dfa)
1675 {
1676   Idx node_idx;
1677   bool incomplete;
1678 #ifdef DEBUG
1679   assert (dfa->nodes_len > 0);
1680 #endif
1681   incomplete = false;
1682   /* For each nodes, calculate epsilon closure.  */
1683   for (node_idx = 0; ; ++node_idx)
1684     {
1685       reg_errcode_t err;
1686       re_node_set eclosure_elem;
1687       if (node_idx == dfa->nodes_len)
1688         {
1689           if (!incomplete)
1690             break;
1691           incomplete = false;
1692           node_idx = 0;
1693         }
1694
1695 #ifdef DEBUG
1696       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1697 #endif
1698
1699       /* If we have already calculated, skip it.  */
1700       if (dfa->eclosures[node_idx].nelem != 0)
1701         continue;
1702       /* Calculate epsilon closure of 'node_idx'.  */
1703       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1704       if (BE (err != REG_NOERROR, 0))
1705         return err;
1706
1707       if (dfa->eclosures[node_idx].nelem == 0)
1708         {
1709           incomplete = true;
1710           re_node_set_free (&eclosure_elem);
1711         }
1712     }
1713   return REG_NOERROR;
1714 }
1715
1716 /* Calculate epsilon closure of NODE.  */
1717
1718 static reg_errcode_t
1719 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1720 {
1721   reg_errcode_t err;
1722   Idx i;
1723   re_node_set eclosure;
1724   bool ok;
1725   bool incomplete = false;
1726   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1727   if (BE (err != REG_NOERROR, 0))
1728     return err;
1729
1730   /* This indicates that we are calculating this node now.
1731      We reference this value to avoid infinite loop.  */
1732   dfa->eclosures[node].nelem = REG_MISSING;
1733
1734   /* If the current node has constraints, duplicate all nodes
1735      since they must inherit the constraints.  */
1736   if (dfa->nodes[node].constraint
1737       && dfa->edests[node].nelem
1738       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1739     {
1740       err = duplicate_node_closure (dfa, node, node, node,
1741                                     dfa->nodes[node].constraint);
1742       if (BE (err != REG_NOERROR, 0))
1743         return err;
1744     }
1745
1746   /* Expand each epsilon destination nodes.  */
1747   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1748     for (i = 0; i < dfa->edests[node].nelem; ++i)
1749       {
1750         re_node_set eclosure_elem;
1751         Idx edest = dfa->edests[node].elems[i];
1752         /* If calculating the epsilon closure of 'edest' is in progress,
1753            return intermediate result.  */
1754         if (dfa->eclosures[edest].nelem == REG_MISSING)
1755           {
1756             incomplete = true;
1757             continue;
1758           }
1759         /* If we haven't calculated the epsilon closure of 'edest' yet,
1760            calculate now. Otherwise use calculated epsilon closure.  */
1761         if (dfa->eclosures[edest].nelem == 0)
1762           {
1763             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1764             if (BE (err != REG_NOERROR, 0))
1765               return err;
1766           }
1767         else
1768           eclosure_elem = dfa->eclosures[edest];
1769         /* Merge the epsilon closure of 'edest'.  */
1770         err = re_node_set_merge (&eclosure, &eclosure_elem);
1771         if (BE (err != REG_NOERROR, 0))
1772           return err;
1773         /* If the epsilon closure of 'edest' is incomplete,
1774            the epsilon closure of this node is also incomplete.  */
1775         if (dfa->eclosures[edest].nelem == 0)
1776           {
1777             incomplete = true;
1778             re_node_set_free (&eclosure_elem);
1779           }
1780       }
1781
1782   /* An epsilon closure includes itself.  */
1783   ok = re_node_set_insert (&eclosure, node);
1784   if (BE (! ok, 0))
1785     return REG_ESPACE;
1786   if (incomplete && !root)
1787     dfa->eclosures[node].nelem = 0;
1788   else
1789     dfa->eclosures[node] = eclosure;
1790   *new_set = eclosure;
1791   return REG_NOERROR;
1792 }
1793 \f
1794 /* Functions for token which are used in the parser.  */
1795
1796 /* Fetch a token from INPUT.
1797    We must not use this function inside bracket expressions.  */
1798
1799 static void
1800 internal_function
1801 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1802 {
1803   re_string_skip_bytes (input, peek_token (result, input, syntax));
1804 }
1805
1806 /* Peek a token from INPUT, and return the length of the token.
1807    We must not use this function inside bracket expressions.  */
1808
1809 static int
1810 internal_function
1811 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1812 {
1813   unsigned char c;
1814
1815   if (re_string_eoi (input))
1816     {
1817       token->type = END_OF_RE;
1818       return 0;
1819     }
1820
1821   c = re_string_peek_byte (input, 0);
1822   token->opr.c = c;
1823
1824   token->word_char = 0;
1825 #ifdef RE_ENABLE_I18N
1826   token->mb_partial = 0;
1827   if (input->mb_cur_max > 1 &&
1828       !re_string_first_byte (input, re_string_cur_idx (input)))
1829     {
1830       token->type = CHARACTER;
1831       token->mb_partial = 1;
1832       return 1;
1833     }
1834 #endif
1835   if (c == '\\')
1836     {
1837       unsigned char c2;
1838       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1839         {
1840           token->type = BACK_SLASH;
1841           return 1;
1842         }
1843
1844       c2 = re_string_peek_byte_case (input, 1);
1845       token->opr.c = c2;
1846       token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848       if (input->mb_cur_max > 1)
1849         {
1850           wint_t wc = re_string_wchar_at (input,
1851                                           re_string_cur_idx (input) + 1);
1852           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1853         }
1854       else
1855 #endif
1856         token->word_char = IS_WORD_CHAR (c2) != 0;
1857
1858       switch (c2)
1859         {
1860         case '|':
1861           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1862             token->type = OP_ALT;
1863           break;
1864         case '1': case '2': case '3': case '4': case '5':
1865         case '6': case '7': case '8': case '9':
1866           if (!(syntax & RE_NO_BK_REFS))
1867             {
1868               token->type = OP_BACK_REF;
1869               token->opr.idx = c2 - '1';
1870             }
1871           break;
1872         case '<':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             {
1875               token->type = ANCHOR;
1876               token->opr.ctx_type = WORD_FIRST;
1877             }
1878           break;
1879         case '>':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = WORD_LAST;
1884             }
1885           break;
1886         case 'b':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = WORD_DELIM;
1891             }
1892           break;
1893         case 'B':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             {
1896               token->type = ANCHOR;
1897               token->opr.ctx_type = NOT_WORD_DELIM;
1898             }
1899           break;
1900         case 'w':
1901           if (!(syntax & RE_NO_GNU_OPS))
1902             token->type = OP_WORD;
1903           break;
1904         case 'W':
1905           if (!(syntax & RE_NO_GNU_OPS))
1906             token->type = OP_NOTWORD;
1907           break;
1908         case 's':
1909           if (!(syntax & RE_NO_GNU_OPS))
1910             token->type = OP_SPACE;
1911           break;
1912         case 'S':
1913           if (!(syntax & RE_NO_GNU_OPS))
1914             token->type = OP_NOTSPACE;
1915           break;
1916         case '`':
1917           if (!(syntax & RE_NO_GNU_OPS))
1918             {
1919               token->type = ANCHOR;
1920               token->opr.ctx_type = BUF_FIRST;
1921             }
1922           break;
1923         case '\'':
1924           if (!(syntax & RE_NO_GNU_OPS))
1925             {
1926               token->type = ANCHOR;
1927               token->opr.ctx_type = BUF_LAST;
1928             }
1929           break;
1930         case '(':
1931           if (!(syntax & RE_NO_BK_PARENS))
1932             token->type = OP_OPEN_SUBEXP;
1933           break;
1934         case ')':
1935           if (!(syntax & RE_NO_BK_PARENS))
1936             token->type = OP_CLOSE_SUBEXP;
1937           break;
1938         case '+':
1939           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1940             token->type = OP_DUP_PLUS;
1941           break;
1942         case '?':
1943           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1944             token->type = OP_DUP_QUESTION;
1945           break;
1946         case '{':
1947           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1948             token->type = OP_OPEN_DUP_NUM;
1949           break;
1950         case '}':
1951           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1952             token->type = OP_CLOSE_DUP_NUM;
1953           break;
1954         default:
1955           break;
1956         }
1957       return 2;
1958     }
1959
1960   token->type = CHARACTER;
1961 #ifdef RE_ENABLE_I18N
1962   if (input->mb_cur_max > 1)
1963     {
1964       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1965       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1966     }
1967   else
1968 #endif
1969     token->word_char = IS_WORD_CHAR (token->opr.c);
1970
1971   switch (c)
1972     {
1973     case '\n':
1974       if (syntax & RE_NEWLINE_ALT)
1975         token->type = OP_ALT;
1976       break;
1977     case '|':
1978       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1979         token->type = OP_ALT;
1980       break;
1981     case '*':
1982       token->type = OP_DUP_ASTERISK;
1983       break;
1984     case '+':
1985       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1986         token->type = OP_DUP_PLUS;
1987       break;
1988     case '?':
1989       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1990         token->type = OP_DUP_QUESTION;
1991       break;
1992     case '{':
1993       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1994         token->type = OP_OPEN_DUP_NUM;
1995       break;
1996     case '}':
1997       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1998         token->type = OP_CLOSE_DUP_NUM;
1999       break;
2000     case '(':
2001       if (syntax & RE_NO_BK_PARENS)
2002         token->type = OP_OPEN_SUBEXP;
2003       break;
2004     case ')':
2005       if (syntax & RE_NO_BK_PARENS)
2006         token->type = OP_CLOSE_SUBEXP;
2007       break;
2008     case '[':
2009       token->type = OP_OPEN_BRACKET;
2010       break;
2011     case '.':
2012       token->type = OP_PERIOD;
2013       break;
2014     case '^':
2015       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2016           re_string_cur_idx (input) != 0)
2017         {
2018           char prev = re_string_peek_byte (input, -1);
2019           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2020             break;
2021         }
2022       token->type = ANCHOR;
2023       token->opr.ctx_type = LINE_FIRST;
2024       break;
2025     case '$':
2026       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2027           re_string_cur_idx (input) + 1 != re_string_length (input))
2028         {
2029           re_token_t next;
2030           re_string_skip_bytes (input, 1);
2031           peek_token (&next, input, syntax);
2032           re_string_skip_bytes (input, -1);
2033           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2034             break;
2035         }
2036       token->type = ANCHOR;
2037       token->opr.ctx_type = LINE_LAST;
2038       break;
2039     default:
2040       break;
2041     }
2042   return 1;
2043 }
2044
2045 /* Peek a token from INPUT, and return the length of the token.
2046    We must not use this function out of bracket expressions.  */
2047
2048 static int
2049 internal_function
2050 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2051 {
2052   unsigned char c;
2053   if (re_string_eoi (input))
2054     {
2055       token->type = END_OF_RE;
2056       return 0;
2057     }
2058   c = re_string_peek_byte (input, 0);
2059   token->opr.c = c;
2060
2061 #ifdef RE_ENABLE_I18N
2062   if (input->mb_cur_max > 1 &&
2063       !re_string_first_byte (input, re_string_cur_idx (input)))
2064     {
2065       token->type = CHARACTER;
2066       return 1;
2067     }
2068 #endif /* RE_ENABLE_I18N */
2069
2070   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2071       && re_string_cur_idx (input) + 1 < re_string_length (input))
2072     {
2073       /* In this case, '\' escape a character.  */
2074       unsigned char c2;
2075       re_string_skip_bytes (input, 1);
2076       c2 = re_string_peek_byte (input, 0);
2077       token->opr.c = c2;
2078       token->type = CHARACTER;
2079       return 1;
2080     }
2081   if (c == '[') /* '[' is a special char in a bracket exps.  */
2082     {
2083       unsigned char c2;
2084       int token_len;
2085       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2086         c2 = re_string_peek_byte (input, 1);
2087       else
2088         c2 = 0;
2089       token->opr.c = c2;
2090       token_len = 2;
2091       switch (c2)
2092         {
2093         case '.':
2094           token->type = OP_OPEN_COLL_ELEM;
2095           break;
2096         case '=':
2097           token->type = OP_OPEN_EQUIV_CLASS;
2098           break;
2099         case ':':
2100           if (syntax & RE_CHAR_CLASSES)
2101             {
2102               token->type = OP_OPEN_CHAR_CLASS;
2103               break;
2104             }
2105           /* else fall through.  */
2106         default:
2107           token->type = CHARACTER;
2108           token->opr.c = c;
2109           token_len = 1;
2110           break;
2111         }
2112       return token_len;
2113     }
2114   switch (c)
2115     {
2116     case '-':
2117       token->type = OP_CHARSET_RANGE;
2118       break;
2119     case ']':
2120       token->type = OP_CLOSE_BRACKET;
2121       break;
2122     case '^':
2123       token->type = OP_NON_MATCH_LIST;
2124       break;
2125     default:
2126       token->type = CHARACTER;
2127     }
2128   return 1;
2129 }
2130 \f
2131 /* Functions for parser.  */
2132
2133 /* Entry point of the parser.
2134    Parse the regular expression REGEXP and return the structure tree.
2135    If an error occurs, ERR is set by error code, and return NULL.
2136    This function build the following tree, from regular expression <reg_exp>:
2137            CAT
2138            / \
2139           /   \
2140    <reg_exp>  EOR
2141
2142    CAT means concatenation.
2143    EOR means end of regular expression.  */
2144
2145 static bin_tree_t *
2146 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2147        reg_errcode_t *err)
2148 {
2149   re_dfa_t *dfa = preg->buffer;
2150   bin_tree_t *tree, *eor, *root;
2151   re_token_t current_token;
2152   dfa->syntax = syntax;
2153   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2155   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2156     return NULL;
2157   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2158   if (tree != NULL)
2159     root = create_tree (dfa, tree, eor, CONCAT);
2160   else
2161     root = eor;
2162   if (BE (eor == NULL || root == NULL, 0))
2163     {
2164       *err = REG_ESPACE;
2165       return NULL;
2166     }
2167   return root;
2168 }
2169
2170 /* This function build the following tree, from regular expression
2171    <branch1>|<branch2>:
2172            ALT
2173            / \
2174           /   \
2175    <branch1> <branch2>
2176
2177    ALT means alternative, which represents the operator '|'.  */
2178
2179 static bin_tree_t *
2180 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2182 {
2183   re_dfa_t *dfa = preg->buffer;
2184   bin_tree_t *tree, *branch = NULL;
2185   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2186   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2187     return NULL;
2188
2189   while (token->type == OP_ALT)
2190     {
2191       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2192       if (token->type != OP_ALT && token->type != END_OF_RE
2193           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194         {
2195           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2196           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2197             return NULL;
2198         }
2199       else
2200         branch = NULL;
2201       tree = create_tree (dfa, tree, branch, OP_ALT);
2202       if (BE (tree == NULL, 0))
2203         {
2204           *err = REG_ESPACE;
2205           return NULL;
2206         }
2207     }
2208   return tree;
2209 }
2210
2211 /* This function build the following tree, from regular expression
2212    <exp1><exp2>:
2213         CAT
2214         / \
2215        /   \
2216    <exp1> <exp2>
2217
2218    CAT means concatenation.  */
2219
2220 static bin_tree_t *
2221 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2223 {
2224   bin_tree_t *tree, *expr;
2225   re_dfa_t *dfa = preg->buffer;
2226   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2227   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2228     return NULL;
2229
2230   while (token->type != OP_ALT && token->type != END_OF_RE
2231          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2232     {
2233       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2234       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2235         {
2236           if (tree != NULL)
2237             postorder (tree, free_tree, NULL);
2238           return NULL;
2239         }
2240       if (tree != NULL && expr != NULL)
2241         {
2242           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2243           if (newtree == NULL)
2244             {
2245               postorder (expr, free_tree, NULL);
2246               postorder (tree, free_tree, NULL);
2247               *err = REG_ESPACE;
2248               return NULL;
2249             }
2250           tree = newtree;
2251         }
2252       else if (tree == NULL)
2253         tree = expr;
2254       /* Otherwise expr == NULL, we don't need to create new tree.  */
2255     }
2256   return tree;
2257 }
2258
2259 /* This function build the following tree, from regular expression a*:
2260          *
2261          |
2262          a
2263 */
2264
2265 static bin_tree_t *
2266 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2267                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2268 {
2269   re_dfa_t *dfa = preg->buffer;
2270   bin_tree_t *tree;
2271   switch (token->type)
2272     {
2273     case CHARACTER:
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276         {
2277           *err = REG_ESPACE;
2278           return NULL;
2279         }
2280 #ifdef RE_ENABLE_I18N
2281       if (dfa->mb_cur_max > 1)
2282         {
2283           while (!re_string_eoi (regexp)
2284                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2285             {
2286               bin_tree_t *mbc_remain;
2287               fetch_token (token, regexp, syntax);
2288               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2289               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2290               if (BE (mbc_remain == NULL || tree == NULL, 0))
2291                 {
2292                   *err = REG_ESPACE;
2293                   return NULL;
2294                 }
2295             }
2296         }
2297 #endif
2298       break;
2299     case OP_OPEN_SUBEXP:
2300       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2301       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2302         return NULL;
2303       break;
2304     case OP_OPEN_BRACKET:
2305       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2306       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307         return NULL;
2308       break;
2309     case OP_BACK_REF:
2310       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311         {
2312           *err = REG_ESUBREG;
2313           return NULL;
2314         }
2315       dfa->used_bkref_map |= 1 << token->opr.idx;
2316       tree = create_token_tree (dfa, NULL, NULL, token);
2317       if (BE (tree == NULL, 0))
2318         {
2319           *err = REG_ESPACE;
2320           return NULL;
2321         }
2322       ++dfa->nbackref;
2323       dfa->has_mb_node = 1;
2324       break;
2325     case OP_OPEN_DUP_NUM:
2326       if (syntax & RE_CONTEXT_INVALID_DUP)
2327         {
2328           *err = REG_BADRPT;
2329           return NULL;
2330         }
2331       /* FALLTHROUGH */
2332     case OP_DUP_ASTERISK:
2333     case OP_DUP_PLUS:
2334     case OP_DUP_QUESTION:
2335       if (syntax & RE_CONTEXT_INVALID_OPS)
2336         {
2337           *err = REG_BADRPT;
2338           return NULL;
2339         }
2340       else if (syntax & RE_CONTEXT_INDEP_OPS)
2341         {
2342           fetch_token (token, regexp, syntax);
2343           return parse_expression (regexp, preg, token, syntax, nest, err);
2344         }
2345       /* else fall through  */
2346     case OP_CLOSE_SUBEXP:
2347       if ((token->type == OP_CLOSE_SUBEXP) &&
2348           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2349         {
2350           *err = REG_ERPAREN;
2351           return NULL;
2352         }
2353       /* else fall through  */
2354     case OP_CLOSE_DUP_NUM:
2355       /* We treat it as a normal character.  */
2356
2357       /* Then we can these characters as normal characters.  */
2358       token->type = CHARACTER;
2359       /* mb_partial and word_char bits should be initialized already
2360          by peek_token.  */
2361       tree = create_token_tree (dfa, NULL, NULL, token);
2362       if (BE (tree == NULL, 0))
2363         {
2364           *err = REG_ESPACE;
2365           return NULL;
2366         }
2367       break;
2368     case ANCHOR:
2369       if ((token->opr.ctx_type
2370            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371           && dfa->word_ops_used == 0)
2372         init_word_char (dfa);
2373       if (token->opr.ctx_type == WORD_DELIM
2374           || token->opr.ctx_type == NOT_WORD_DELIM)
2375         {
2376           bin_tree_t *tree_first, *tree_last;
2377           if (token->opr.ctx_type == WORD_DELIM)
2378             {
2379               token->opr.ctx_type = WORD_FIRST;
2380               tree_first = create_token_tree (dfa, NULL, NULL, token);
2381               token->opr.ctx_type = WORD_LAST;
2382             }
2383           else
2384             {
2385               token->opr.ctx_type = INSIDE_WORD;
2386               tree_first = create_token_tree (dfa, NULL, NULL, token);
2387               token->opr.ctx_type = INSIDE_NOTWORD;
2388             }
2389           tree_last = create_token_tree (dfa, NULL, NULL, token);
2390           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392             {
2393               *err = REG_ESPACE;
2394               return NULL;
2395             }
2396         }
2397       else
2398         {
2399           tree = create_token_tree (dfa, NULL, NULL, token);
2400           if (BE (tree == NULL, 0))
2401             {
2402               *err = REG_ESPACE;
2403               return NULL;
2404             }
2405         }
2406       /* We must return here, since ANCHORs can't be followed
2407          by repetition operators.
2408          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2410       fetch_token (token, regexp, syntax);
2411       return tree;
2412     case OP_PERIOD:
2413       tree = create_token_tree (dfa, NULL, NULL, token);
2414       if (BE (tree == NULL, 0))
2415         {
2416           *err = REG_ESPACE;
2417           return NULL;
2418         }
2419       if (dfa->mb_cur_max > 1)
2420         dfa->has_mb_node = 1;
2421       break;
2422     case OP_WORD:
2423     case OP_NOTWORD:
2424       tree = build_charclass_op (dfa, regexp->trans,
2425                                  (const unsigned char *) "alnum",
2426                                  (const unsigned char *) "_",
2427                                  token->type == OP_NOTWORD, err);
2428       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2429         return NULL;
2430       break;
2431     case OP_SPACE:
2432     case OP_NOTSPACE:
2433       tree = build_charclass_op (dfa, regexp->trans,
2434                                  (const unsigned char *) "space",
2435                                  (const unsigned char *) "",
2436                                  token->type == OP_NOTSPACE, err);
2437       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438         return NULL;
2439       break;
2440     case OP_ALT:
2441     case END_OF_RE:
2442       return NULL;
2443     case BACK_SLASH:
2444       *err = REG_EESCAPE;
2445       return NULL;
2446     default:
2447       /* Must not happen?  */
2448 #ifdef DEBUG
2449       assert (0);
2450 #endif
2451       return NULL;
2452     }
2453   fetch_token (token, regexp, syntax);
2454
2455   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2456          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2457     {
2458       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2459       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2460         return NULL;
2461       /* In BRE consecutive duplications are not allowed.  */
2462       if ((syntax & RE_CONTEXT_INVALID_DUP)
2463           && (token->type == OP_DUP_ASTERISK
2464               || token->type == OP_OPEN_DUP_NUM))
2465         {
2466           *err = REG_BADRPT;
2467           return NULL;
2468         }
2469     }
2470
2471   return tree;
2472 }
2473
2474 /* This function build the following tree, from regular expression
2475    (<reg_exp>):
2476          SUBEXP
2477             |
2478         <reg_exp>
2479 */
2480
2481 static bin_tree_t *
2482 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2483                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2484 {
2485   re_dfa_t *dfa = preg->buffer;
2486   bin_tree_t *tree;
2487   size_t cur_nsub;
2488   cur_nsub = preg->re_nsub++;
2489
2490   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2491
2492   /* The subexpression may be a null string.  */
2493   if (token->type == OP_CLOSE_SUBEXP)
2494     tree = NULL;
2495   else
2496     {
2497       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2498       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2499         {
2500           if (tree != NULL)
2501             postorder (tree, free_tree, NULL);
2502           *err = REG_EPAREN;
2503         }
2504       if (BE (*err != REG_NOERROR, 0))
2505         return NULL;
2506     }
2507
2508   if (cur_nsub <= '9' - '1')
2509     dfa->completed_bkref_map |= 1 << cur_nsub;
2510
2511   tree = create_tree (dfa, tree, NULL, SUBEXP);
2512   if (BE (tree == NULL, 0))
2513     {
2514       *err = REG_ESPACE;
2515       return NULL;
2516     }
2517   tree->token.opr.idx = cur_nsub;
2518   return tree;
2519 }
2520
2521 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2522
2523 static bin_tree_t *
2524 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2525               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2526 {
2527   bin_tree_t *tree = NULL, *old_tree = NULL;
2528   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2529   re_token_t start_token = *token;
2530
2531   if (token->type == OP_OPEN_DUP_NUM)
2532     {
2533       end = 0;
2534       start = fetch_number (regexp, token, syntax);
2535       if (start == REG_MISSING)
2536         {
2537           if (token->type == CHARACTER && token->opr.c == ',')
2538             start = 0; /* We treat "{,m}" as "{0,m}".  */
2539           else
2540             {
2541               *err = REG_BADBR; /* <re>{} is invalid.  */
2542               return NULL;
2543             }
2544         }
2545       if (BE (start != REG_ERROR, 1))
2546         {
2547           /* We treat "{n}" as "{n,n}".  */
2548           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549                  : ((token->type == CHARACTER && token->opr.c == ',')
2550                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2551         }
2552       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2553         {
2554           /* Invalid sequence.  */
2555           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2556             {
2557               if (token->type == END_OF_RE)
2558                 *err = REG_EBRACE;
2559               else
2560                 *err = REG_BADBR;
2561
2562               return NULL;
2563             }
2564
2565           /* If the syntax bit is set, rollback.  */
2566           re_string_set_index (regexp, start_idx);
2567           *token = start_token;
2568           token->type = CHARACTER;
2569           /* mb_partial and word_char bits should be already initialized by
2570              peek_token.  */
2571           return elem;
2572         }
2573
2574       if (BE ((end != REG_MISSING && start > end)
2575               || token->type != OP_CLOSE_DUP_NUM, 0))
2576         {
2577           /* First number greater than second.  */
2578           *err = REG_BADBR;
2579           return NULL;
2580         }
2581
2582       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2583         {
2584           *err = REG_ESIZE;
2585           return NULL;
2586         }
2587     }
2588   else
2589     {
2590       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2592     }
2593
2594   fetch_token (token, regexp, syntax);
2595
2596   if (BE (elem == NULL, 0))
2597     return NULL;
2598   if (BE (start == 0 && end == 0, 0))
2599     {
2600       postorder (elem, free_tree, NULL);
2601       return NULL;
2602     }
2603
2604   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2605   if (BE (start > 0, 0))
2606     {
2607       tree = elem;
2608       for (i = 2; i <= start; ++i)
2609         {
2610           elem = duplicate_tree (elem, dfa);
2611           tree = create_tree (dfa, tree, elem, CONCAT);
2612           if (BE (elem == NULL || tree == NULL, 0))
2613             goto parse_dup_op_espace;
2614         }
2615
2616       if (start == end)
2617         return tree;
2618
2619       /* Duplicate ELEM before it is marked optional.  */
2620       elem = duplicate_tree (elem, dfa);
2621       old_tree = tree;
2622     }
2623   else
2624     old_tree = NULL;
2625
2626   if (elem->token.type == SUBEXP)
2627     {
2628       uintptr_t subidx = elem->token.opr.idx;
2629       postorder (elem, mark_opt_subexp, (void *) subidx);
2630     }
2631
2632   tree = create_tree (dfa, elem, NULL,
2633                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2634   if (BE (tree == NULL, 0))
2635     goto parse_dup_op_espace;
2636
2637 /* From gnulib's "intprops.h":
2638    True if the arithmetic type T is signed.  */
2639 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2640
2641   /* This loop is actually executed only when end != REG_MISSING,
2642      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2643      already created the start+1-th copy.  */
2644   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2645     for (i = start + 2; i <= end; ++i)
2646       {
2647         elem = duplicate_tree (elem, dfa);
2648         tree = create_tree (dfa, tree, elem, CONCAT);
2649         if (BE (elem == NULL || tree == NULL, 0))
2650           goto parse_dup_op_espace;
2651
2652         tree = create_tree (dfa, tree, NULL, OP_ALT);
2653         if (BE (tree == NULL, 0))
2654           goto parse_dup_op_espace;
2655       }
2656
2657   if (old_tree)
2658     tree = create_tree (dfa, old_tree, tree, CONCAT);
2659
2660   return tree;
2661
2662  parse_dup_op_espace:
2663   *err = REG_ESPACE;
2664   return NULL;
2665 }
2666
2667 /* Size of the names for collating symbol/equivalence_class/character_class.
2668    I'm not sure, but maybe enough.  */
2669 #define BRACKET_NAME_BUF_SIZE 32
2670
2671 #ifndef _LIBC
2672   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2673      Build the range expression which starts from START_ELEM, and ends
2674      at END_ELEM.  The result are written to MBCSET and SBCSET.
2675      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2676      mbcset->range_ends, is a pointer argument since we may
2677      update it.  */
2678
2679 static reg_errcode_t
2680 internal_function
2681 # ifdef RE_ENABLE_I18N
2682 build_range_exp (const reg_syntax_t syntax,
2683                  bitset_t sbcset,
2684                  re_charset_t *mbcset,
2685                  Idx *range_alloc,
2686                  const bracket_elem_t *start_elem,
2687                  const bracket_elem_t *end_elem)
2688 # else /* not RE_ENABLE_I18N */
2689 build_range_exp (const reg_syntax_t syntax,
2690                  bitset_t sbcset,
2691                  const bracket_elem_t *start_elem,
2692                  const bracket_elem_t *end_elem)
2693 # endif /* not RE_ENABLE_I18N */
2694 {
2695   unsigned int start_ch, end_ch;
2696   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2697   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2698           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2699           0))
2700     return REG_ERANGE;
2701
2702   /* We can handle no multi character collating elements without libc
2703      support.  */
2704   if (BE ((start_elem->type == COLL_SYM
2705            && strlen ((char *) start_elem->opr.name) > 1)
2706           || (end_elem->type == COLL_SYM
2707               && strlen ((char *) end_elem->opr.name) > 1), 0))
2708     return REG_ECOLLATE;
2709
2710 # ifdef RE_ENABLE_I18N
2711   {
2712     wchar_t wc;
2713     wint_t start_wc;
2714     wint_t end_wc;
2715     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2716
2717     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2718                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719                    : 0));
2720     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2721               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722                  : 0));
2723     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2724                 ? __btowc (start_ch) : start_elem->opr.wch);
2725     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2726               ? __btowc (end_ch) : end_elem->opr.wch);
2727     if (start_wc == WEOF || end_wc == WEOF)
2728       return REG_ECOLLATE;
2729     cmp_buf[0] = start_wc;
2730     cmp_buf[4] = end_wc;
2731
2732     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2733             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2734       return REG_ERANGE;
2735
2736     /* Got valid collation sequence values, add them as a new entry.
2737        However, for !_LIBC we have no collation elements: if the
2738        character set is single byte, the single byte character set
2739        that we build below suffices.  parse_bracket_exp passes
2740        no MBCSET if dfa->mb_cur_max == 1.  */
2741     if (mbcset)
2742       {
2743         /* Check the space of the arrays.  */
2744         if (BE (*range_alloc == mbcset->nranges, 0))
2745           {
2746             /* There is not enough space, need realloc.  */
2747             wchar_t *new_array_start, *new_array_end;
2748             Idx new_nranges;
2749
2750             /* +1 in case of mbcset->nranges is 0.  */
2751             new_nranges = 2 * mbcset->nranges + 1;
2752             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753                are NULL if *range_alloc == 0.  */
2754             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2755                                           new_nranges);
2756             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2757                                         new_nranges);
2758
2759             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2760               return REG_ESPACE;
2761
2762             mbcset->range_starts = new_array_start;
2763             mbcset->range_ends = new_array_end;
2764             *range_alloc = new_nranges;
2765           }
2766
2767         mbcset->range_starts[mbcset->nranges] = start_wc;
2768         mbcset->range_ends[mbcset->nranges++] = end_wc;
2769       }
2770
2771     /* Build the table for single byte characters.  */
2772     for (wc = 0; wc < SBC_MAX; ++wc)
2773       {
2774         cmp_buf[2] = wc;
2775         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2776             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2777           bitset_set (sbcset, wc);
2778       }
2779   }
2780 # else /* not RE_ENABLE_I18N */
2781   {
2782     unsigned int ch;
2783     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2784                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2785                    : 0));
2786     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2787               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2788                  : 0));
2789     if (start_ch > end_ch)
2790       return REG_ERANGE;
2791     /* Build the table for single byte characters.  */
2792     for (ch = 0; ch < SBC_MAX; ++ch)
2793       if (start_ch <= ch  && ch <= end_ch)
2794         bitset_set (sbcset, ch);
2795   }
2796 # endif /* not RE_ENABLE_I18N */
2797   return REG_NOERROR;
2798 }
2799 #endif /* not _LIBC */
2800
2801 #ifndef _LIBC
2802 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2803    Build the collating element which is represented by NAME.
2804    The result are written to MBCSET and SBCSET.
2805    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2806    pointer argument since we may update it.  */
2807
2808 static reg_errcode_t
2809 internal_function
2810 # ifdef RE_ENABLE_I18N
2811 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2812                         Idx *coll_sym_alloc, const unsigned char *name)
2813 # else /* not RE_ENABLE_I18N */
2814 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2815 # endif /* not RE_ENABLE_I18N */
2816 {
2817   size_t name_len = strlen ((const char *) name);
2818   if (BE (name_len != 1, 0))
2819     return REG_ECOLLATE;
2820   else
2821     {
2822       bitset_set (sbcset, name[0]);
2823       return REG_NOERROR;
2824     }
2825 }
2826 #endif /* not _LIBC */
2827
2828 /* This function parse bracket expression like "[abc]", "[a-c]",
2829    "[[.a-a.]]" etc.  */
2830
2831 static bin_tree_t *
2832 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2833                    reg_syntax_t syntax, reg_errcode_t *err)
2834 {
2835 #ifdef _LIBC
2836   const unsigned char *collseqmb;
2837   const char *collseqwc;
2838   uint32_t nrules;
2839   int32_t table_size;
2840   const int32_t *symb_table;
2841   const unsigned char *extra;
2842
2843   /* Local function for parse_bracket_exp used in _LIBC environment.
2844      Seek the collating symbol entry corresponding to NAME.
2845      Return the index of the symbol in the SYMB_TABLE.  */
2846
2847   auto inline int32_t
2848   __attribute ((always_inline))
2849   seek_collating_symbol_entry (name, name_len)
2850          const unsigned char *name;
2851          size_t name_len;
2852     {
2853       int32_t hash = elem_hash ((const char *) name, name_len);
2854       int32_t elem = hash % table_size;
2855       if (symb_table[2 * elem] != 0)
2856         {
2857           int32_t second = hash % (table_size - 2) + 1;
2858
2859           do
2860             {
2861               /* First compare the hashing value.  */
2862               if (symb_table[2 * elem] == hash
2863                   /* Compare the length of the name.  */
2864                   && name_len == extra[symb_table[2 * elem + 1]]
2865                   /* Compare the name.  */
2866                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2867                              name_len) == 0)
2868                 {
2869                   /* Yep, this is the entry.  */
2870                   break;
2871                 }
2872
2873               /* Next entry.  */
2874               elem += second;
2875             }
2876           while (symb_table[2 * elem] != 0);
2877         }
2878       return elem;
2879     }
2880
2881   /* Local function for parse_bracket_exp used in _LIBC environment.
2882      Look up the collation sequence value of BR_ELEM.
2883      Return the value if succeeded, UINT_MAX otherwise.  */
2884
2885   auto inline unsigned int
2886   __attribute ((always_inline))
2887   lookup_collation_sequence_value (br_elem)
2888          bracket_elem_t *br_elem;
2889     {
2890       if (br_elem->type == SB_CHAR)
2891         {
2892           /*
2893           if (MB_CUR_MAX == 1)
2894           */
2895           if (nrules == 0)
2896             return collseqmb[br_elem->opr.ch];
2897           else
2898             {
2899               wint_t wc = __btowc (br_elem->opr.ch);
2900               return __collseq_table_lookup (collseqwc, wc);
2901             }
2902         }
2903       else if (br_elem->type == MB_CHAR)
2904         {
2905           if (nrules != 0)
2906             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2907         }
2908       else if (br_elem->type == COLL_SYM)
2909         {
2910           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2911           if (nrules != 0)
2912             {
2913               int32_t elem, idx;
2914               elem = seek_collating_symbol_entry (br_elem->opr.name,
2915                                                   sym_name_len);
2916               if (symb_table[2 * elem] != 0)
2917                 {
2918                   /* We found the entry.  */
2919                   idx = symb_table[2 * elem + 1];
2920                   /* Skip the name of collating element name.  */
2921                   idx += 1 + extra[idx];
2922                   /* Skip the byte sequence of the collating element.  */
2923                   idx += 1 + extra[idx];
2924                   /* Adjust for the alignment.  */
2925                   idx = (idx + 3) & ~3;
2926                   /* Skip the multibyte collation sequence value.  */
2927                   idx += sizeof (unsigned int);
2928                   /* Skip the wide char sequence of the collating element.  */
2929                   idx += sizeof (unsigned int) *
2930                     (1 + *(unsigned int *) (extra + idx));
2931                   /* Return the collation sequence value.  */
2932                   return *(unsigned int *) (extra + idx);
2933                 }
2934               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2935                 {
2936                   /* No valid character.  Match it as a single byte
2937                      character.  */
2938                   return collseqmb[br_elem->opr.name[0]];
2939                 }
2940             }
2941           else if (sym_name_len == 1)
2942             return collseqmb[br_elem->opr.name[0]];
2943         }
2944       return UINT_MAX;
2945     }
2946
2947   /* Local function for parse_bracket_exp used in _LIBC environment.
2948      Build the range expression which starts from START_ELEM, and ends
2949      at END_ELEM.  The result are written to MBCSET and SBCSET.
2950      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2951      mbcset->range_ends, is a pointer argument since we may
2952      update it.  */
2953
2954   auto inline reg_errcode_t
2955   __attribute ((always_inline))
2956   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2957          re_charset_t *mbcset;
2958          Idx *range_alloc;
2959          bitset_t sbcset;
2960          bracket_elem_t *start_elem, *end_elem;
2961     {
2962       unsigned int ch;
2963       uint32_t start_collseq;
2964       uint32_t end_collseq;
2965
2966       /* Equivalence Classes and Character Classes can't be a range
2967          start/end.  */
2968       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2969               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2970               0))
2971         return REG_ERANGE;
2972
2973       start_collseq = lookup_collation_sequence_value (start_elem);
2974       end_collseq = lookup_collation_sequence_value (end_elem);
2975       /* Check start/end collation sequence values.  */
2976       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2977         return REG_ECOLLATE;
2978       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2979         return REG_ERANGE;
2980
2981       /* Got valid collation sequence values, add them as a new entry.
2982          However, if we have no collation elements, and the character set
2983          is single byte, the single byte character set that we
2984          build below suffices. */
2985       if (nrules > 0 || dfa->mb_cur_max > 1)
2986         {
2987           /* Check the space of the arrays.  */
2988           if (BE (*range_alloc == mbcset->nranges, 0))
2989             {
2990               /* There is not enough space, need realloc.  */
2991               uint32_t *new_array_start;
2992               uint32_t *new_array_end;
2993               Idx new_nranges;
2994
2995               /* +1 in case of mbcset->nranges is 0.  */
2996               new_nranges = 2 * mbcset->nranges + 1;
2997               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2998                                             new_nranges);
2999               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3000                                           new_nranges);
3001
3002               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3003                 return REG_ESPACE;
3004
3005               mbcset->range_starts = new_array_start;
3006               mbcset->range_ends = new_array_end;
3007               *range_alloc = new_nranges;
3008             }
3009
3010           mbcset->range_starts[mbcset->nranges] = start_collseq;
3011           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3012         }
3013
3014       /* Build the table for single byte characters.  */
3015       for (ch = 0; ch < SBC_MAX; ch++)
3016         {
3017           uint32_t ch_collseq;
3018           /*
3019           if (MB_CUR_MAX == 1)
3020           */
3021           if (nrules == 0)
3022             ch_collseq = collseqmb[ch];
3023           else
3024             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3025           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3026             bitset_set (sbcset, ch);
3027         }
3028       return REG_NOERROR;
3029     }
3030
3031   /* Local function for parse_bracket_exp used in _LIBC environment.
3032      Build the collating element which is represented by NAME.
3033      The result are written to MBCSET and SBCSET.
3034      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3035      pointer argument since we may update it.  */
3036
3037   auto inline reg_errcode_t
3038   __attribute ((always_inline))
3039   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3040          re_charset_t *mbcset;
3041          Idx *coll_sym_alloc;
3042          bitset_t sbcset;
3043          const unsigned char *name;
3044     {
3045       int32_t elem, idx;
3046       size_t name_len = strlen ((const char *) name);
3047       if (nrules != 0)
3048         {
3049           elem = seek_collating_symbol_entry (name, name_len);
3050           if (symb_table[2 * elem] != 0)
3051             {
3052               /* We found the entry.  */
3053               idx = symb_table[2 * elem + 1];
3054               /* Skip the name of collating element name.  */
3055               idx += 1 + extra[idx];
3056             }
3057           else if (symb_table[2 * elem] == 0 && name_len == 1)
3058             {
3059               /* No valid character, treat it as a normal
3060                  character.  */
3061               bitset_set (sbcset, name[0]);
3062               return REG_NOERROR;
3063             }
3064           else
3065             return REG_ECOLLATE;
3066
3067           /* Got valid collation sequence, add it as a new entry.  */
3068           /* Check the space of the arrays.  */
3069           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3070             {
3071               /* Not enough, realloc it.  */
3072               /* +1 in case of mbcset->ncoll_syms is 0.  */
3073               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3074               /* Use realloc since mbcset->coll_syms is NULL
3075                  if *alloc == 0.  */
3076               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3077                                                    new_coll_sym_alloc);
3078               if (BE (new_coll_syms == NULL, 0))
3079                 return REG_ESPACE;
3080               mbcset->coll_syms = new_coll_syms;
3081               *coll_sym_alloc = new_coll_sym_alloc;
3082             }
3083           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3084           return REG_NOERROR;
3085         }
3086       else
3087         {
3088           if (BE (name_len != 1, 0))
3089             return REG_ECOLLATE;
3090           else
3091             {
3092               bitset_set (sbcset, name[0]);
3093               return REG_NOERROR;
3094             }
3095         }
3096     }
3097 #endif
3098
3099   re_token_t br_token;
3100   re_bitset_ptr_t sbcset;
3101 #ifdef RE_ENABLE_I18N
3102   re_charset_t *mbcset;
3103   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3104   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3105 #endif /* not RE_ENABLE_I18N */
3106   bool non_match = false;
3107   bin_tree_t *work_tree;
3108   int token_len;
3109   bool first_round = true;
3110 #ifdef _LIBC
3111   collseqmb = (const unsigned char *)
3112     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3113   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3114   if (nrules)
3115     {
3116       /*
3117       if (MB_CUR_MAX > 1)
3118       */
3119       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3120       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3121       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3122                                                   _NL_COLLATE_SYMB_TABLEMB);
3123       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3124                                                    _NL_COLLATE_SYMB_EXTRAMB);
3125     }
3126 #endif
3127   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3128 #ifdef RE_ENABLE_I18N
3129   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3130 #endif /* RE_ENABLE_I18N */
3131 #ifdef RE_ENABLE_I18N
3132   if (BE (sbcset == NULL || mbcset == NULL, 0))
3133 #else
3134   if (BE (sbcset == NULL, 0))
3135 #endif /* RE_ENABLE_I18N */
3136     {
3137       re_free (sbcset);
3138 #ifdef RE_ENABLE_I18N
3139       re_free (mbcset);
3140 #endif
3141       *err = REG_ESPACE;
3142       return NULL;
3143     }
3144
3145   token_len = peek_token_bracket (token, regexp, syntax);
3146   if (BE (token->type == END_OF_RE, 0))
3147     {
3148       *err = REG_BADPAT;
3149       goto parse_bracket_exp_free_return;
3150     }
3151   if (token->type == OP_NON_MATCH_LIST)
3152     {
3153 #ifdef RE_ENABLE_I18N
3154       mbcset->non_match = 1;
3155 #endif /* not RE_ENABLE_I18N */
3156       non_match = true;
3157       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3158         bitset_set (sbcset, '\n');
3159       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3160       token_len = peek_token_bracket (token, regexp, syntax);
3161       if (BE (token->type == END_OF_RE, 0))
3162         {
3163           *err = REG_BADPAT;
3164           goto parse_bracket_exp_free_return;
3165         }
3166     }
3167
3168   /* We treat the first ']' as a normal character.  */
3169   if (token->type == OP_CLOSE_BRACKET)
3170     token->type = CHARACTER;
3171
3172   while (1)
3173     {
3174       bracket_elem_t start_elem, end_elem;
3175       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3176       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3177       reg_errcode_t ret;
3178       int token_len2 = 0;
3179       bool is_range_exp = false;
3180       re_token_t token2;
3181
3182       start_elem.opr.name = start_name_buf;
3183       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3184                                    syntax, first_round);
3185       if (BE (ret != REG_NOERROR, 0))
3186         {
3187           *err = ret;
3188           goto parse_bracket_exp_free_return;
3189         }
3190       first_round = false;
3191
3192       /* Get information about the next token.  We need it in any case.  */
3193       token_len = peek_token_bracket (token, regexp, syntax);
3194
3195       /* Do not check for ranges if we know they are not allowed.  */
3196       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3197         {
3198           if (BE (token->type == END_OF_RE, 0))
3199             {
3200               *err = REG_EBRACK;
3201               goto parse_bracket_exp_free_return;
3202             }
3203           if (token->type == OP_CHARSET_RANGE)
3204             {
3205               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3206               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3207               if (BE (token2.type == END_OF_RE, 0))
3208                 {
3209                   *err = REG_EBRACK;
3210                   goto parse_bracket_exp_free_return;
3211                 }
3212               if (token2.type == OP_CLOSE_BRACKET)
3213                 {
3214                   /* We treat the last '-' as a normal character.  */
3215                   re_string_skip_bytes (regexp, -token_len);
3216                   token->type = CHARACTER;
3217                 }
3218               else
3219                 is_range_exp = true;
3220             }
3221         }
3222
3223       if (is_range_exp == true)
3224         {
3225           end_elem.opr.name = end_name_buf;
3226           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3227                                        dfa, syntax, true);
3228           if (BE (ret != REG_NOERROR, 0))
3229             {
3230               *err = ret;
3231               goto parse_bracket_exp_free_return;
3232             }
3233
3234           token_len = peek_token_bracket (token, regexp, syntax);
3235
3236 #ifdef _LIBC
3237           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3238                                   &start_elem, &end_elem);
3239 #else
3240 # ifdef RE_ENABLE_I18N
3241           *err = build_range_exp (syntax, sbcset,
3242                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3243                                   &range_alloc, &start_elem, &end_elem);
3244 # else
3245           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3246 # endif
3247 #endif /* RE_ENABLE_I18N */
3248           if (BE (*err != REG_NOERROR, 0))
3249             goto parse_bracket_exp_free_return;
3250         }
3251       else
3252         {
3253           switch (start_elem.type)
3254             {
3255             case SB_CHAR:
3256               bitset_set (sbcset, start_elem.opr.ch);
3257               break;
3258 #ifdef RE_ENABLE_I18N
3259             case MB_CHAR:
3260               /* Check whether the array has enough space.  */
3261               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3262                 {
3263                   wchar_t *new_mbchars;
3264                   /* Not enough, realloc it.  */
3265                   /* +1 in case of mbcset->nmbchars is 0.  */
3266                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3267                   /* Use realloc since array is NULL if *alloc == 0.  */
3268                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3269                                             mbchar_alloc);
3270                   if (BE (new_mbchars == NULL, 0))
3271                     goto parse_bracket_exp_espace;
3272                   mbcset->mbchars = new_mbchars;
3273                 }
3274               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3275               break;
3276 #endif /* RE_ENABLE_I18N */
3277             case EQUIV_CLASS:
3278               *err = build_equiv_class (sbcset,
3279 #ifdef RE_ENABLE_I18N
3280                                         mbcset, &equiv_class_alloc,
3281 #endif /* RE_ENABLE_I18N */
3282                                         start_elem.opr.name);
3283               if (BE (*err != REG_NOERROR, 0))
3284                 goto parse_bracket_exp_free_return;
3285               break;
3286             case COLL_SYM:
3287               *err = build_collating_symbol (sbcset,
3288 #ifdef RE_ENABLE_I18N
3289                                              mbcset, &coll_sym_alloc,
3290 #endif /* RE_ENABLE_I18N */
3291                                              start_elem.opr.name);
3292               if (BE (*err != REG_NOERROR, 0))
3293                 goto parse_bracket_exp_free_return;
3294               break;
3295             case CHAR_CLASS:
3296               *err = build_charclass (regexp->trans, sbcset,
3297 #ifdef RE_ENABLE_I18N
3298                                       mbcset, &char_class_alloc,
3299 #endif /* RE_ENABLE_I18N */
3300                                       start_elem.opr.name, syntax);
3301               if (BE (*err != REG_NOERROR, 0))
3302                goto parse_bracket_exp_free_return;
3303               break;
3304             default:
3305               assert (0);
3306               break;
3307             }
3308         }
3309       if (BE (token->type == END_OF_RE, 0))
3310         {
3311           *err = REG_EBRACK;
3312           goto parse_bracket_exp_free_return;
3313         }
3314       if (token->type == OP_CLOSE_BRACKET)
3315         break;
3316     }
3317
3318   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3319
3320   /* If it is non-matching list.  */
3321   if (non_match)
3322     bitset_not (sbcset);
3323
3324 #ifdef RE_ENABLE_I18N
3325   /* Ensure only single byte characters are set.  */
3326   if (dfa->mb_cur_max > 1)
3327     bitset_mask (sbcset, dfa->sb_char);
3328
3329   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3330       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3331                                                      || mbcset->non_match)))
3332     {
3333       bin_tree_t *mbc_tree;
3334       int sbc_idx;
3335       /* Build a tree for complex bracket.  */
3336       dfa->has_mb_node = 1;
3337       br_token.type = COMPLEX_BRACKET;
3338       br_token.opr.mbcset = mbcset;
3339       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3340       if (BE (mbc_tree == NULL, 0))
3341         goto parse_bracket_exp_espace;
3342       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3343         if (sbcset[sbc_idx])
3344           break;
3345       /* If there are no bits set in sbcset, there is no point
3346          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3347       if (sbc_idx < BITSET_WORDS)
3348         {
3349           /* Build a tree for simple bracket.  */
3350           br_token.type = SIMPLE_BRACKET;
3351           br_token.opr.sbcset = sbcset;
3352           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3353           if (BE (work_tree == NULL, 0))
3354             goto parse_bracket_exp_espace;
3355
3356           /* Then join them by ALT node.  */
3357           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3358           if (BE (work_tree == NULL, 0))
3359             goto parse_bracket_exp_espace;
3360         }
3361       else
3362         {
3363           re_free (sbcset);
3364           work_tree = mbc_tree;
3365         }
3366     }
3367   else
3368 #endif /* not RE_ENABLE_I18N */
3369     {
3370 #ifdef RE_ENABLE_I18N
3371       free_charset (mbcset);
3372 #endif
3373       /* Build a tree for simple bracket.  */
3374       br_token.type = SIMPLE_BRACKET;
3375       br_token.opr.sbcset = sbcset;
3376       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3377       if (BE (work_tree == NULL, 0))
3378         goto parse_bracket_exp_espace;
3379     }
3380   return work_tree;
3381
3382  parse_bracket_exp_espace:
3383   *err = REG_ESPACE;
3384  parse_bracket_exp_free_return:
3385   re_free (sbcset);
3386 #ifdef RE_ENABLE_I18N
3387   free_charset (mbcset);
3388 #endif /* RE_ENABLE_I18N */
3389   return NULL;
3390 }
3391
3392 /* Parse an element in the bracket expression.  */
3393
3394 static reg_errcode_t
3395 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3396                        re_token_t *token, int token_len, re_dfa_t *dfa,
3397                        reg_syntax_t syntax, bool accept_hyphen)
3398 {
3399 #ifdef RE_ENABLE_I18N
3400   int cur_char_size;
3401   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3402   if (cur_char_size > 1)
3403     {
3404       elem->type = MB_CHAR;
3405       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3406       re_string_skip_bytes (regexp, cur_char_size);
3407       return REG_NOERROR;
3408     }
3409 #endif /* RE_ENABLE_I18N */
3410   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3411   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3412       || token->type == OP_OPEN_EQUIV_CLASS)
3413     return parse_bracket_symbol (elem, regexp, token);
3414   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3415     {
3416       /* A '-' must only appear as anything but a range indicator before
3417          the closing bracket.  Everything else is an error.  */
3418       re_token_t token2;
3419       (void) peek_token_bracket (&token2, regexp, syntax);
3420       if (token2.type != OP_CLOSE_BRACKET)
3421         /* The actual error value is not standardized since this whole
3422            case is undefined.  But ERANGE makes good sense.  */
3423         return REG_ERANGE;
3424     }
3425   elem->type = SB_CHAR;
3426   elem->opr.ch = token->opr.c;
3427   return REG_NOERROR;
3428 }
3429
3430 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3431    such as [:<character_class>:], [.<collating_element>.], and
3432    [=<equivalent_class>=].  */
3433
3434 static reg_errcode_t
3435 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3436                       re_token_t *token)
3437 {
3438   unsigned char ch, delim = token->opr.c;
3439   int i = 0;
3440   if (re_string_eoi(regexp))
3441     return REG_EBRACK;
3442   for (;; ++i)
3443     {
3444       if (i >= BRACKET_NAME_BUF_SIZE)
3445         return REG_EBRACK;
3446       if (token->type == OP_OPEN_CHAR_CLASS)
3447         ch = re_string_fetch_byte_case (regexp);
3448       else
3449         ch = re_string_fetch_byte (regexp);
3450       if (re_string_eoi(regexp))
3451         return REG_EBRACK;
3452       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3453         break;
3454       elem->opr.name[i] = ch;
3455     }
3456   re_string_skip_bytes (regexp, 1);
3457   elem->opr.name[i] = '\0';
3458   switch (token->type)
3459     {
3460     case OP_OPEN_COLL_ELEM:
3461       elem->type = COLL_SYM;
3462       break;
3463     case OP_OPEN_EQUIV_CLASS:
3464       elem->type = EQUIV_CLASS;
3465       break;
3466     case OP_OPEN_CHAR_CLASS:
3467       elem->type = CHAR_CLASS;
3468       break;
3469     default:
3470       break;
3471     }
3472   return REG_NOERROR;
3473 }
3474
3475   /* Helper function for parse_bracket_exp.
3476      Build the equivalence class which is represented by NAME.
3477      The result are written to MBCSET and SBCSET.
3478      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3479      is a pointer argument since we may update it.  */
3480
3481 static reg_errcode_t
3482 #ifdef RE_ENABLE_I18N
3483 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3484                    Idx *equiv_class_alloc, const unsigned char *name)
3485 #else /* not RE_ENABLE_I18N */
3486 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3487 #endif /* not RE_ENABLE_I18N */
3488 {
3489 #ifdef _LIBC
3490   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3491   if (nrules != 0)
3492     {
3493       const int32_t *table, *indirect;
3494       const unsigned char *weights, *extra, *cp;
3495       unsigned char char_buf[2];
3496       int32_t idx1, idx2;
3497       unsigned int ch;
3498       size_t len;
3499       /* This #include defines a local function!  */
3500 # include <locale/weight.h>
3501       /* Calculate the index for equivalence class.  */
3502       cp = name;
3503       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3504       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3505                                                _NL_COLLATE_WEIGHTMB);
3506       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3507                                                    _NL_COLLATE_EXTRAMB);
3508       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3509                                                 _NL_COLLATE_INDIRECTMB);
3510       idx1 = findidx (&cp, -1);
3511       if (BE (idx1 == 0 || *cp != '\0', 0))
3512         /* This isn't a valid character.  */
3513         return REG_ECOLLATE;
3514
3515       /* Build single byte matching table for this equivalence class.  */
3516       len = weights[idx1 & 0xffffff];
3517       for (ch = 0; ch < SBC_MAX; ++ch)
3518         {
3519           char_buf[0] = ch;
3520           cp = char_buf;
3521           idx2 = findidx (&cp, 1);
3522 /*
3523           idx2 = table[ch];
3524 */
3525           if (idx2 == 0)
3526             /* This isn't a valid character.  */
3527             continue;
3528           /* Compare only if the length matches and the collation rule
3529              index is the same.  */
3530           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3531             {
3532               int cnt = 0;
3533
3534               while (cnt <= len &&
3535                      weights[(idx1 & 0xffffff) + 1 + cnt]
3536                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3537                 ++cnt;
3538
3539               if (cnt > len)
3540                 bitset_set (sbcset, ch);
3541             }
3542         }
3543       /* Check whether the array has enough space.  */
3544       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3545         {
3546           /* Not enough, realloc it.  */
3547           /* +1 in case of mbcset->nequiv_classes is 0.  */
3548           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3549           /* Use realloc since the array is NULL if *alloc == 0.  */
3550           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3551                                                    int32_t,
3552                                                    new_equiv_class_alloc);
3553           if (BE (new_equiv_classes == NULL, 0))
3554             return REG_ESPACE;
3555           mbcset->equiv_classes = new_equiv_classes;
3556           *equiv_class_alloc = new_equiv_class_alloc;
3557         }
3558       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3559     }
3560   else
3561 #endif /* _LIBC */
3562     {
3563       if (BE (strlen ((const char *) name) != 1, 0))
3564         return REG_ECOLLATE;
3565       bitset_set (sbcset, *name);
3566     }
3567   return REG_NOERROR;
3568 }
3569
3570   /* Helper function for parse_bracket_exp.
3571      Build the character class which is represented by NAME.
3572      The result are written to MBCSET and SBCSET.
3573      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3574      is a pointer argument since we may update it.  */
3575
3576 static reg_errcode_t
3577 #ifdef RE_ENABLE_I18N
3578 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579                  re_charset_t *mbcset, Idx *char_class_alloc,
3580                  const unsigned char *class_name, reg_syntax_t syntax)
3581 #else /* not RE_ENABLE_I18N */
3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583                  const unsigned char *class_name, reg_syntax_t syntax)
3584 #endif /* not RE_ENABLE_I18N */
3585 {
3586   int i;
3587   const char *name = (const char *) class_name;
3588
3589   /* In case of REG_ICASE "upper" and "lower" match the both of
3590      upper and lower cases.  */
3591   if ((syntax & RE_ICASE)
3592       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3593     name = "alpha";
3594
3595 #ifdef RE_ENABLE_I18N
3596   /* Check the space of the arrays.  */
3597   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3598     {
3599       /* Not enough, realloc it.  */
3600       /* +1 in case of mbcset->nchar_classes is 0.  */
3601       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3602       /* Use realloc since array is NULL if *alloc == 0.  */
3603       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3604                                                new_char_class_alloc);
3605       if (BE (new_char_classes == NULL, 0))
3606         return REG_ESPACE;
3607       mbcset->char_classes = new_char_classes;
3608       *char_class_alloc = new_char_class_alloc;
3609     }
3610   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3611 #endif /* RE_ENABLE_I18N */
3612
3613 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3614   do {                                          \
3615     if (BE (trans != NULL, 0))                  \
3616       {                                         \
3617         for (i = 0; i < SBC_MAX; ++i)           \
3618           if (ctype_func (i))                   \
3619             bitset_set (sbcset, trans[i]);      \
3620       }                                         \
3621     else                                        \
3622       {                                         \
3623         for (i = 0; i < SBC_MAX; ++i)           \
3624           if (ctype_func (i))                   \
3625             bitset_set (sbcset, i);             \
3626       }                                         \
3627   } while (0)
3628
3629   if (strcmp (name, "alnum") == 0)
3630     BUILD_CHARCLASS_LOOP (isalnum);
3631   else if (strcmp (name, "cntrl") == 0)
3632     BUILD_CHARCLASS_LOOP (iscntrl);
3633   else if (strcmp (name, "lower") == 0)
3634     BUILD_CHARCLASS_LOOP (islower);
3635   else if (strcmp (name, "space") == 0)
3636     BUILD_CHARCLASS_LOOP (isspace);
3637   else if (strcmp (name, "alpha") == 0)
3638     BUILD_CHARCLASS_LOOP (isalpha);
3639   else if (strcmp (name, "digit") == 0)
3640     BUILD_CHARCLASS_LOOP (isdigit);
3641   else if (strcmp (name, "print") == 0)
3642     BUILD_CHARCLASS_LOOP (isprint);
3643   else if (strcmp (name, "upper") == 0)
3644     BUILD_CHARCLASS_LOOP (isupper);
3645   else if (strcmp (name, "blank") == 0)
3646     BUILD_CHARCLASS_LOOP (isblank);
3647   else if (strcmp (name, "graph") == 0)
3648     BUILD_CHARCLASS_LOOP (isgraph);
3649   else if (strcmp (name, "punct") == 0)
3650     BUILD_CHARCLASS_LOOP (ispunct);
3651   else if (strcmp (name, "xdigit") == 0)
3652     BUILD_CHARCLASS_LOOP (isxdigit);
3653   else
3654     return REG_ECTYPE;
3655
3656   return REG_NOERROR;
3657 }
3658
3659 static bin_tree_t *
3660 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3661                     const unsigned char *class_name,
3662                     const unsigned char *extra, bool non_match,
3663                     reg_errcode_t *err)
3664 {
3665   re_bitset_ptr_t sbcset;
3666 #ifdef RE_ENABLE_I18N
3667   re_charset_t *mbcset;
3668   Idx alloc = 0;
3669 #endif /* not RE_ENABLE_I18N */
3670   reg_errcode_t ret;
3671   re_token_t br_token;
3672   bin_tree_t *tree;
3673
3674   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3675 #ifdef RE_ENABLE_I18N
3676   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3677 #endif /* RE_ENABLE_I18N */
3678
3679 #ifdef RE_ENABLE_I18N
3680   if (BE (sbcset == NULL || mbcset == NULL, 0))
3681 #else /* not RE_ENABLE_I18N */
3682   if (BE (sbcset == NULL, 0))
3683 #endif /* not RE_ENABLE_I18N */
3684     {
3685       *err = REG_ESPACE;
3686       return NULL;
3687     }
3688
3689   if (non_match)
3690     {
3691 #ifdef RE_ENABLE_I18N
3692       mbcset->non_match = 1;
3693 #endif /* not RE_ENABLE_I18N */
3694     }
3695
3696   /* We don't care the syntax in this case.  */
3697   ret = build_charclass (trans, sbcset,
3698 #ifdef RE_ENABLE_I18N
3699                          mbcset, &alloc,
3700 #endif /* RE_ENABLE_I18N */
3701                          class_name, 0);
3702
3703   if (BE (ret != REG_NOERROR, 0))
3704     {
3705       re_free (sbcset);
3706 #ifdef RE_ENABLE_I18N
3707       free_charset (mbcset);
3708 #endif /* RE_ENABLE_I18N */
3709       *err = ret;
3710       return NULL;
3711     }
3712   /* \w match '_' also.  */
3713   for (; *extra; extra++)
3714     bitset_set (sbcset, *extra);
3715
3716   /* If it is non-matching list.  */
3717   if (non_match)
3718     bitset_not (sbcset);
3719
3720 #ifdef RE_ENABLE_I18N
3721   /* Ensure only single byte characters are set.  */
3722   if (dfa->mb_cur_max > 1)
3723     bitset_mask (sbcset, dfa->sb_char);
3724 #endif
3725
3726   /* Build a tree for simple bracket.  */
3727   br_token.type = SIMPLE_BRACKET;
3728   br_token.opr.sbcset = sbcset;
3729   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3730   if (BE (tree == NULL, 0))
3731     goto build_word_op_espace;
3732
3733 #ifdef RE_ENABLE_I18N
3734   if (dfa->mb_cur_max > 1)
3735     {
3736       bin_tree_t *mbc_tree;
3737       /* Build a tree for complex bracket.  */
3738       br_token.type = COMPLEX_BRACKET;
3739       br_token.opr.mbcset = mbcset;
3740       dfa->has_mb_node = 1;
3741       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3742       if (BE (mbc_tree == NULL, 0))
3743         goto build_word_op_espace;
3744       /* Then join them by ALT node.  */
3745       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3746       if (BE (mbc_tree != NULL, 1))
3747         return tree;
3748     }
3749   else
3750     {
3751       free_charset (mbcset);
3752       return tree;
3753     }
3754 #else /* not RE_ENABLE_I18N */
3755   return tree;
3756 #endif /* not RE_ENABLE_I18N */
3757
3758  build_word_op_espace:
3759   re_free (sbcset);
3760 #ifdef RE_ENABLE_I18N
3761   free_charset (mbcset);
3762 #endif /* RE_ENABLE_I18N */
3763   *err = REG_ESPACE;
3764   return NULL;
3765 }
3766
3767 /* This is intended for the expressions like "a{1,3}".
3768    Fetch a number from 'input', and return the number.
3769    Return REG_MISSING if the number field is empty like "{,1}".
3770    Return RE_DUP_MAX + 1 if the number field is too large.
3771    Return REG_ERROR if an error occurred.  */
3772
3773 static Idx
3774 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3775 {
3776   Idx num = REG_MISSING;
3777   unsigned char c;
3778   while (1)
3779     {
3780       fetch_token (token, input, syntax);
3781       c = token->opr.c;
3782       if (BE (token->type == END_OF_RE, 0))
3783         return REG_ERROR;
3784       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3785         break;
3786       num = ((token->type != CHARACTER || c < '0' || '9' < c
3787               || num == REG_ERROR)
3788              ? REG_ERROR
3789              : num == REG_MISSING
3790              ? c - '0'
3791              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3792     }
3793   return num;
3794 }
3795 \f
3796 #ifdef RE_ENABLE_I18N
3797 static void
3798 free_charset (re_charset_t *cset)
3799 {
3800   re_free (cset->mbchars);
3801 # ifdef _LIBC
3802   re_free (cset->coll_syms);
3803   re_free (cset->equiv_classes);
3804   re_free (cset->range_starts);
3805   re_free (cset->range_ends);
3806 # endif
3807   re_free (cset->char_classes);
3808   re_free (cset);
3809 }
3810 #endif /* RE_ENABLE_I18N */
3811 \f
3812 /* Functions for binary tree operation.  */
3813
3814 /* Create a tree node.  */
3815
3816 static bin_tree_t *
3817 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3818              re_token_type_t type)
3819 {
3820   re_token_t t;
3821   t.type = type;
3822   return create_token_tree (dfa, left, right, &t);
3823 }
3824
3825 static bin_tree_t *
3826 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3827                    const re_token_t *token)
3828 {
3829   bin_tree_t *tree;
3830   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3831     {
3832       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3833
3834       if (storage == NULL)
3835         return NULL;
3836       storage->next = dfa->str_tree_storage;
3837       dfa->str_tree_storage = storage;
3838       dfa->str_tree_storage_idx = 0;
3839     }
3840   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3841
3842   tree->parent = NULL;
3843   tree->left = left;
3844   tree->right = right;
3845   tree->token = *token;
3846   tree->token.duplicated = 0;
3847   tree->token.opt_subexp = 0;
3848   tree->first = NULL;
3849   tree->next = NULL;
3850   tree->node_idx = REG_MISSING;
3851
3852   if (left != NULL)
3853     left->parent = tree;
3854   if (right != NULL)
3855     right->parent = tree;
3856   return tree;
3857 }
3858
3859 /* Mark the tree SRC as an optional subexpression.
3860    To be called from preorder or postorder.  */
3861
3862 static reg_errcode_t
3863 mark_opt_subexp (void *extra, bin_tree_t *node)
3864 {
3865   Idx idx = (uintptr_t) extra;
3866   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3867     node->token.opt_subexp = 1;
3868
3869   return REG_NOERROR;
3870 }
3871
3872 /* Free the allocated memory inside NODE. */
3873
3874 static void
3875 free_token (re_token_t *node)
3876 {
3877 #ifdef RE_ENABLE_I18N
3878   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3879     free_charset (node->opr.mbcset);
3880   else
3881 #endif /* RE_ENABLE_I18N */
3882     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3883       re_free (node->opr.sbcset);
3884 }
3885
3886 /* Worker function for tree walking.  Free the allocated memory inside NODE
3887    and its children. */
3888
3889 static reg_errcode_t
3890 free_tree (void *extra, bin_tree_t *node)
3891 {
3892   free_token (&node->token);
3893   return REG_NOERROR;
3894 }
3895
3896
3897 /* Duplicate the node SRC, and return new node.  This is a preorder
3898    visit similar to the one implemented by the generic visitor, but
3899    we need more infrastructure to maintain two parallel trees --- so,
3900    it's easier to duplicate.  */
3901
3902 static bin_tree_t *
3903 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3904 {
3905   const bin_tree_t *node;
3906   bin_tree_t *dup_root;
3907   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3908
3909   for (node = root; ; )
3910     {
3911       /* Create a new tree and link it back to the current parent.  */
3912       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3913       if (*p_new == NULL)
3914         return NULL;
3915       (*p_new)->parent = dup_node;
3916       (*p_new)->token.duplicated = 1;
3917       dup_node = *p_new;
3918
3919       /* Go to the left node, or up and to the right.  */
3920       if (node->left)
3921         {
3922           node = node->left;
3923           p_new = &dup_node->left;
3924         }
3925       else
3926         {
3927           const bin_tree_t *prev = NULL;
3928           while (node->right == prev || node->right == NULL)
3929             {
3930               prev = node;
3931               node = node->parent;
3932               dup_node = dup_node->parent;
3933               if (!node)
3934                 return dup_root;
3935             }
3936           node = node->right;
3937           p_new = &dup_node->right;
3938         }
3939     }
3940 }