lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
  18
  19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  20                                           size_t length, reg_syntax_t syntax);
  21 static void re_compile_fastmap_iter (regex_t *bufp,
  22                                      const re_dfastate_t *init_state,
  23                                      char *fastmap);
  24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  25 #ifdef RE_ENABLE_I18N
  26 static void free_charset (re_charset_t *cset);
  27 #endif /* RE_ENABLE_I18N */
  28 static void free_workarea_compile (regex_t *preg);
  29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  30 #ifdef RE_ENABLE_I18N
  31 static void optimize_utf8 (re_dfa_t *dfa);
  32 #endif
  33 static reg_errcode_t analyze (regex_t *preg);
  34 static reg_errcode_t preorder (bin_tree_t *root,
  35                                reg_errcode_t (fn (void *, bin_tree_t *)),
  36                                void *extra);
  37 static reg_errcode_t postorder (bin_tree_t *root,
  38                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                 void *extra);
  40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  43                                  bin_tree_t *node);
  44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  49                                    unsigned int constraint);
  50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  52                                          Idx node, bool root);
  53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  54 static Idx fetch_number (re_string_t *input, re_token_t *token,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax) internal_function;
  58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  59                           reg_syntax_t syntax, reg_errcode_t *err);
  60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  61                                   re_token_t *token, reg_syntax_t syntax,
  62                                   Idx nest, reg_errcode_t *err);
  63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  64                                  re_token_t *token, reg_syntax_t syntax,
  65                                  Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  67                                      re_token_t *token, reg_syntax_t syntax,
  68                                      Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  70                                   re_token_t *token, reg_syntax_t syntax,
  71                                   Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  73                                  re_dfa_t *dfa, re_token_t *token,
  74                                  reg_syntax_t syntax, reg_errcode_t *err);
  75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  76                                       re_token_t *token, reg_syntax_t syntax,
  77                                       reg_errcode_t *err);
  78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  79                                             re_string_t *regexp,
  80                                             re_token_t *token, int token_len,
  81                                             re_dfa_t *dfa,
  82                                             reg_syntax_t syntax,
  83                                             bool accept_hyphen);
  84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  85                                           re_string_t *regexp,
  86                                           re_token_t *token);
  87 #ifdef RE_ENABLE_I18N
  88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  89                                         re_charset_t *mbcset,
  90                                         Idx *equiv_class_alloc,
  91                                         const unsigned char *name);
  92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  93                                       bitset_t sbcset,
  94                                       re_charset_t *mbcset,
  95                                       Idx *char_class_alloc,
  96                                       const unsigned char *class_name,
  97                                       reg_syntax_t syntax);
  98 #else  /* not RE_ENABLE_I18N */
  99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 100                                         const unsigned char *name);
 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 102                                       bitset_t sbcset,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #endif /* not RE_ENABLE_I18N */
 106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 107                                        RE_TRANSLATE_TYPE trans,
 108                                        const unsigned char *class_name,
 109                                        const unsigned char *extra,
 110                                        bool non_match, reg_errcode_t *err);
 111 static bin_tree_t *create_tree (re_dfa_t *dfa,
 112                                 bin_tree_t *left, bin_tree_t *right,
 113                                 re_token_type_t type);
 114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 115                                       bin_tree_t *left, bin_tree_t *right,
 116                                       const re_token_t *token);
 117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 118 static void free_token (re_token_t *node);
 119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 121 \f
 122 /* This table gives an error message for each of the error codes listed
 123    in regex.h.  Obviously the order here has to be same as there.
 124    POSIX doesn't require that we do anything for REG_NOERROR,
 125    but why not be nice?  */
 126
 127 static const char __re_error_msgid[] =
 128   {
 129 #define REG_NOERROR_IDX 0
 130     gettext_noop ("Success")    /* REG_NOERROR */
 131     "\0"
 132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 133     gettext_noop ("No match")   /* REG_NOMATCH */
 134     "\0"
 135 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 136     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 137     "\0"
 138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 139     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 140     "\0"
 141 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 142     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 143     "\0"
 144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 145     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 146     "\0"
 147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 148     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 149     "\0"
 150 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 151     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 152     "\0"
 153 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 154     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 155     "\0"
 156 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 157     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 158     "\0"
 159 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 160     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 161     "\0"
 162 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 163     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 164     "\0"
 165 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 166     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 167     "\0"
 168 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 169     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 170     "\0"
 171 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 172     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 173     "\0"
 174 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 175     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 176     "\0"
 177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 178     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 179   };
 180
 181 static const size_t __re_error_msgid_idx[] =
 182   {
 183     REG_NOERROR_IDX,
 184     REG_NOMATCH_IDX,
 185     REG_BADPAT_IDX,
 186     REG_ECOLLATE_IDX,
 187     REG_ECTYPE_IDX,
 188     REG_EESCAPE_IDX,
 189     REG_ESUBREG_IDX,
 190     REG_EBRACK_IDX,
 191     REG_EPAREN_IDX,
 192     REG_EBRACE_IDX,
 193     REG_BADBR_IDX,
 194     REG_ERANGE_IDX,
 195     REG_ESPACE_IDX,
 196     REG_BADRPT_IDX,
 197     REG_EEND_IDX,
 198     REG_ESIZE_IDX,
 199     REG_ERPAREN_IDX
 200   };
 201 \f
 202 /* Entry points for GNU code.  */
 203
 204 /* re_compile_pattern is the GNU regular expression compiler: it
 205    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 206    Returns 0 if the pattern was valid, otherwise an error string.
 207
 208    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 209    are set in BUFP on entry.  */
 210
 211 #ifdef _LIBC
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 #else /* size_t might promote */
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 #endif
 222 {
 223   reg_errcode_t ret;
 224
 225   /* And GNU code determines whether or not to get register information
 226      by passing null for the REGS argument to re_match, etc., not by
 227      setting no_sub, unless RE_NO_SUB is set.  */
 228   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 229
 230   /* Match anchors at newline.  */
 231   bufp->newline_anchor = 1;
 232
 233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 234
 235   if (!ret)
 236     return NULL;
 237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 238 }
 239 #ifdef _LIBC
 240 weak_alias (__re_compile_pattern, re_compile_pattern)
 241 #endif
 242
 243 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 244    also be assigned to arbitrarily: each pattern buffer stores its own
 245    syntax, so it can be changed between regex compilations.  */
 246 /* This has no initializer because initialized variables in Emacs
 247    become read-only after dumping.  */
 248 reg_syntax_t re_syntax_options;
 249
 250
 251 /* Specify the precise syntax of regexps for compilation.  This provides
 252    for compatibility for various utilities which historically have
 253    different, incompatible syntaxes.
 254
 255    The argument SYNTAX is a bit mask comprised of the various bits
 256    defined in regex.h.  We return the old syntax.  */
 257
 258 reg_syntax_t
 259 re_set_syntax (syntax)
 260     reg_syntax_t syntax;
 261 {
 262   reg_syntax_t ret = re_syntax_options;
 263
 264   re_syntax_options = syntax;
 265   return ret;
 266 }
 267 #ifdef _LIBC
 268 weak_alias (__re_set_syntax, re_set_syntax)
 269 #endif
 270
 271 int
 272 re_compile_fastmap (bufp)
 273     struct re_pattern_buffer *bufp;
 274 {
 275   re_dfa_t *dfa = bufp->buffer;
 276   char *fastmap = bufp->fastmap;
 277
 278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 280   if (dfa->init_state != dfa->init_state_word)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 282   if (dfa->init_state != dfa->init_state_nl)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 284   if (dfa->init_state != dfa->init_state_begbuf)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 286   bufp->fastmap_accurate = 1;
 287   return 0;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 291 #endif
 292
 293 static inline void
 294 __attribute ((always_inline))
 295 re_set_fastmap (char *fastmap, bool icase, int ch)
 296 {
 297   fastmap[ch] = 1;
 298   if (icase)
 299     fastmap[tolower (ch)] = 1;
 300 }
 301
 302 /* Helper function for re_compile_fastmap.
 303    Compile fastmap for the initial_state INIT_STATE.  */
 304
 305 static void
 306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 307                          char *fastmap)
 308 {
 309   re_dfa_t *dfa = bufp->buffer;
 310   Idx node_cnt;
 311   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 312   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 313     {
 314       Idx node = init_state->nodes.elems[node_cnt];
 315       re_token_type_t type = dfa->nodes[node].type;
 316
 317       if (type == CHARACTER)
 318         {
 319           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 320 #ifdef RE_ENABLE_I18N
 321           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 322             {
 323               unsigned char buf[MB_LEN_MAX];
 324               unsigned char *p;
 325               wchar_t wc;
 326               mbstate_t state;
 327
 328               p = buf;
 329               *p++ = dfa->nodes[node].opr.c;
 330               while (++node < dfa->nodes_len
 331                      && dfa->nodes[node].type == CHARACTER
 332                      && dfa->nodes[node].mb_partial)
 333                 *p++ = dfa->nodes[node].opr.c;
 334               memset (&state, '\0', sizeof (state));
 335               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 336                              &state) == p - buf
 337                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 338                       != (size_t) -1))
 339                 re_set_fastmap (fastmap, false, buf[0]);
 340             }
 341 #endif
 342         }
 343       else if (type == SIMPLE_BRACKET)
 344         {
 345           int i, ch;
 346           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 347             {
 348               int j;
 349               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 350               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 351                 if (w & ((bitset_word_t) 1 << j))
 352                   re_set_fastmap (fastmap, icase, ch);
 353             }
 354         }
 355 #ifdef RE_ENABLE_I18N
 356       else if (type == COMPLEX_BRACKET)
 357         {
 358           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 359           Idx i;
 360
 361 # ifdef _LIBC
 362           /* See if we have to try all bytes which start multiple collation
 363              elements.
 364              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 365                   collation element, and don't catch 'b' since 'b' is
 366                   the only collation element which starts from 'b' (and
 367                   it is caught by SIMPLE_BRACKET).  */
 368               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 369                   && (cset->ncoll_syms || cset->nranges))
 370                 {
 371                   const int32_t *table = (const int32_t *)
 372                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 373                   for (i = 0; i < SBC_MAX; ++i)
 374                     if (table[i] < 0)
 375                       re_set_fastmap (fastmap, icase, i);
 376                 }
 377 # endif /* _LIBC */
 378
 379           /* See if we have to start the match at all multibyte characters,
 380              i.e. where we would not find an invalid sequence.  This only
 381              applies to multibyte character sets; for single byte character
 382              sets, the SIMPLE_BRACKET again suffices.  */
 383           if (dfa->mb_cur_max > 1
 384               && (cset->nchar_classes || cset->non_match || cset->nranges
 385 # ifdef _LIBC
 386                   || cset->nequiv_classes
 387 # endif /* _LIBC */
 388                  ))
 389             {
 390               unsigned char c = 0;
 391               do
 392                 {
 393                   mbstate_t mbs;
 394                   memset (&mbs, 0, sizeof (mbs));
 395                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 396                     re_set_fastmap (fastmap, false, (int) c);
 397                 }
 398               while (++c != 0);
 399             }
 400
 401           else
 402             {
 403               /* ... Else catch all bytes which can start the mbchars.  */
 404               for (i = 0; i < cset->nmbchars; ++i)
 405                 {
 406                   char buf[256];
 407                   mbstate_t state;
 408                   memset (&state, '\0', sizeof (state));
 409                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 410                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 411                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 412                     {
 413                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 414                           != (size_t) -1)
 415                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 416                     }
 417                 }
 418             }
 419         }
 420 #endif /* RE_ENABLE_I18N */
 421       else if (type == OP_PERIOD
 422 #ifdef RE_ENABLE_I18N
 423                || type == OP_UTF8_PERIOD
 424 #endif /* RE_ENABLE_I18N */
 425                || type == END_OF_RE)
 426         {
 427           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 428           if (type == END_OF_RE)
 429             bufp->can_be_null = 1;
 430           return;
 431         }
 432     }
 433 }
 434 \f
 435 /* Entry point for POSIX code.  */
 436 /* regcomp takes a regular expression as a string and compiles it.
 437
 438    PREG is a regex_t *.  We do not expect any fields to be initialized,
 439    since POSIX says we shouldn't.  Thus, we set
 440
 441      'buffer' to the compiled pattern;
 442      'used' to the length of the compiled pattern;
 443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 445        RE_SYNTAX_POSIX_BASIC;
 446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 447      'fastmap' to an allocated space for the fastmap;
 448      'fastmap_accurate' to zero;
 449      're_nsub' to the number of subexpressions in PATTERN.
 450
 451    PATTERN is the address of the pattern string.
 452
 453    CFLAGS is a series of bits which affect compilation.
 454
 455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 456      use POSIX basic syntax.
 457
 458      If REG_NEWLINE is set, then . and [^...] don't match newline.
 459      Also, regexec will try a match beginning after every newline.
 460
 461      If REG_ICASE is set, then we considers upper- and lowercase
 462      versions of letters to be equivalent when matching.
 463
 464      If REG_NOSUB is set, then when PREG is passed to regexec, that
 465      routine will report only success or failure, and nothing about the
 466      registers.
 467
 468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 469    the return codes and their meanings.)  */
 470
 471 int
 472 regcomp (preg, pattern, cflags)
 473     regex_t *_Restrict_ preg;
 474     const char *_Restrict_ pattern;
 475     int cflags;
 476 {
 477   reg_errcode_t ret;
 478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 479                          : RE_SYNTAX_POSIX_BASIC);
 480
 481   preg->buffer = NULL;
 482   preg->allocated = 0;
 483   preg->used = 0;
 484
 485   /* Try to allocate space for the fastmap.  */
 486   preg->fastmap = re_malloc (char, SBC_MAX);
 487   if (BE (preg->fastmap == NULL, 0))
 488     return REG_ESPACE;
 489
 490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 491
 492   /* If REG_NEWLINE is set, newlines are treated differently.  */
 493   if (cflags & REG_NEWLINE)
 494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 495       syntax &= ~RE_DOT_NEWLINE;
 496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 497       /* It also changes the matching behavior.  */
 498       preg->newline_anchor = 1;
 499     }
 500   else
 501     preg->newline_anchor = 0;
 502   preg->no_sub = !!(cflags & REG_NOSUB);
 503   preg->translate = NULL;
 504
 505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 506
 507   /* POSIX doesn't distinguish between an unmatched open-group and an
 508      unmatched close-group: both are REG_EPAREN.  */
 509   if (ret == REG_ERPAREN)
 510     ret = REG_EPAREN;
 511
 512   /* We have already checked preg->fastmap != NULL.  */
 513   if (BE (ret == REG_NOERROR, 1))
 514     /* Compute the fastmap now, since regexec cannot modify the pattern
 515        buffer.  This function never fails in this implementation.  */
 516     (void) re_compile_fastmap (preg);
 517   else
 518     {
 519       /* Some error occurred while compiling the expression.  */
 520       re_free (preg->fastmap);
 521       preg->fastmap = NULL;
 522     }
 523
 524   return (int) ret;
 525 }
 526 #ifdef _LIBC
 527 weak_alias (__regcomp, regcomp)
 528 #endif
 529
 530 /* Returns a message corresponding to an error code, ERRCODE, returned
 531    from either regcomp or regexec.   We don't use PREG here.  */
 532
 533 #ifdef _LIBC
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *_Restrict_ preg;
 538     char *_Restrict_ errbuf;
 539     size_t errbuf_size;
 540 #else /* size_t might promote */
 541 size_t
 542 regerror (int errcode, const regex_t *_Restrict_ preg,
 543           char *_Restrict_ errbuf, size_t errbuf_size)
 544 #endif
 545 {
 546   const char *msg;
 547   size_t msg_size;
 548
 549   if (BE (errcode < 0
 550           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 551                                / sizeof (__re_error_msgid_idx[0])), 0))
 552     /* Only error codes returned by the rest of the code should be passed
 553        to this routine.  If we are given anything else, or if other regex
 554        code generates an invalid error code, then the program has a bug.
 555        Dump core so we can fix it.  */
 556     abort ();
 557
 558   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 559
 560   msg_size = strlen (msg) + 1; /* Includes the null.  */
 561
 562   if (BE (errbuf_size != 0, 1))
 563     {
 564       size_t cpy_size = msg_size;
 565       if (BE (msg_size > errbuf_size, 0))
 566         {
 567           cpy_size = errbuf_size - 1;
 568           errbuf[cpy_size] = '\0';
 569         }
 570       memcpy (errbuf, msg, cpy_size);
 571     }
 572
 573   return msg_size;
 574 }
 575 #ifdef _LIBC
 576 weak_alias (__regerror, regerror)
 577 #endif
 578
 579
 580 #ifdef RE_ENABLE_I18N
 581 /* This static array is used for the map to single-byte characters when
 582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 583    it the same all the time.  UTF-8 is the preferred encoding so this is
 584    a worthwhile optimization.  */
 585 static const bitset_t utf8_sb_map =
 586 {
 587   /* Set the first 128 bits.  */
 588 # ifdef __GNUC__
 589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 590 # else
 591 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 592 #   error "bitset_word_t is narrower than 32 bits"
 593 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX, BITSET_WORD_MAX,
 597 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 598   BITSET_WORD_MAX,
 599 #  endif
 600   (BITSET_WORD_MAX
 601    >> (SBC_MAX % BITSET_WORD_BITS == 0
 602        ? 0
 603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 604 # endif
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   const char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 903       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 904       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 905       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 906     dfa->is_utf8 = 1;
 907
 908   /* We check exhaustively in the loop below if this charset is a
 909      superset of ASCII.  */
 910   dfa->map_notascii = 0;
 911 #endif
 912
 913 #ifdef RE_ENABLE_I18N
 914   if (dfa->mb_cur_max > 1)
 915     {
 916       if (dfa->is_utf8)
 917         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 918       else
 919         {
 920           int i, j, ch;
 921
 922           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 923           if (BE (dfa->sb_char == NULL, 0))
 924             return REG_ESPACE;
 925
 926           /* Set the bits corresponding to single byte chars.  */
 927           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 928             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 929               {
 930                 wint_t wch = __btowc (ch);
 931                 if (wch != WEOF)
 932                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 933 # ifndef _LIBC
 934                 if (isascii (ch) && wch != ch)
 935                   dfa->map_notascii = 1;
 936 # endif
 937               }
 938         }
 939     }
 940 #endif
 941
 942   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 943     return REG_ESPACE;
 944   return REG_NOERROR;
 945 }
 946
 947 /* Initialize WORD_CHAR table, which indicate which character is
 948    "word".  In this case "word" means that it is the word construction
 949    character used by some operators like "\<", "\>", etc.  */
 950
 951 static void
 952 internal_function
 953 init_word_char (re_dfa_t *dfa)
 954 {
 955   dfa->word_ops_used = 1;
 956   int i = 0;
 957   int j;
 958   int ch = 0;
 959   if (BE (dfa->map_notascii == 0, 1))
 960     {
 961       bitset_word_t bits0 = 0x00000000;
 962       bitset_word_t bits1 = 0x03ff0000;
 963       bitset_word_t bits2 = 0x87fffffe;
 964       bitset_word_t bits3 = 0x07fffffe;
 965       if (BITSET_WORD_BITS == 64)
 966         {
 967           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 968           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 969           i = 2;
 970         }
 971       else if (BITSET_WORD_BITS == 32)
 972         {
 973           dfa->word_char[0] = bits0;
 974           dfa->word_char[1] = bits1;
 975           dfa->word_char[2] = bits2;
 976           dfa->word_char[3] = bits3;
 977           i = 4;
 978         }
 979       else
 980         goto general_case;
 981       ch = 128;
 982
 983       if (BE (dfa->is_utf8, 1))
 984         {
 985           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 986           return;
 987         }
 988     }
 989
 990  general_case:
 991   for (; i < BITSET_WORDS; ++i)
 992     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 993       if (isalnum (ch) || ch == '_')
 994         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 995 }
 996
 997 /* Free the work area which are only used while compiling.  */
 998
 999 static void
1000 free_workarea_compile (regex_t *preg)
1001 {
1002   re_dfa_t *dfa = preg->buffer;
1003   bin_tree_storage_t *storage, *next;
1004   for (storage = dfa->str_tree_storage; storage; storage = next)
1005     {
1006       next = storage->next;
1007       re_free (storage);
1008     }
1009   dfa->str_tree_storage = NULL;
1010   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1011   dfa->str_tree = NULL;
1012   re_free (dfa->org_indices);
1013   dfa->org_indices = NULL;
1014 }
1015
1016 /* Create initial states for all contexts.  */
1017
1018 static reg_errcode_t
1019 create_initial_state (re_dfa_t *dfa)
1020 {
1021   Idx first, i;
1022   reg_errcode_t err;
1023   re_node_set init_nodes;
1024
1025   /* Initial states have the epsilon closure of the node which is
1026      the first node of the regular expression.  */
1027   first = dfa->str_tree->first->node_idx;
1028   dfa->init_node = first;
1029   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1030   if (BE (err != REG_NOERROR, 0))
1031     return err;
1032
1033   /* The back-references which are in initial states can epsilon transit,
1034      since in this case all of the subexpressions can be null.
1035      Then we add epsilon closures of the nodes which are the next nodes of
1036      the back-references.  */
1037   if (dfa->nbackref > 0)
1038     for (i = 0; i < init_nodes.nelem; ++i)
1039       {
1040         Idx node_idx = init_nodes.elems[i];
1041         re_token_type_t type = dfa->nodes[node_idx].type;
1042
1043         Idx clexp_idx;
1044         if (type != OP_BACK_REF)
1045           continue;
1046         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1047           {
1048             re_token_t *clexp_node;
1049             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1050             if (clexp_node->type == OP_CLOSE_SUBEXP
1051                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1052               break;
1053           }
1054         if (clexp_idx == init_nodes.nelem)
1055           continue;
1056
1057         if (type == OP_BACK_REF)
1058           {
1059             Idx dest_idx = dfa->edests[node_idx].elems[0];
1060             if (!re_node_set_contains (&init_nodes, dest_idx))
1061               {
1062                 reg_errcode_t merge_err
1063                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1064                 if (merge_err != REG_NOERROR)
1065                   return merge_err;
1066                 i = 0;
1067               }
1068           }
1069       }
1070
1071   /* It must be the first time to invoke acquire_state.  */
1072   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1073   /* We don't check ERR here, since the initial state must not be NULL.  */
1074   if (BE (dfa->init_state == NULL, 0))
1075     return err;
1076   if (dfa->init_state->has_constraint)
1077     {
1078       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1079                                                        CONTEXT_WORD);
1080       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1081                                                      CONTEXT_NEWLINE);
1082       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1083                                                          &init_nodes,
1084                                                          CONTEXT_NEWLINE
1085                                                          | CONTEXT_BEGBUF);
1086       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1087               || dfa->init_state_begbuf == NULL, 0))
1088         return err;
1089     }
1090   else
1091     dfa->init_state_word = dfa->init_state_nl
1092       = dfa->init_state_begbuf = dfa->init_state;
1093
1094   re_node_set_free (&init_nodes);
1095   return REG_NOERROR;
1096 }
1097 \f
1098 #ifdef RE_ENABLE_I18N
1099 /* If it is possible to do searching in single byte encoding instead of UTF-8
1100    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1101    DFA nodes where needed.  */
1102
1103 static void
1104 optimize_utf8 (re_dfa_t *dfa)
1105 {
1106   Idx node;
1107   int i;
1108   bool mb_chars = false;
1109   bool has_period = false;
1110
1111   for (node = 0; node < dfa->nodes_len; ++node)
1112     switch (dfa->nodes[node].type)
1113       {
1114       case CHARACTER:
1115         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1116           mb_chars = true;
1117         break;
1118       case ANCHOR:
1119         switch (dfa->nodes[node].opr.ctx_type)
1120           {
1121           case LINE_FIRST:
1122           case LINE_LAST:
1123           case BUF_FIRST:
1124           case BUF_LAST:
1125             break;
1126           default:
1127             /* Word anchors etc. cannot be handled.  It's okay to test
1128                opr.ctx_type since constraints (for all DFA nodes) are
1129                created by ORing one or more opr.ctx_type values.  */
1130             return;
1131           }
1132         break;
1133       case OP_PERIOD:
1134         has_period = true;
1135         break;
1136       case OP_BACK_REF:
1137       case OP_ALT:
1138       case END_OF_RE:
1139       case OP_DUP_ASTERISK:
1140       case OP_OPEN_SUBEXP:
1141       case OP_CLOSE_SUBEXP:
1142         break;
1143       case COMPLEX_BRACKET:
1144         return;
1145       case SIMPLE_BRACKET:
1146         /* Just double check.  */
1147         {
1148           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1149                         ? 0
1150                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1151           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1152             {
1153               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1154                 return;
1155               rshift = 0;
1156             }
1157         }
1158         break;
1159       default:
1160         abort ();
1161       }
1162
1163   if (mb_chars || has_period)
1164     for (node = 0; node < dfa->nodes_len; ++node)
1165       {
1166         if (dfa->nodes[node].type == CHARACTER
1167             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1168           dfa->nodes[node].mb_partial = 0;
1169         else if (dfa->nodes[node].type == OP_PERIOD)
1170           dfa->nodes[node].type = OP_UTF8_PERIOD;
1171       }
1172
1173   /* The search can be in single byte locale.  */
1174   dfa->mb_cur_max = 1;
1175   dfa->is_utf8 = 0;
1176   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1177 }
1178 #endif
1179 \f
1180 /* Analyze the structure tree, and calculate "first", "next", "edest",
1181    "eclosure", and "inveclosure".  */
1182
1183 static reg_errcode_t
1184 analyze (regex_t *preg)
1185 {
1186   re_dfa_t *dfa = preg->buffer;
1187   reg_errcode_t ret;
1188
1189   /* Allocate arrays.  */
1190   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1191   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1192   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1193   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1194   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1195           || dfa->eclosures == NULL, 0))
1196     return REG_ESPACE;
1197
1198   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1199   if (dfa->subexp_map != NULL)
1200     {
1201       Idx i;
1202       for (i = 0; i < preg->re_nsub; i++)
1203         dfa->subexp_map[i] = i;
1204       preorder (dfa->str_tree, optimize_subexps, dfa);
1205       for (i = 0; i < preg->re_nsub; i++)
1206         if (dfa->subexp_map[i] != i)
1207           break;
1208       if (i == preg->re_nsub)
1209         {
1210           free (dfa->subexp_map);
1211           dfa->subexp_map = NULL;
1212         }
1213     }
1214
1215   ret = postorder (dfa->str_tree, lower_subexps, preg);
1216   if (BE (ret != REG_NOERROR, 0))
1217     return ret;
1218   ret = postorder (dfa->str_tree, calc_first, dfa);
1219   if (BE (ret != REG_NOERROR, 0))
1220     return ret;
1221   preorder (dfa->str_tree, calc_next, dfa);
1222   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1223   if (BE (ret != REG_NOERROR, 0))
1224     return ret;
1225   ret = calc_eclosure (dfa);
1226   if (BE (ret != REG_NOERROR, 0))
1227     return ret;
1228
1229   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1230      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1231   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1232       || dfa->nbackref)
1233     {
1234       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1235       if (BE (dfa->inveclosures == NULL, 0))
1236         return REG_ESPACE;
1237       ret = calc_inveclosure (dfa);
1238     }
1239
1240   return ret;
1241 }
1242
1243 /* Our parse trees are very unbalanced, so we cannot use a stack to
1244    implement parse tree visits.  Instead, we use parent pointers and
1245    some hairy code in these two functions.  */
1246 static reg_errcode_t
1247 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248            void *extra)
1249 {
1250   bin_tree_t *node, *prev;
1251
1252   for (node = root; ; )
1253     {
1254       /* Descend down the tree, preferably to the left (or to the right
1255          if that's the only child).  */
1256       while (node->left || node->right)
1257         if (node->left)
1258           node = node->left;
1259         else
1260           node = node->right;
1261
1262       do
1263         {
1264           reg_errcode_t err = fn (extra, node);
1265           if (BE (err != REG_NOERROR, 0))
1266             return err;
1267           if (node->parent == NULL)
1268             return REG_NOERROR;
1269           prev = node;
1270           node = node->parent;
1271         }
1272       /* Go up while we have a node that is reached from the right.  */
1273       while (node->right == prev || node->right == NULL);
1274       node = node->right;
1275     }
1276 }
1277
1278 static reg_errcode_t
1279 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1280           void *extra)
1281 {
1282   bin_tree_t *node;
1283
1284   for (node = root; ; )
1285     {
1286       reg_errcode_t err = fn (extra, node);
1287       if (BE (err != REG_NOERROR, 0))
1288         return err;
1289
1290       /* Go to the left node, or up and to the right.  */
1291       if (node->left)
1292         node = node->left;
1293       else
1294         {
1295           bin_tree_t *prev = NULL;
1296           while (node->right == prev || node->right == NULL)
1297             {
1298               prev = node;
1299               node = node->parent;
1300               if (!node)
1301                 return REG_NOERROR;
1302             }
1303           node = node->right;
1304         }
1305     }
1306 }
1307
1308 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1309    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1310    backreferences as well.  Requires a preorder visit.  */
1311 static reg_errcode_t
1312 optimize_subexps (void *extra, bin_tree_t *node)
1313 {
1314   re_dfa_t *dfa = (re_dfa_t *) extra;
1315
1316   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1317     {
1318       int idx = node->token.opr.idx;
1319       node->token.opr.idx = dfa->subexp_map[idx];
1320       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1321     }
1322
1323   else if (node->token.type == SUBEXP
1324            && node->left && node->left->token.type == SUBEXP)
1325     {
1326       Idx other_idx = node->left->token.opr.idx;
1327
1328       node->left = node->left->left;
1329       if (node->left)
1330         node->left->parent = node;
1331
1332       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1333       if (other_idx < BITSET_WORD_BITS)
1334         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1335     }
1336
1337   return REG_NOERROR;
1338 }
1339
1340 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1341    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1342 static reg_errcode_t
1343 lower_subexps (void *extra, bin_tree_t *node)
1344 {
1345   regex_t *preg = (regex_t *) extra;
1346   reg_errcode_t err = REG_NOERROR;
1347
1348   if (node->left && node->left->token.type == SUBEXP)
1349     {
1350       node->left = lower_subexp (&err, preg, node->left);
1351       if (node->left)
1352         node->left->parent = node;
1353     }
1354   if (node->right && node->right->token.type == SUBEXP)
1355     {
1356       node->right = lower_subexp (&err, preg, node->right);
1357       if (node->right)
1358         node->right->parent = node;
1359     }
1360
1361   return err;
1362 }
1363
1364 static bin_tree_t *
1365 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1366 {
1367   re_dfa_t *dfa = preg->buffer;
1368   bin_tree_t *body = node->left;
1369   bin_tree_t *op, *cls, *tree1, *tree;
1370
1371   if (preg->no_sub
1372       /* We do not optimize empty subexpressions, because otherwise we may
1373          have bad CONCAT nodes with NULL children.  This is obviously not
1374          very common, so we do not lose much.  An example that triggers
1375          this case is the sed "script" /\(\)/x.  */
1376       && node->left != NULL
1377       && (node->token.opr.idx >= BITSET_WORD_BITS
1378           || !(dfa->used_bkref_map
1379                & ((bitset_word_t) 1 << node->token.opr.idx))))
1380     return node->left;
1381
1382   /* Convert the SUBEXP node to the concatenation of an
1383      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1384   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1385   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1386   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1387   tree = create_tree (dfa, op, tree1, CONCAT);
1388   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1389     {
1390       *err = REG_ESPACE;
1391       return NULL;
1392     }
1393
1394   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1395   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1396   return tree;
1397 }
1398
1399 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1400    nodes.  Requires a postorder visit.  */
1401 static reg_errcode_t
1402 calc_first (void *extra, bin_tree_t *node)
1403 {
1404   re_dfa_t *dfa = (re_dfa_t *) extra;
1405   if (node->token.type == CONCAT)
1406     {
1407       node->first = node->left->first;
1408       node->node_idx = node->left->node_idx;
1409     }
1410   else
1411     {
1412       node->first = node;
1413       node->node_idx = re_dfa_add_node (dfa, node->token);
1414       if (BE (node->node_idx == REG_MISSING, 0))
1415         return REG_ESPACE;
1416       if (node->token.type == ANCHOR)
1417         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1418     }
1419   return REG_NOERROR;
1420 }
1421
1422 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1423 static reg_errcode_t
1424 calc_next (void *extra, bin_tree_t *node)
1425 {
1426   switch (node->token.type)
1427     {
1428     case OP_DUP_ASTERISK:
1429       node->left->next = node;
1430       break;
1431     case CONCAT:
1432       node->left->next = node->right->first;
1433       node->right->next = node->next;
1434       break;
1435     default:
1436       if (node->left)
1437         node->left->next = node->next;
1438       if (node->right)
1439         node->right->next = node->next;
1440       break;
1441     }
1442   return REG_NOERROR;
1443 }
1444
1445 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1446 static reg_errcode_t
1447 link_nfa_nodes (void *extra, bin_tree_t *node)
1448 {
1449   re_dfa_t *dfa = (re_dfa_t *) extra;
1450   Idx idx = node->node_idx;
1451   reg_errcode_t err = REG_NOERROR;
1452
1453   switch (node->token.type)
1454     {
1455     case CONCAT:
1456       break;
1457
1458     case END_OF_RE:
1459       assert (node->next == NULL);
1460       break;
1461
1462     case OP_DUP_ASTERISK:
1463     case OP_ALT:
1464       {
1465         Idx left, right;
1466         dfa->has_plural_match = 1;
1467         if (node->left != NULL)
1468           left = node->left->first->node_idx;
1469         else
1470           left = node->next->node_idx;
1471         if (node->right != NULL)
1472           right = node->right->first->node_idx;
1473         else
1474           right = node->next->node_idx;
1475         assert (REG_VALID_INDEX (left));
1476         assert (REG_VALID_INDEX (right));
1477         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1478       }
1479       break;
1480
1481     case ANCHOR:
1482     case OP_OPEN_SUBEXP:
1483     case OP_CLOSE_SUBEXP:
1484       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1485       break;
1486
1487     case OP_BACK_REF:
1488       dfa->nexts[idx] = node->next->node_idx;
1489       if (node->token.type == OP_BACK_REF)
1490         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1491       break;
1492
1493     default:
1494       assert (!IS_EPSILON_NODE (node->token.type));
1495       dfa->nexts[idx] = node->next->node_idx;
1496       break;
1497     }
1498
1499   return err;
1500 }
1501
1502 /* Duplicate the epsilon closure of the node ROOT_NODE.
1503    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1504    to their own constraint.  */
1505
1506 static reg_errcode_t
1507 internal_function
1508 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1509                         Idx root_node, unsigned int init_constraint)
1510 {
1511   Idx org_node, clone_node;
1512   bool ok;
1513   unsigned int constraint = init_constraint;
1514   for (org_node = top_org_node, clone_node = top_clone_node;;)
1515     {
1516       Idx org_dest, clone_dest;
1517       if (dfa->nodes[org_node].type == OP_BACK_REF)
1518         {
1519           /* If the back reference epsilon-transit, its destination must
1520              also have the constraint.  Then duplicate the epsilon closure
1521              of the destination of the back reference, and store it in
1522              edests of the back reference.  */
1523           org_dest = dfa->nexts[org_node];
1524           re_node_set_empty (dfa->edests + clone_node);
1525           clone_dest = duplicate_node (dfa, org_dest, constraint);
1526           if (BE (clone_dest == REG_MISSING, 0))
1527             return REG_ESPACE;
1528           dfa->nexts[clone_node] = dfa->nexts[org_node];
1529           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1530           if (BE (! ok, 0))
1531             return REG_ESPACE;
1532         }
1533       else if (dfa->edests[org_node].nelem == 0)
1534         {
1535           /* In case of the node can't epsilon-transit, don't duplicate the
1536              destination and store the original destination as the
1537              destination of the node.  */
1538           dfa->nexts[clone_node] = dfa->nexts[org_node];
1539           break;
1540         }
1541       else if (dfa->edests[org_node].nelem == 1)
1542         {
1543           /* In case of the node can epsilon-transit, and it has only one
1544              destination.  */
1545           org_dest = dfa->edests[org_node].elems[0];
1546           re_node_set_empty (dfa->edests + clone_node);
1547           /* If the node is root_node itself, it means the epsilon closure
1548              has a loop.  Then tie it to the destination of the root_node.  */
1549           if (org_node == root_node && clone_node != org_node)
1550             {
1551               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1552               if (BE (! ok, 0))
1553                 return REG_ESPACE;
1554               break;
1555             }
1556           /* In case the node has another constraint, append it.  */
1557           constraint |= dfa->nodes[org_node].constraint;
1558           clone_dest = duplicate_node (dfa, org_dest, constraint);
1559           if (BE (clone_dest == REG_MISSING, 0))
1560             return REG_ESPACE;
1561           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1562           if (BE (! ok, 0))
1563             return REG_ESPACE;
1564         }
1565       else /* dfa->edests[org_node].nelem == 2 */
1566         {
1567           /* In case of the node can epsilon-transit, and it has two
1568              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1569           org_dest = dfa->edests[org_node].elems[0];
1570           re_node_set_empty (dfa->edests + clone_node);
1571           /* Search for a duplicated node which satisfies the constraint.  */
1572           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1573           if (clone_dest == REG_MISSING)
1574             {
1575               /* There is no such duplicated node, create a new one.  */
1576               reg_errcode_t err;
1577               clone_dest = duplicate_node (dfa, org_dest, constraint);
1578               if (BE (clone_dest == REG_MISSING, 0))
1579                 return REG_ESPACE;
1580               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581               if (BE (! ok, 0))
1582                 return REG_ESPACE;
1583               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1584                                             root_node, constraint);
1585               if (BE (err != REG_NOERROR, 0))
1586                 return err;
1587             }
1588           else
1589             {
1590               /* There is a duplicated node which satisfies the constraint,
1591                  use it to avoid infinite loop.  */
1592               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593               if (BE (! ok, 0))
1594                 return REG_ESPACE;
1595             }
1596
1597           org_dest = dfa->edests[org_node].elems[1];
1598           clone_dest = duplicate_node (dfa, org_dest, constraint);
1599           if (BE (clone_dest == REG_MISSING, 0))
1600             return REG_ESPACE;
1601           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1602           if (BE (! ok, 0))
1603             return REG_ESPACE;
1604         }
1605       org_node = org_dest;
1606       clone_node = clone_dest;
1607     }
1608   return REG_NOERROR;
1609 }
1610
1611 /* Search for a node which is duplicated from the node ORG_NODE, and
1612    satisfies the constraint CONSTRAINT.  */
1613
1614 static Idx
1615 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1616                         unsigned int constraint)
1617 {
1618   Idx idx;
1619   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1620     {
1621       if (org_node == dfa->org_indices[idx]
1622           && constraint == dfa->nodes[idx].constraint)
1623         return idx; /* Found.  */
1624     }
1625   return REG_MISSING; /* Not found.  */
1626 }
1627
1628 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1629    Return the index of the new node, or REG_MISSING if insufficient storage is
1630    available.  */
1631
1632 static Idx
1633 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1634 {
1635   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1636   if (BE (dup_idx != REG_MISSING, 1))
1637     {
1638       dfa->nodes[dup_idx].constraint = constraint;
1639       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1640       dfa->nodes[dup_idx].duplicated = 1;
1641
1642       /* Store the index of the original node.  */
1643       dfa->org_indices[dup_idx] = org_idx;
1644     }
1645   return dup_idx;
1646 }
1647
1648 static reg_errcode_t
1649 calc_inveclosure (re_dfa_t *dfa)
1650 {
1651   Idx src, idx;
1652   bool ok;
1653   for (idx = 0; idx < dfa->nodes_len; ++idx)
1654     re_node_set_init_empty (dfa->inveclosures + idx);
1655
1656   for (src = 0; src < dfa->nodes_len; ++src)
1657     {
1658       Idx *elems = dfa->eclosures[src].elems;
1659       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1660         {
1661           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1662           if (BE (! ok, 0))
1663             return REG_ESPACE;
1664         }
1665     }
1666
1667   return REG_NOERROR;
1668 }
1669
1670 /* Calculate "eclosure" for all the node in DFA.  */
1671
1672 static reg_errcode_t
1673 calc_eclosure (re_dfa_t *dfa)
1674 {
1675   Idx node_idx;
1676   bool incomplete;
1677 #ifdef DEBUG
1678   assert (dfa->nodes_len > 0);
1679 #endif
1680   incomplete = false;
1681   /* For each nodes, calculate epsilon closure.  */
1682   for (node_idx = 0; ; ++node_idx)
1683     {
1684       reg_errcode_t err;
1685       re_node_set eclosure_elem;
1686       if (node_idx == dfa->nodes_len)
1687         {
1688           if (!incomplete)
1689             break;
1690           incomplete = false;
1691           node_idx = 0;
1692         }
1693
1694 #ifdef DEBUG
1695       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1696 #endif
1697
1698       /* If we have already calculated, skip it.  */
1699       if (dfa->eclosures[node_idx].nelem != 0)
1700         continue;
1701       /* Calculate epsilon closure of 'node_idx'.  */
1702       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1703       if (BE (err != REG_NOERROR, 0))
1704         return err;
1705
1706       if (dfa->eclosures[node_idx].nelem == 0)
1707         {
1708           incomplete = true;
1709           re_node_set_free (&eclosure_elem);
1710         }
1711     }
1712   return REG_NOERROR;
1713 }
1714
1715 /* Calculate epsilon closure of NODE.  */
1716
1717 static reg_errcode_t
1718 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1719 {
1720   reg_errcode_t err;
1721   Idx i;
1722   re_node_set eclosure;
1723   bool ok;
1724   bool incomplete = false;
1725   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1726   if (BE (err != REG_NOERROR, 0))
1727     return err;
1728
1729   /* This indicates that we are calculating this node now.
1730      We reference this value to avoid infinite loop.  */
1731   dfa->eclosures[node].nelem = REG_MISSING;
1732
1733   /* If the current node has constraints, duplicate all nodes
1734      since they must inherit the constraints.  */
1735   if (dfa->nodes[node].constraint
1736       && dfa->edests[node].nelem
1737       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1738     {
1739       err = duplicate_node_closure (dfa, node, node, node,
1740                                     dfa->nodes[node].constraint);
1741       if (BE (err != REG_NOERROR, 0))
1742         return err;
1743     }
1744
1745   /* Expand each epsilon destination nodes.  */
1746   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1747     for (i = 0; i < dfa->edests[node].nelem; ++i)
1748       {
1749         re_node_set eclosure_elem;
1750         Idx edest = dfa->edests[node].elems[i];
1751         /* If calculating the epsilon closure of 'edest' is in progress,
1752            return intermediate result.  */
1753         if (dfa->eclosures[edest].nelem == REG_MISSING)
1754           {
1755             incomplete = true;
1756             continue;
1757           }
1758         /* If we haven't calculated the epsilon closure of 'edest' yet,
1759            calculate now. Otherwise use calculated epsilon closure.  */
1760         if (dfa->eclosures[edest].nelem == 0)
1761           {
1762             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1763             if (BE (err != REG_NOERROR, 0))
1764               return err;
1765           }
1766         else
1767           eclosure_elem = dfa->eclosures[edest];
1768         /* Merge the epsilon closure of 'edest'.  */
1769         err = re_node_set_merge (&eclosure, &eclosure_elem);
1770         if (BE (err != REG_NOERROR, 0))
1771           return err;
1772         /* If the epsilon closure of 'edest' is incomplete,
1773            the epsilon closure of this node is also incomplete.  */
1774         if (dfa->eclosures[edest].nelem == 0)
1775           {
1776             incomplete = true;
1777             re_node_set_free (&eclosure_elem);
1778           }
1779       }
1780
1781   /* An epsilon closure includes itself.  */
1782   ok = re_node_set_insert (&eclosure, node);
1783   if (BE (! ok, 0))
1784     return REG_ESPACE;
1785   if (incomplete && !root)
1786     dfa->eclosures[node].nelem = 0;
1787   else
1788     dfa->eclosures[node] = eclosure;
1789   *new_set = eclosure;
1790   return REG_NOERROR;
1791 }
1792 \f
1793 /* Functions for token which are used in the parser.  */
1794
1795 /* Fetch a token from INPUT.
1796    We must not use this function inside bracket expressions.  */
1797
1798 static void
1799 internal_function
1800 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1801 {
1802   re_string_skip_bytes (input, peek_token (result, input, syntax));
1803 }
1804
1805 /* Peek a token from INPUT, and return the length of the token.
1806    We must not use this function inside bracket expressions.  */
1807
1808 static int
1809 internal_function
1810 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1811 {
1812   unsigned char c;
1813
1814   if (re_string_eoi (input))
1815     {
1816       token->type = END_OF_RE;
1817       return 0;
1818     }
1819
1820   c = re_string_peek_byte (input, 0);
1821   token->opr.c = c;
1822
1823   token->word_char = 0;
1824 #ifdef RE_ENABLE_I18N
1825   token->mb_partial = 0;
1826   if (input->mb_cur_max > 1 &&
1827       !re_string_first_byte (input, re_string_cur_idx (input)))
1828     {
1829       token->type = CHARACTER;
1830       token->mb_partial = 1;
1831       return 1;
1832     }
1833 #endif
1834   if (c == '\\')
1835     {
1836       unsigned char c2;
1837       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1838         {
1839           token->type = BACK_SLASH;
1840           return 1;
1841         }
1842
1843       c2 = re_string_peek_byte_case (input, 1);
1844       token->opr.c = c2;
1845       token->type = CHARACTER;
1846 #ifdef RE_ENABLE_I18N
1847       if (input->mb_cur_max > 1)
1848         {
1849           wint_t wc = re_string_wchar_at (input,
1850                                           re_string_cur_idx (input) + 1);
1851           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1852         }
1853       else
1854 #endif
1855         token->word_char = IS_WORD_CHAR (c2) != 0;
1856
1857       switch (c2)
1858         {
1859         case '|':
1860           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1861             token->type = OP_ALT;
1862           break;
1863         case '1': case '2': case '3': case '4': case '5':
1864         case '6': case '7': case '8': case '9':
1865           if (!(syntax & RE_NO_BK_REFS))
1866             {
1867               token->type = OP_BACK_REF;
1868               token->opr.idx = c2 - '1';
1869             }
1870           break;
1871         case '<':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             {
1874               token->type = ANCHOR;
1875               token->opr.ctx_type = WORD_FIRST;
1876             }
1877           break;
1878         case '>':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = WORD_LAST;
1883             }
1884           break;
1885         case 'b':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             {
1888               token->type = ANCHOR;
1889               token->opr.ctx_type = WORD_DELIM;
1890             }
1891           break;
1892         case 'B':
1893           if (!(syntax & RE_NO_GNU_OPS))
1894             {
1895               token->type = ANCHOR;
1896               token->opr.ctx_type = NOT_WORD_DELIM;
1897             }
1898           break;
1899         case 'w':
1900           if (!(syntax & RE_NO_GNU_OPS))
1901             token->type = OP_WORD;
1902           break;
1903         case 'W':
1904           if (!(syntax & RE_NO_GNU_OPS))
1905             token->type = OP_NOTWORD;
1906           break;
1907         case 's':
1908           if (!(syntax & RE_NO_GNU_OPS))
1909             token->type = OP_SPACE;
1910           break;
1911         case 'S':
1912           if (!(syntax & RE_NO_GNU_OPS))
1913             token->type = OP_NOTSPACE;
1914           break;
1915         case '`':
1916           if (!(syntax & RE_NO_GNU_OPS))
1917             {
1918               token->type = ANCHOR;
1919               token->opr.ctx_type = BUF_FIRST;
1920             }
1921           break;
1922         case '\'':
1923           if (!(syntax & RE_NO_GNU_OPS))
1924             {
1925               token->type = ANCHOR;
1926               token->opr.ctx_type = BUF_LAST;
1927             }
1928           break;
1929         case '(':
1930           if (!(syntax & RE_NO_BK_PARENS))
1931             token->type = OP_OPEN_SUBEXP;
1932           break;
1933         case ')':
1934           if (!(syntax & RE_NO_BK_PARENS))
1935             token->type = OP_CLOSE_SUBEXP;
1936           break;
1937         case '+':
1938           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1939             token->type = OP_DUP_PLUS;
1940           break;
1941         case '?':
1942           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1943             token->type = OP_DUP_QUESTION;
1944           break;
1945         case '{':
1946           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1947             token->type = OP_OPEN_DUP_NUM;
1948           break;
1949         case '}':
1950           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1951             token->type = OP_CLOSE_DUP_NUM;
1952           break;
1953         default:
1954           break;
1955         }
1956       return 2;
1957     }
1958
1959   token->type = CHARACTER;
1960 #ifdef RE_ENABLE_I18N
1961   if (input->mb_cur_max > 1)
1962     {
1963       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1964       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1965     }
1966   else
1967 #endif
1968     token->word_char = IS_WORD_CHAR (token->opr.c);
1969
1970   switch (c)
1971     {
1972     case '\n':
1973       if (syntax & RE_NEWLINE_ALT)
1974         token->type = OP_ALT;
1975       break;
1976     case '|':
1977       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1978         token->type = OP_ALT;
1979       break;
1980     case '*':
1981       token->type = OP_DUP_ASTERISK;
1982       break;
1983     case '+':
1984       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1985         token->type = OP_DUP_PLUS;
1986       break;
1987     case '?':
1988       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1989         token->type = OP_DUP_QUESTION;
1990       break;
1991     case '{':
1992       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1993         token->type = OP_OPEN_DUP_NUM;
1994       break;
1995     case '}':
1996       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1997         token->type = OP_CLOSE_DUP_NUM;
1998       break;
1999     case '(':
2000       if (syntax & RE_NO_BK_PARENS)
2001         token->type = OP_OPEN_SUBEXP;
2002       break;
2003     case ')':
2004       if (syntax & RE_NO_BK_PARENS)
2005         token->type = OP_CLOSE_SUBEXP;
2006       break;
2007     case '[':
2008       token->type = OP_OPEN_BRACKET;
2009       break;
2010     case '.':
2011       token->type = OP_PERIOD;
2012       break;
2013     case '^':
2014       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2015           re_string_cur_idx (input) != 0)
2016         {
2017           char prev = re_string_peek_byte (input, -1);
2018           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2019             break;
2020         }
2021       token->type = ANCHOR;
2022       token->opr.ctx_type = LINE_FIRST;
2023       break;
2024     case '$':
2025       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2026           re_string_cur_idx (input) + 1 != re_string_length (input))
2027         {
2028           re_token_t next;
2029           re_string_skip_bytes (input, 1);
2030           peek_token (&next, input, syntax);
2031           re_string_skip_bytes (input, -1);
2032           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2033             break;
2034         }
2035       token->type = ANCHOR;
2036       token->opr.ctx_type = LINE_LAST;
2037       break;
2038     default:
2039       break;
2040     }
2041   return 1;
2042 }
2043
2044 /* Peek a token from INPUT, and return the length of the token.
2045    We must not use this function out of bracket expressions.  */
2046
2047 static int
2048 internal_function
2049 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2050 {
2051   unsigned char c;
2052   if (re_string_eoi (input))
2053     {
2054       token->type = END_OF_RE;
2055       return 0;
2056     }
2057   c = re_string_peek_byte (input, 0);
2058   token->opr.c = c;
2059
2060 #ifdef RE_ENABLE_I18N
2061   if (input->mb_cur_max > 1 &&
2062       !re_string_first_byte (input, re_string_cur_idx (input)))
2063     {
2064       token->type = CHARACTER;
2065       return 1;
2066     }
2067 #endif /* RE_ENABLE_I18N */
2068
2069   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2070       && re_string_cur_idx (input) + 1 < re_string_length (input))
2071     {
2072       /* In this case, '\' escape a character.  */
2073       unsigned char c2;
2074       re_string_skip_bytes (input, 1);
2075       c2 = re_string_peek_byte (input, 0);
2076       token->opr.c = c2;
2077       token->type = CHARACTER;
2078       return 1;
2079     }
2080   if (c == '[') /* '[' is a special char in a bracket exps.  */
2081     {
2082       unsigned char c2;
2083       int token_len;
2084       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2085         c2 = re_string_peek_byte (input, 1);
2086       else
2087         c2 = 0;
2088       token->opr.c = c2;
2089       token_len = 2;
2090       switch (c2)
2091         {
2092         case '.':
2093           token->type = OP_OPEN_COLL_ELEM;
2094           break;
2095         case '=':
2096           token->type = OP_OPEN_EQUIV_CLASS;
2097           break;
2098         case ':':
2099           if (syntax & RE_CHAR_CLASSES)
2100             {
2101               token->type = OP_OPEN_CHAR_CLASS;
2102               break;
2103             }
2104           /* else fall through.  */
2105         default:
2106           token->type = CHARACTER;
2107           token->opr.c = c;
2108           token_len = 1;
2109           break;
2110         }
2111       return token_len;
2112     }
2113   switch (c)
2114     {
2115     case '-':
2116       token->type = OP_CHARSET_RANGE;
2117       break;
2118     case ']':
2119       token->type = OP_CLOSE_BRACKET;
2120       break;
2121     case '^':
2122       token->type = OP_NON_MATCH_LIST;
2123       break;
2124     default:
2125       token->type = CHARACTER;
2126     }
2127   return 1;
2128 }
2129 \f
2130 /* Functions for parser.  */
2131
2132 /* Entry point of the parser.
2133    Parse the regular expression REGEXP and return the structure tree.
2134    If an error occurs, ERR is set by error code, and return NULL.
2135    This function build the following tree, from regular expression <reg_exp>:
2136            CAT
2137            / \
2138           /   \
2139    <reg_exp>  EOR
2140
2141    CAT means concatenation.
2142    EOR means end of regular expression.  */
2143
2144 static bin_tree_t *
2145 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2146        reg_errcode_t *err)
2147 {
2148   re_dfa_t *dfa = preg->buffer;
2149   bin_tree_t *tree, *eor, *root;
2150   re_token_t current_token;
2151   dfa->syntax = syntax;
2152   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2154   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155     return NULL;
2156   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2157   if (tree != NULL)
2158     root = create_tree (dfa, tree, eor, CONCAT);
2159   else
2160     root = eor;
2161   if (BE (eor == NULL || root == NULL, 0))
2162     {
2163       *err = REG_ESPACE;
2164       return NULL;
2165     }
2166   return root;
2167 }
2168
2169 /* This function build the following tree, from regular expression
2170    <branch1>|<branch2>:
2171            ALT
2172            / \
2173           /   \
2174    <branch1> <branch2>
2175
2176    ALT means alternative, which represents the operator '|'.  */
2177
2178 static bin_tree_t *
2179 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2181 {
2182   re_dfa_t *dfa = preg->buffer;
2183   bin_tree_t *tree, *branch = NULL;
2184   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2185   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186     return NULL;
2187
2188   while (token->type == OP_ALT)
2189     {
2190       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2191       if (token->type != OP_ALT && token->type != END_OF_RE
2192           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2193         {
2194           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2195           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2196             return NULL;
2197         }
2198       else
2199         branch = NULL;
2200       tree = create_tree (dfa, tree, branch, OP_ALT);
2201       if (BE (tree == NULL, 0))
2202         {
2203           *err = REG_ESPACE;
2204           return NULL;
2205         }
2206     }
2207   return tree;
2208 }
2209
2210 /* This function build the following tree, from regular expression
2211    <exp1><exp2>:
2212         CAT
2213         / \
2214        /   \
2215    <exp1> <exp2>
2216
2217    CAT means concatenation.  */
2218
2219 static bin_tree_t *
2220 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2222 {
2223   bin_tree_t *tree, *expr;
2224   re_dfa_t *dfa = preg->buffer;
2225   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2226   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227     return NULL;
2228
2229   while (token->type != OP_ALT && token->type != END_OF_RE
2230          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2231     {
2232       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2233       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2234         {
2235           if (tree != NULL)
2236             postorder (tree, free_tree, NULL);
2237           return NULL;
2238         }
2239       if (tree != NULL && expr != NULL)
2240         {
2241           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2242           if (newtree == NULL)
2243             {
2244               postorder (expr, free_tree, NULL);
2245               postorder (tree, free_tree, NULL);
2246               *err = REG_ESPACE;
2247               return NULL;
2248             }
2249           tree = newtree;
2250         }
2251       else if (tree == NULL)
2252         tree = expr;
2253       /* Otherwise expr == NULL, we don't need to create new tree.  */
2254     }
2255   return tree;
2256 }
2257
2258 /* This function build the following tree, from regular expression a*:
2259          *
2260          |
2261          a
2262 */
2263
2264 static bin_tree_t *
2265 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2266                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2267 {
2268   re_dfa_t *dfa = preg->buffer;
2269   bin_tree_t *tree;
2270   switch (token->type)
2271     {
2272     case CHARACTER:
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275         {
2276           *err = REG_ESPACE;
2277           return NULL;
2278         }
2279 #ifdef RE_ENABLE_I18N
2280       if (dfa->mb_cur_max > 1)
2281         {
2282           while (!re_string_eoi (regexp)
2283                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2284             {
2285               bin_tree_t *mbc_remain;
2286               fetch_token (token, regexp, syntax);
2287               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2288               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2289               if (BE (mbc_remain == NULL || tree == NULL, 0))
2290                 {
2291                   *err = REG_ESPACE;
2292                   return NULL;
2293                 }
2294             }
2295         }
2296 #endif
2297       break;
2298     case OP_OPEN_SUBEXP:
2299       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2300       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2301         return NULL;
2302       break;
2303     case OP_OPEN_BRACKET:
2304       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306         return NULL;
2307       break;
2308     case OP_BACK_REF:
2309       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2310         {
2311           *err = REG_ESUBREG;
2312           return NULL;
2313         }
2314       dfa->used_bkref_map |= 1 << token->opr.idx;
2315       tree = create_token_tree (dfa, NULL, NULL, token);
2316       if (BE (tree == NULL, 0))
2317         {
2318           *err = REG_ESPACE;
2319           return NULL;
2320         }
2321       ++dfa->nbackref;
2322       dfa->has_mb_node = 1;
2323       break;
2324     case OP_OPEN_DUP_NUM:
2325       if (syntax & RE_CONTEXT_INVALID_DUP)
2326         {
2327           *err = REG_BADRPT;
2328           return NULL;
2329         }
2330       /* FALLTHROUGH */
2331     case OP_DUP_ASTERISK:
2332     case OP_DUP_PLUS:
2333     case OP_DUP_QUESTION:
2334       if (syntax & RE_CONTEXT_INVALID_OPS)
2335         {
2336           *err = REG_BADRPT;
2337           return NULL;
2338         }
2339       else if (syntax & RE_CONTEXT_INDEP_OPS)
2340         {
2341           fetch_token (token, regexp, syntax);
2342           return parse_expression (regexp, preg, token, syntax, nest, err);
2343         }
2344       /* else fall through  */
2345     case OP_CLOSE_SUBEXP:
2346       if ((token->type == OP_CLOSE_SUBEXP) &&
2347           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348         {
2349           *err = REG_ERPAREN;
2350           return NULL;
2351         }
2352       /* else fall through  */
2353     case OP_CLOSE_DUP_NUM:
2354       /* We treat it as a normal character.  */
2355
2356       /* Then we can these characters as normal characters.  */
2357       token->type = CHARACTER;
2358       /* mb_partial and word_char bits should be initialized already
2359          by peek_token.  */
2360       tree = create_token_tree (dfa, NULL, NULL, token);
2361       if (BE (tree == NULL, 0))
2362         {
2363           *err = REG_ESPACE;
2364           return NULL;
2365         }
2366       break;
2367     case ANCHOR:
2368       if ((token->opr.ctx_type
2369            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2370           && dfa->word_ops_used == 0)
2371         init_word_char (dfa);
2372       if (token->opr.ctx_type == WORD_DELIM
2373           || token->opr.ctx_type == NOT_WORD_DELIM)
2374         {
2375           bin_tree_t *tree_first, *tree_last;
2376           if (token->opr.ctx_type == WORD_DELIM)
2377             {
2378               token->opr.ctx_type = WORD_FIRST;
2379               tree_first = create_token_tree (dfa, NULL, NULL, token);
2380               token->opr.ctx_type = WORD_LAST;
2381             }
2382           else
2383             {
2384               token->opr.ctx_type = INSIDE_WORD;
2385               tree_first = create_token_tree (dfa, NULL, NULL, token);
2386               token->opr.ctx_type = INSIDE_NOTWORD;
2387             }
2388           tree_last = create_token_tree (dfa, NULL, NULL, token);
2389           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2390           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2391             {
2392               *err = REG_ESPACE;
2393               return NULL;
2394             }
2395         }
2396       else
2397         {
2398           tree = create_token_tree (dfa, NULL, NULL, token);
2399           if (BE (tree == NULL, 0))
2400             {
2401               *err = REG_ESPACE;
2402               return NULL;
2403             }
2404         }
2405       /* We must return here, since ANCHORs can't be followed
2406          by repetition operators.
2407          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2408              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2409       fetch_token (token, regexp, syntax);
2410       return tree;
2411     case OP_PERIOD:
2412       tree = create_token_tree (dfa, NULL, NULL, token);
2413       if (BE (tree == NULL, 0))
2414         {
2415           *err = REG_ESPACE;
2416           return NULL;
2417         }
2418       if (dfa->mb_cur_max > 1)
2419         dfa->has_mb_node = 1;
2420       break;
2421     case OP_WORD:
2422     case OP_NOTWORD:
2423       tree = build_charclass_op (dfa, regexp->trans,
2424                                  (const unsigned char *) "alnum",
2425                                  (const unsigned char *) "_",
2426                                  token->type == OP_NOTWORD, err);
2427       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2428         return NULL;
2429       break;
2430     case OP_SPACE:
2431     case OP_NOTSPACE:
2432       tree = build_charclass_op (dfa, regexp->trans,
2433                                  (const unsigned char *) "space",
2434                                  (const unsigned char *) "",
2435                                  token->type == OP_NOTSPACE, err);
2436       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2437         return NULL;
2438       break;
2439     case OP_ALT:
2440     case END_OF_RE:
2441       return NULL;
2442     case BACK_SLASH:
2443       *err = REG_EESCAPE;
2444       return NULL;
2445     default:
2446       /* Must not happen?  */
2447 #ifdef DEBUG
2448       assert (0);
2449 #endif
2450       return NULL;
2451     }
2452   fetch_token (token, regexp, syntax);
2453
2454   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2455          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2456     {
2457       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2458       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2459         return NULL;
2460       /* In BRE consecutive duplications are not allowed.  */
2461       if ((syntax & RE_CONTEXT_INVALID_DUP)
2462           && (token->type == OP_DUP_ASTERISK
2463               || token->type == OP_OPEN_DUP_NUM))
2464         {
2465           *err = REG_BADRPT;
2466           return NULL;
2467         }
2468     }
2469
2470   return tree;
2471 }
2472
2473 /* This function build the following tree, from regular expression
2474    (<reg_exp>):
2475          SUBEXP
2476             |
2477         <reg_exp>
2478 */
2479
2480 static bin_tree_t *
2481 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2482                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2483 {
2484   re_dfa_t *dfa = preg->buffer;
2485   bin_tree_t *tree;
2486   size_t cur_nsub;
2487   cur_nsub = preg->re_nsub++;
2488
2489   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2490
2491   /* The subexpression may be a null string.  */
2492   if (token->type == OP_CLOSE_SUBEXP)
2493     tree = NULL;
2494   else
2495     {
2496       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2498         {
2499           if (tree != NULL)
2500             postorder (tree, free_tree, NULL);
2501           *err = REG_EPAREN;
2502         }
2503       if (BE (*err != REG_NOERROR, 0))
2504         return NULL;
2505     }
2506
2507   if (cur_nsub <= '9' - '1')
2508     dfa->completed_bkref_map |= 1 << cur_nsub;
2509
2510   tree = create_tree (dfa, tree, NULL, SUBEXP);
2511   if (BE (tree == NULL, 0))
2512     {
2513       *err = REG_ESPACE;
2514       return NULL;
2515     }
2516   tree->token.opr.idx = cur_nsub;
2517   return tree;
2518 }
2519
2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2521
2522 static bin_tree_t *
2523 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2525 {
2526   bin_tree_t *tree = NULL, *old_tree = NULL;
2527   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528   re_token_t start_token = *token;
2529
2530   if (token->type == OP_OPEN_DUP_NUM)
2531     {
2532       end = 0;
2533       start = fetch_number (regexp, token, syntax);
2534       if (start == REG_MISSING)
2535         {
2536           if (token->type == CHARACTER && token->opr.c == ',')
2537             start = 0; /* We treat "{,m}" as "{0,m}".  */
2538           else
2539             {
2540               *err = REG_BADBR; /* <re>{} is invalid.  */
2541               return NULL;
2542             }
2543         }
2544       if (BE (start != REG_ERROR, 1))
2545         {
2546           /* We treat "{n}" as "{n,n}".  */
2547           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548                  : ((token->type == CHARACTER && token->opr.c == ',')
2549                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2550         }
2551       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2552         {
2553           /* Invalid sequence.  */
2554           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2555             {
2556               if (token->type == END_OF_RE)
2557                 *err = REG_EBRACE;
2558               else
2559                 *err = REG_BADBR;
2560
2561               return NULL;
2562             }
2563
2564           /* If the syntax bit is set, rollback.  */
2565           re_string_set_index (regexp, start_idx);
2566           *token = start_token;
2567           token->type = CHARACTER;
2568           /* mb_partial and word_char bits should be already initialized by
2569              peek_token.  */
2570           return elem;
2571         }
2572
2573       if (BE ((end != REG_MISSING && start > end)
2574               || token->type != OP_CLOSE_DUP_NUM, 0))
2575         {
2576           /* First number greater than second.  */
2577           *err = REG_BADBR;
2578           return NULL;
2579         }
2580
2581       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2582         {
2583           *err = REG_ESIZE;
2584           return NULL;
2585         }
2586     }
2587   else
2588     {
2589       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2591     }
2592
2593   fetch_token (token, regexp, syntax);
2594
2595   if (BE (elem == NULL, 0))
2596     return NULL;
2597   if (BE (start == 0 && end == 0, 0))
2598     {
2599       postorder (elem, free_tree, NULL);
2600       return NULL;
2601     }
2602
2603   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2604   if (BE (start > 0, 0))
2605     {
2606       tree = elem;
2607       for (i = 2; i <= start; ++i)
2608         {
2609           elem = duplicate_tree (elem, dfa);
2610           tree = create_tree (dfa, tree, elem, CONCAT);
2611           if (BE (elem == NULL || tree == NULL, 0))
2612             goto parse_dup_op_espace;
2613         }
2614
2615       if (start == end)
2616         return tree;
2617
2618       /* Duplicate ELEM before it is marked optional.  */
2619       elem = duplicate_tree (elem, dfa);
2620       old_tree = tree;
2621     }
2622   else
2623     old_tree = NULL;
2624
2625   if (elem->token.type == SUBEXP)
2626     {
2627       uintptr_t subidx = elem->token.opr.idx;
2628       postorder (elem, mark_opt_subexp, (void *) subidx);
2629     }
2630
2631   tree = create_tree (dfa, elem, NULL,
2632                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2633   if (BE (tree == NULL, 0))
2634     goto parse_dup_op_espace;
2635
2636 /* From gnulib's "intprops.h":
2637    True if the arithmetic type T is signed.  */
2638 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2639
2640   /* This loop is actually executed only when end != REG_MISSING,
2641      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2642      already created the start+1-th copy.  */
2643   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2644     for (i = start + 2; i <= end; ++i)
2645       {
2646         elem = duplicate_tree (elem, dfa);
2647         tree = create_tree (dfa, tree, elem, CONCAT);
2648         if (BE (elem == NULL || tree == NULL, 0))
2649           goto parse_dup_op_espace;
2650
2651         tree = create_tree (dfa, tree, NULL, OP_ALT);
2652         if (BE (tree == NULL, 0))
2653           goto parse_dup_op_espace;
2654       }
2655
2656   if (old_tree)
2657     tree = create_tree (dfa, old_tree, tree, CONCAT);
2658
2659   return tree;
2660
2661  parse_dup_op_espace:
2662   *err = REG_ESPACE;
2663   return NULL;
2664 }
2665
2666 /* Size of the names for collating symbol/equivalence_class/character_class.
2667    I'm not sure, but maybe enough.  */
2668 #define BRACKET_NAME_BUF_SIZE 32
2669
2670 #ifndef _LIBC
2671   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2672      Build the range expression which starts from START_ELEM, and ends
2673      at END_ELEM.  The result are written to MBCSET and SBCSET.
2674      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2675      mbcset->range_ends, is a pointer argument since we may
2676      update it.  */
2677
2678 static reg_errcode_t
2679 internal_function
2680 # ifdef RE_ENABLE_I18N
2681 build_range_exp (const reg_syntax_t syntax,
2682                  bitset_t sbcset,
2683                  re_charset_t *mbcset,
2684                  Idx *range_alloc,
2685                  const bracket_elem_t *start_elem,
2686                  const bracket_elem_t *end_elem)
2687 # else /* not RE_ENABLE_I18N */
2688 build_range_exp (const reg_syntax_t syntax,
2689                  bitset_t sbcset,
2690                  const bracket_elem_t *start_elem,
2691                  const bracket_elem_t *end_elem)
2692 # endif /* not RE_ENABLE_I18N */
2693 {
2694   unsigned int start_ch, end_ch;
2695   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2696   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2697           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2698           0))
2699     return REG_ERANGE;
2700
2701   /* We can handle no multi character collating elements without libc
2702      support.  */
2703   if (BE ((start_elem->type == COLL_SYM
2704            && strlen ((char *) start_elem->opr.name) > 1)
2705           || (end_elem->type == COLL_SYM
2706               && strlen ((char *) end_elem->opr.name) > 1), 0))
2707     return REG_ECOLLATE;
2708
2709 # ifdef RE_ENABLE_I18N
2710   {
2711     wchar_t wc;
2712     wint_t start_wc;
2713     wint_t end_wc;
2714     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2715
2716     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718                    : 0));
2719     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721                  : 0));
2722     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723                 ? __btowc (start_ch) : start_elem->opr.wch);
2724     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725               ? __btowc (end_ch) : end_elem->opr.wch);
2726     if (start_wc == WEOF || end_wc == WEOF)
2727       return REG_ECOLLATE;
2728     cmp_buf[0] = start_wc;
2729     cmp_buf[4] = end_wc;
2730
2731     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2732             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733       return REG_ERANGE;
2734
2735     /* Got valid collation sequence values, add them as a new entry.
2736        However, for !_LIBC we have no collation elements: if the
2737        character set is single byte, the single byte character set
2738        that we build below suffices.  parse_bracket_exp passes
2739        no MBCSET if dfa->mb_cur_max == 1.  */
2740     if (mbcset)
2741       {
2742         /* Check the space of the arrays.  */
2743         if (BE (*range_alloc == mbcset->nranges, 0))
2744           {
2745             /* There is not enough space, need realloc.  */
2746             wchar_t *new_array_start, *new_array_end;
2747             Idx new_nranges;
2748
2749             /* +1 in case of mbcset->nranges is 0.  */
2750             new_nranges = 2 * mbcset->nranges + 1;
2751             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2752                are NULL if *range_alloc == 0.  */
2753             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2754                                           new_nranges);
2755             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2756                                         new_nranges);
2757
2758             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2759               return REG_ESPACE;
2760
2761             mbcset->range_starts = new_array_start;
2762             mbcset->range_ends = new_array_end;
2763             *range_alloc = new_nranges;
2764           }
2765
2766         mbcset->range_starts[mbcset->nranges] = start_wc;
2767         mbcset->range_ends[mbcset->nranges++] = end_wc;
2768       }
2769
2770     /* Build the table for single byte characters.  */
2771     for (wc = 0; wc < SBC_MAX; ++wc)
2772       {
2773         cmp_buf[2] = wc;
2774         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2775             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2776           bitset_set (sbcset, wc);
2777       }
2778   }
2779 # else /* not RE_ENABLE_I18N */
2780   {
2781     unsigned int ch;
2782     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2783                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2784                    : 0));
2785     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2786               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2787                  : 0));
2788     if (start_ch > end_ch)
2789       return REG_ERANGE;
2790     /* Build the table for single byte characters.  */
2791     for (ch = 0; ch < SBC_MAX; ++ch)
2792       if (start_ch <= ch  && ch <= end_ch)
2793         bitset_set (sbcset, ch);
2794   }
2795 # endif /* not RE_ENABLE_I18N */
2796   return REG_NOERROR;
2797 }
2798 #endif /* not _LIBC */
2799
2800 #ifndef _LIBC
2801 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2802    Build the collating element which is represented by NAME.
2803    The result are written to MBCSET and SBCSET.
2804    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2805    pointer argument since we may update it.  */
2806
2807 static reg_errcode_t
2808 internal_function
2809 # ifdef RE_ENABLE_I18N
2810 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2811                         Idx *coll_sym_alloc, const unsigned char *name)
2812 # else /* not RE_ENABLE_I18N */
2813 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2814 # endif /* not RE_ENABLE_I18N */
2815 {
2816   size_t name_len = strlen ((const char *) name);
2817   if (BE (name_len != 1, 0))
2818     return REG_ECOLLATE;
2819   else
2820     {
2821       bitset_set (sbcset, name[0]);
2822       return REG_NOERROR;
2823     }
2824 }
2825 #endif /* not _LIBC */
2826
2827 /* This function parse bracket expression like "[abc]", "[a-c]",
2828    "[[.a-a.]]" etc.  */
2829
2830 static bin_tree_t *
2831 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2832                    reg_syntax_t syntax, reg_errcode_t *err)
2833 {
2834 #ifdef _LIBC
2835   const unsigned char *collseqmb;
2836   const char *collseqwc;
2837   uint32_t nrules;
2838   int32_t table_size;
2839   const int32_t *symb_table;
2840   const unsigned char *extra;
2841
2842   /* Local function for parse_bracket_exp used in _LIBC environment.
2843      Seek the collating symbol entry corresponding to NAME.
2844      Return the index of the symbol in the SYMB_TABLE.  */
2845
2846   auto inline int32_t
2847   __attribute ((always_inline))
2848   seek_collating_symbol_entry (name, name_len)
2849          const unsigned char *name;
2850          size_t name_len;
2851     {
2852       int32_t hash = elem_hash ((const char *) name, name_len);
2853       int32_t elem = hash % table_size;
2854       if (symb_table[2 * elem] != 0)
2855         {
2856           int32_t second = hash % (table_size - 2) + 1;
2857
2858           do
2859             {
2860               /* First compare the hashing value.  */
2861               if (symb_table[2 * elem] == hash
2862                   /* Compare the length of the name.  */
2863                   && name_len == extra[symb_table[2 * elem + 1]]
2864                   /* Compare the name.  */
2865                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2866                              name_len) == 0)
2867                 {
2868                   /* Yep, this is the entry.  */
2869                   break;
2870                 }
2871
2872               /* Next entry.  */
2873               elem += second;
2874             }
2875           while (symb_table[2 * elem] != 0);
2876         }
2877       return elem;
2878     }
2879
2880   /* Local function for parse_bracket_exp used in _LIBC environment.
2881      Look up the collation sequence value of BR_ELEM.
2882      Return the value if succeeded, UINT_MAX otherwise.  */
2883
2884   auto inline unsigned int
2885   __attribute ((always_inline))
2886   lookup_collation_sequence_value (br_elem)
2887          bracket_elem_t *br_elem;
2888     {
2889       if (br_elem->type == SB_CHAR)
2890         {
2891           /*
2892           if (MB_CUR_MAX == 1)
2893           */
2894           if (nrules == 0)
2895             return collseqmb[br_elem->opr.ch];
2896           else
2897             {
2898               wint_t wc = __btowc (br_elem->opr.ch);
2899               return __collseq_table_lookup (collseqwc, wc);
2900             }
2901         }
2902       else if (br_elem->type == MB_CHAR)
2903         {
2904           if (nrules != 0)
2905             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2906         }
2907       else if (br_elem->type == COLL_SYM)
2908         {
2909           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2910           if (nrules != 0)
2911             {
2912               int32_t elem, idx;
2913               elem = seek_collating_symbol_entry (br_elem->opr.name,
2914                                                   sym_name_len);
2915               if (symb_table[2 * elem] != 0)
2916                 {
2917                   /* We found the entry.  */
2918                   idx = symb_table[2 * elem + 1];
2919                   /* Skip the name of collating element name.  */
2920                   idx += 1 + extra[idx];
2921                   /* Skip the byte sequence of the collating element.  */
2922                   idx += 1 + extra[idx];
2923                   /* Adjust for the alignment.  */
2924                   idx = (idx + 3) & ~3;
2925                   /* Skip the multibyte collation sequence value.  */
2926                   idx += sizeof (unsigned int);
2927                   /* Skip the wide char sequence of the collating element.  */
2928                   idx += sizeof (unsigned int) *
2929                     (1 + *(unsigned int *) (extra + idx));
2930                   /* Return the collation sequence value.  */
2931                   return *(unsigned int *) (extra + idx);
2932                 }
2933               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2934                 {
2935                   /* No valid character.  Match it as a single byte
2936                      character.  */
2937                   return collseqmb[br_elem->opr.name[0]];
2938                 }
2939             }
2940           else if (sym_name_len == 1)
2941             return collseqmb[br_elem->opr.name[0]];
2942         }
2943       return UINT_MAX;
2944     }
2945
2946   /* Local function for parse_bracket_exp used in _LIBC environment.
2947      Build the range expression which starts from START_ELEM, and ends
2948      at END_ELEM.  The result are written to MBCSET and SBCSET.
2949      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2950      mbcset->range_ends, is a pointer argument since we may
2951      update it.  */
2952
2953   auto inline reg_errcode_t
2954   __attribute ((always_inline))
2955   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2956          re_charset_t *mbcset;
2957          Idx *range_alloc;
2958          bitset_t sbcset;
2959          bracket_elem_t *start_elem, *end_elem;
2960     {
2961       unsigned int ch;
2962       uint32_t start_collseq;
2963       uint32_t end_collseq;
2964
2965       /* Equivalence Classes and Character Classes can't be a range
2966          start/end.  */
2967       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2968               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2969               0))
2970         return REG_ERANGE;
2971
2972       start_collseq = lookup_collation_sequence_value (start_elem);
2973       end_collseq = lookup_collation_sequence_value (end_elem);
2974       /* Check start/end collation sequence values.  */
2975       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2976         return REG_ECOLLATE;
2977       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978         return REG_ERANGE;
2979
2980       /* Got valid collation sequence values, add them as a new entry.
2981          However, if we have no collation elements, and the character set
2982          is single byte, the single byte character set that we
2983          build below suffices. */
2984       if (nrules > 0 || dfa->mb_cur_max > 1)
2985         {
2986           /* Check the space of the arrays.  */
2987           if (BE (*range_alloc == mbcset->nranges, 0))
2988             {
2989               /* There is not enough space, need realloc.  */
2990               uint32_t *new_array_start;
2991               uint32_t *new_array_end;
2992               Idx new_nranges;
2993
2994               /* +1 in case of mbcset->nranges is 0.  */
2995               new_nranges = 2 * mbcset->nranges + 1;
2996               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2997                                             new_nranges);
2998               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999                                           new_nranges);
3000
3001               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002                 return REG_ESPACE;
3003
3004               mbcset->range_starts = new_array_start;
3005               mbcset->range_ends = new_array_end;
3006               *range_alloc = new_nranges;
3007             }
3008
3009           mbcset->range_starts[mbcset->nranges] = start_collseq;
3010           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3011         }
3012
3013       /* Build the table for single byte characters.  */
3014       for (ch = 0; ch < SBC_MAX; ch++)
3015         {
3016           uint32_t ch_collseq;
3017           /*
3018           if (MB_CUR_MAX == 1)
3019           */
3020           if (nrules == 0)
3021             ch_collseq = collseqmb[ch];
3022           else
3023             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3024           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3025             bitset_set (sbcset, ch);
3026         }
3027       return REG_NOERROR;
3028     }
3029
3030   /* Local function for parse_bracket_exp used in _LIBC environment.
3031      Build the collating element which is represented by NAME.
3032      The result are written to MBCSET and SBCSET.
3033      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3034      pointer argument since we may update it.  */
3035
3036   auto inline reg_errcode_t
3037   __attribute ((always_inline))
3038   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3039          re_charset_t *mbcset;
3040          Idx *coll_sym_alloc;
3041          bitset_t sbcset;
3042          const unsigned char *name;
3043     {
3044       int32_t elem, idx;
3045       size_t name_len = strlen ((const char *) name);
3046       if (nrules != 0)
3047         {
3048           elem = seek_collating_symbol_entry (name, name_len);
3049           if (symb_table[2 * elem] != 0)
3050             {
3051               /* We found the entry.  */
3052               idx = symb_table[2 * elem + 1];
3053               /* Skip the name of collating element name.  */
3054               idx += 1 + extra[idx];
3055             }
3056           else if (symb_table[2 * elem] == 0 && name_len == 1)
3057             {
3058               /* No valid character, treat it as a normal
3059                  character.  */
3060               bitset_set (sbcset, name[0]);
3061               return REG_NOERROR;
3062             }
3063           else
3064             return REG_ECOLLATE;
3065
3066           /* Got valid collation sequence, add it as a new entry.  */
3067           /* Check the space of the arrays.  */
3068           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3069             {
3070               /* Not enough, realloc it.  */
3071               /* +1 in case of mbcset->ncoll_syms is 0.  */
3072               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3073               /* Use realloc since mbcset->coll_syms is NULL
3074                  if *alloc == 0.  */
3075               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3076                                                    new_coll_sym_alloc);
3077               if (BE (new_coll_syms == NULL, 0))
3078                 return REG_ESPACE;
3079               mbcset->coll_syms = new_coll_syms;
3080               *coll_sym_alloc = new_coll_sym_alloc;
3081             }
3082           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3083           return REG_NOERROR;
3084         }
3085       else
3086         {
3087           if (BE (name_len != 1, 0))
3088             return REG_ECOLLATE;
3089           else
3090             {
3091               bitset_set (sbcset, name[0]);
3092               return REG_NOERROR;
3093             }
3094         }
3095     }
3096 #endif
3097
3098   re_token_t br_token;
3099   re_bitset_ptr_t sbcset;
3100 #ifdef RE_ENABLE_I18N
3101   re_charset_t *mbcset;
3102   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3103   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3104 #endif /* not RE_ENABLE_I18N */
3105   bool non_match = false;
3106   bin_tree_t *work_tree;
3107   int token_len;
3108   bool first_round = true;
3109 #ifdef _LIBC
3110   collseqmb = (const unsigned char *)
3111     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3112   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3113   if (nrules)
3114     {
3115       /*
3116       if (MB_CUR_MAX > 1)
3117       */
3118       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3119       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3120       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3121                                                   _NL_COLLATE_SYMB_TABLEMB);
3122       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3123                                                    _NL_COLLATE_SYMB_EXTRAMB);
3124     }
3125 #endif
3126   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3127 #ifdef RE_ENABLE_I18N
3128   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3129 #endif /* RE_ENABLE_I18N */
3130 #ifdef RE_ENABLE_I18N
3131   if (BE (sbcset == NULL || mbcset == NULL, 0))
3132 #else
3133   if (BE (sbcset == NULL, 0))
3134 #endif /* RE_ENABLE_I18N */
3135     {
3136       re_free (sbcset);
3137 #ifdef RE_ENABLE_I18N
3138       re_free (mbcset);
3139 #endif
3140       *err = REG_ESPACE;
3141       return NULL;
3142     }
3143
3144   token_len = peek_token_bracket (token, regexp, syntax);
3145   if (BE (token->type == END_OF_RE, 0))
3146     {
3147       *err = REG_BADPAT;
3148       goto parse_bracket_exp_free_return;
3149     }
3150   if (token->type == OP_NON_MATCH_LIST)
3151     {
3152 #ifdef RE_ENABLE_I18N
3153       mbcset->non_match = 1;
3154 #endif /* not RE_ENABLE_I18N */
3155       non_match = true;
3156       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3157         bitset_set (sbcset, '\n');
3158       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3159       token_len = peek_token_bracket (token, regexp, syntax);
3160       if (BE (token->type == END_OF_RE, 0))
3161         {
3162           *err = REG_BADPAT;
3163           goto parse_bracket_exp_free_return;
3164         }
3165     }
3166
3167   /* We treat the first ']' as a normal character.  */
3168   if (token->type == OP_CLOSE_BRACKET)
3169     token->type = CHARACTER;
3170
3171   while (1)
3172     {
3173       bracket_elem_t start_elem, end_elem;
3174       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3175       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3176       reg_errcode_t ret;
3177       int token_len2 = 0;
3178       bool is_range_exp = false;
3179       re_token_t token2;
3180
3181       start_elem.opr.name = start_name_buf;
3182       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3183                                    syntax, first_round);
3184       if (BE (ret != REG_NOERROR, 0))
3185         {
3186           *err = ret;
3187           goto parse_bracket_exp_free_return;
3188         }
3189       first_round = false;
3190
3191       /* Get information about the next token.  We need it in any case.  */
3192       token_len = peek_token_bracket (token, regexp, syntax);
3193
3194       /* Do not check for ranges if we know they are not allowed.  */
3195       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3196         {
3197           if (BE (token->type == END_OF_RE, 0))
3198             {
3199               *err = REG_EBRACK;
3200               goto parse_bracket_exp_free_return;
3201             }
3202           if (token->type == OP_CHARSET_RANGE)
3203             {
3204               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3205               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3206               if (BE (token2.type == END_OF_RE, 0))
3207                 {
3208                   *err = REG_EBRACK;
3209                   goto parse_bracket_exp_free_return;
3210                 }
3211               if (token2.type == OP_CLOSE_BRACKET)
3212                 {
3213                   /* We treat the last '-' as a normal character.  */
3214                   re_string_skip_bytes (regexp, -token_len);
3215                   token->type = CHARACTER;
3216                 }
3217               else
3218                 is_range_exp = true;
3219             }
3220         }
3221
3222       if (is_range_exp == true)
3223         {
3224           end_elem.opr.name = end_name_buf;
3225           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3226                                        dfa, syntax, true);
3227           if (BE (ret != REG_NOERROR, 0))
3228             {
3229               *err = ret;
3230               goto parse_bracket_exp_free_return;
3231             }
3232
3233           token_len = peek_token_bracket (token, regexp, syntax);
3234
3235 #ifdef _LIBC
3236           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3237                                   &start_elem, &end_elem);
3238 #else
3239 # ifdef RE_ENABLE_I18N
3240           *err = build_range_exp (syntax, sbcset,
3241                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3242                                   &range_alloc, &start_elem, &end_elem);
3243 # else
3244           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3245 # endif
3246 #endif /* RE_ENABLE_I18N */
3247           if (BE (*err != REG_NOERROR, 0))
3248             goto parse_bracket_exp_free_return;
3249         }
3250       else
3251         {
3252           switch (start_elem.type)
3253             {
3254             case SB_CHAR:
3255               bitset_set (sbcset, start_elem.opr.ch);
3256               break;
3257 #ifdef RE_ENABLE_I18N
3258             case MB_CHAR:
3259               /* Check whether the array has enough space.  */
3260               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3261                 {
3262                   wchar_t *new_mbchars;
3263                   /* Not enough, realloc it.  */
3264                   /* +1 in case of mbcset->nmbchars is 0.  */
3265                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3266                   /* Use realloc since array is NULL if *alloc == 0.  */
3267                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3268                                             mbchar_alloc);
3269                   if (BE (new_mbchars == NULL, 0))
3270                     goto parse_bracket_exp_espace;
3271                   mbcset->mbchars = new_mbchars;
3272                 }
3273               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3274               break;
3275 #endif /* RE_ENABLE_I18N */
3276             case EQUIV_CLASS:
3277               *err = build_equiv_class (sbcset,
3278 #ifdef RE_ENABLE_I18N
3279                                         mbcset, &equiv_class_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281                                         start_elem.opr.name);
3282               if (BE (*err != REG_NOERROR, 0))
3283                 goto parse_bracket_exp_free_return;
3284               break;
3285             case COLL_SYM:
3286               *err = build_collating_symbol (sbcset,
3287 #ifdef RE_ENABLE_I18N
3288                                              mbcset, &coll_sym_alloc,
3289 #endif /* RE_ENABLE_I18N */
3290                                              start_elem.opr.name);
3291               if (BE (*err != REG_NOERROR, 0))
3292                 goto parse_bracket_exp_free_return;
3293               break;
3294             case CHAR_CLASS:
3295               *err = build_charclass (regexp->trans, sbcset,
3296 #ifdef RE_ENABLE_I18N
3297                                       mbcset, &char_class_alloc,
3298 #endif /* RE_ENABLE_I18N */
3299                                       start_elem.opr.name, syntax);
3300               if (BE (*err != REG_NOERROR, 0))
3301                goto parse_bracket_exp_free_return;
3302               break;
3303             default:
3304               assert (0);
3305               break;
3306             }
3307         }
3308       if (BE (token->type == END_OF_RE, 0))
3309         {
3310           *err = REG_EBRACK;
3311           goto parse_bracket_exp_free_return;
3312         }
3313       if (token->type == OP_CLOSE_BRACKET)
3314         break;
3315     }
3316
3317   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3318
3319   /* If it is non-matching list.  */
3320   if (non_match)
3321     bitset_not (sbcset);
3322
3323 #ifdef RE_ENABLE_I18N
3324   /* Ensure only single byte characters are set.  */
3325   if (dfa->mb_cur_max > 1)
3326     bitset_mask (sbcset, dfa->sb_char);
3327
3328   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3329       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3330                                                      || mbcset->non_match)))
3331     {
3332       bin_tree_t *mbc_tree;
3333       int sbc_idx;
3334       /* Build a tree for complex bracket.  */
3335       dfa->has_mb_node = 1;
3336       br_token.type = COMPLEX_BRACKET;
3337       br_token.opr.mbcset = mbcset;
3338       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3339       if (BE (mbc_tree == NULL, 0))
3340         goto parse_bracket_exp_espace;
3341       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3342         if (sbcset[sbc_idx])
3343           break;
3344       /* If there are no bits set in sbcset, there is no point
3345          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3346       if (sbc_idx < BITSET_WORDS)
3347         {
3348           /* Build a tree for simple bracket.  */
3349           br_token.type = SIMPLE_BRACKET;
3350           br_token.opr.sbcset = sbcset;
3351           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352           if (BE (work_tree == NULL, 0))
3353             goto parse_bracket_exp_espace;
3354
3355           /* Then join them by ALT node.  */
3356           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3357           if (BE (work_tree == NULL, 0))
3358             goto parse_bracket_exp_espace;
3359         }
3360       else
3361         {
3362           re_free (sbcset);
3363           work_tree = mbc_tree;
3364         }
3365     }
3366   else
3367 #endif /* not RE_ENABLE_I18N */
3368     {
3369 #ifdef RE_ENABLE_I18N
3370       free_charset (mbcset);
3371 #endif
3372       /* Build a tree for simple bracket.  */
3373       br_token.type = SIMPLE_BRACKET;
3374       br_token.opr.sbcset = sbcset;
3375       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3376       if (BE (work_tree == NULL, 0))
3377         goto parse_bracket_exp_espace;
3378     }
3379   return work_tree;
3380
3381  parse_bracket_exp_espace:
3382   *err = REG_ESPACE;
3383  parse_bracket_exp_free_return:
3384   re_free (sbcset);
3385 #ifdef RE_ENABLE_I18N
3386   free_charset (mbcset);
3387 #endif /* RE_ENABLE_I18N */
3388   return NULL;
3389 }
3390
3391 /* Parse an element in the bracket expression.  */
3392
3393 static reg_errcode_t
3394 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3395                        re_token_t *token, int token_len, re_dfa_t *dfa,
3396                        reg_syntax_t syntax, bool accept_hyphen)
3397 {
3398 #ifdef RE_ENABLE_I18N
3399   int cur_char_size;
3400   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3401   if (cur_char_size > 1)
3402     {
3403       elem->type = MB_CHAR;
3404       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3405       re_string_skip_bytes (regexp, cur_char_size);
3406       return REG_NOERROR;
3407     }
3408 #endif /* RE_ENABLE_I18N */
3409   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3410   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3411       || token->type == OP_OPEN_EQUIV_CLASS)
3412     return parse_bracket_symbol (elem, regexp, token);
3413   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3414     {
3415       /* A '-' must only appear as anything but a range indicator before
3416          the closing bracket.  Everything else is an error.  */
3417       re_token_t token2;
3418       (void) peek_token_bracket (&token2, regexp, syntax);
3419       if (token2.type != OP_CLOSE_BRACKET)
3420         /* The actual error value is not standardized since this whole
3421            case is undefined.  But ERANGE makes good sense.  */
3422         return REG_ERANGE;
3423     }
3424   elem->type = SB_CHAR;
3425   elem->opr.ch = token->opr.c;
3426   return REG_NOERROR;
3427 }
3428
3429 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3430    such as [:<character_class>:], [.<collating_element>.], and
3431    [=<equivalent_class>=].  */
3432
3433 static reg_errcode_t
3434 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3435                       re_token_t *token)
3436 {
3437   unsigned char ch, delim = token->opr.c;
3438   int i = 0;
3439   if (re_string_eoi(regexp))
3440     return REG_EBRACK;
3441   for (;; ++i)
3442     {
3443       if (i >= BRACKET_NAME_BUF_SIZE)
3444         return REG_EBRACK;
3445       if (token->type == OP_OPEN_CHAR_CLASS)
3446         ch = re_string_fetch_byte_case (regexp);
3447       else
3448         ch = re_string_fetch_byte (regexp);
3449       if (re_string_eoi(regexp))
3450         return REG_EBRACK;
3451       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3452         break;
3453       elem->opr.name[i] = ch;
3454     }
3455   re_string_skip_bytes (regexp, 1);
3456   elem->opr.name[i] = '\0';
3457   switch (token->type)
3458     {
3459     case OP_OPEN_COLL_ELEM:
3460       elem->type = COLL_SYM;
3461       break;
3462     case OP_OPEN_EQUIV_CLASS:
3463       elem->type = EQUIV_CLASS;
3464       break;
3465     case OP_OPEN_CHAR_CLASS:
3466       elem->type = CHAR_CLASS;
3467       break;
3468     default:
3469       break;
3470     }
3471   return REG_NOERROR;
3472 }
3473
3474   /* Helper function for parse_bracket_exp.
3475      Build the equivalence class which is represented by NAME.
3476      The result are written to MBCSET and SBCSET.
3477      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3478      is a pointer argument since we may update it.  */
3479
3480 static reg_errcode_t
3481 #ifdef RE_ENABLE_I18N
3482 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3483                    Idx *equiv_class_alloc, const unsigned char *name)
3484 #else /* not RE_ENABLE_I18N */
3485 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3486 #endif /* not RE_ENABLE_I18N */
3487 {
3488 #ifdef _LIBC
3489   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3490   if (nrules != 0)
3491     {
3492       const int32_t *table, *indirect;
3493       const unsigned char *weights, *extra, *cp;
3494       unsigned char char_buf[2];
3495       int32_t idx1, idx2;
3496       unsigned int ch;
3497       size_t len;
3498       /* This #include defines a local function!  */
3499 # include <locale/weight.h>
3500       /* Calculate the index for equivalence class.  */
3501       cp = name;
3502       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3503       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504                                                _NL_COLLATE_WEIGHTMB);
3505       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3506                                                    _NL_COLLATE_EXTRAMB);
3507       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3508                                                 _NL_COLLATE_INDIRECTMB);
3509       idx1 = findidx (&cp, -1);
3510       if (BE (idx1 == 0 || *cp != '\0', 0))
3511         /* This isn't a valid character.  */
3512         return REG_ECOLLATE;
3513
3514       /* Build single byte matching table for this equivalence class.  */
3515       len = weights[idx1 & 0xffffff];
3516       for (ch = 0; ch < SBC_MAX; ++ch)
3517         {
3518           char_buf[0] = ch;
3519           cp = char_buf;
3520           idx2 = findidx (&cp, 1);
3521 /*
3522           idx2 = table[ch];
3523 */
3524           if (idx2 == 0)
3525             /* This isn't a valid character.  */
3526             continue;
3527           /* Compare only if the length matches and the collation rule
3528              index is the same.  */
3529           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3530             {
3531               int cnt = 0;
3532
3533               while (cnt <= len &&
3534                      weights[(idx1 & 0xffffff) + 1 + cnt]
3535                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3536                 ++cnt;
3537
3538               if (cnt > len)
3539                 bitset_set (sbcset, ch);
3540             }
3541         }
3542       /* Check whether the array has enough space.  */
3543       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3544         {
3545           /* Not enough, realloc it.  */
3546           /* +1 in case of mbcset->nequiv_classes is 0.  */
3547           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3548           /* Use realloc since the array is NULL if *alloc == 0.  */
3549           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3550                                                    int32_t,
3551                                                    new_equiv_class_alloc);
3552           if (BE (new_equiv_classes == NULL, 0))
3553             return REG_ESPACE;
3554           mbcset->equiv_classes = new_equiv_classes;
3555           *equiv_class_alloc = new_equiv_class_alloc;
3556         }
3557       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3558     }
3559   else
3560 #endif /* _LIBC */
3561     {
3562       if (BE (strlen ((const char *) name) != 1, 0))
3563         return REG_ECOLLATE;
3564       bitset_set (sbcset, *name);
3565     }
3566   return REG_NOERROR;
3567 }
3568
3569   /* Helper function for parse_bracket_exp.
3570      Build the character class which is represented by NAME.
3571      The result are written to MBCSET and SBCSET.
3572      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3573      is a pointer argument since we may update it.  */
3574
3575 static reg_errcode_t
3576 #ifdef RE_ENABLE_I18N
3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578                  re_charset_t *mbcset, Idx *char_class_alloc,
3579                  const unsigned char *class_name, reg_syntax_t syntax)
3580 #else /* not RE_ENABLE_I18N */
3581 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3582                  const unsigned char *class_name, reg_syntax_t syntax)
3583 #endif /* not RE_ENABLE_I18N */
3584 {
3585   int i;
3586   const char *name = (const char *) class_name;
3587
3588   /* In case of REG_ICASE "upper" and "lower" match the both of
3589      upper and lower cases.  */
3590   if ((syntax & RE_ICASE)
3591       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3592     name = "alpha";
3593
3594 #ifdef RE_ENABLE_I18N
3595   /* Check the space of the arrays.  */
3596   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3597     {
3598       /* Not enough, realloc it.  */
3599       /* +1 in case of mbcset->nchar_classes is 0.  */
3600       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3601       /* Use realloc since array is NULL if *alloc == 0.  */
3602       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3603                                                new_char_class_alloc);
3604       if (BE (new_char_classes == NULL, 0))
3605         return REG_ESPACE;
3606       mbcset->char_classes = new_char_classes;
3607       *char_class_alloc = new_char_class_alloc;
3608     }
3609   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3610 #endif /* RE_ENABLE_I18N */
3611
3612 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3613   do {                                          \
3614     if (BE (trans != NULL, 0))                  \
3615       {                                         \
3616         for (i = 0; i < SBC_MAX; ++i)           \
3617           if (ctype_func (i))                   \
3618             bitset_set (sbcset, trans[i]);      \
3619       }                                         \
3620     else                                        \
3621       {                                         \
3622         for (i = 0; i < SBC_MAX; ++i)           \
3623           if (ctype_func (i))                   \
3624             bitset_set (sbcset, i);             \
3625       }                                         \
3626   } while (0)
3627
3628   if (strcmp (name, "alnum") == 0)
3629     BUILD_CHARCLASS_LOOP (isalnum);
3630   else if (strcmp (name, "cntrl") == 0)
3631     BUILD_CHARCLASS_LOOP (iscntrl);
3632   else if (strcmp (name, "lower") == 0)
3633     BUILD_CHARCLASS_LOOP (islower);
3634   else if (strcmp (name, "space") == 0)
3635     BUILD_CHARCLASS_LOOP (isspace);
3636   else if (strcmp (name, "alpha") == 0)
3637     BUILD_CHARCLASS_LOOP (isalpha);
3638   else if (strcmp (name, "digit") == 0)
3639     BUILD_CHARCLASS_LOOP (isdigit);
3640   else if (strcmp (name, "print") == 0)
3641     BUILD_CHARCLASS_LOOP (isprint);
3642   else if (strcmp (name, "upper") == 0)
3643     BUILD_CHARCLASS_LOOP (isupper);
3644   else if (strcmp (name, "blank") == 0)
3645     BUILD_CHARCLASS_LOOP (isblank);
3646   else if (strcmp (name, "graph") == 0)
3647     BUILD_CHARCLASS_LOOP (isgraph);
3648   else if (strcmp (name, "punct") == 0)
3649     BUILD_CHARCLASS_LOOP (ispunct);
3650   else if (strcmp (name, "xdigit") == 0)
3651     BUILD_CHARCLASS_LOOP (isxdigit);
3652   else
3653     return REG_ECTYPE;
3654
3655   return REG_NOERROR;
3656 }
3657
3658 static bin_tree_t *
3659 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3660                     const unsigned char *class_name,
3661                     const unsigned char *extra, bool non_match,
3662                     reg_errcode_t *err)
3663 {
3664   re_bitset_ptr_t sbcset;
3665 #ifdef RE_ENABLE_I18N
3666   re_charset_t *mbcset;
3667   Idx alloc = 0;
3668 #endif /* not RE_ENABLE_I18N */
3669   reg_errcode_t ret;
3670   re_token_t br_token;
3671   bin_tree_t *tree;
3672
3673   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3674 #ifdef RE_ENABLE_I18N
3675   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3676 #endif /* RE_ENABLE_I18N */
3677
3678 #ifdef RE_ENABLE_I18N
3679   if (BE (sbcset == NULL || mbcset == NULL, 0))
3680 #else /* not RE_ENABLE_I18N */
3681   if (BE (sbcset == NULL, 0))
3682 #endif /* not RE_ENABLE_I18N */
3683     {
3684       *err = REG_ESPACE;
3685       return NULL;
3686     }
3687
3688   if (non_match)
3689     {
3690 #ifdef RE_ENABLE_I18N
3691       mbcset->non_match = 1;
3692 #endif /* not RE_ENABLE_I18N */
3693     }
3694
3695   /* We don't care the syntax in this case.  */
3696   ret = build_charclass (trans, sbcset,
3697 #ifdef RE_ENABLE_I18N
3698                          mbcset, &alloc,
3699 #endif /* RE_ENABLE_I18N */
3700                          class_name, 0);
3701
3702   if (BE (ret != REG_NOERROR, 0))
3703     {
3704       re_free (sbcset);
3705 #ifdef RE_ENABLE_I18N
3706       free_charset (mbcset);
3707 #endif /* RE_ENABLE_I18N */
3708       *err = ret;
3709       return NULL;
3710     }
3711   /* \w match '_' also.  */
3712   for (; *extra; extra++)
3713     bitset_set (sbcset, *extra);
3714
3715   /* If it is non-matching list.  */
3716   if (non_match)
3717     bitset_not (sbcset);
3718
3719 #ifdef RE_ENABLE_I18N
3720   /* Ensure only single byte characters are set.  */
3721   if (dfa->mb_cur_max > 1)
3722     bitset_mask (sbcset, dfa->sb_char);
3723 #endif
3724
3725   /* Build a tree for simple bracket.  */
3726   br_token.type = SIMPLE_BRACKET;
3727   br_token.opr.sbcset = sbcset;
3728   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3729   if (BE (tree == NULL, 0))
3730     goto build_word_op_espace;
3731
3732 #ifdef RE_ENABLE_I18N
3733   if (dfa->mb_cur_max > 1)
3734     {
3735       bin_tree_t *mbc_tree;
3736       /* Build a tree for complex bracket.  */
3737       br_token.type = COMPLEX_BRACKET;
3738       br_token.opr.mbcset = mbcset;
3739       dfa->has_mb_node = 1;
3740       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3741       if (BE (mbc_tree == NULL, 0))
3742         goto build_word_op_espace;
3743       /* Then join them by ALT node.  */
3744       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3745       if (BE (mbc_tree != NULL, 1))
3746         return tree;
3747     }
3748   else
3749     {
3750       free_charset (mbcset);
3751       return tree;
3752     }
3753 #else /* not RE_ENABLE_I18N */
3754   return tree;
3755 #endif /* not RE_ENABLE_I18N */
3756
3757  build_word_op_espace:
3758   re_free (sbcset);
3759 #ifdef RE_ENABLE_I18N
3760   free_charset (mbcset);
3761 #endif /* RE_ENABLE_I18N */
3762   *err = REG_ESPACE;
3763   return NULL;
3764 }
3765
3766 /* This is intended for the expressions like "a{1,3}".
3767    Fetch a number from 'input', and return the number.
3768    Return REG_MISSING if the number field is empty like "{,1}".
3769    Return RE_DUP_MAX + 1 if the number field is too large.
3770    Return REG_ERROR if an error occurred.  */
3771
3772 static Idx
3773 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3774 {
3775   Idx num = REG_MISSING;
3776   unsigned char c;
3777   while (1)
3778     {
3779       fetch_token (token, input, syntax);
3780       c = token->opr.c;
3781       if (BE (token->type == END_OF_RE, 0))
3782         return REG_ERROR;
3783       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3784         break;
3785       num = ((token->type != CHARACTER || c < '0' || '9' < c
3786               || num == REG_ERROR)
3787              ? REG_ERROR
3788              : num == REG_MISSING
3789              ? c - '0'
3790              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3791     }
3792   return num;
3793 }
3794 \f
3795 #ifdef RE_ENABLE_I18N
3796 static void
3797 free_charset (re_charset_t *cset)
3798 {
3799   re_free (cset->mbchars);
3800 # ifdef _LIBC
3801   re_free (cset->coll_syms);
3802   re_free (cset->equiv_classes);
3803   re_free (cset->range_starts);
3804   re_free (cset->range_ends);
3805 # endif
3806   re_free (cset->char_classes);
3807   re_free (cset);
3808 }
3809 #endif /* RE_ENABLE_I18N */
3810 \f
3811 /* Functions for binary tree operation.  */
3812
3813 /* Create a tree node.  */
3814
3815 static bin_tree_t *
3816 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3817              re_token_type_t type)
3818 {
3819   re_token_t t;
3820   t.type = type;
3821   return create_token_tree (dfa, left, right, &t);
3822 }
3823
3824 static bin_tree_t *
3825 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3826                    const re_token_t *token)
3827 {
3828   bin_tree_t *tree;
3829   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3830     {
3831       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3832
3833       if (storage == NULL)
3834         return NULL;
3835       storage->next = dfa->str_tree_storage;
3836       dfa->str_tree_storage = storage;
3837       dfa->str_tree_storage_idx = 0;
3838     }
3839   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3840
3841   tree->parent = NULL;
3842   tree->left = left;
3843   tree->right = right;
3844   tree->token = *token;
3845   tree->token.duplicated = 0;
3846   tree->token.opt_subexp = 0;
3847   tree->first = NULL;
3848   tree->next = NULL;
3849   tree->node_idx = REG_MISSING;
3850
3851   if (left != NULL)
3852     left->parent = tree;
3853   if (right != NULL)
3854     right->parent = tree;
3855   return tree;
3856 }
3857
3858 /* Mark the tree SRC as an optional subexpression.
3859    To be called from preorder or postorder.  */
3860
3861 static reg_errcode_t
3862 mark_opt_subexp (void *extra, bin_tree_t *node)
3863 {
3864   Idx idx = (uintptr_t) extra;
3865   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3866     node->token.opt_subexp = 1;
3867
3868   return REG_NOERROR;
3869 }
3870
3871 /* Free the allocated memory inside NODE. */
3872
3873 static void
3874 free_token (re_token_t *node)
3875 {
3876 #ifdef RE_ENABLE_I18N
3877   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3878     free_charset (node->opr.mbcset);
3879   else
3880 #endif /* RE_ENABLE_I18N */
3881     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3882       re_free (node->opr.sbcset);
3883 }
3884
3885 /* Worker function for tree walking.  Free the allocated memory inside NODE
3886    and its children. */
3887
3888 static reg_errcode_t
3889 free_tree (void *extra, bin_tree_t *node)
3890 {
3891   free_token (&node->token);
3892   return REG_NOERROR;
3893 }
3894
3895
3896 /* Duplicate the node SRC, and return new node.  This is a preorder
3897    visit similar to the one implemented by the generic visitor, but
3898    we need more infrastructure to maintain two parallel trees --- so,
3899    it's easier to duplicate.  */
3900
3901 static bin_tree_t *
3902 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3903 {
3904   const bin_tree_t *node;
3905   bin_tree_t *dup_root;
3906   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3907
3908   for (node = root; ; )
3909     {
3910       /* Create a new tree and link it back to the current parent.  */
3911       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3912       if (*p_new == NULL)
3913         return NULL;
3914       (*p_new)->parent = dup_node;
3915       (*p_new)->token.duplicated = 1;
3916       dup_node = *p_new;
3917
3918       /* Go to the left node, or up and to the right.  */
3919       if (node->left)
3920         {
3921           node = node->left;
3922           p_new = &dup_node->left;
3923         }
3924       else
3925         {
3926           const bin_tree_t *prev = NULL;
3927           while (node->right == prev || node->right == NULL)
3928             {
3929               prev = node;
3930               node = node->parent;
3931               dup_node = dup_node->parent;
3932               if (!node)
3933                 return dup_root;
3934             }
3935           node = node->right;
3936           p_new = &dup_node->right;
3937         }
3938     }
3939 }