lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (mbrtowc (&wc, (const char *) buf, p - buf,
 337                            &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           Idx i;
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 362               || cset->nranges || cset->nchar_classes)
 363             {
 364 # ifdef _LIBC
 365               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 366                 {
 367                   /* In this case we want to catch the bytes which are
 368                      the first byte of any collation elements.
 369                      e.g. In da_DK, we want to catch 'a' since "aa"
 370                           is a valid collation element, and don't catch
 371                           'b' since 'b' is the only collation element
 372                           which starts from 'b'.  */
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # else
 380               if (dfa->mb_cur_max > 1)
 381                 for (i = 0; i < SBC_MAX; ++i)
 382                   if (__btowc (i) == WEOF)
 383                     re_set_fastmap (fastmap, icase, i);
 384 # endif /* not _LIBC */
 385             }
 386           for (i = 0; i < cset->nmbchars; ++i)
 387             {
 388               char buf[256];
 389               mbstate_t state;
 390               memset (&state, '\0', sizeof (state));
 391               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 392                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 393               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 394                 {
 395                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 396                       != (size_t) -1)
 397                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 398                 }
 399             }
 400         }
 401 #endif /* RE_ENABLE_I18N */
 402       else if (type == OP_PERIOD
 403 #ifdef RE_ENABLE_I18N
 404                || type == OP_UTF8_PERIOD
 405 #endif /* RE_ENABLE_I18N */
 406                || type == END_OF_RE)
 407         {
 408           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 409           if (type == END_OF_RE)
 410             bufp->can_be_null = 1;
 411           return;
 412         }
 413     }
 414 }
 415 \f
 416 /* Entry point for POSIX code.  */
 417 /* regcomp takes a regular expression as a string and compiles it.
 418
 419    PREG is a regex_t *.  We do not expect any fields to be initialized,
 420    since POSIX says we shouldn't.  Thus, we set
 421
 422      `buffer' to the compiled pattern;
 423      `used' to the length of the compiled pattern;
 424      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 425        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 426        RE_SYNTAX_POSIX_BASIC;
 427      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 428      `fastmap' to an allocated space for the fastmap;
 429      `fastmap_accurate' to zero;
 430      `re_nsub' to the number of subexpressions in PATTERN.
 431
 432    PATTERN is the address of the pattern string.
 433
 434    CFLAGS is a series of bits which affect compilation.
 435
 436      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 437      use POSIX basic syntax.
 438
 439      If REG_NEWLINE is set, then . and [^...] don't match newline.
 440      Also, regexec will try a match beginning after every newline.
 441
 442      If REG_ICASE is set, then we considers upper- and lowercase
 443      versions of letters to be equivalent when matching.
 444
 445      If REG_NOSUB is set, then when PREG is passed to regexec, that
 446      routine will report only success or failure, and nothing about the
 447      registers.
 448
 449    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 450    the return codes and their meanings.)  */
 451
 452 int
 453 regcomp (preg, pattern, cflags)
 454     regex_t *_Restrict_ preg;
 455     const char *_Restrict_ pattern;
 456     int cflags;
 457 {
 458   reg_errcode_t ret;
 459   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 460                          : RE_SYNTAX_POSIX_BASIC);
 461
 462   preg->buffer = NULL;
 463   preg->allocated = 0;
 464   preg->used = 0;
 465
 466   /* Try to allocate space for the fastmap.  */
 467   preg->fastmap = re_malloc (char, SBC_MAX);
 468   if (BE (preg->fastmap == NULL, 0))
 469     return REG_ESPACE;
 470
 471   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 472
 473   /* If REG_NEWLINE is set, newlines are treated differently.  */
 474   if (cflags & REG_NEWLINE)
 475     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 476       syntax &= ~RE_DOT_NEWLINE;
 477       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 478       /* It also changes the matching behavior.  */
 479       preg->newline_anchor = 1;
 480     }
 481   else
 482     preg->newline_anchor = 0;
 483   preg->no_sub = !!(cflags & REG_NOSUB);
 484   preg->translate = NULL;
 485
 486   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 487
 488   /* POSIX doesn't distinguish between an unmatched open-group and an
 489      unmatched close-group: both are REG_EPAREN.  */
 490   if (ret == REG_ERPAREN)
 491     ret = REG_EPAREN;
 492
 493   /* We have already checked preg->fastmap != NULL.  */
 494   if (BE (ret == REG_NOERROR, 1))
 495     /* Compute the fastmap now, since regexec cannot modify the pattern
 496        buffer.  This function never fails in this implementation.  */
 497     (void) re_compile_fastmap (preg);
 498   else
 499     {
 500       /* Some error occurred while compiling the expression.  */
 501       re_free (preg->fastmap);
 502       preg->fastmap = NULL;
 503     }
 504
 505   return (int) ret;
 506 }
 507 #ifdef _LIBC
 508 weak_alias (__regcomp, regcomp)
 509 #endif
 510
 511 /* Returns a message corresponding to an error code, ERRCODE, returned
 512    from either regcomp or regexec.   We don't use PREG here.  */
 513
 514 #ifdef _LIBC
 515 size_t
 516 regerror (errcode, preg, errbuf, errbuf_size)
 517     int errcode;
 518     const regex_t *_Restrict_ preg;
 519     char *_Restrict_ errbuf;
 520     size_t errbuf_size;
 521 #else /* size_t might promote */
 522 size_t
 523 regerror (int errcode, const regex_t *_Restrict_ preg,
 524           char *_Restrict_ errbuf, size_t errbuf_size)
 525 #endif
 526 {
 527   const char *msg;
 528   size_t msg_size;
 529
 530   if (BE (errcode < 0
 531           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 532                                / sizeof (__re_error_msgid_idx[0])), 0))
 533     /* Only error codes returned by the rest of the code should be passed
 534        to this routine.  If we are given anything else, or if other regex
 535        code generates an invalid error code, then the program has a bug.
 536        Dump core so we can fix it.  */
 537     abort ();
 538
 539   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 540
 541   msg_size = strlen (msg) + 1; /* Includes the null.  */
 542
 543   if (BE (errbuf_size != 0, 1))
 544     {
 545       size_t cpy_size = msg_size;
 546       if (BE (msg_size > errbuf_size, 0))
 547         {
 548           cpy_size = errbuf_size - 1;
 549           errbuf[cpy_size] = '\0';
 550         }
 551       memcpy (errbuf, msg, cpy_size);
 552     }
 553
 554   return msg_size;
 555 }
 556 #ifdef _LIBC
 557 weak_alias (__regerror, regerror)
 558 #endif
 559
 560
 561 #ifdef RE_ENABLE_I18N
 562 /* This static array is used for the map to single-byte characters when
 563    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 564    it the same all the time.  UTF-8 is the preferred encoding so this is
 565    a worthwhile optimization.  */
 566 static const bitset_t utf8_sb_map =
 567 {
 568   /* Set the first 128 bits.  */
 569 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 570 #  error "bitset_word_t is narrower than 32 bits"
 571 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 572   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 573 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 574   BITSET_WORD_MAX, BITSET_WORD_MAX,
 575 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 576   BITSET_WORD_MAX,
 577 # endif
 578   (BITSET_WORD_MAX
 579    >> (SBC_MAX % BITSET_WORD_BITS == 0
 580        ? 0
 581        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   Idx i, j;
 590
 591   if (dfa->nodes)
 592     for (i = 0; i < dfa->nodes_len; ++i)
 593       free_token (dfa->nodes + i);
 594   re_free (dfa->nexts);
 595   for (i = 0; i < dfa->nodes_len; ++i)
 596     {
 597       if (dfa->eclosures != NULL)
 598         re_node_set_free (dfa->eclosures + i);
 599       if (dfa->inveclosures != NULL)
 600         re_node_set_free (dfa->inveclosures + i);
 601       if (dfa->edests != NULL)
 602         re_node_set_free (dfa->edests + i);
 603     }
 604   re_free (dfa->edests);
 605   re_free (dfa->eclosures);
 606   re_free (dfa->inveclosures);
 607   re_free (dfa->nodes);
 608
 609   if (dfa->state_table)
 610     for (i = 0; i <= dfa->state_hash_mask; ++i)
 611       {
 612         struct re_state_table_entry *entry = dfa->state_table + i;
 613         for (j = 0; j < entry->num; ++j)
 614           {
 615             re_dfastate_t *state = entry->array[j];
 616             free_state (state);
 617           }
 618         re_free (entry->array);
 619       }
 620   re_free (dfa->state_table);
 621 #ifdef RE_ENABLE_I18N
 622   if (dfa->sb_char != utf8_sb_map)
 623     re_free (dfa->sb_char);
 624 #endif
 625   re_free (dfa->subexp_map);
 626 #ifdef DEBUG
 627   re_free (dfa->re_str);
 628 #endif
 629
 630   re_free (dfa);
 631 }
 632
 633
 634 /* Free dynamically allocated space used by PREG.  */
 635
 636 void
 637 regfree (preg)
 638     regex_t *preg;
 639 {
 640   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 641   if (BE (dfa != NULL, 1))
 642     free_dfa_content (dfa);
 643   preg->buffer = NULL;
 644   preg->allocated = 0;
 645
 646   re_free (preg->fastmap);
 647   preg->fastmap = NULL;
 648
 649   re_free (preg->translate);
 650   preg->translate = NULL;
 651 }
 652 #ifdef _LIBC
 653 weak_alias (__regfree, regfree)
 654 #endif
 655 \f
 656 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 657    them unless specifically requested.  */
 658
 659 #if defined _REGEX_RE_COMP || defined _LIBC
 660
 661 /* BSD has one and only one pattern buffer.  */
 662 static struct re_pattern_buffer re_comp_buf;
 663
 664 char *
 665 # ifdef _LIBC
 666 /* Make these definitions weak in libc, so POSIX programs can redefine
 667    these names if they don't use our functions, and still use
 668    regcomp/regexec above without link errors.  */
 669 weak_function
 670 # endif
 671 re_comp (s)
 672      const char *s;
 673 {
 674   reg_errcode_t ret;
 675   char *fastmap;
 676
 677   if (!s)
 678     {
 679       if (!re_comp_buf.buffer)
 680         return gettext ("No previous regular expression");
 681       return 0;
 682     }
 683
 684   if (re_comp_buf.buffer)
 685     {
 686       fastmap = re_comp_buf.fastmap;
 687       re_comp_buf.fastmap = NULL;
 688       __regfree (&re_comp_buf);
 689       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 690       re_comp_buf.fastmap = fastmap;
 691     }
 692
 693   if (re_comp_buf.fastmap == NULL)
 694     {
 695       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 696       if (re_comp_buf.fastmap == NULL)
 697         return (char *) gettext (__re_error_msgid
 698                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 699     }
 700
 701   /* Since `re_exec' always passes NULL for the `regs' argument, we
 702      don't need to initialize the pattern buffer fields which affect it.  */
 703
 704   /* Match anchors at newlines.  */
 705   re_comp_buf.newline_anchor = 1;
 706
 707   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 708
 709   if (!ret)
 710     return NULL;
 711
 712   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 713   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 714 }
 715
 716 #ifdef _LIBC
 717 libc_freeres_fn (free_mem)
 718 {
 719   __regfree (&re_comp_buf);
 720 }
 721 #endif
 722
 723 #endif /* _REGEX_RE_COMP */
 724 \f
 725 /* Internal entry point.
 726    Compile the regular expression PATTERN, whose length is LENGTH.
 727    SYNTAX indicate regular expression's syntax.  */
 728
 729 static reg_errcode_t
 730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 731                      reg_syntax_t syntax)
 732 {
 733   reg_errcode_t err = REG_NOERROR;
 734   re_dfa_t *dfa;
 735   re_string_t regexp;
 736
 737   /* Initialize the pattern buffer.  */
 738   preg->fastmap_accurate = 0;
 739   preg->syntax = syntax;
 740   preg->not_bol = preg->not_eol = 0;
 741   preg->used = 0;
 742   preg->re_nsub = 0;
 743   preg->can_be_null = 0;
 744   preg->regs_allocated = REGS_UNALLOCATED;
 745
 746   /* Initialize the dfa.  */
 747   dfa = (re_dfa_t *) preg->buffer;
 748   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 749     {
 750       /* If zero allocated, but buffer is non-null, try to realloc
 751          enough space.  This loses if buffer's address is bogus, but
 752          that is the user's responsibility.  If ->buffer is NULL this
 753          is a simple allocation.  */
 754       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 755       if (dfa == NULL)
 756         return REG_ESPACE;
 757       preg->allocated = sizeof (re_dfa_t);
 758       preg->buffer = (unsigned char *) dfa;
 759     }
 760   preg->used = sizeof (re_dfa_t);
 761
 762   err = init_dfa (dfa, length);
 763   if (BE (err != REG_NOERROR, 0))
 764     {
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770 #ifdef DEBUG
 771   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 772   dfa->re_str = re_malloc (char, length + 1);
 773   strncpy (dfa->re_str, pattern, length + 1);
 774 #endif
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = re_string_construct (&regexp, pattern, length, preg->translate,
 779                              syntax & RE_ICASE, dfa);
 780   if (BE (err != REG_NOERROR, 0))
 781     {
 782     re_compile_internal_free_return:
 783       free_workarea_compile (preg);
 784       re_string_destruct (&regexp);
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790
 791   /* Parse the regular expression, and build a structure tree.  */
 792   preg->re_nsub = 0;
 793   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 794   if (BE (dfa->str_tree == NULL, 0))
 795     goto re_compile_internal_free_return;
 796
 797   /* Analyze the tree and create the nfa.  */
 798   err = analyze (preg);
 799   if (BE (err != REG_NOERROR, 0))
 800     goto re_compile_internal_free_return;
 801
 802 #ifdef RE_ENABLE_I18N
 803   /* If possible, do searching in single byte encoding to speed things up.  */
 804   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 805     optimize_utf8 (dfa);
 806 #endif
 807
 808   /* Then create the initial state of the dfa.  */
 809   err = create_initial_state (dfa);
 810
 811   /* Release work areas.  */
 812   free_workarea_compile (preg);
 813   re_string_destruct (&regexp);
 814
 815   if (BE (err != REG_NOERROR, 0))
 816     {
 817       free_dfa_content (dfa);
 818       preg->buffer = NULL;
 819       preg->allocated = 0;
 820     }
 821
 822   return err;
 823 }
 824
 825 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 826    as the initial length of some arrays.  */
 827
 828 static reg_errcode_t
 829 init_dfa (re_dfa_t *dfa, size_t pat_len)
 830 {
 831   __re_size_t table_size;
 832 #ifndef _LIBC
 833   char *codeset_name;
 834 #endif
 835 #ifdef RE_ENABLE_I18N
 836   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 837 #else
 838   size_t max_i18n_object_size = 0;
 839 #endif
 840   size_t max_object_size =
 841     MAX (sizeof (struct re_state_table_entry),
 842          MAX (sizeof (re_token_t),
 843               MAX (sizeof (re_node_set),
 844                    MAX (sizeof (regmatch_t),
 845                         max_i18n_object_size))));
 846
 847   memset (dfa, '\0', sizeof (re_dfa_t));
 848
 849   /* Force allocation of str_tree_storage the first time.  */
 850   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 851
 852   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 853      calculation below, and for similar doubling calculations
 854      elsewhere.  And it's <= rather than <, because some of the
 855      doubling calculations add 1 afterwards.  */
 856   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 857     return REG_ESPACE;
 858
 859   dfa->nodes_alloc = pat_len + 1;
 860   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 861
 862   /*  table_size = 2 ^ ceil(log pat_len) */
 863   for (table_size = 1; ; table_size <<= 1)
 864     if (table_size > pat_len)
 865       break;
 866
 867   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 868   dfa->state_hash_mask = table_size - 1;
 869
 870   dfa->mb_cur_max = MB_CUR_MAX;
 871 #ifdef _LIBC
 872   if (dfa->mb_cur_max == 6
 873       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 874     dfa->is_utf8 = 1;
 875   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 876                        != 0);
 877 #else
 878 # ifdef HAVE_LANGINFO_CODESET
 879   codeset_name = nl_langinfo (CODESET);
 880 # else
 881   codeset_name = getenv ("LC_ALL");
 882   if (codeset_name == NULL || codeset_name[0] == '\0')
 883     codeset_name = getenv ("LC_CTYPE");
 884   if (codeset_name == NULL || codeset_name[0] == '\0')
 885     codeset_name = getenv ("LANG");
 886   if (codeset_name == NULL)
 887     codeset_name = "";
 888   else if (strchr (codeset_name, '.') !=  NULL)
 889     codeset_name = strchr (codeset_name, '.') + 1;
 890 # endif
 891
 892   if (strcasecmp (codeset_name, "UTF-8") == 0
 893       || strcasecmp (codeset_name, "UTF8") == 0)
 894     dfa->is_utf8 = 1;
 895
 896   /* We check exhaustively in the loop below if this charset is a
 897      superset of ASCII.  */
 898   dfa->map_notascii = 0;
 899 #endif
 900
 901 #ifdef RE_ENABLE_I18N
 902   if (dfa->mb_cur_max > 1)
 903     {
 904       if (dfa->is_utf8)
 905         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 906       else
 907         {
 908           int i, j, ch;
 909
 910           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 911           if (BE (dfa->sb_char == NULL, 0))
 912             return REG_ESPACE;
 913
 914           /* Set the bits corresponding to single byte chars.  */
 915           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 916             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 917               {
 918                 wint_t wch = __btowc (ch);
 919                 if (wch != WEOF)
 920                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 921 # ifndef _LIBC
 922                 if (isascii (ch) && wch != ch)
 923                   dfa->map_notascii = 1;
 924 # endif
 925               }
 926         }
 927     }
 928 #endif
 929
 930   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 931     return REG_ESPACE;
 932   return REG_NOERROR;
 933 }
 934
 935 /* Initialize WORD_CHAR table, which indicate which character is
 936    "word".  In this case "word" means that it is the word construction
 937    character used by some operators like "\<", "\>", etc.  */
 938
 939 static void
 940 internal_function
 941 init_word_char (re_dfa_t *dfa)
 942 {
 943   int i, j, ch;
 944   dfa->word_ops_used = 1;
 945   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 946     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 947       if (isalnum (ch) || ch == '_')
 948         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 949 }
 950
 951 /* Free the work area which are only used while compiling.  */
 952
 953 static void
 954 free_workarea_compile (regex_t *preg)
 955 {
 956   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 957   bin_tree_storage_t *storage, *next;
 958   for (storage = dfa->str_tree_storage; storage; storage = next)
 959     {
 960       next = storage->next;
 961       re_free (storage);
 962     }
 963   dfa->str_tree_storage = NULL;
 964   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 965   dfa->str_tree = NULL;
 966   re_free (dfa->org_indices);
 967   dfa->org_indices = NULL;
 968 }
 969
 970 /* Create initial states for all contexts.  */
 971
 972 static reg_errcode_t
 973 create_initial_state (re_dfa_t *dfa)
 974 {
 975   Idx first, i;
 976   reg_errcode_t err;
 977   re_node_set init_nodes;
 978
 979   /* Initial states have the epsilon closure of the node which is
 980      the first node of the regular expression.  */
 981   first = dfa->str_tree->first->node_idx;
 982   dfa->init_node = first;
 983   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 984   if (BE (err != REG_NOERROR, 0))
 985     return err;
 986
 987   /* The back-references which are in initial states can epsilon transit,
 988      since in this case all of the subexpressions can be null.
 989      Then we add epsilon closures of the nodes which are the next nodes of
 990      the back-references.  */
 991   if (dfa->nbackref > 0)
 992     for (i = 0; i < init_nodes.nelem; ++i)
 993       {
 994         Idx node_idx = init_nodes.elems[i];
 995         re_token_type_t type = dfa->nodes[node_idx].type;
 996
 997         Idx clexp_idx;
 998         if (type != OP_BACK_REF)
 999           continue;
1000         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1001           {
1002             re_token_t *clexp_node;
1003             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1004             if (clexp_node->type == OP_CLOSE_SUBEXP
1005                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1006               break;
1007           }
1008         if (clexp_idx == init_nodes.nelem)
1009           continue;
1010
1011         if (type == OP_BACK_REF)
1012           {
1013             Idx dest_idx = dfa->edests[node_idx].elems[0];
1014             if (!re_node_set_contains (&init_nodes, dest_idx))
1015               {
1016                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1017                 i = 0;
1018               }
1019           }
1020       }
1021
1022   /* It must be the first time to invoke acquire_state.  */
1023   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1024   /* We don't check ERR here, since the initial state must not be NULL.  */
1025   if (BE (dfa->init_state == NULL, 0))
1026     return err;
1027   if (dfa->init_state->has_constraint)
1028     {
1029       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1030                                                        CONTEXT_WORD);
1031       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1032                                                      CONTEXT_NEWLINE);
1033       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1034                                                          &init_nodes,
1035                                                          CONTEXT_NEWLINE
1036                                                          | CONTEXT_BEGBUF);
1037       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1038               || dfa->init_state_begbuf == NULL, 0))
1039         return err;
1040     }
1041   else
1042     dfa->init_state_word = dfa->init_state_nl
1043       = dfa->init_state_begbuf = dfa->init_state;
1044
1045   re_node_set_free (&init_nodes);
1046   return REG_NOERROR;
1047 }
1048 \f
1049 #ifdef RE_ENABLE_I18N
1050 /* If it is possible to do searching in single byte encoding instead of UTF-8
1051    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1052    DFA nodes where needed.  */
1053
1054 static void
1055 optimize_utf8 (re_dfa_t *dfa)
1056 {
1057   Idx node;
1058   int i;
1059   bool mb_chars = false;
1060   bool has_period = false;
1061
1062   for (node = 0; node < dfa->nodes_len; ++node)
1063     switch (dfa->nodes[node].type)
1064       {
1065       case CHARACTER:
1066         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1067           mb_chars = true;
1068         break;
1069       case ANCHOR:
1070         switch (dfa->nodes[node].opr.idx)
1071           {
1072           case LINE_FIRST:
1073           case LINE_LAST:
1074           case BUF_FIRST:
1075           case BUF_LAST:
1076             break;
1077           default:
1078             /* Word anchors etc. cannot be handled.  */
1079             return;
1080           }
1081         break;
1082       case OP_PERIOD:
1083         has_period = true;
1084         break;
1085       case OP_BACK_REF:
1086       case OP_ALT:
1087       case END_OF_RE:
1088       case OP_DUP_ASTERISK:
1089       case OP_OPEN_SUBEXP:
1090       case OP_CLOSE_SUBEXP:
1091         break;
1092       case COMPLEX_BRACKET:
1093         return;
1094       case SIMPLE_BRACKET:
1095         /* Just double check.  */
1096         {
1097           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1098                         ? 0
1099                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1100           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1101             {
1102               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1103                 return;
1104               rshift = 0;
1105             }
1106         }
1107         break;
1108       default:
1109         abort ();
1110       }
1111
1112   if (mb_chars || has_period)
1113     for (node = 0; node < dfa->nodes_len; ++node)
1114       {
1115         if (dfa->nodes[node].type == CHARACTER
1116             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1117           dfa->nodes[node].mb_partial = 0;
1118         else if (dfa->nodes[node].type == OP_PERIOD)
1119           dfa->nodes[node].type = OP_UTF8_PERIOD;
1120       }
1121
1122   /* The search can be in single byte locale.  */
1123   dfa->mb_cur_max = 1;
1124   dfa->is_utf8 = 0;
1125   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1126 }
1127 #endif
1128 \f
1129 /* Analyze the structure tree, and calculate "first", "next", "edest",
1130    "eclosure", and "inveclosure".  */
1131
1132 static reg_errcode_t
1133 analyze (regex_t *preg)
1134 {
1135   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1136   reg_errcode_t ret;
1137
1138   /* Allocate arrays.  */
1139   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1140   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1141   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1142   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1143   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1144           || dfa->eclosures == NULL, 0))
1145     return REG_ESPACE;
1146
1147   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1148   if (dfa->subexp_map != NULL)
1149     {
1150       Idx i;
1151       for (i = 0; i < preg->re_nsub; i++)
1152         dfa->subexp_map[i] = i;
1153       preorder (dfa->str_tree, optimize_subexps, dfa);
1154       for (i = 0; i < preg->re_nsub; i++)
1155         if (dfa->subexp_map[i] != i)
1156           break;
1157       if (i == preg->re_nsub)
1158         {
1159           free (dfa->subexp_map);
1160           dfa->subexp_map = NULL;
1161         }
1162     }
1163
1164   ret = postorder (dfa->str_tree, lower_subexps, preg);
1165   if (BE (ret != REG_NOERROR, 0))
1166     return ret;
1167   ret = postorder (dfa->str_tree, calc_first, dfa);
1168   if (BE (ret != REG_NOERROR, 0))
1169     return ret;
1170   preorder (dfa->str_tree, calc_next, dfa);
1171   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1172   if (BE (ret != REG_NOERROR, 0))
1173     return ret;
1174   ret = calc_eclosure (dfa);
1175   if (BE (ret != REG_NOERROR, 0))
1176     return ret;
1177
1178   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1179      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1180   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1181       || dfa->nbackref)
1182     {
1183       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1184       if (BE (dfa->inveclosures == NULL, 0))
1185         return REG_ESPACE;
1186       ret = calc_inveclosure (dfa);
1187     }
1188
1189   return ret;
1190 }
1191
1192 /* Our parse trees are very unbalanced, so we cannot use a stack to
1193    implement parse tree visits.  Instead, we use parent pointers and
1194    some hairy code in these two functions.  */
1195 static reg_errcode_t
1196 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1197            void *extra)
1198 {
1199   bin_tree_t *node, *prev;
1200
1201   for (node = root; ; )
1202     {
1203       /* Descend down the tree, preferably to the left (or to the right
1204          if that's the only child).  */
1205       while (node->left || node->right)
1206         if (node->left)
1207           node = node->left;
1208         else
1209           node = node->right;
1210
1211       do
1212         {
1213           reg_errcode_t err = fn (extra, node);
1214           if (BE (err != REG_NOERROR, 0))
1215             return err;
1216           if (node->parent == NULL)
1217             return REG_NOERROR;
1218           prev = node;
1219           node = node->parent;
1220         }
1221       /* Go up while we have a node that is reached from the right.  */
1222       while (node->right == prev || node->right == NULL);
1223       node = node->right;
1224     }
1225 }
1226
1227 static reg_errcode_t
1228 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1229           void *extra)
1230 {
1231   bin_tree_t *node;
1232
1233   for (node = root; ; )
1234     {
1235       reg_errcode_t err = fn (extra, node);
1236       if (BE (err != REG_NOERROR, 0))
1237         return err;
1238
1239       /* Go to the left node, or up and to the right.  */
1240       if (node->left)
1241         node = node->left;
1242       else
1243         {
1244           bin_tree_t *prev = NULL;
1245           while (node->right == prev || node->right == NULL)
1246             {
1247               prev = node;
1248               node = node->parent;
1249               if (!node)
1250                 return REG_NOERROR;
1251             }
1252           node = node->right;
1253         }
1254     }
1255 }
1256
1257 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1258    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1259    backreferences as well.  Requires a preorder visit.  */
1260 static reg_errcode_t
1261 optimize_subexps (void *extra, bin_tree_t *node)
1262 {
1263   re_dfa_t *dfa = (re_dfa_t *) extra;
1264
1265   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1266     {
1267       int idx = node->token.opr.idx;
1268       node->token.opr.idx = dfa->subexp_map[idx];
1269       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1270     }
1271
1272   else if (node->token.type == SUBEXP
1273            && node->left && node->left->token.type == SUBEXP)
1274     {
1275       Idx other_idx = node->left->token.opr.idx;
1276
1277       node->left = node->left->left;
1278       if (node->left)
1279         node->left->parent = node;
1280
1281       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1282       if (other_idx < BITSET_WORD_BITS)
1283         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1284     }
1285
1286   return REG_NOERROR;
1287 }
1288
1289 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1290    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1291 static reg_errcode_t
1292 lower_subexps (void *extra, bin_tree_t *node)
1293 {
1294   regex_t *preg = (regex_t *) extra;
1295   reg_errcode_t err = REG_NOERROR;
1296
1297   if (node->left && node->left->token.type == SUBEXP)
1298     {
1299       node->left = lower_subexp (&err, preg, node->left);
1300       if (node->left)
1301         node->left->parent = node;
1302     }
1303   if (node->right && node->right->token.type == SUBEXP)
1304     {
1305       node->right = lower_subexp (&err, preg, node->right);
1306       if (node->right)
1307         node->right->parent = node;
1308     }
1309
1310   return err;
1311 }
1312
1313 static bin_tree_t *
1314 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1315 {
1316   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1317   bin_tree_t *body = node->left;
1318   bin_tree_t *op, *cls, *tree1, *tree;
1319
1320   if (preg->no_sub
1321       /* We do not optimize empty subexpressions, because otherwise we may
1322          have bad CONCAT nodes with NULL children.  This is obviously not
1323          very common, so we do not lose much.  An example that triggers
1324          this case is the sed "script" /\(\)/x.  */
1325       && node->left != NULL
1326       && (node->token.opr.idx >= BITSET_WORD_BITS
1327           || !(dfa->used_bkref_map
1328                & ((bitset_word_t) 1 << node->token.opr.idx))))
1329     return node->left;
1330
1331   /* Convert the SUBEXP node to the concatenation of an
1332      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1333   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1334   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1335   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1336   tree = create_tree (dfa, op, tree1, CONCAT);
1337   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1338     {
1339       *err = REG_ESPACE;
1340       return NULL;
1341     }
1342
1343   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1344   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1345   return tree;
1346 }
1347
1348 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1349    nodes.  Requires a postorder visit.  */
1350 static reg_errcode_t
1351 calc_first (void *extra, bin_tree_t *node)
1352 {
1353   re_dfa_t *dfa = (re_dfa_t *) extra;
1354   if (node->token.type == CONCAT)
1355     {
1356       node->first = node->left->first;
1357       node->node_idx = node->left->node_idx;
1358     }
1359   else
1360     {
1361       node->first = node;
1362       node->node_idx = re_dfa_add_node (dfa, node->token);
1363       if (BE (node->node_idx == REG_MISSING, 0))
1364         return REG_ESPACE;
1365     }
1366   return REG_NOERROR;
1367 }
1368
1369 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1370 static reg_errcode_t
1371 calc_next (void *extra, bin_tree_t *node)
1372 {
1373   switch (node->token.type)
1374     {
1375     case OP_DUP_ASTERISK:
1376       node->left->next = node;
1377       break;
1378     case CONCAT:
1379       node->left->next = node->right->first;
1380       node->right->next = node->next;
1381       break;
1382     default:
1383       if (node->left)
1384         node->left->next = node->next;
1385       if (node->right)
1386         node->right->next = node->next;
1387       break;
1388     }
1389   return REG_NOERROR;
1390 }
1391
1392 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1393 static reg_errcode_t
1394 link_nfa_nodes (void *extra, bin_tree_t *node)
1395 {
1396   re_dfa_t *dfa = (re_dfa_t *) extra;
1397   Idx idx = node->node_idx;
1398   reg_errcode_t err = REG_NOERROR;
1399
1400   switch (node->token.type)
1401     {
1402     case CONCAT:
1403       break;
1404
1405     case END_OF_RE:
1406       assert (node->next == NULL);
1407       break;
1408
1409     case OP_DUP_ASTERISK:
1410     case OP_ALT:
1411       {
1412         Idx left, right;
1413         dfa->has_plural_match = 1;
1414         if (node->left != NULL)
1415           left = node->left->first->node_idx;
1416         else
1417           left = node->next->node_idx;
1418         if (node->right != NULL)
1419           right = node->right->first->node_idx;
1420         else
1421           right = node->next->node_idx;
1422         assert (REG_VALID_INDEX (left));
1423         assert (REG_VALID_INDEX (right));
1424         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1425       }
1426       break;
1427
1428     case ANCHOR:
1429     case OP_OPEN_SUBEXP:
1430     case OP_CLOSE_SUBEXP:
1431       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1432       break;
1433
1434     case OP_BACK_REF:
1435       dfa->nexts[idx] = node->next->node_idx;
1436       if (node->token.type == OP_BACK_REF)
1437         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1438       break;
1439
1440     default:
1441       assert (!IS_EPSILON_NODE (node->token.type));
1442       dfa->nexts[idx] = node->next->node_idx;
1443       break;
1444     }
1445
1446   return err;
1447 }
1448
1449 /* Duplicate the epsilon closure of the node ROOT_NODE.
1450    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1451    to their own constraint.  */
1452
1453 static reg_errcode_t
1454 internal_function
1455 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1456                         Idx root_node, unsigned int init_constraint)
1457 {
1458   Idx org_node, clone_node;
1459   bool ok;
1460   unsigned int constraint = init_constraint;
1461   for (org_node = top_org_node, clone_node = top_clone_node;;)
1462     {
1463       Idx org_dest, clone_dest;
1464       if (dfa->nodes[org_node].type == OP_BACK_REF)
1465         {
1466           /* If the back reference epsilon-transit, its destination must
1467              also have the constraint.  Then duplicate the epsilon closure
1468              of the destination of the back reference, and store it in
1469              edests of the back reference.  */
1470           org_dest = dfa->nexts[org_node];
1471           re_node_set_empty (dfa->edests + clone_node);
1472           clone_dest = duplicate_node (dfa, org_dest, constraint);
1473           if (BE (clone_dest == REG_MISSING, 0))
1474             return REG_ESPACE;
1475           dfa->nexts[clone_node] = dfa->nexts[org_node];
1476           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1477           if (BE (! ok, 0))
1478             return REG_ESPACE;
1479         }
1480       else if (dfa->edests[org_node].nelem == 0)
1481         {
1482           /* In case of the node can't epsilon-transit, don't duplicate the
1483              destination and store the original destination as the
1484              destination of the node.  */
1485           dfa->nexts[clone_node] = dfa->nexts[org_node];
1486           break;
1487         }
1488       else if (dfa->edests[org_node].nelem == 1)
1489         {
1490           /* In case of the node can epsilon-transit, and it has only one
1491              destination.  */
1492           org_dest = dfa->edests[org_node].elems[0];
1493           re_node_set_empty (dfa->edests + clone_node);
1494           if (dfa->nodes[org_node].type == ANCHOR)
1495             {
1496               /* In case of the node has another constraint, append it.  */
1497               if (org_node == root_node && clone_node != org_node)
1498                 {
1499                   /* ...but if the node is root_node itself, it means the
1500                      epsilon closure have a loop, then tie it to the
1501                      destination of the root_node.  */
1502                   ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1503                   if (BE (! ok, 0))
1504                     return REG_ESPACE;
1505                   break;
1506                 }
1507               constraint |= dfa->nodes[org_node].opr.ctx_type;
1508             }
1509           clone_dest = duplicate_node (dfa, org_dest, constraint);
1510           if (BE (clone_dest == REG_MISSING, 0))
1511             return REG_ESPACE;
1512           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1513           if (BE (! ok, 0))
1514             return REG_ESPACE;
1515         }
1516       else /* dfa->edests[org_node].nelem == 2 */
1517         {
1518           /* In case of the node can epsilon-transit, and it has two
1519              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1520           org_dest = dfa->edests[org_node].elems[0];
1521           re_node_set_empty (dfa->edests + clone_node);
1522           /* Search for a duplicated node which satisfies the constraint.  */
1523           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1524           if (clone_dest == REG_MISSING)
1525             {
1526               /* There are no such a duplicated node, create a new one.  */
1527               reg_errcode_t err;
1528               clone_dest = duplicate_node (dfa, org_dest, constraint);
1529               if (BE (clone_dest == REG_MISSING, 0))
1530                 return REG_ESPACE;
1531               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1532               if (BE (! ok, 0))
1533                 return REG_ESPACE;
1534               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1535                                             root_node, constraint);
1536               if (BE (err != REG_NOERROR, 0))
1537                 return err;
1538             }
1539           else
1540             {
1541               /* There are a duplicated node which satisfy the constraint,
1542                  use it to avoid infinite loop.  */
1543               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (! ok, 0))
1545                 return REG_ESPACE;
1546             }
1547
1548           org_dest = dfa->edests[org_node].elems[1];
1549           clone_dest = duplicate_node (dfa, org_dest, constraint);
1550           if (BE (clone_dest == REG_MISSING, 0))
1551             return REG_ESPACE;
1552           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553           if (BE (! ok, 0))
1554             return REG_ESPACE;
1555         }
1556       org_node = org_dest;
1557       clone_node = clone_dest;
1558     }
1559   return REG_NOERROR;
1560 }
1561
1562 /* Search for a node which is duplicated from the node ORG_NODE, and
1563    satisfies the constraint CONSTRAINT.  */
1564
1565 static Idx
1566 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1567                         unsigned int constraint)
1568 {
1569   Idx idx;
1570   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1571     {
1572       if (org_node == dfa->org_indices[idx]
1573           && constraint == dfa->nodes[idx].constraint)
1574         return idx; /* Found.  */
1575     }
1576   return REG_MISSING; /* Not found.  */
1577 }
1578
1579 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1580    Return the index of the new node, or REG_MISSING if insufficient storage is
1581    available.  */
1582
1583 static Idx
1584 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1585 {
1586   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1587   if (BE (dup_idx != REG_MISSING, 1))
1588     {
1589       dfa->nodes[dup_idx].constraint = constraint;
1590       if (dfa->nodes[org_idx].type == ANCHOR)
1591         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1592       dfa->nodes[dup_idx].duplicated = 1;
1593
1594       /* Store the index of the original node.  */
1595       dfa->org_indices[dup_idx] = org_idx;
1596     }
1597   return dup_idx;
1598 }
1599
1600 static reg_errcode_t
1601 calc_inveclosure (re_dfa_t *dfa)
1602 {
1603   Idx src, idx;
1604   bool ok;
1605   for (idx = 0; idx < dfa->nodes_len; ++idx)
1606     re_node_set_init_empty (dfa->inveclosures + idx);
1607
1608   for (src = 0; src < dfa->nodes_len; ++src)
1609     {
1610       Idx *elems = dfa->eclosures[src].elems;
1611       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1612         {
1613           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1614           if (BE (! ok, 0))
1615             return REG_ESPACE;
1616         }
1617     }
1618
1619   return REG_NOERROR;
1620 }
1621
1622 /* Calculate "eclosure" for all the node in DFA.  */
1623
1624 static reg_errcode_t
1625 calc_eclosure (re_dfa_t *dfa)
1626 {
1627   Idx node_idx;
1628   bool incomplete;
1629 #ifdef DEBUG
1630   assert (dfa->nodes_len > 0);
1631 #endif
1632   incomplete = false;
1633   /* For each nodes, calculate epsilon closure.  */
1634   for (node_idx = 0; ; ++node_idx)
1635     {
1636       reg_errcode_t err;
1637       re_node_set eclosure_elem;
1638       if (node_idx == dfa->nodes_len)
1639         {
1640           if (!incomplete)
1641             break;
1642           incomplete = false;
1643           node_idx = 0;
1644         }
1645
1646 #ifdef DEBUG
1647       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1648 #endif
1649
1650       /* If we have already calculated, skip it.  */
1651       if (dfa->eclosures[node_idx].nelem != 0)
1652         continue;
1653       /* Calculate epsilon closure of `node_idx'.  */
1654       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1655       if (BE (err != REG_NOERROR, 0))
1656         return err;
1657
1658       if (dfa->eclosures[node_idx].nelem == 0)
1659         {
1660           incomplete = true;
1661           re_node_set_free (&eclosure_elem);
1662         }
1663     }
1664   return REG_NOERROR;
1665 }
1666
1667 /* Calculate epsilon closure of NODE.  */
1668
1669 static reg_errcode_t
1670 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1671 {
1672   reg_errcode_t err;
1673   unsigned int constraint;
1674   Idx i;
1675   bool incomplete;
1676   bool ok;
1677   re_node_set eclosure;
1678   incomplete = false;
1679   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1680   if (BE (err != REG_NOERROR, 0))
1681     return err;
1682
1683   /* This indicates that we are calculating this node now.
1684      We reference this value to avoid infinite loop.  */
1685   dfa->eclosures[node].nelem = REG_MISSING;
1686
1687   constraint = ((dfa->nodes[node].type == ANCHOR)
1688                 ? dfa->nodes[node].opr.ctx_type : 0);
1689   /* If the current node has constraints, duplicate all nodes.
1690      Since they must inherit the constraints.  */
1691   if (constraint
1692       && dfa->edests[node].nelem
1693       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1694     {
1695       err = duplicate_node_closure (dfa, node, node, node, constraint);
1696       if (BE (err != REG_NOERROR, 0))
1697         return err;
1698     }
1699
1700   /* Expand each epsilon destination nodes.  */
1701   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1702     for (i = 0; i < dfa->edests[node].nelem; ++i)
1703       {
1704         re_node_set eclosure_elem;
1705         Idx edest = dfa->edests[node].elems[i];
1706         /* If calculating the epsilon closure of `edest' is in progress,
1707            return intermediate result.  */
1708         if (dfa->eclosures[edest].nelem == REG_MISSING)
1709           {
1710             incomplete = true;
1711             continue;
1712           }
1713         /* If we haven't calculated the epsilon closure of `edest' yet,
1714            calculate now. Otherwise use calculated epsilon closure.  */
1715         if (dfa->eclosures[edest].nelem == 0)
1716           {
1717             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1718             if (BE (err != REG_NOERROR, 0))
1719               return err;
1720           }
1721         else
1722           eclosure_elem = dfa->eclosures[edest];
1723         /* Merge the epsilon closure of `edest'.  */
1724         re_node_set_merge (&eclosure, &eclosure_elem);
1725         /* If the epsilon closure of `edest' is incomplete,
1726            the epsilon closure of this node is also incomplete.  */
1727         if (dfa->eclosures[edest].nelem == 0)
1728           {
1729             incomplete = true;
1730             re_node_set_free (&eclosure_elem);
1731           }
1732       }
1733
1734   /* Epsilon closures include itself.  */
1735   ok = re_node_set_insert (&eclosure, node);
1736   if (BE (! ok, 0))
1737     return REG_ESPACE;
1738   if (incomplete && !root)
1739     dfa->eclosures[node].nelem = 0;
1740   else
1741     dfa->eclosures[node] = eclosure;
1742   *new_set = eclosure;
1743   return REG_NOERROR;
1744 }
1745 \f
1746 /* Functions for token which are used in the parser.  */
1747
1748 /* Fetch a token from INPUT.
1749    We must not use this function inside bracket expressions.  */
1750
1751 static void
1752 internal_function
1753 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1754 {
1755   re_string_skip_bytes (input, peek_token (result, input, syntax));
1756 }
1757
1758 /* Peek a token from INPUT, and return the length of the token.
1759    We must not use this function inside bracket expressions.  */
1760
1761 static int
1762 internal_function
1763 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1764 {
1765   unsigned char c;
1766
1767   if (re_string_eoi (input))
1768     {
1769       token->type = END_OF_RE;
1770       return 0;
1771     }
1772
1773   c = re_string_peek_byte (input, 0);
1774   token->opr.c = c;
1775
1776   token->word_char = 0;
1777 #ifdef RE_ENABLE_I18N
1778   token->mb_partial = 0;
1779   if (input->mb_cur_max > 1 &&
1780       !re_string_first_byte (input, re_string_cur_idx (input)))
1781     {
1782       token->type = CHARACTER;
1783       token->mb_partial = 1;
1784       return 1;
1785     }
1786 #endif
1787   if (c == '\\')
1788     {
1789       unsigned char c2;
1790       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1791         {
1792           token->type = BACK_SLASH;
1793           return 1;
1794         }
1795
1796       c2 = re_string_peek_byte_case (input, 1);
1797       token->opr.c = c2;
1798       token->type = CHARACTER;
1799 #ifdef RE_ENABLE_I18N
1800       if (input->mb_cur_max > 1)
1801         {
1802           wint_t wc = re_string_wchar_at (input,
1803                                           re_string_cur_idx (input) + 1);
1804           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1805         }
1806       else
1807 #endif
1808         token->word_char = IS_WORD_CHAR (c2) != 0;
1809
1810       switch (c2)
1811         {
1812         case '|':
1813           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1814             token->type = OP_ALT;
1815           break;
1816         case '1': case '2': case '3': case '4': case '5':
1817         case '6': case '7': case '8': case '9':
1818           if (!(syntax & RE_NO_BK_REFS))
1819             {
1820               token->type = OP_BACK_REF;
1821               token->opr.idx = c2 - '1';
1822             }
1823           break;
1824         case '<':
1825           if (!(syntax & RE_NO_GNU_OPS))
1826             {
1827               token->type = ANCHOR;
1828               token->opr.ctx_type = WORD_FIRST;
1829             }
1830           break;
1831         case '>':
1832           if (!(syntax & RE_NO_GNU_OPS))
1833             {
1834               token->type = ANCHOR;
1835               token->opr.ctx_type = WORD_LAST;
1836             }
1837           break;
1838         case 'b':
1839           if (!(syntax & RE_NO_GNU_OPS))
1840             {
1841               token->type = ANCHOR;
1842               token->opr.ctx_type = WORD_DELIM;
1843             }
1844           break;
1845         case 'B':
1846           if (!(syntax & RE_NO_GNU_OPS))
1847             {
1848               token->type = ANCHOR;
1849               token->opr.ctx_type = NOT_WORD_DELIM;
1850             }
1851           break;
1852         case 'w':
1853           if (!(syntax & RE_NO_GNU_OPS))
1854             token->type = OP_WORD;
1855           break;
1856         case 'W':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             token->type = OP_NOTWORD;
1859           break;
1860         case 's':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             token->type = OP_SPACE;
1863           break;
1864         case 'S':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             token->type = OP_NOTSPACE;
1867           break;
1868         case '`':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             {
1871               token->type = ANCHOR;
1872               token->opr.ctx_type = BUF_FIRST;
1873             }
1874           break;
1875         case '\'':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             {
1878               token->type = ANCHOR;
1879               token->opr.ctx_type = BUF_LAST;
1880             }
1881           break;
1882         case '(':
1883           if (!(syntax & RE_NO_BK_PARENS))
1884             token->type = OP_OPEN_SUBEXP;
1885           break;
1886         case ')':
1887           if (!(syntax & RE_NO_BK_PARENS))
1888             token->type = OP_CLOSE_SUBEXP;
1889           break;
1890         case '+':
1891           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892             token->type = OP_DUP_PLUS;
1893           break;
1894         case '?':
1895           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896             token->type = OP_DUP_QUESTION;
1897           break;
1898         case '{':
1899           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900             token->type = OP_OPEN_DUP_NUM;
1901           break;
1902         case '}':
1903           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904             token->type = OP_CLOSE_DUP_NUM;
1905           break;
1906         default:
1907           break;
1908         }
1909       return 2;
1910     }
1911
1912   token->type = CHARACTER;
1913 #ifdef RE_ENABLE_I18N
1914   if (input->mb_cur_max > 1)
1915     {
1916       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1917       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1918     }
1919   else
1920 #endif
1921     token->word_char = IS_WORD_CHAR (token->opr.c);
1922
1923   switch (c)
1924     {
1925     case '\n':
1926       if (syntax & RE_NEWLINE_ALT)
1927         token->type = OP_ALT;
1928       break;
1929     case '|':
1930       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1931         token->type = OP_ALT;
1932       break;
1933     case '*':
1934       token->type = OP_DUP_ASTERISK;
1935       break;
1936     case '+':
1937       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1938         token->type = OP_DUP_PLUS;
1939       break;
1940     case '?':
1941       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1942         token->type = OP_DUP_QUESTION;
1943       break;
1944     case '{':
1945       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1946         token->type = OP_OPEN_DUP_NUM;
1947       break;
1948     case '}':
1949       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1950         token->type = OP_CLOSE_DUP_NUM;
1951       break;
1952     case '(':
1953       if (syntax & RE_NO_BK_PARENS)
1954         token->type = OP_OPEN_SUBEXP;
1955       break;
1956     case ')':
1957       if (syntax & RE_NO_BK_PARENS)
1958         token->type = OP_CLOSE_SUBEXP;
1959       break;
1960     case '[':
1961       token->type = OP_OPEN_BRACKET;
1962       break;
1963     case '.':
1964       token->type = OP_PERIOD;
1965       break;
1966     case '^':
1967       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1968           re_string_cur_idx (input) != 0)
1969         {
1970           char prev = re_string_peek_byte (input, -1);
1971           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1972             break;
1973         }
1974       token->type = ANCHOR;
1975       token->opr.ctx_type = LINE_FIRST;
1976       break;
1977     case '$':
1978       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1979           re_string_cur_idx (input) + 1 != re_string_length (input))
1980         {
1981           re_token_t next;
1982           re_string_skip_bytes (input, 1);
1983           peek_token (&next, input, syntax);
1984           re_string_skip_bytes (input, -1);
1985           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1986             break;
1987         }
1988       token->type = ANCHOR;
1989       token->opr.ctx_type = LINE_LAST;
1990       break;
1991     default:
1992       break;
1993     }
1994   return 1;
1995 }
1996
1997 /* Peek a token from INPUT, and return the length of the token.
1998    We must not use this function out of bracket expressions.  */
1999
2000 static int
2001 internal_function
2002 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2003 {
2004   unsigned char c;
2005   if (re_string_eoi (input))
2006     {
2007       token->type = END_OF_RE;
2008       return 0;
2009     }
2010   c = re_string_peek_byte (input, 0);
2011   token->opr.c = c;
2012
2013 #ifdef RE_ENABLE_I18N
2014   if (input->mb_cur_max > 1 &&
2015       !re_string_first_byte (input, re_string_cur_idx (input)))
2016     {
2017       token->type = CHARACTER;
2018       return 1;
2019     }
2020 #endif /* RE_ENABLE_I18N */
2021
2022   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2023       && re_string_cur_idx (input) + 1 < re_string_length (input))
2024     {
2025       /* In this case, '\' escape a character.  */
2026       unsigned char c2;
2027       re_string_skip_bytes (input, 1);
2028       c2 = re_string_peek_byte (input, 0);
2029       token->opr.c = c2;
2030       token->type = CHARACTER;
2031       return 1;
2032     }
2033   if (c == '[') /* '[' is a special char in a bracket exps.  */
2034     {
2035       unsigned char c2;
2036       int token_len;
2037       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2038         c2 = re_string_peek_byte (input, 1);
2039       else
2040         c2 = 0;
2041       token->opr.c = c2;
2042       token_len = 2;
2043       switch (c2)
2044         {
2045         case '.':
2046           token->type = OP_OPEN_COLL_ELEM;
2047           break;
2048         case '=':
2049           token->type = OP_OPEN_EQUIV_CLASS;
2050           break;
2051         case ':':
2052           if (syntax & RE_CHAR_CLASSES)
2053             {
2054               token->type = OP_OPEN_CHAR_CLASS;
2055               break;
2056             }
2057           /* else fall through.  */
2058         default:
2059           token->type = CHARACTER;
2060           token->opr.c = c;
2061           token_len = 1;
2062           break;
2063         }
2064       return token_len;
2065     }
2066   switch (c)
2067     {
2068     case '-':
2069       token->type = OP_CHARSET_RANGE;
2070       break;
2071     case ']':
2072       token->type = OP_CLOSE_BRACKET;
2073       break;
2074     case '^':
2075       token->type = OP_NON_MATCH_LIST;
2076       break;
2077     default:
2078       token->type = CHARACTER;
2079     }
2080   return 1;
2081 }
2082 \f
2083 /* Functions for parser.  */
2084
2085 /* Entry point of the parser.
2086    Parse the regular expression REGEXP and return the structure tree.
2087    If an error is occured, ERR is set by error code, and return NULL.
2088    This function build the following tree, from regular expression <reg_exp>:
2089            CAT
2090            / \
2091           /   \
2092    <reg_exp>  EOR
2093
2094    CAT means concatenation.
2095    EOR means end of regular expression.  */
2096
2097 static bin_tree_t *
2098 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2099        reg_errcode_t *err)
2100 {
2101   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2102   bin_tree_t *tree, *eor, *root;
2103   re_token_t current_token;
2104   dfa->syntax = syntax;
2105   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2106   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2107   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108     return NULL;
2109   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2110   if (tree != NULL)
2111     root = create_tree (dfa, tree, eor, CONCAT);
2112   else
2113     root = eor;
2114   if (BE (eor == NULL || root == NULL, 0))
2115     {
2116       *err = REG_ESPACE;
2117       return NULL;
2118     }
2119   return root;
2120 }
2121
2122 /* This function build the following tree, from regular expression
2123    <branch1>|<branch2>:
2124            ALT
2125            / \
2126           /   \
2127    <branch1> <branch2>
2128
2129    ALT means alternative, which represents the operator `|'.  */
2130
2131 static bin_tree_t *
2132 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2134 {
2135   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2136   bin_tree_t *tree, *branch = NULL;
2137   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2138   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2139     return NULL;
2140
2141   while (token->type == OP_ALT)
2142     {
2143       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2144       if (token->type != OP_ALT && token->type != END_OF_RE
2145           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2146         {
2147           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2148           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2149             return NULL;
2150         }
2151       else
2152         branch = NULL;
2153       tree = create_tree (dfa, tree, branch, OP_ALT);
2154       if (BE (tree == NULL, 0))
2155         {
2156           *err = REG_ESPACE;
2157           return NULL;
2158         }
2159     }
2160   return tree;
2161 }
2162
2163 /* This function build the following tree, from regular expression
2164    <exp1><exp2>:
2165         CAT
2166         / \
2167        /   \
2168    <exp1> <exp2>
2169
2170    CAT means concatenation.  */
2171
2172 static bin_tree_t *
2173 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2175 {
2176   bin_tree_t *tree, *expr;
2177   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2178   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2179   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2180     return NULL;
2181
2182   while (token->type != OP_ALT && token->type != END_OF_RE
2183          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2184     {
2185       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2186       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2187         {
2188           return NULL;
2189         }
2190       if (tree != NULL && expr != NULL)
2191         {
2192           tree = create_tree (dfa, tree, expr, CONCAT);
2193           if (tree == NULL)
2194             {
2195               *err = REG_ESPACE;
2196               return NULL;
2197             }
2198         }
2199       else if (tree == NULL)
2200         tree = expr;
2201       /* Otherwise expr == NULL, we don't need to create new tree.  */
2202     }
2203   return tree;
2204 }
2205
2206 /* This function build the following tree, from regular expression a*:
2207          *
2208          |
2209          a
2210 */
2211
2212 static bin_tree_t *
2213 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2214                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2215 {
2216   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2217   bin_tree_t *tree;
2218   switch (token->type)
2219     {
2220     case CHARACTER:
2221       tree = create_token_tree (dfa, NULL, NULL, token);
2222       if (BE (tree == NULL, 0))
2223         {
2224           *err = REG_ESPACE;
2225           return NULL;
2226         }
2227 #ifdef RE_ENABLE_I18N
2228       if (dfa->mb_cur_max > 1)
2229         {
2230           while (!re_string_eoi (regexp)
2231                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2232             {
2233               bin_tree_t *mbc_remain;
2234               fetch_token (token, regexp, syntax);
2235               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2236               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2237               if (BE (mbc_remain == NULL || tree == NULL, 0))
2238                 {
2239                   *err = REG_ESPACE;
2240                   return NULL;
2241                 }
2242             }
2243         }
2244 #endif
2245       break;
2246     case OP_OPEN_SUBEXP:
2247       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2248       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2249         return NULL;
2250       break;
2251     case OP_OPEN_BRACKET:
2252       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2253       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2254         return NULL;
2255       break;
2256     case OP_BACK_REF:
2257       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2258         {
2259           *err = REG_ESUBREG;
2260           return NULL;
2261         }
2262       dfa->used_bkref_map |= 1 << token->opr.idx;
2263       tree = create_token_tree (dfa, NULL, NULL, token);
2264       if (BE (tree == NULL, 0))
2265         {
2266           *err = REG_ESPACE;
2267           return NULL;
2268         }
2269       ++dfa->nbackref;
2270       dfa->has_mb_node = 1;
2271       break;
2272     case OP_OPEN_DUP_NUM:
2273       if (syntax & RE_CONTEXT_INVALID_DUP)
2274         {
2275           *err = REG_BADRPT;
2276           return NULL;
2277         }
2278       /* FALLTHROUGH */
2279     case OP_DUP_ASTERISK:
2280     case OP_DUP_PLUS:
2281     case OP_DUP_QUESTION:
2282       if (syntax & RE_CONTEXT_INVALID_OPS)
2283         {
2284           *err = REG_BADRPT;
2285           return NULL;
2286         }
2287       else if (syntax & RE_CONTEXT_INDEP_OPS)
2288         {
2289           fetch_token (token, regexp, syntax);
2290           return parse_expression (regexp, preg, token, syntax, nest, err);
2291         }
2292       /* else fall through  */
2293     case OP_CLOSE_SUBEXP:
2294       if ((token->type == OP_CLOSE_SUBEXP) &&
2295           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2296         {
2297           *err = REG_ERPAREN;
2298           return NULL;
2299         }
2300       /* else fall through  */
2301     case OP_CLOSE_DUP_NUM:
2302       /* We treat it as a normal character.  */
2303
2304       /* Then we can these characters as normal characters.  */
2305       token->type = CHARACTER;
2306       /* mb_partial and word_char bits should be initialized already
2307          by peek_token.  */
2308       tree = create_token_tree (dfa, NULL, NULL, token);
2309       if (BE (tree == NULL, 0))
2310         {
2311           *err = REG_ESPACE;
2312           return NULL;
2313         }
2314       break;
2315     case ANCHOR:
2316       if ((token->opr.ctx_type
2317            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2318           && dfa->word_ops_used == 0)
2319         init_word_char (dfa);
2320       if (token->opr.ctx_type == WORD_DELIM
2321           || token->opr.ctx_type == NOT_WORD_DELIM)
2322         {
2323           bin_tree_t *tree_first, *tree_last;
2324           if (token->opr.ctx_type == WORD_DELIM)
2325             {
2326               token->opr.ctx_type = WORD_FIRST;
2327               tree_first = create_token_tree (dfa, NULL, NULL, token);
2328               token->opr.ctx_type = WORD_LAST;
2329             }
2330           else
2331             {
2332               token->opr.ctx_type = INSIDE_WORD;
2333               tree_first = create_token_tree (dfa, NULL, NULL, token);
2334               token->opr.ctx_type = INSIDE_NOTWORD;
2335             }
2336           tree_last = create_token_tree (dfa, NULL, NULL, token);
2337           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2338           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2339             {
2340               *err = REG_ESPACE;
2341               return NULL;
2342             }
2343         }
2344       else
2345         {
2346           tree = create_token_tree (dfa, NULL, NULL, token);
2347           if (BE (tree == NULL, 0))
2348             {
2349               *err = REG_ESPACE;
2350               return NULL;
2351             }
2352         }
2353       /* We must return here, since ANCHORs can't be followed
2354          by repetition operators.
2355          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2357       fetch_token (token, regexp, syntax);
2358       return tree;
2359     case OP_PERIOD:
2360       tree = create_token_tree (dfa, NULL, NULL, token);
2361       if (BE (tree == NULL, 0))
2362         {
2363           *err = REG_ESPACE;
2364           return NULL;
2365         }
2366       if (dfa->mb_cur_max > 1)
2367         dfa->has_mb_node = 1;
2368       break;
2369     case OP_WORD:
2370     case OP_NOTWORD:
2371       tree = build_charclass_op (dfa, regexp->trans,
2372                                  (const unsigned char *) "alnum",
2373                                  (const unsigned char *) "_",
2374                                  token->type == OP_NOTWORD, err);
2375       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2376         return NULL;
2377       break;
2378     case OP_SPACE:
2379     case OP_NOTSPACE:
2380       tree = build_charclass_op (dfa, regexp->trans,
2381                                  (const unsigned char *) "space",
2382                                  (const unsigned char *) "",
2383                                  token->type == OP_NOTSPACE, err);
2384       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385         return NULL;
2386       break;
2387     case OP_ALT:
2388     case END_OF_RE:
2389       return NULL;
2390     case BACK_SLASH:
2391       *err = REG_EESCAPE;
2392       return NULL;
2393     default:
2394       /* Must not happen?  */
2395 #ifdef DEBUG
2396       assert (0);
2397 #endif
2398       return NULL;
2399     }
2400   fetch_token (token, regexp, syntax);
2401
2402   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2403          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2404     {
2405       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2406       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2407         return NULL;
2408       /* In BRE consecutive duplications are not allowed.  */
2409       if ((syntax & RE_CONTEXT_INVALID_DUP)
2410           && (token->type == OP_DUP_ASTERISK
2411               || token->type == OP_OPEN_DUP_NUM))
2412         {
2413           *err = REG_BADRPT;
2414           return NULL;
2415         }
2416     }
2417
2418   return tree;
2419 }
2420
2421 /* This function build the following tree, from regular expression
2422    (<reg_exp>):
2423          SUBEXP
2424             |
2425         <reg_exp>
2426 */
2427
2428 static bin_tree_t *
2429 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2430                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2431 {
2432   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2433   bin_tree_t *tree;
2434   size_t cur_nsub;
2435   cur_nsub = preg->re_nsub++;
2436
2437   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2438
2439   /* The subexpression may be a null string.  */
2440   if (token->type == OP_CLOSE_SUBEXP)
2441     tree = NULL;
2442   else
2443     {
2444       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2445       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2446         *err = REG_EPAREN;
2447       if (BE (*err != REG_NOERROR, 0))
2448         return NULL;
2449     }
2450
2451   if (cur_nsub <= '9' - '1')
2452     dfa->completed_bkref_map |= 1 << cur_nsub;
2453
2454   tree = create_tree (dfa, tree, NULL, SUBEXP);
2455   if (BE (tree == NULL, 0))
2456     {
2457       *err = REG_ESPACE;
2458       return NULL;
2459     }
2460   tree->token.opr.idx = cur_nsub;
2461   return tree;
2462 }
2463
2464 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2465
2466 static bin_tree_t *
2467 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2468               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2469 {
2470   bin_tree_t *tree = NULL, *old_tree = NULL;
2471   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2472   re_token_t start_token = *token;
2473
2474   if (token->type == OP_OPEN_DUP_NUM)
2475     {
2476       end = 0;
2477       start = fetch_number (regexp, token, syntax);
2478       if (start == REG_MISSING)
2479         {
2480           if (token->type == CHARACTER && token->opr.c == ',')
2481             start = 0; /* We treat "{,m}" as "{0,m}".  */
2482           else
2483             {
2484               *err = REG_BADBR; /* <re>{} is invalid.  */
2485               return NULL;
2486             }
2487         }
2488       if (BE (start != REG_ERROR, 1))
2489         {
2490           /* We treat "{n}" as "{n,n}".  */
2491           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2492                  : ((token->type == CHARACTER && token->opr.c == ',')
2493                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2494         }
2495       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2496         {
2497           /* Invalid sequence.  */
2498           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2499             {
2500               if (token->type == END_OF_RE)
2501                 *err = REG_EBRACE;
2502               else
2503                 *err = REG_BADBR;
2504
2505               return NULL;
2506             }
2507
2508           /* If the syntax bit is set, rollback.  */
2509           re_string_set_index (regexp, start_idx);
2510           *token = start_token;
2511           token->type = CHARACTER;
2512           /* mb_partial and word_char bits should be already initialized by
2513              peek_token.  */
2514           return elem;
2515         }
2516
2517       if (BE (end != REG_MISSING && start > end, 0))
2518         {
2519           /* First number greater than second.  */
2520           *err = REG_BADBR;
2521           return NULL;
2522         }
2523     }
2524   else
2525     {
2526       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2527       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2528     }
2529
2530   fetch_token (token, regexp, syntax);
2531
2532   if (BE (elem == NULL, 0))
2533     return NULL;
2534   if (BE (start == 0 && end == 0, 0))
2535     {
2536       postorder (elem, free_tree, NULL);
2537       return NULL;
2538     }
2539
2540   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2541   if (BE (start > 0, 0))
2542     {
2543       tree = elem;
2544       for (i = 2; i <= start; ++i)
2545         {
2546           elem = duplicate_tree (elem, dfa);
2547           tree = create_tree (dfa, tree, elem, CONCAT);
2548           if (BE (elem == NULL || tree == NULL, 0))
2549             goto parse_dup_op_espace;
2550         }
2551
2552       if (start == end)
2553         return tree;
2554
2555       /* Duplicate ELEM before it is marked optional.  */
2556       elem = duplicate_tree (elem, dfa);
2557       old_tree = tree;
2558     }
2559   else
2560     old_tree = NULL;
2561
2562   if (elem->token.type == SUBEXP)
2563     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2564
2565   tree = create_tree (dfa, elem, NULL,
2566                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2567   if (BE (tree == NULL, 0))
2568     goto parse_dup_op_espace;
2569
2570   /* This loop is actually executed only when end != REG_MISSING,
2571      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2572      already created the start+1-th copy.  */
2573   if ((Idx) -1 < 0 || end != REG_MISSING)
2574     for (i = start + 2; i <= end; ++i)
2575       {
2576         elem = duplicate_tree (elem, dfa);
2577         tree = create_tree (dfa, tree, elem, CONCAT);
2578         if (BE (elem == NULL || tree == NULL, 0))
2579           goto parse_dup_op_espace;
2580
2581         tree = create_tree (dfa, tree, NULL, OP_ALT);
2582         if (BE (tree == NULL, 0))
2583           goto parse_dup_op_espace;
2584       }
2585
2586   if (old_tree)
2587     tree = create_tree (dfa, old_tree, tree, CONCAT);
2588
2589   return tree;
2590
2591  parse_dup_op_espace:
2592   *err = REG_ESPACE;
2593   return NULL;
2594 }
2595
2596 /* Size of the names for collating symbol/equivalence_class/character_class.
2597    I'm not sure, but maybe enough.  */
2598 #define BRACKET_NAME_BUF_SIZE 32
2599
2600 #ifndef _LIBC
2601   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2602      Build the range expression which starts from START_ELEM, and ends
2603      at END_ELEM.  The result are written to MBCSET and SBCSET.
2604      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2605      mbcset->range_ends, is a pointer argument sinse we may
2606      update it.  */
2607
2608 static reg_errcode_t
2609 internal_function
2610 # ifdef RE_ENABLE_I18N
2611 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2612                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2613 # else /* not RE_ENABLE_I18N */
2614 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2615                  bracket_elem_t *end_elem)
2616 # endif /* not RE_ENABLE_I18N */
2617 {
2618   unsigned int start_ch, end_ch;
2619   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2620   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2621           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2622           0))
2623     return REG_ERANGE;
2624
2625   /* We can handle no multi character collating elements without libc
2626      support.  */
2627   if (BE ((start_elem->type == COLL_SYM
2628            && strlen ((char *) start_elem->opr.name) > 1)
2629           || (end_elem->type == COLL_SYM
2630               && strlen ((char *) end_elem->opr.name) > 1), 0))
2631     return REG_ECOLLATE;
2632
2633 # ifdef RE_ENABLE_I18N
2634   {
2635     wchar_t wc;
2636     wint_t start_wc;
2637     wint_t end_wc;
2638     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2639
2640     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2641                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2642                    : 0));
2643     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2644               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2645                  : 0));
2646     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2647                 ? __btowc (start_ch) : start_elem->opr.wch);
2648     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2649               ? __btowc (end_ch) : end_elem->opr.wch);
2650     if (start_wc == WEOF || end_wc == WEOF)
2651       return REG_ECOLLATE;
2652     cmp_buf[0] = start_wc;
2653     cmp_buf[4] = end_wc;
2654     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2655       return REG_ERANGE;
2656
2657     /* Got valid collation sequence values, add them as a new entry.
2658        However, for !_LIBC we have no collation elements: if the
2659        character set is single byte, the single byte character set
2660        that we build below suffices.  parse_bracket_exp passes
2661        no MBCSET if dfa->mb_cur_max == 1.  */
2662     if (mbcset)
2663       {
2664         /* Check the space of the arrays.  */
2665         if (BE (*range_alloc == mbcset->nranges, 0))
2666           {
2667             /* There is not enough space, need realloc.  */
2668             wchar_t *new_array_start, *new_array_end;
2669             Idx new_nranges;
2670
2671             /* +1 in case of mbcset->nranges is 0.  */
2672             new_nranges = 2 * mbcset->nranges + 1;
2673             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2674                are NULL if *range_alloc == 0.  */
2675             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2676                                           new_nranges);
2677             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2678                                         new_nranges);
2679
2680             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2681               return REG_ESPACE;
2682
2683             mbcset->range_starts = new_array_start;
2684             mbcset->range_ends = new_array_end;
2685             *range_alloc = new_nranges;
2686           }
2687
2688         mbcset->range_starts[mbcset->nranges] = start_wc;
2689         mbcset->range_ends[mbcset->nranges++] = end_wc;
2690       }
2691
2692     /* Build the table for single byte characters.  */
2693     for (wc = 0; wc < SBC_MAX; ++wc)
2694       {
2695         cmp_buf[2] = wc;
2696         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2697             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2698           bitset_set (sbcset, wc);
2699       }
2700   }
2701 # else /* not RE_ENABLE_I18N */
2702   {
2703     unsigned int ch;
2704     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2705                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2706                    : 0));
2707     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2708               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2709                  : 0));
2710     if (start_ch > end_ch)
2711       return REG_ERANGE;
2712     /* Build the table for single byte characters.  */
2713     for (ch = 0; ch < SBC_MAX; ++ch)
2714       if (start_ch <= ch  && ch <= end_ch)
2715         bitset_set (sbcset, ch);
2716   }
2717 # endif /* not RE_ENABLE_I18N */
2718   return REG_NOERROR;
2719 }
2720 #endif /* not _LIBC */
2721
2722 #ifndef _LIBC
2723 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2724    Build the collating element which is represented by NAME.
2725    The result are written to MBCSET and SBCSET.
2726    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2727    pointer argument since we may update it.  */
2728
2729 static reg_errcode_t
2730 internal_function
2731 build_collating_symbol (bitset_t sbcset,
2732 # ifdef RE_ENABLE_I18N
2733                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2734 # endif
2735                         const unsigned char *name)
2736 {
2737   size_t name_len = strlen ((const char *) name);
2738   if (BE (name_len != 1, 0))
2739     return REG_ECOLLATE;
2740   else
2741     {
2742       bitset_set (sbcset, name[0]);
2743       return REG_NOERROR;
2744     }
2745 }
2746 #endif /* not _LIBC */
2747
2748 /* This function parse bracket expression like "[abc]", "[a-c]",
2749    "[[.a-a.]]" etc.  */
2750
2751 static bin_tree_t *
2752 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2753                    reg_syntax_t syntax, reg_errcode_t *err)
2754 {
2755 #ifdef _LIBC
2756   const unsigned char *collseqmb;
2757   const char *collseqwc;
2758   uint32_t nrules;
2759   int32_t table_size;
2760   const int32_t *symb_table;
2761   const unsigned char *extra;
2762
2763   /* Local function for parse_bracket_exp used in _LIBC environement.
2764      Seek the collating symbol entry correspondings to NAME.
2765      Return the index of the symbol in the SYMB_TABLE.  */
2766
2767   auto inline int32_t
2768   __attribute ((always_inline))
2769   seek_collating_symbol_entry (name, name_len)
2770          const unsigned char *name;
2771          size_t name_len;
2772     {
2773       int32_t hash = elem_hash ((const char *) name, name_len);
2774       int32_t elem = hash % table_size;
2775       if (symb_table[2 * elem] != 0)
2776         {
2777           int32_t second = hash % (table_size - 2) + 1;
2778
2779           do
2780             {
2781               /* First compare the hashing value.  */
2782               if (symb_table[2 * elem] == hash
2783                   /* Compare the length of the name.  */
2784                   && name_len == extra[symb_table[2 * elem + 1]]
2785                   /* Compare the name.  */
2786                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2787                              name_len) == 0)
2788                 {
2789                   /* Yep, this is the entry.  */
2790                   break;
2791                 }
2792
2793               /* Next entry.  */
2794               elem += second;
2795             }
2796           while (symb_table[2 * elem] != 0);
2797         }
2798       return elem;
2799     }
2800
2801   /* Local function for parse_bracket_exp used in _LIBC environement.
2802      Look up the collation sequence value of BR_ELEM.
2803      Return the value if succeeded, UINT_MAX otherwise.  */
2804
2805   auto inline unsigned int
2806   __attribute ((always_inline))
2807   lookup_collation_sequence_value (br_elem)
2808          bracket_elem_t *br_elem;
2809     {
2810       if (br_elem->type == SB_CHAR)
2811         {
2812           /*
2813           if (MB_CUR_MAX == 1)
2814           */
2815           if (nrules == 0)
2816             return collseqmb[br_elem->opr.ch];
2817           else
2818             {
2819               wint_t wc = __btowc (br_elem->opr.ch);
2820               return __collseq_table_lookup (collseqwc, wc);
2821             }
2822         }
2823       else if (br_elem->type == MB_CHAR)
2824         {
2825           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2826         }
2827       else if (br_elem->type == COLL_SYM)
2828         {
2829           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2830           if (nrules != 0)
2831             {
2832               int32_t elem, idx;
2833               elem = seek_collating_symbol_entry (br_elem->opr.name,
2834                                                   sym_name_len);
2835               if (symb_table[2 * elem] != 0)
2836                 {
2837                   /* We found the entry.  */
2838                   idx = symb_table[2 * elem + 1];
2839                   /* Skip the name of collating element name.  */
2840                   idx += 1 + extra[idx];
2841                   /* Skip the byte sequence of the collating element.  */
2842                   idx += 1 + extra[idx];
2843                   /* Adjust for the alignment.  */
2844                   idx = (idx + 3) & ~3;
2845                   /* Skip the multibyte collation sequence value.  */
2846                   idx += sizeof (unsigned int);
2847                   /* Skip the wide char sequence of the collating element.  */
2848                   idx += sizeof (unsigned int) *
2849                     (1 + *(unsigned int *) (extra + idx));
2850                   /* Return the collation sequence value.  */
2851                   return *(unsigned int *) (extra + idx);
2852                 }
2853               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2854                 {
2855                   /* No valid character.  Match it as a single byte
2856                      character.  */
2857                   return collseqmb[br_elem->opr.name[0]];
2858                 }
2859             }
2860           else if (sym_name_len == 1)
2861             return collseqmb[br_elem->opr.name[0]];
2862         }
2863       return UINT_MAX;
2864     }
2865
2866   /* Local function for parse_bracket_exp used in _LIBC environement.
2867      Build the range expression which starts from START_ELEM, and ends
2868      at END_ELEM.  The result are written to MBCSET and SBCSET.
2869      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2870      mbcset->range_ends, is a pointer argument sinse we may
2871      update it.  */
2872
2873   auto inline reg_errcode_t
2874   __attribute ((always_inline))
2875   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2876          re_charset_t *mbcset;
2877          Idx *range_alloc;
2878          bitset_t sbcset;
2879          bracket_elem_t *start_elem, *end_elem;
2880     {
2881       unsigned int ch;
2882       uint32_t start_collseq;
2883       uint32_t end_collseq;
2884
2885       /* Equivalence Classes and Character Classes can't be a range
2886          start/end.  */
2887       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2889               0))
2890         return REG_ERANGE;
2891
2892       start_collseq = lookup_collation_sequence_value (start_elem);
2893       end_collseq = lookup_collation_sequence_value (end_elem);
2894       /* Check start/end collation sequence values.  */
2895       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896         return REG_ECOLLATE;
2897       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898         return REG_ERANGE;
2899
2900       /* Got valid collation sequence values, add them as a new entry.
2901          However, if we have no collation elements, and the character set
2902          is single byte, the single byte character set that we
2903          build below suffices. */
2904       if (nrules > 0 || dfa->mb_cur_max > 1)
2905         {
2906           /* Check the space of the arrays.  */
2907           if (BE (*range_alloc == mbcset->nranges, 0))
2908             {
2909               /* There is not enough space, need realloc.  */
2910               uint32_t *new_array_start;
2911               uint32_t *new_array_end;
2912               Idx new_nranges;
2913
2914               /* +1 in case of mbcset->nranges is 0.  */
2915               new_nranges = 2 * mbcset->nranges + 1;
2916               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917                                             new_nranges);
2918               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919                                           new_nranges);
2920
2921               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922                 return REG_ESPACE;
2923
2924               mbcset->range_starts = new_array_start;
2925               mbcset->range_ends = new_array_end;
2926               *range_alloc = new_nranges;
2927             }
2928
2929           mbcset->range_starts[mbcset->nranges] = start_collseq;
2930           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2931         }
2932
2933       /* Build the table for single byte characters.  */
2934       for (ch = 0; ch < SBC_MAX; ch++)
2935         {
2936           uint32_t ch_collseq;
2937           /*
2938           if (MB_CUR_MAX == 1)
2939           */
2940           if (nrules == 0)
2941             ch_collseq = collseqmb[ch];
2942           else
2943             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945             bitset_set (sbcset, ch);
2946         }
2947       return REG_NOERROR;
2948     }
2949
2950   /* Local function for parse_bracket_exp used in _LIBC environement.
2951      Build the collating element which is represented by NAME.
2952      The result are written to MBCSET and SBCSET.
2953      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954      pointer argument sinse we may update it.  */
2955
2956   auto inline reg_errcode_t
2957   __attribute ((always_inline))
2958   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2959          re_charset_t *mbcset;
2960          Idx *coll_sym_alloc;
2961          bitset_t sbcset;
2962          const unsigned char *name;
2963     {
2964       int32_t elem, idx;
2965       size_t name_len = strlen ((const char *) name);
2966       if (nrules != 0)
2967         {
2968           elem = seek_collating_symbol_entry (name, name_len);
2969           if (symb_table[2 * elem] != 0)
2970             {
2971               /* We found the entry.  */
2972               idx = symb_table[2 * elem + 1];
2973               /* Skip the name of collating element name.  */
2974               idx += 1 + extra[idx];
2975             }
2976           else if (symb_table[2 * elem] == 0 && name_len == 1)
2977             {
2978               /* No valid character, treat it as a normal
2979                  character.  */
2980               bitset_set (sbcset, name[0]);
2981               return REG_NOERROR;
2982             }
2983           else
2984             return REG_ECOLLATE;
2985
2986           /* Got valid collation sequence, add it as a new entry.  */
2987           /* Check the space of the arrays.  */
2988           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2989             {
2990               /* Not enough, realloc it.  */
2991               /* +1 in case of mbcset->ncoll_syms is 0.  */
2992               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2993               /* Use realloc since mbcset->coll_syms is NULL
2994                  if *alloc == 0.  */
2995               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2996                                                    new_coll_sym_alloc);
2997               if (BE (new_coll_syms == NULL, 0))
2998                 return REG_ESPACE;
2999               mbcset->coll_syms = new_coll_syms;
3000               *coll_sym_alloc = new_coll_sym_alloc;
3001             }
3002           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3003           return REG_NOERROR;
3004         }
3005       else
3006         {
3007           if (BE (name_len != 1, 0))
3008             return REG_ECOLLATE;
3009           else
3010             {
3011               bitset_set (sbcset, name[0]);
3012               return REG_NOERROR;
3013             }
3014         }
3015     }
3016 #endif
3017
3018   re_token_t br_token;
3019   re_bitset_ptr_t sbcset;
3020 #ifdef RE_ENABLE_I18N
3021   re_charset_t *mbcset;
3022   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3023   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3024 #endif /* not RE_ENABLE_I18N */
3025   bool non_match = false;
3026   bin_tree_t *work_tree;
3027   int token_len;
3028   bool first_round = true;
3029 #ifdef _LIBC
3030   collseqmb = (const unsigned char *)
3031     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3032   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3033   if (nrules)
3034     {
3035       /*
3036       if (MB_CUR_MAX > 1)
3037       */
3038       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3039       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3040       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3041                                                   _NL_COLLATE_SYMB_TABLEMB);
3042       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3043                                                    _NL_COLLATE_SYMB_EXTRAMB);
3044     }
3045 #endif
3046   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3047 #ifdef RE_ENABLE_I18N
3048   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3049 #endif /* RE_ENABLE_I18N */
3050 #ifdef RE_ENABLE_I18N
3051   if (BE (sbcset == NULL || mbcset == NULL, 0))
3052 #else
3053   if (BE (sbcset == NULL, 0))
3054 #endif /* RE_ENABLE_I18N */
3055     {
3056       *err = REG_ESPACE;
3057       return NULL;
3058     }
3059
3060   token_len = peek_token_bracket (token, regexp, syntax);
3061   if (BE (token->type == END_OF_RE, 0))
3062     {
3063       *err = REG_BADPAT;
3064       goto parse_bracket_exp_free_return;
3065     }
3066   if (token->type == OP_NON_MATCH_LIST)
3067     {
3068 #ifdef RE_ENABLE_I18N
3069       mbcset->non_match = 1;
3070 #endif /* not RE_ENABLE_I18N */
3071       non_match = true;
3072       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3073         bitset_set (sbcset, '\0');
3074       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3075       token_len = peek_token_bracket (token, regexp, syntax);
3076       if (BE (token->type == END_OF_RE, 0))
3077         {
3078           *err = REG_BADPAT;
3079           goto parse_bracket_exp_free_return;
3080         }
3081     }
3082
3083   /* We treat the first ']' as a normal character.  */
3084   if (token->type == OP_CLOSE_BRACKET)
3085     token->type = CHARACTER;
3086
3087   while (1)
3088     {
3089       bracket_elem_t start_elem, end_elem;
3090       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3091       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3092       reg_errcode_t ret;
3093       int token_len2 = 0;
3094       bool is_range_exp = false;
3095       re_token_t token2;
3096
3097       start_elem.opr.name = start_name_buf;
3098       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3099                                    syntax, first_round);
3100       if (BE (ret != REG_NOERROR, 0))
3101         {
3102           *err = ret;
3103           goto parse_bracket_exp_free_return;
3104         }
3105       first_round = false;
3106
3107       /* Get information about the next token.  We need it in any case.  */
3108       token_len = peek_token_bracket (token, regexp, syntax);
3109
3110       /* Do not check for ranges if we know they are not allowed.  */
3111       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3112         {
3113           if (BE (token->type == END_OF_RE, 0))
3114             {
3115               *err = REG_EBRACK;
3116               goto parse_bracket_exp_free_return;
3117             }
3118           if (token->type == OP_CHARSET_RANGE)
3119             {
3120               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3121               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3122               if (BE (token2.type == END_OF_RE, 0))
3123                 {
3124                   *err = REG_EBRACK;
3125                   goto parse_bracket_exp_free_return;
3126                 }
3127               if (token2.type == OP_CLOSE_BRACKET)
3128                 {
3129                   /* We treat the last '-' as a normal character.  */
3130                   re_string_skip_bytes (regexp, -token_len);
3131                   token->type = CHARACTER;
3132                 }
3133               else
3134                 is_range_exp = true;
3135             }
3136         }
3137
3138       if (is_range_exp == true)
3139         {
3140           end_elem.opr.name = end_name_buf;
3141           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3142                                        dfa, syntax, true);
3143           if (BE (ret != REG_NOERROR, 0))
3144             {
3145               *err = ret;
3146               goto parse_bracket_exp_free_return;
3147             }
3148
3149           token_len = peek_token_bracket (token, regexp, syntax);
3150
3151 #ifdef _LIBC
3152           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3153                                   &start_elem, &end_elem);
3154 #else
3155 # ifdef RE_ENABLE_I18N
3156           *err = build_range_exp (sbcset,
3157                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3158                                   &range_alloc, &start_elem, &end_elem);
3159 # else
3160           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3161 # endif
3162 #endif /* RE_ENABLE_I18N */
3163           if (BE (*err != REG_NOERROR, 0))
3164             goto parse_bracket_exp_free_return;
3165         }
3166       else
3167         {
3168           switch (start_elem.type)
3169             {
3170             case SB_CHAR:
3171               bitset_set (sbcset, start_elem.opr.ch);
3172               break;
3173 #ifdef RE_ENABLE_I18N
3174             case MB_CHAR:
3175               /* Check whether the array has enough space.  */
3176               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3177                 {
3178                   wchar_t *new_mbchars;
3179                   /* Not enough, realloc it.  */
3180                   /* +1 in case of mbcset->nmbchars is 0.  */
3181                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3182                   /* Use realloc since array is NULL if *alloc == 0.  */
3183                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3184                                             mbchar_alloc);
3185                   if (BE (new_mbchars == NULL, 0))
3186                     goto parse_bracket_exp_espace;
3187                   mbcset->mbchars = new_mbchars;
3188                 }
3189               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3190               break;
3191 #endif /* RE_ENABLE_I18N */
3192             case EQUIV_CLASS:
3193               *err = build_equiv_class (sbcset,
3194 #ifdef RE_ENABLE_I18N
3195                                         mbcset, &equiv_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197                                         start_elem.opr.name);
3198               if (BE (*err != REG_NOERROR, 0))
3199                 goto parse_bracket_exp_free_return;
3200               break;
3201             case COLL_SYM:
3202               *err = build_collating_symbol (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204                                              mbcset, &coll_sym_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206                                              start_elem.opr.name);
3207               if (BE (*err != REG_NOERROR, 0))
3208                 goto parse_bracket_exp_free_return;
3209               break;
3210             case CHAR_CLASS:
3211               *err = build_charclass (regexp->trans, sbcset,
3212 #ifdef RE_ENABLE_I18N
3213                                       mbcset, &char_class_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215                                       start_elem.opr.name, syntax);
3216               if (BE (*err != REG_NOERROR, 0))
3217                goto parse_bracket_exp_free_return;
3218               break;
3219             default:
3220               assert (0);
3221               break;
3222             }
3223         }
3224       if (BE (token->type == END_OF_RE, 0))
3225         {
3226           *err = REG_EBRACK;
3227           goto parse_bracket_exp_free_return;
3228         }
3229       if (token->type == OP_CLOSE_BRACKET)
3230         break;
3231     }
3232
3233   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3234
3235   /* If it is non-matching list.  */
3236   if (non_match)
3237     bitset_not (sbcset);
3238
3239 #ifdef RE_ENABLE_I18N
3240   /* Ensure only single byte characters are set.  */
3241   if (dfa->mb_cur_max > 1)
3242     bitset_mask (sbcset, dfa->sb_char);
3243
3244   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3245       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3246                                                      || mbcset->non_match)))
3247     {
3248       bin_tree_t *mbc_tree;
3249       int sbc_idx;
3250       /* Build a tree for complex bracket.  */
3251       dfa->has_mb_node = 1;
3252       br_token.type = COMPLEX_BRACKET;
3253       br_token.opr.mbcset = mbcset;
3254       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255       if (BE (mbc_tree == NULL, 0))
3256         goto parse_bracket_exp_espace;
3257       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3258         if (sbcset[sbc_idx])
3259           break;
3260       /* If there are no bits set in sbcset, there is no point
3261          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3262       if (sbc_idx < BITSET_WORDS)
3263         {
3264           /* Build a tree for simple bracket.  */
3265           br_token.type = SIMPLE_BRACKET;
3266           br_token.opr.sbcset = sbcset;
3267           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268           if (BE (work_tree == NULL, 0))
3269             goto parse_bracket_exp_espace;
3270
3271           /* Then join them by ALT node.  */
3272           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3273           if (BE (work_tree == NULL, 0))
3274             goto parse_bracket_exp_espace;
3275         }
3276       else
3277         {
3278           re_free (sbcset);
3279           work_tree = mbc_tree;
3280         }
3281     }
3282   else
3283 #endif /* not RE_ENABLE_I18N */
3284     {
3285 #ifdef RE_ENABLE_I18N
3286       free_charset (mbcset);
3287 #endif
3288       /* Build a tree for simple bracket.  */
3289       br_token.type = SIMPLE_BRACKET;
3290       br_token.opr.sbcset = sbcset;
3291       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292       if (BE (work_tree == NULL, 0))
3293         goto parse_bracket_exp_espace;
3294     }
3295   return work_tree;
3296
3297  parse_bracket_exp_espace:
3298   *err = REG_ESPACE;
3299  parse_bracket_exp_free_return:
3300   re_free (sbcset);
3301 #ifdef RE_ENABLE_I18N
3302   free_charset (mbcset);
3303 #endif /* RE_ENABLE_I18N */
3304   return NULL;
3305 }
3306
3307 /* Parse an element in the bracket expression.  */
3308
3309 static reg_errcode_t
3310 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3311                        re_token_t *token, int token_len, re_dfa_t *dfa,
3312                        reg_syntax_t syntax, bool accept_hyphen)
3313 {
3314 #ifdef RE_ENABLE_I18N
3315   int cur_char_size;
3316   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3317   if (cur_char_size > 1)
3318     {
3319       elem->type = MB_CHAR;
3320       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3321       re_string_skip_bytes (regexp, cur_char_size);
3322       return REG_NOERROR;
3323     }
3324 #endif /* RE_ENABLE_I18N */
3325   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3326   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327       || token->type == OP_OPEN_EQUIV_CLASS)
3328     return parse_bracket_symbol (elem, regexp, token);
3329   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3330     {
3331       /* A '-' must only appear as anything but a range indicator before
3332          the closing bracket.  Everything else is an error.  */
3333       re_token_t token2;
3334       (void) peek_token_bracket (&token2, regexp, syntax);
3335       if (token2.type != OP_CLOSE_BRACKET)
3336         /* The actual error value is not standardized since this whole
3337            case is undefined.  But ERANGE makes good sense.  */
3338         return REG_ERANGE;
3339     }
3340   elem->type = SB_CHAR;
3341   elem->opr.ch = token->opr.c;
3342   return REG_NOERROR;
3343 }
3344
3345 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3346    such as [:<character_class>:], [.<collating_element>.], and
3347    [=<equivalent_class>=].  */
3348
3349 static reg_errcode_t
3350 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3351                       re_token_t *token)
3352 {
3353   unsigned char ch, delim = token->opr.c;
3354   int i = 0;
3355   if (re_string_eoi(regexp))
3356     return REG_EBRACK;
3357   for (;; ++i)
3358     {
3359       if (i >= BRACKET_NAME_BUF_SIZE)
3360         return REG_EBRACK;
3361       if (token->type == OP_OPEN_CHAR_CLASS)
3362         ch = re_string_fetch_byte_case (regexp);
3363       else
3364         ch = re_string_fetch_byte (regexp);
3365       if (re_string_eoi(regexp))
3366         return REG_EBRACK;
3367       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3368         break;
3369       elem->opr.name[i] = ch;
3370     }
3371   re_string_skip_bytes (regexp, 1);
3372   elem->opr.name[i] = '\0';
3373   switch (token->type)
3374     {
3375     case OP_OPEN_COLL_ELEM:
3376       elem->type = COLL_SYM;
3377       break;
3378     case OP_OPEN_EQUIV_CLASS:
3379       elem->type = EQUIV_CLASS;
3380       break;
3381     case OP_OPEN_CHAR_CLASS:
3382       elem->type = CHAR_CLASS;
3383       break;
3384     default:
3385       break;
3386     }
3387   return REG_NOERROR;
3388 }
3389
3390   /* Helper function for parse_bracket_exp.
3391      Build the equivalence class which is represented by NAME.
3392      The result are written to MBCSET and SBCSET.
3393      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394      is a pointer argument sinse we may update it.  */
3395
3396 static reg_errcode_t
3397 #ifdef RE_ENABLE_I18N
3398 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3399                    Idx *equiv_class_alloc, const unsigned char *name)
3400 #else /* not RE_ENABLE_I18N */
3401 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3402 #endif /* not RE_ENABLE_I18N */
3403 {
3404 #ifdef _LIBC
3405   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3406   if (nrules != 0)
3407     {
3408       const int32_t *table, *indirect;
3409       const unsigned char *weights, *extra, *cp;
3410       unsigned char char_buf[2];
3411       int32_t idx1, idx2;
3412       unsigned int ch;
3413       size_t len;
3414       /* This #include defines a local function!  */
3415 # include <locale/weight.h>
3416       /* Calculate the index for equivalence class.  */
3417       cp = name;
3418       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420                                                _NL_COLLATE_WEIGHTMB);
3421       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422                                                    _NL_COLLATE_EXTRAMB);
3423       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424                                                 _NL_COLLATE_INDIRECTMB);
3425       idx1 = findidx (&cp);
3426       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3427         /* This isn't a valid character.  */
3428         return REG_ECOLLATE;
3429
3430       /* Build single byte matcing table for this equivalence class.  */
3431       char_buf[1] = (unsigned char) '\0';
3432       len = weights[idx1];
3433       for (ch = 0; ch < SBC_MAX; ++ch)
3434         {
3435           char_buf[0] = ch;
3436           cp = char_buf;
3437           idx2 = findidx (&cp);
3438 /*
3439           idx2 = table[ch];
3440 */
3441           if (idx2 == 0)
3442             /* This isn't a valid character.  */
3443             continue;
3444           if (len == weights[idx2])
3445             {
3446               int cnt = 0;
3447               while (cnt <= len &&
3448                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3449                 ++cnt;
3450
3451               if (cnt > len)
3452                 bitset_set (sbcset, ch);
3453             }
3454         }
3455       /* Check whether the array has enough space.  */
3456       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3457         {
3458           /* Not enough, realloc it.  */
3459           /* +1 in case of mbcset->nequiv_classes is 0.  */
3460           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3461           /* Use realloc since the array is NULL if *alloc == 0.  */
3462           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3463                                                    int32_t,
3464                                                    new_equiv_class_alloc);
3465           if (BE (new_equiv_classes == NULL, 0))
3466             return REG_ESPACE;
3467           mbcset->equiv_classes = new_equiv_classes;
3468           *equiv_class_alloc = new_equiv_class_alloc;
3469         }
3470       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3471     }
3472   else
3473 #endif /* _LIBC */
3474     {
3475       if (BE (strlen ((const char *) name) != 1, 0))
3476         return REG_ECOLLATE;
3477       bitset_set (sbcset, *name);
3478     }
3479   return REG_NOERROR;
3480 }
3481
3482   /* Helper function for parse_bracket_exp.
3483      Build the character class which is represented by NAME.
3484      The result are written to MBCSET and SBCSET.
3485      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3486      is a pointer argument sinse we may update it.  */
3487
3488 static reg_errcode_t
3489 #ifdef RE_ENABLE_I18N
3490 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3491                  re_charset_t *mbcset, Idx *char_class_alloc,
3492                  const unsigned char *class_name, reg_syntax_t syntax)
3493 #else /* not RE_ENABLE_I18N */
3494 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3495                  const unsigned char *class_name, reg_syntax_t syntax)
3496 #endif /* not RE_ENABLE_I18N */
3497 {
3498   int i;
3499   const char *name = (const char *) class_name;
3500
3501   /* In case of REG_ICASE "upper" and "lower" match the both of
3502      upper and lower cases.  */
3503   if ((syntax & RE_ICASE)
3504       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3505     name = "alpha";
3506
3507 #ifdef RE_ENABLE_I18N
3508   /* Check the space of the arrays.  */
3509   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3510     {
3511       /* Not enough, realloc it.  */
3512       /* +1 in case of mbcset->nchar_classes is 0.  */
3513       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3514       /* Use realloc since array is NULL if *alloc == 0.  */
3515       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3516                                                new_char_class_alloc);
3517       if (BE (new_char_classes == NULL, 0))
3518         return REG_ESPACE;
3519       mbcset->char_classes = new_char_classes;
3520       *char_class_alloc = new_char_class_alloc;
3521     }
3522   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3523 #endif /* RE_ENABLE_I18N */
3524
3525 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3526   do {                                          \
3527     if (BE (trans != NULL, 0))                  \
3528       {                                         \
3529         for (i = 0; i < SBC_MAX; ++i)           \
3530           if (ctype_func (i))                   \
3531             bitset_set (sbcset, trans[i]);      \
3532       }                                         \
3533     else                                        \
3534       {                                         \
3535         for (i = 0; i < SBC_MAX; ++i)           \
3536           if (ctype_func (i))                   \
3537             bitset_set (sbcset, i);             \
3538       }                                         \
3539   } while (0)
3540
3541   if (strcmp (name, "alnum") == 0)
3542     BUILD_CHARCLASS_LOOP (isalnum);
3543   else if (strcmp (name, "cntrl") == 0)
3544     BUILD_CHARCLASS_LOOP (iscntrl);
3545   else if (strcmp (name, "lower") == 0)
3546     BUILD_CHARCLASS_LOOP (islower);
3547   else if (strcmp (name, "space") == 0)
3548     BUILD_CHARCLASS_LOOP (isspace);
3549   else if (strcmp (name, "alpha") == 0)
3550     BUILD_CHARCLASS_LOOP (isalpha);
3551   else if (strcmp (name, "digit") == 0)
3552     BUILD_CHARCLASS_LOOP (isdigit);
3553   else if (strcmp (name, "print") == 0)
3554     BUILD_CHARCLASS_LOOP (isprint);
3555   else if (strcmp (name, "upper") == 0)
3556     BUILD_CHARCLASS_LOOP (isupper);
3557   else if (strcmp (name, "blank") == 0)
3558     BUILD_CHARCLASS_LOOP (isblank);
3559   else if (strcmp (name, "graph") == 0)
3560     BUILD_CHARCLASS_LOOP (isgraph);
3561   else if (strcmp (name, "punct") == 0)
3562     BUILD_CHARCLASS_LOOP (ispunct);
3563   else if (strcmp (name, "xdigit") == 0)
3564     BUILD_CHARCLASS_LOOP (isxdigit);
3565   else
3566     return REG_ECTYPE;
3567
3568   return REG_NOERROR;
3569 }
3570
3571 static bin_tree_t *
3572 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3573                     const unsigned char *class_name,
3574                     const unsigned char *extra, bool non_match,
3575                     reg_errcode_t *err)
3576 {
3577   re_bitset_ptr_t sbcset;
3578 #ifdef RE_ENABLE_I18N
3579   re_charset_t *mbcset;
3580   Idx alloc = 0;
3581 #endif /* not RE_ENABLE_I18N */
3582   reg_errcode_t ret;
3583   re_token_t br_token;
3584   bin_tree_t *tree;
3585
3586   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3587 #ifdef RE_ENABLE_I18N
3588   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3589 #endif /* RE_ENABLE_I18N */
3590
3591 #ifdef RE_ENABLE_I18N
3592   if (BE (sbcset == NULL || mbcset == NULL, 0))
3593 #else /* not RE_ENABLE_I18N */
3594   if (BE (sbcset == NULL, 0))
3595 #endif /* not RE_ENABLE_I18N */
3596     {
3597       *err = REG_ESPACE;
3598       return NULL;
3599     }
3600
3601   if (non_match)
3602     {
3603 #ifdef RE_ENABLE_I18N
3604       /*
3605       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3606         bitset_set(cset->sbcset, '\0');
3607       */
3608       mbcset->non_match = 1;
3609 #endif /* not RE_ENABLE_I18N */
3610     }
3611
3612   /* We don't care the syntax in this case.  */
3613   ret = build_charclass (trans, sbcset,
3614 #ifdef RE_ENABLE_I18N
3615                          mbcset, &alloc,
3616 #endif /* RE_ENABLE_I18N */
3617                          class_name, 0);
3618
3619   if (BE (ret != REG_NOERROR, 0))
3620     {
3621       re_free (sbcset);
3622 #ifdef RE_ENABLE_I18N
3623       free_charset (mbcset);
3624 #endif /* RE_ENABLE_I18N */
3625       *err = ret;
3626       return NULL;
3627     }
3628   /* \w match '_' also.  */
3629   for (; *extra; extra++)
3630     bitset_set (sbcset, *extra);
3631
3632   /* If it is non-matching list.  */
3633   if (non_match)
3634     bitset_not (sbcset);
3635
3636 #ifdef RE_ENABLE_I18N
3637   /* Ensure only single byte characters are set.  */
3638   if (dfa->mb_cur_max > 1)
3639     bitset_mask (sbcset, dfa->sb_char);
3640 #endif
3641
3642   /* Build a tree for simple bracket.  */
3643   br_token.type = SIMPLE_BRACKET;
3644   br_token.opr.sbcset = sbcset;
3645   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3646   if (BE (tree == NULL, 0))
3647     goto build_word_op_espace;
3648
3649 #ifdef RE_ENABLE_I18N
3650   if (dfa->mb_cur_max > 1)
3651     {
3652       bin_tree_t *mbc_tree;
3653       /* Build a tree for complex bracket.  */
3654       br_token.type = COMPLEX_BRACKET;
3655       br_token.opr.mbcset = mbcset;
3656       dfa->has_mb_node = 1;
3657       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3658       if (BE (mbc_tree == NULL, 0))
3659         goto build_word_op_espace;
3660       /* Then join them by ALT node.  */
3661       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3662       if (BE (mbc_tree != NULL, 1))
3663         return tree;
3664     }
3665   else
3666     {
3667       free_charset (mbcset);
3668       return tree;
3669     }
3670 #else /* not RE_ENABLE_I18N */
3671   return tree;
3672 #endif /* not RE_ENABLE_I18N */
3673
3674  build_word_op_espace:
3675   re_free (sbcset);
3676 #ifdef RE_ENABLE_I18N
3677   free_charset (mbcset);
3678 #endif /* RE_ENABLE_I18N */
3679   *err = REG_ESPACE;
3680   return NULL;
3681 }
3682
3683 /* This is intended for the expressions like "a{1,3}".
3684    Fetch a number from `input', and return the number.
3685    Return REG_MISSING if the number field is empty like "{,1}".
3686    Return REG_ERROR if an error occurred.  */
3687
3688 static Idx
3689 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3690 {
3691   Idx num = REG_MISSING;
3692   unsigned char c;
3693   while (1)
3694     {
3695       fetch_token (token, input, syntax);
3696       c = token->opr.c;
3697       if (BE (token->type == END_OF_RE, 0))
3698         return REG_ERROR;
3699       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3700         break;
3701       num = ((token->type != CHARACTER || c < '0' || '9' < c
3702               || num == REG_ERROR)
3703              ? REG_ERROR
3704              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3705       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3706     }
3707   return num;
3708 }
3709 \f
3710 #ifdef RE_ENABLE_I18N
3711 static void
3712 free_charset (re_charset_t *cset)
3713 {
3714   re_free (cset->mbchars);
3715 # ifdef _LIBC
3716   re_free (cset->coll_syms);
3717   re_free (cset->equiv_classes);
3718   re_free (cset->range_starts);
3719   re_free (cset->range_ends);
3720 # endif
3721   re_free (cset->char_classes);
3722   re_free (cset);
3723 }
3724 #endif /* RE_ENABLE_I18N */
3725 \f
3726 /* Functions for binary tree operation.  */
3727
3728 /* Create a tree node.  */
3729
3730 static bin_tree_t *
3731 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3732              re_token_type_t type)
3733 {
3734   re_token_t t;
3735   t.type = type;
3736   return create_token_tree (dfa, left, right, &t);
3737 }
3738
3739 static bin_tree_t *
3740 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3741                    const re_token_t *token)
3742 {
3743   bin_tree_t *tree;
3744   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3745     {
3746       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3747
3748       if (storage == NULL)
3749         return NULL;
3750       storage->next = dfa->str_tree_storage;
3751       dfa->str_tree_storage = storage;
3752       dfa->str_tree_storage_idx = 0;
3753     }
3754   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3755
3756   tree->parent = NULL;
3757   tree->left = left;
3758   tree->right = right;
3759   tree->token = *token;
3760   tree->token.duplicated = 0;
3761   tree->token.opt_subexp = 0;
3762   tree->first = NULL;
3763   tree->next = NULL;
3764   tree->node_idx = REG_MISSING;
3765
3766   if (left != NULL)
3767     left->parent = tree;
3768   if (right != NULL)
3769     right->parent = tree;
3770   return tree;
3771 }
3772
3773 /* Mark the tree SRC as an optional subexpression.
3774    To be called from preorder or postorder.  */
3775
3776 static reg_errcode_t
3777 mark_opt_subexp (void *extra, bin_tree_t *node)
3778 {
3779   Idx idx = (Idx) (long) extra;
3780   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3781     node->token.opt_subexp = 1;
3782
3783   return REG_NOERROR;
3784 }
3785
3786 /* Free the allocated memory inside NODE. */
3787
3788 static void
3789 free_token (re_token_t *node)
3790 {
3791 #ifdef RE_ENABLE_I18N
3792   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3793     free_charset (node->opr.mbcset);
3794   else
3795 #endif /* RE_ENABLE_I18N */
3796     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3797       re_free (node->opr.sbcset);
3798 }
3799
3800 /* Worker function for tree walking.  Free the allocated memory inside NODE
3801    and its children. */
3802
3803 static reg_errcode_t
3804 free_tree (void *extra, bin_tree_t *node)
3805 {
3806   free_token (&node->token);
3807   return REG_NOERROR;
3808 }
3809
3810
3811 /* Duplicate the node SRC, and return new node.  This is a preorder
3812    visit similar to the one implemented by the generic visitor, but
3813    we need more infrastructure to maintain two parallel trees --- so,
3814    it's easier to duplicate.  */
3815
3816 static bin_tree_t *
3817 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3818 {
3819   const bin_tree_t *node;
3820   bin_tree_t *dup_root;
3821   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3822
3823   for (node = root; ; )
3824     {
3825       /* Create a new tree and link it back to the current parent.  */
3826       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3827       if (*p_new == NULL)
3828         return NULL;
3829       (*p_new)->parent = dup_node;
3830       (*p_new)->token.duplicated = 1;
3831       dup_node = *p_new;
3832
3833       /* Go to the left node, or up and to the right.  */
3834       if (node->left)
3835         {
3836           node = node->left;
3837           p_new = &dup_node->left;
3838         }
3839       else
3840         {
3841           const bin_tree_t *prev = NULL;
3842           while (node->right == prev || node->right == NULL)
3843             {
3844               prev = node;
3845               node = node->parent;
3846               dup_node = dup_node->parent;
3847               if (!node)
3848                 return dup_root;
3849             }
3850           node = node->right;
3851           p_new = &dup_node->right;
3852         }
3853     }
3854 }