lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
  18
  19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  20                                           size_t length, reg_syntax_t syntax);
  21 static void re_compile_fastmap_iter (regex_t *bufp,
  22                                      const re_dfastate_t *init_state,
  23                                      char *fastmap);
  24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  25 #ifdef RE_ENABLE_I18N
  26 static void free_charset (re_charset_t *cset);
  27 #endif /* RE_ENABLE_I18N */
  28 static void free_workarea_compile (regex_t *preg);
  29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  30 #ifdef RE_ENABLE_I18N
  31 static void optimize_utf8 (re_dfa_t *dfa);
  32 #endif
  33 static reg_errcode_t analyze (regex_t *preg);
  34 static reg_errcode_t preorder (bin_tree_t *root,
  35                                reg_errcode_t (fn (void *, bin_tree_t *)),
  36                                void *extra);
  37 static reg_errcode_t postorder (bin_tree_t *root,
  38                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                 void *extra);
  40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  43                                  bin_tree_t *node);
  44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  49                                    unsigned int constraint);
  50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  52                                          Idx node, bool root);
  53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  54 static Idx fetch_number (re_string_t *input, re_token_t *token,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax) internal_function;
  58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  59                           reg_syntax_t syntax, reg_errcode_t *err);
  60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  61                                   re_token_t *token, reg_syntax_t syntax,
  62                                   Idx nest, reg_errcode_t *err);
  63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  64                                  re_token_t *token, reg_syntax_t syntax,
  65                                  Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  67                                      re_token_t *token, reg_syntax_t syntax,
  68                                      Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  70                                   re_token_t *token, reg_syntax_t syntax,
  71                                   Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  73                                  re_dfa_t *dfa, re_token_t *token,
  74                                  reg_syntax_t syntax, reg_errcode_t *err);
  75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  76                                       re_token_t *token, reg_syntax_t syntax,
  77                                       reg_errcode_t *err);
  78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  79                                             re_string_t *regexp,
  80                                             re_token_t *token, int token_len,
  81                                             re_dfa_t *dfa,
  82                                             reg_syntax_t syntax,
  83                                             bool accept_hyphen);
  84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  85                                           re_string_t *regexp,
  86                                           re_token_t *token);
  87 #ifdef RE_ENABLE_I18N
  88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  89                                         re_charset_t *mbcset,
  90                                         Idx *equiv_class_alloc,
  91                                         const unsigned char *name);
  92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  93                                       bitset_t sbcset,
  94                                       re_charset_t *mbcset,
  95                                       Idx *char_class_alloc,
  96                                       const unsigned char *class_name,
  97                                       reg_syntax_t syntax);
  98 #else  /* not RE_ENABLE_I18N */
  99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 100                                         const unsigned char *name);
 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 102                                       bitset_t sbcset,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #endif /* not RE_ENABLE_I18N */
 106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 107                                        RE_TRANSLATE_TYPE trans,
 108                                        const unsigned char *class_name,
 109                                        const unsigned char *extra,
 110                                        bool non_match, reg_errcode_t *err);
 111 static bin_tree_t *create_tree (re_dfa_t *dfa,
 112                                 bin_tree_t *left, bin_tree_t *right,
 113                                 re_token_type_t type);
 114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 115                                       bin_tree_t *left, bin_tree_t *right,
 116                                       const re_token_t *token);
 117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 118 static void free_token (re_token_t *node);
 119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 121 \f
 122 /* This table gives an error message for each of the error codes listed
 123    in regex.h.  Obviously the order here has to be same as there.
 124    POSIX doesn't require that we do anything for REG_NOERROR,
 125    but why not be nice?  */
 126
 127 static const char __re_error_msgid[] =
 128   {
 129 #define REG_NOERROR_IDX 0
 130     gettext_noop ("Success")    /* REG_NOERROR */
 131     "\0"
 132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 133     gettext_noop ("No match")   /* REG_NOMATCH */
 134     "\0"
 135 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 136     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 137     "\0"
 138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 139     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 140     "\0"
 141 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 142     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 143     "\0"
 144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 145     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 146     "\0"
 147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 148     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 149     "\0"
 150 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 151     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 152     "\0"
 153 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 154     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 155     "\0"
 156 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 157     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 158     "\0"
 159 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 160     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 161     "\0"
 162 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 163     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 164     "\0"
 165 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 166     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 167     "\0"
 168 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 169     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 170     "\0"
 171 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 172     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 173     "\0"
 174 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 175     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 176     "\0"
 177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 178     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 179   };
 180
 181 static const size_t __re_error_msgid_idx[] =
 182   {
 183     REG_NOERROR_IDX,
 184     REG_NOMATCH_IDX,
 185     REG_BADPAT_IDX,
 186     REG_ECOLLATE_IDX,
 187     REG_ECTYPE_IDX,
 188     REG_EESCAPE_IDX,
 189     REG_ESUBREG_IDX,
 190     REG_EBRACK_IDX,
 191     REG_EPAREN_IDX,
 192     REG_EBRACE_IDX,
 193     REG_BADBR_IDX,
 194     REG_ERANGE_IDX,
 195     REG_ESPACE_IDX,
 196     REG_BADRPT_IDX,
 197     REG_EEND_IDX,
 198     REG_ESIZE_IDX,
 199     REG_ERPAREN_IDX
 200   };
 201 \f
 202 /* Entry points for GNU code.  */
 203
 204 /* re_compile_pattern is the GNU regular expression compiler: it
 205    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 206    Returns 0 if the pattern was valid, otherwise an error string.
 207
 208    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 209    are set in BUFP on entry.  */
 210
 211 #ifdef _LIBC
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 #else /* size_t might promote */
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 #endif
 222 {
 223   reg_errcode_t ret;
 224
 225   /* And GNU code determines whether or not to get register information
 226      by passing null for the REGS argument to re_match, etc., not by
 227      setting no_sub, unless RE_NO_SUB is set.  */
 228   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 229
 230   /* Match anchors at newline.  */
 231   bufp->newline_anchor = 1;
 232
 233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 234
 235   if (!ret)
 236     return NULL;
 237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 238 }
 239 #ifdef _LIBC
 240 weak_alias (__re_compile_pattern, re_compile_pattern)
 241 #endif
 242
 243 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 244    also be assigned to arbitrarily: each pattern buffer stores its own
 245    syntax, so it can be changed between regex compilations.  */
 246 /* This has no initializer because initialized variables in Emacs
 247    become read-only after dumping.  */
 248 reg_syntax_t re_syntax_options;
 249
 250
 251 /* Specify the precise syntax of regexps for compilation.  This provides
 252    for compatibility for various utilities which historically have
 253    different, incompatible syntaxes.
 254
 255    The argument SYNTAX is a bit mask comprised of the various bits
 256    defined in regex.h.  We return the old syntax.  */
 257
 258 reg_syntax_t
 259 re_set_syntax (syntax)
 260     reg_syntax_t syntax;
 261 {
 262   reg_syntax_t ret = re_syntax_options;
 263
 264   re_syntax_options = syntax;
 265   return ret;
 266 }
 267 #ifdef _LIBC
 268 weak_alias (__re_set_syntax, re_set_syntax)
 269 #endif
 270
 271 int
 272 re_compile_fastmap (bufp)
 273     struct re_pattern_buffer *bufp;
 274 {
 275   re_dfa_t *dfa = bufp->buffer;
 276   char *fastmap = bufp->fastmap;
 277
 278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 280   if (dfa->init_state != dfa->init_state_word)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 282   if (dfa->init_state != dfa->init_state_nl)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 284   if (dfa->init_state != dfa->init_state_begbuf)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 286   bufp->fastmap_accurate = 1;
 287   return 0;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 291 #endif
 292
 293 static inline void
 294 __attribute ((always_inline))
 295 re_set_fastmap (char *fastmap, bool icase, int ch)
 296 {
 297   fastmap[ch] = 1;
 298   if (icase)
 299     fastmap[tolower (ch)] = 1;
 300 }
 301
 302 /* Helper function for re_compile_fastmap.
 303    Compile fastmap for the initial_state INIT_STATE.  */
 304
 305 static void
 306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 307                          char *fastmap)
 308 {
 309   re_dfa_t *dfa = bufp->buffer;
 310   Idx node_cnt;
 311   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 312   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 313     {
 314       Idx node = init_state->nodes.elems[node_cnt];
 315       re_token_type_t type = dfa->nodes[node].type;
 316
 317       if (type == CHARACTER)
 318         {
 319           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 320 #ifdef RE_ENABLE_I18N
 321           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 322             {
 323               unsigned char buf[MB_LEN_MAX];
 324               unsigned char *p;
 325               wchar_t wc;
 326               mbstate_t state;
 327
 328               p = buf;
 329               *p++ = dfa->nodes[node].opr.c;
 330               while (++node < dfa->nodes_len
 331                      && dfa->nodes[node].type == CHARACTER
 332                      && dfa->nodes[node].mb_partial)
 333                 *p++ = dfa->nodes[node].opr.c;
 334               memset (&state, '\0', sizeof (state));
 335               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 336                              &state) == p - buf
 337                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 338                       != (size_t) -1))
 339                 re_set_fastmap (fastmap, false, buf[0]);
 340             }
 341 #endif
 342         }
 343       else if (type == SIMPLE_BRACKET)
 344         {
 345           int i, ch;
 346           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 347             {
 348               int j;
 349               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 350               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 351                 if (w & ((bitset_word_t) 1 << j))
 352                   re_set_fastmap (fastmap, icase, ch);
 353             }
 354         }
 355 #ifdef RE_ENABLE_I18N
 356       else if (type == COMPLEX_BRACKET)
 357         {
 358           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 359           Idx i;
 360
 361 # ifdef _LIBC
 362           /* See if we have to try all bytes which start multiple collation
 363              elements.
 364              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 365                   collation element, and don't catch 'b' since 'b' is
 366                   the only collation element which starts from 'b' (and
 367                   it is caught by SIMPLE_BRACKET).  */
 368               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 369                   && (cset->ncoll_syms || cset->nranges))
 370                 {
 371                   const int32_t *table = (const int32_t *)
 372                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 373                   for (i = 0; i < SBC_MAX; ++i)
 374                     if (table[i] < 0)
 375                       re_set_fastmap (fastmap, icase, i);
 376                 }
 377 # endif /* _LIBC */
 378
 379           /* See if we have to start the match at all multibyte characters,
 380              i.e. where we would not find an invalid sequence.  This only
 381              applies to multibyte character sets; for single byte character
 382              sets, the SIMPLE_BRACKET again suffices.  */
 383           if (dfa->mb_cur_max > 1
 384               && (cset->nchar_classes || cset->non_match || cset->nranges
 385 # ifdef _LIBC
 386                   || cset->nequiv_classes
 387 # endif /* _LIBC */
 388                  ))
 389             {
 390               unsigned char c = 0;
 391               do
 392                 {
 393                   mbstate_t mbs;
 394                   memset (&mbs, 0, sizeof (mbs));
 395                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 396                     re_set_fastmap (fastmap, false, (int) c);
 397                 }
 398               while (++c != 0);
 399             }
 400
 401           else
 402             {
 403               /* ... Else catch all bytes which can start the mbchars.  */
 404               for (i = 0; i < cset->nmbchars; ++i)
 405                 {
 406                   char buf[256];
 407                   mbstate_t state;
 408                   memset (&state, '\0', sizeof (state));
 409                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 410                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 411                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 412                     {
 413                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 414                           != (size_t) -1)
 415                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 416                     }
 417                 }
 418             }
 419         }
 420 #endif /* RE_ENABLE_I18N */
 421       else if (type == OP_PERIOD
 422 #ifdef RE_ENABLE_I18N
 423                || type == OP_UTF8_PERIOD
 424 #endif /* RE_ENABLE_I18N */
 425                || type == END_OF_RE)
 426         {
 427           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 428           if (type == END_OF_RE)
 429             bufp->can_be_null = 1;
 430           return;
 431         }
 432     }
 433 }
 434 \f
 435 /* Entry point for POSIX code.  */
 436 /* regcomp takes a regular expression as a string and compiles it.
 437
 438    PREG is a regex_t *.  We do not expect any fields to be initialized,
 439    since POSIX says we shouldn't.  Thus, we set
 440
 441      'buffer' to the compiled pattern;
 442      'used' to the length of the compiled pattern;
 443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 445        RE_SYNTAX_POSIX_BASIC;
 446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 447      'fastmap' to an allocated space for the fastmap;
 448      'fastmap_accurate' to zero;
 449      're_nsub' to the number of subexpressions in PATTERN.
 450
 451    PATTERN is the address of the pattern string.
 452
 453    CFLAGS is a series of bits which affect compilation.
 454
 455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 456      use POSIX basic syntax.
 457
 458      If REG_NEWLINE is set, then . and [^...] don't match newline.
 459      Also, regexec will try a match beginning after every newline.
 460
 461      If REG_ICASE is set, then we considers upper- and lowercase
 462      versions of letters to be equivalent when matching.
 463
 464      If REG_NOSUB is set, then when PREG is passed to regexec, that
 465      routine will report only success or failure, and nothing about the
 466      registers.
 467
 468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 469    the return codes and their meanings.)  */
 470
 471 int
 472 regcomp (preg, pattern, cflags)
 473     regex_t *_Restrict_ preg;
 474     const char *_Restrict_ pattern;
 475     int cflags;
 476 {
 477   reg_errcode_t ret;
 478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 479                          : RE_SYNTAX_POSIX_BASIC);
 480
 481   preg->buffer = NULL;
 482   preg->allocated = 0;
 483   preg->used = 0;
 484
 485   /* Try to allocate space for the fastmap.  */
 486   preg->fastmap = re_malloc (char, SBC_MAX);
 487   if (BE (preg->fastmap == NULL, 0))
 488     return REG_ESPACE;
 489
 490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 491
 492   /* If REG_NEWLINE is set, newlines are treated differently.  */
 493   if (cflags & REG_NEWLINE)
 494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 495       syntax &= ~RE_DOT_NEWLINE;
 496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 497       /* It also changes the matching behavior.  */
 498       preg->newline_anchor = 1;
 499     }
 500   else
 501     preg->newline_anchor = 0;
 502   preg->no_sub = !!(cflags & REG_NOSUB);
 503   preg->translate = NULL;
 504
 505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 506
 507   /* POSIX doesn't distinguish between an unmatched open-group and an
 508      unmatched close-group: both are REG_EPAREN.  */
 509   if (ret == REG_ERPAREN)
 510     ret = REG_EPAREN;
 511
 512   /* We have already checked preg->fastmap != NULL.  */
 513   if (BE (ret == REG_NOERROR, 1))
 514     /* Compute the fastmap now, since regexec cannot modify the pattern
 515        buffer.  This function never fails in this implementation.  */
 516     (void) re_compile_fastmap (preg);
 517   else
 518     {
 519       /* Some error occurred while compiling the expression.  */
 520       re_free (preg->fastmap);
 521       preg->fastmap = NULL;
 522     }
 523
 524   return (int) ret;
 525 }
 526 #ifdef _LIBC
 527 weak_alias (__regcomp, regcomp)
 528 #endif
 529
 530 /* Returns a message corresponding to an error code, ERRCODE, returned
 531    from either regcomp or regexec.   We don't use PREG here.  */
 532
 533 #ifdef _LIBC
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *_Restrict_ preg;
 538     char *_Restrict_ errbuf;
 539     size_t errbuf_size;
 540 #else /* size_t might promote */
 541 size_t
 542 regerror (int errcode, const regex_t *_Restrict_ preg,
 543           char *_Restrict_ errbuf, size_t errbuf_size)
 544 #endif
 545 {
 546   const char *msg;
 547   size_t msg_size;
 548
 549   if (BE (errcode < 0
 550           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 551                                / sizeof (__re_error_msgid_idx[0])), 0))
 552     /* Only error codes returned by the rest of the code should be passed
 553        to this routine.  If we are given anything else, or if other regex
 554        code generates an invalid error code, then the program has a bug.
 555        Dump core so we can fix it.  */
 556     abort ();
 557
 558   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 559
 560   msg_size = strlen (msg) + 1; /* Includes the null.  */
 561
 562   if (BE (errbuf_size != 0, 1))
 563     {
 564       size_t cpy_size = msg_size;
 565       if (BE (msg_size > errbuf_size, 0))
 566         {
 567           cpy_size = errbuf_size - 1;
 568           errbuf[cpy_size] = '\0';
 569         }
 570       memcpy (errbuf, msg, cpy_size);
 571     }
 572
 573   return msg_size;
 574 }
 575 #ifdef _LIBC
 576 weak_alias (__regerror, regerror)
 577 #endif
 578
 579
 580 #ifdef RE_ENABLE_I18N
 581 /* This static array is used for the map to single-byte characters when
 582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 583    it the same all the time.  UTF-8 is the preferred encoding so this is
 584    a worthwhile optimization.  */
 585 static const bitset_t utf8_sb_map =
 586 {
 587   /* Set the first 128 bits.  */
 588 # ifdef __GNUC__
 589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 590 # else
 591 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 592 #   error "bitset_word_t is narrower than 32 bits"
 593 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX, BITSET_WORD_MAX,
 597 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 598   BITSET_WORD_MAX,
 599 #  endif
 600   (BITSET_WORD_MAX
 601    >> (SBC_MAX % BITSET_WORD_BITS == 0
 602        ? 0
 603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 604 # endif
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   const char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if (strcasecmp (codeset_name, "UTF-8") == 0
 903       || strcasecmp (codeset_name, "UTF8") == 0)
 904     dfa->is_utf8 = 1;
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 916       else
 917         {
 918           int i, j, ch;
 919
 920           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 921           if (BE (dfa->sb_char == NULL, 0))
 922             return REG_ESPACE;
 923
 924           /* Set the bits corresponding to single byte chars.  */
 925           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 926             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 927               {
 928                 wint_t wch = __btowc (ch);
 929                 if (wch != WEOF)
 930                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 931 # ifndef _LIBC
 932                 if (isascii (ch) && wch != ch)
 933                   dfa->map_notascii = 1;
 934 # endif
 935               }
 936         }
 937     }
 938 #endif
 939
 940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 /* Initialize WORD_CHAR table, which indicate which character is
 946    "word".  In this case "word" means that it is the word construction
 947    character used by some operators like "\<", "\>", etc.  */
 948
 949 static void
 950 internal_function
 951 init_word_char (re_dfa_t *dfa)
 952 {
 953   dfa->word_ops_used = 1;
 954   int i = 0;
 955   int j;
 956   int ch = 0;
 957   if (BE (dfa->map_notascii == 0, 1))
 958     {
 959       bitset_word_t bits0 = 0x00000000;
 960       bitset_word_t bits1 = 0x03ff0000;
 961       bitset_word_t bits2 = 0x87fffffe;
 962       bitset_word_t bits3 = 0x07fffffe;
 963       if (BITSET_WORD_BITS == 64)
 964         {
 965           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 966           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 967           i = 2;
 968         }
 969       else if (BITSET_WORD_BITS == 32)
 970         {
 971           dfa->word_char[0] = bits0;
 972           dfa->word_char[1] = bits1;
 973           dfa->word_char[2] = bits2;
 974           dfa->word_char[3] = bits3;
 975           i = 4;
 976         }
 977       else
 978         goto general_case;
 979       ch = 128;
 980
 981       if (BE (dfa->is_utf8, 1))
 982         {
 983           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 984           return;
 985         }
 986     }
 987
 988  general_case:
 989   for (; i < BITSET_WORDS; ++i)
 990     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 991       if (isalnum (ch) || ch == '_')
 992         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 993 }
 994
 995 /* Free the work area which are only used while compiling.  */
 996
 997 static void
 998 free_workarea_compile (regex_t *preg)
 999 {
1000   re_dfa_t *dfa = preg->buffer;
1001   bin_tree_storage_t *storage, *next;
1002   for (storage = dfa->str_tree_storage; storage; storage = next)
1003     {
1004       next = storage->next;
1005       re_free (storage);
1006     }
1007   dfa->str_tree_storage = NULL;
1008   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1009   dfa->str_tree = NULL;
1010   re_free (dfa->org_indices);
1011   dfa->org_indices = NULL;
1012 }
1013
1014 /* Create initial states for all contexts.  */
1015
1016 static reg_errcode_t
1017 create_initial_state (re_dfa_t *dfa)
1018 {
1019   Idx first, i;
1020   reg_errcode_t err;
1021   re_node_set init_nodes;
1022
1023   /* Initial states have the epsilon closure of the node which is
1024      the first node of the regular expression.  */
1025   first = dfa->str_tree->first->node_idx;
1026   dfa->init_node = first;
1027   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1028   if (BE (err != REG_NOERROR, 0))
1029     return err;
1030
1031   /* The back-references which are in initial states can epsilon transit,
1032      since in this case all of the subexpressions can be null.
1033      Then we add epsilon closures of the nodes which are the next nodes of
1034      the back-references.  */
1035   if (dfa->nbackref > 0)
1036     for (i = 0; i < init_nodes.nelem; ++i)
1037       {
1038         Idx node_idx = init_nodes.elems[i];
1039         re_token_type_t type = dfa->nodes[node_idx].type;
1040
1041         Idx clexp_idx;
1042         if (type != OP_BACK_REF)
1043           continue;
1044         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1045           {
1046             re_token_t *clexp_node;
1047             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1048             if (clexp_node->type == OP_CLOSE_SUBEXP
1049                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1050               break;
1051           }
1052         if (clexp_idx == init_nodes.nelem)
1053           continue;
1054
1055         if (type == OP_BACK_REF)
1056           {
1057             Idx dest_idx = dfa->edests[node_idx].elems[0];
1058             if (!re_node_set_contains (&init_nodes, dest_idx))
1059               {
1060                 reg_errcode_t merge_err
1061                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1062                 if (merge_err != REG_NOERROR)
1063                   return merge_err;
1064                 i = 0;
1065               }
1066           }
1067       }
1068
1069   /* It must be the first time to invoke acquire_state.  */
1070   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1071   /* We don't check ERR here, since the initial state must not be NULL.  */
1072   if (BE (dfa->init_state == NULL, 0))
1073     return err;
1074   if (dfa->init_state->has_constraint)
1075     {
1076       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1077                                                        CONTEXT_WORD);
1078       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1079                                                      CONTEXT_NEWLINE);
1080       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1081                                                          &init_nodes,
1082                                                          CONTEXT_NEWLINE
1083                                                          | CONTEXT_BEGBUF);
1084       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1085               || dfa->init_state_begbuf == NULL, 0))
1086         return err;
1087     }
1088   else
1089     dfa->init_state_word = dfa->init_state_nl
1090       = dfa->init_state_begbuf = dfa->init_state;
1091
1092   re_node_set_free (&init_nodes);
1093   return REG_NOERROR;
1094 }
1095 \f
1096 #ifdef RE_ENABLE_I18N
1097 /* If it is possible to do searching in single byte encoding instead of UTF-8
1098    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1099    DFA nodes where needed.  */
1100
1101 static void
1102 optimize_utf8 (re_dfa_t *dfa)
1103 {
1104   Idx node;
1105   int i;
1106   bool mb_chars = false;
1107   bool has_period = false;
1108
1109   for (node = 0; node < dfa->nodes_len; ++node)
1110     switch (dfa->nodes[node].type)
1111       {
1112       case CHARACTER:
1113         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1114           mb_chars = true;
1115         break;
1116       case ANCHOR:
1117         switch (dfa->nodes[node].opr.ctx_type)
1118           {
1119           case LINE_FIRST:
1120           case LINE_LAST:
1121           case BUF_FIRST:
1122           case BUF_LAST:
1123             break;
1124           default:
1125             /* Word anchors etc. cannot be handled.  It's okay to test
1126                opr.ctx_type since constraints (for all DFA nodes) are
1127                created by ORing one or more opr.ctx_type values.  */
1128             return;
1129           }
1130         break;
1131       case OP_PERIOD:
1132         has_period = true;
1133         break;
1134       case OP_BACK_REF:
1135       case OP_ALT:
1136       case END_OF_RE:
1137       case OP_DUP_ASTERISK:
1138       case OP_OPEN_SUBEXP:
1139       case OP_CLOSE_SUBEXP:
1140         break;
1141       case COMPLEX_BRACKET:
1142         return;
1143       case SIMPLE_BRACKET:
1144         /* Just double check.  */
1145         {
1146           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1147                         ? 0
1148                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1149           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1150             {
1151               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1152                 return;
1153               rshift = 0;
1154             }
1155         }
1156         break;
1157       default:
1158         abort ();
1159       }
1160
1161   if (mb_chars || has_period)
1162     for (node = 0; node < dfa->nodes_len; ++node)
1163       {
1164         if (dfa->nodes[node].type == CHARACTER
1165             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1166           dfa->nodes[node].mb_partial = 0;
1167         else if (dfa->nodes[node].type == OP_PERIOD)
1168           dfa->nodes[node].type = OP_UTF8_PERIOD;
1169       }
1170
1171   /* The search can be in single byte locale.  */
1172   dfa->mb_cur_max = 1;
1173   dfa->is_utf8 = 0;
1174   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1175 }
1176 #endif
1177 \f
1178 /* Analyze the structure tree, and calculate "first", "next", "edest",
1179    "eclosure", and "inveclosure".  */
1180
1181 static reg_errcode_t
1182 analyze (regex_t *preg)
1183 {
1184   re_dfa_t *dfa = preg->buffer;
1185   reg_errcode_t ret;
1186
1187   /* Allocate arrays.  */
1188   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1189   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1190   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1191   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1192   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1193           || dfa->eclosures == NULL, 0))
1194     return REG_ESPACE;
1195
1196   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1197   if (dfa->subexp_map != NULL)
1198     {
1199       Idx i;
1200       for (i = 0; i < preg->re_nsub; i++)
1201         dfa->subexp_map[i] = i;
1202       preorder (dfa->str_tree, optimize_subexps, dfa);
1203       for (i = 0; i < preg->re_nsub; i++)
1204         if (dfa->subexp_map[i] != i)
1205           break;
1206       if (i == preg->re_nsub)
1207         {
1208           free (dfa->subexp_map);
1209           dfa->subexp_map = NULL;
1210         }
1211     }
1212
1213   ret = postorder (dfa->str_tree, lower_subexps, preg);
1214   if (BE (ret != REG_NOERROR, 0))
1215     return ret;
1216   ret = postorder (dfa->str_tree, calc_first, dfa);
1217   if (BE (ret != REG_NOERROR, 0))
1218     return ret;
1219   preorder (dfa->str_tree, calc_next, dfa);
1220   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1221   if (BE (ret != REG_NOERROR, 0))
1222     return ret;
1223   ret = calc_eclosure (dfa);
1224   if (BE (ret != REG_NOERROR, 0))
1225     return ret;
1226
1227   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1228      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1229   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1230       || dfa->nbackref)
1231     {
1232       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1233       if (BE (dfa->inveclosures == NULL, 0))
1234         return REG_ESPACE;
1235       ret = calc_inveclosure (dfa);
1236     }
1237
1238   return ret;
1239 }
1240
1241 /* Our parse trees are very unbalanced, so we cannot use a stack to
1242    implement parse tree visits.  Instead, we use parent pointers and
1243    some hairy code in these two functions.  */
1244 static reg_errcode_t
1245 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1246            void *extra)
1247 {
1248   bin_tree_t *node, *prev;
1249
1250   for (node = root; ; )
1251     {
1252       /* Descend down the tree, preferably to the left (or to the right
1253          if that's the only child).  */
1254       while (node->left || node->right)
1255         if (node->left)
1256           node = node->left;
1257         else
1258           node = node->right;
1259
1260       do
1261         {
1262           reg_errcode_t err = fn (extra, node);
1263           if (BE (err != REG_NOERROR, 0))
1264             return err;
1265           if (node->parent == NULL)
1266             return REG_NOERROR;
1267           prev = node;
1268           node = node->parent;
1269         }
1270       /* Go up while we have a node that is reached from the right.  */
1271       while (node->right == prev || node->right == NULL);
1272       node = node->right;
1273     }
1274 }
1275
1276 static reg_errcode_t
1277 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1278           void *extra)
1279 {
1280   bin_tree_t *node;
1281
1282   for (node = root; ; )
1283     {
1284       reg_errcode_t err = fn (extra, node);
1285       if (BE (err != REG_NOERROR, 0))
1286         return err;
1287
1288       /* Go to the left node, or up and to the right.  */
1289       if (node->left)
1290         node = node->left;
1291       else
1292         {
1293           bin_tree_t *prev = NULL;
1294           while (node->right == prev || node->right == NULL)
1295             {
1296               prev = node;
1297               node = node->parent;
1298               if (!node)
1299                 return REG_NOERROR;
1300             }
1301           node = node->right;
1302         }
1303     }
1304 }
1305
1306 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1307    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1308    backreferences as well.  Requires a preorder visit.  */
1309 static reg_errcode_t
1310 optimize_subexps (void *extra, bin_tree_t *node)
1311 {
1312   re_dfa_t *dfa = (re_dfa_t *) extra;
1313
1314   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1315     {
1316       int idx = node->token.opr.idx;
1317       node->token.opr.idx = dfa->subexp_map[idx];
1318       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1319     }
1320
1321   else if (node->token.type == SUBEXP
1322            && node->left && node->left->token.type == SUBEXP)
1323     {
1324       Idx other_idx = node->left->token.opr.idx;
1325
1326       node->left = node->left->left;
1327       if (node->left)
1328         node->left->parent = node;
1329
1330       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1331       if (other_idx < BITSET_WORD_BITS)
1332         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1333     }
1334
1335   return REG_NOERROR;
1336 }
1337
1338 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1339    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1340 static reg_errcode_t
1341 lower_subexps (void *extra, bin_tree_t *node)
1342 {
1343   regex_t *preg = (regex_t *) extra;
1344   reg_errcode_t err = REG_NOERROR;
1345
1346   if (node->left && node->left->token.type == SUBEXP)
1347     {
1348       node->left = lower_subexp (&err, preg, node->left);
1349       if (node->left)
1350         node->left->parent = node;
1351     }
1352   if (node->right && node->right->token.type == SUBEXP)
1353     {
1354       node->right = lower_subexp (&err, preg, node->right);
1355       if (node->right)
1356         node->right->parent = node;
1357     }
1358
1359   return err;
1360 }
1361
1362 static bin_tree_t *
1363 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1364 {
1365   re_dfa_t *dfa = preg->buffer;
1366   bin_tree_t *body = node->left;
1367   bin_tree_t *op, *cls, *tree1, *tree;
1368
1369   if (preg->no_sub
1370       /* We do not optimize empty subexpressions, because otherwise we may
1371          have bad CONCAT nodes with NULL children.  This is obviously not
1372          very common, so we do not lose much.  An example that triggers
1373          this case is the sed "script" /\(\)/x.  */
1374       && node->left != NULL
1375       && (node->token.opr.idx >= BITSET_WORD_BITS
1376           || !(dfa->used_bkref_map
1377                & ((bitset_word_t) 1 << node->token.opr.idx))))
1378     return node->left;
1379
1380   /* Convert the SUBEXP node to the concatenation of an
1381      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1382   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1383   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1384   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1385   tree = create_tree (dfa, op, tree1, CONCAT);
1386   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1387     {
1388       *err = REG_ESPACE;
1389       return NULL;
1390     }
1391
1392   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1393   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1394   return tree;
1395 }
1396
1397 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1398    nodes.  Requires a postorder visit.  */
1399 static reg_errcode_t
1400 calc_first (void *extra, bin_tree_t *node)
1401 {
1402   re_dfa_t *dfa = (re_dfa_t *) extra;
1403   if (node->token.type == CONCAT)
1404     {
1405       node->first = node->left->first;
1406       node->node_idx = node->left->node_idx;
1407     }
1408   else
1409     {
1410       node->first = node;
1411       node->node_idx = re_dfa_add_node (dfa, node->token);
1412       if (BE (node->node_idx == REG_MISSING, 0))
1413         return REG_ESPACE;
1414       if (node->token.type == ANCHOR)
1415         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1416     }
1417   return REG_NOERROR;
1418 }
1419
1420 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1421 static reg_errcode_t
1422 calc_next (void *extra, bin_tree_t *node)
1423 {
1424   switch (node->token.type)
1425     {
1426     case OP_DUP_ASTERISK:
1427       node->left->next = node;
1428       break;
1429     case CONCAT:
1430       node->left->next = node->right->first;
1431       node->right->next = node->next;
1432       break;
1433     default:
1434       if (node->left)
1435         node->left->next = node->next;
1436       if (node->right)
1437         node->right->next = node->next;
1438       break;
1439     }
1440   return REG_NOERROR;
1441 }
1442
1443 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1444 static reg_errcode_t
1445 link_nfa_nodes (void *extra, bin_tree_t *node)
1446 {
1447   re_dfa_t *dfa = (re_dfa_t *) extra;
1448   Idx idx = node->node_idx;
1449   reg_errcode_t err = REG_NOERROR;
1450
1451   switch (node->token.type)
1452     {
1453     case CONCAT:
1454       break;
1455
1456     case END_OF_RE:
1457       assert (node->next == NULL);
1458       break;
1459
1460     case OP_DUP_ASTERISK:
1461     case OP_ALT:
1462       {
1463         Idx left, right;
1464         dfa->has_plural_match = 1;
1465         if (node->left != NULL)
1466           left = node->left->first->node_idx;
1467         else
1468           left = node->next->node_idx;
1469         if (node->right != NULL)
1470           right = node->right->first->node_idx;
1471         else
1472           right = node->next->node_idx;
1473         assert (REG_VALID_INDEX (left));
1474         assert (REG_VALID_INDEX (right));
1475         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1476       }
1477       break;
1478
1479     case ANCHOR:
1480     case OP_OPEN_SUBEXP:
1481     case OP_CLOSE_SUBEXP:
1482       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1483       break;
1484
1485     case OP_BACK_REF:
1486       dfa->nexts[idx] = node->next->node_idx;
1487       if (node->token.type == OP_BACK_REF)
1488         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1489       break;
1490
1491     default:
1492       assert (!IS_EPSILON_NODE (node->token.type));
1493       dfa->nexts[idx] = node->next->node_idx;
1494       break;
1495     }
1496
1497   return err;
1498 }
1499
1500 /* Duplicate the epsilon closure of the node ROOT_NODE.
1501    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1502    to their own constraint.  */
1503
1504 static reg_errcode_t
1505 internal_function
1506 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1507                         Idx root_node, unsigned int init_constraint)
1508 {
1509   Idx org_node, clone_node;
1510   bool ok;
1511   unsigned int constraint = init_constraint;
1512   for (org_node = top_org_node, clone_node = top_clone_node;;)
1513     {
1514       Idx org_dest, clone_dest;
1515       if (dfa->nodes[org_node].type == OP_BACK_REF)
1516         {
1517           /* If the back reference epsilon-transit, its destination must
1518              also have the constraint.  Then duplicate the epsilon closure
1519              of the destination of the back reference, and store it in
1520              edests of the back reference.  */
1521           org_dest = dfa->nexts[org_node];
1522           re_node_set_empty (dfa->edests + clone_node);
1523           clone_dest = duplicate_node (dfa, org_dest, constraint);
1524           if (BE (clone_dest == REG_MISSING, 0))
1525             return REG_ESPACE;
1526           dfa->nexts[clone_node] = dfa->nexts[org_node];
1527           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528           if (BE (! ok, 0))
1529             return REG_ESPACE;
1530         }
1531       else if (dfa->edests[org_node].nelem == 0)
1532         {
1533           /* In case of the node can't epsilon-transit, don't duplicate the
1534              destination and store the original destination as the
1535              destination of the node.  */
1536           dfa->nexts[clone_node] = dfa->nexts[org_node];
1537           break;
1538         }
1539       else if (dfa->edests[org_node].nelem == 1)
1540         {
1541           /* In case of the node can epsilon-transit, and it has only one
1542              destination.  */
1543           org_dest = dfa->edests[org_node].elems[0];
1544           re_node_set_empty (dfa->edests + clone_node);
1545           /* If the node is root_node itself, it means the epsilon closure
1546              has a loop.  Then tie it to the destination of the root_node.  */
1547           if (org_node == root_node && clone_node != org_node)
1548             {
1549               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1550               if (BE (! ok, 0))
1551                 return REG_ESPACE;
1552               break;
1553             }
1554           /* In case the node has another constraint, append it.  */
1555           constraint |= dfa->nodes[org_node].constraint;
1556           clone_dest = duplicate_node (dfa, org_dest, constraint);
1557           if (BE (clone_dest == REG_MISSING, 0))
1558             return REG_ESPACE;
1559           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560           if (BE (! ok, 0))
1561             return REG_ESPACE;
1562         }
1563       else /* dfa->edests[org_node].nelem == 2 */
1564         {
1565           /* In case of the node can epsilon-transit, and it has two
1566              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1567           org_dest = dfa->edests[org_node].elems[0];
1568           re_node_set_empty (dfa->edests + clone_node);
1569           /* Search for a duplicated node which satisfies the constraint.  */
1570           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1571           if (clone_dest == REG_MISSING)
1572             {
1573               /* There is no such duplicated node, create a new one.  */
1574               reg_errcode_t err;
1575               clone_dest = duplicate_node (dfa, org_dest, constraint);
1576               if (BE (clone_dest == REG_MISSING, 0))
1577                 return REG_ESPACE;
1578               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579               if (BE (! ok, 0))
1580                 return REG_ESPACE;
1581               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1582                                             root_node, constraint);
1583               if (BE (err != REG_NOERROR, 0))
1584                 return err;
1585             }
1586           else
1587             {
1588               /* There is a duplicated node which satisfies the constraint,
1589                  use it to avoid infinite loop.  */
1590               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591               if (BE (! ok, 0))
1592                 return REG_ESPACE;
1593             }
1594
1595           org_dest = dfa->edests[org_node].elems[1];
1596           clone_dest = duplicate_node (dfa, org_dest, constraint);
1597           if (BE (clone_dest == REG_MISSING, 0))
1598             return REG_ESPACE;
1599           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1600           if (BE (! ok, 0))
1601             return REG_ESPACE;
1602         }
1603       org_node = org_dest;
1604       clone_node = clone_dest;
1605     }
1606   return REG_NOERROR;
1607 }
1608
1609 /* Search for a node which is duplicated from the node ORG_NODE, and
1610    satisfies the constraint CONSTRAINT.  */
1611
1612 static Idx
1613 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1614                         unsigned int constraint)
1615 {
1616   Idx idx;
1617   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1618     {
1619       if (org_node == dfa->org_indices[idx]
1620           && constraint == dfa->nodes[idx].constraint)
1621         return idx; /* Found.  */
1622     }
1623   return REG_MISSING; /* Not found.  */
1624 }
1625
1626 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1627    Return the index of the new node, or REG_MISSING if insufficient storage is
1628    available.  */
1629
1630 static Idx
1631 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1632 {
1633   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1634   if (BE (dup_idx != REG_MISSING, 1))
1635     {
1636       dfa->nodes[dup_idx].constraint = constraint;
1637       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1638       dfa->nodes[dup_idx].duplicated = 1;
1639
1640       /* Store the index of the original node.  */
1641       dfa->org_indices[dup_idx] = org_idx;
1642     }
1643   return dup_idx;
1644 }
1645
1646 static reg_errcode_t
1647 calc_inveclosure (re_dfa_t *dfa)
1648 {
1649   Idx src, idx;
1650   bool ok;
1651   for (idx = 0; idx < dfa->nodes_len; ++idx)
1652     re_node_set_init_empty (dfa->inveclosures + idx);
1653
1654   for (src = 0; src < dfa->nodes_len; ++src)
1655     {
1656       Idx *elems = dfa->eclosures[src].elems;
1657       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1658         {
1659           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1660           if (BE (! ok, 0))
1661             return REG_ESPACE;
1662         }
1663     }
1664
1665   return REG_NOERROR;
1666 }
1667
1668 /* Calculate "eclosure" for all the node in DFA.  */
1669
1670 static reg_errcode_t
1671 calc_eclosure (re_dfa_t *dfa)
1672 {
1673   Idx node_idx;
1674   bool incomplete;
1675 #ifdef DEBUG
1676   assert (dfa->nodes_len > 0);
1677 #endif
1678   incomplete = false;
1679   /* For each nodes, calculate epsilon closure.  */
1680   for (node_idx = 0; ; ++node_idx)
1681     {
1682       reg_errcode_t err;
1683       re_node_set eclosure_elem;
1684       if (node_idx == dfa->nodes_len)
1685         {
1686           if (!incomplete)
1687             break;
1688           incomplete = false;
1689           node_idx = 0;
1690         }
1691
1692 #ifdef DEBUG
1693       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1694 #endif
1695
1696       /* If we have already calculated, skip it.  */
1697       if (dfa->eclosures[node_idx].nelem != 0)
1698         continue;
1699       /* Calculate epsilon closure of 'node_idx'.  */
1700       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1701       if (BE (err != REG_NOERROR, 0))
1702         return err;
1703
1704       if (dfa->eclosures[node_idx].nelem == 0)
1705         {
1706           incomplete = true;
1707           re_node_set_free (&eclosure_elem);
1708         }
1709     }
1710   return REG_NOERROR;
1711 }
1712
1713 /* Calculate epsilon closure of NODE.  */
1714
1715 static reg_errcode_t
1716 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1717 {
1718   reg_errcode_t err;
1719   Idx i;
1720   re_node_set eclosure;
1721   bool ok;
1722   bool incomplete = false;
1723   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1724   if (BE (err != REG_NOERROR, 0))
1725     return err;
1726
1727   /* This indicates that we are calculating this node now.
1728      We reference this value to avoid infinite loop.  */
1729   dfa->eclosures[node].nelem = REG_MISSING;
1730
1731   /* If the current node has constraints, duplicate all nodes
1732      since they must inherit the constraints.  */
1733   if (dfa->nodes[node].constraint
1734       && dfa->edests[node].nelem
1735       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1736     {
1737       err = duplicate_node_closure (dfa, node, node, node,
1738                                     dfa->nodes[node].constraint);
1739       if (BE (err != REG_NOERROR, 0))
1740         return err;
1741     }
1742
1743   /* Expand each epsilon destination nodes.  */
1744   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1745     for (i = 0; i < dfa->edests[node].nelem; ++i)
1746       {
1747         re_node_set eclosure_elem;
1748         Idx edest = dfa->edests[node].elems[i];
1749         /* If calculating the epsilon closure of 'edest' is in progress,
1750            return intermediate result.  */
1751         if (dfa->eclosures[edest].nelem == REG_MISSING)
1752           {
1753             incomplete = true;
1754             continue;
1755           }
1756         /* If we haven't calculated the epsilon closure of 'edest' yet,
1757            calculate now. Otherwise use calculated epsilon closure.  */
1758         if (dfa->eclosures[edest].nelem == 0)
1759           {
1760             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1761             if (BE (err != REG_NOERROR, 0))
1762               return err;
1763           }
1764         else
1765           eclosure_elem = dfa->eclosures[edest];
1766         /* Merge the epsilon closure of 'edest'.  */
1767         err = re_node_set_merge (&eclosure, &eclosure_elem);
1768         if (BE (err != REG_NOERROR, 0))
1769           return err;
1770         /* If the epsilon closure of 'edest' is incomplete,
1771            the epsilon closure of this node is also incomplete.  */
1772         if (dfa->eclosures[edest].nelem == 0)
1773           {
1774             incomplete = true;
1775             re_node_set_free (&eclosure_elem);
1776           }
1777       }
1778
1779   /* An epsilon closure includes itself.  */
1780   ok = re_node_set_insert (&eclosure, node);
1781   if (BE (! ok, 0))
1782     return REG_ESPACE;
1783   if (incomplete && !root)
1784     dfa->eclosures[node].nelem = 0;
1785   else
1786     dfa->eclosures[node] = eclosure;
1787   *new_set = eclosure;
1788   return REG_NOERROR;
1789 }
1790 \f
1791 /* Functions for token which are used in the parser.  */
1792
1793 /* Fetch a token from INPUT.
1794    We must not use this function inside bracket expressions.  */
1795
1796 static void
1797 internal_function
1798 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1799 {
1800   re_string_skip_bytes (input, peek_token (result, input, syntax));
1801 }
1802
1803 /* Peek a token from INPUT, and return the length of the token.
1804    We must not use this function inside bracket expressions.  */
1805
1806 static int
1807 internal_function
1808 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1809 {
1810   unsigned char c;
1811
1812   if (re_string_eoi (input))
1813     {
1814       token->type = END_OF_RE;
1815       return 0;
1816     }
1817
1818   c = re_string_peek_byte (input, 0);
1819   token->opr.c = c;
1820
1821   token->word_char = 0;
1822 #ifdef RE_ENABLE_I18N
1823   token->mb_partial = 0;
1824   if (input->mb_cur_max > 1 &&
1825       !re_string_first_byte (input, re_string_cur_idx (input)))
1826     {
1827       token->type = CHARACTER;
1828       token->mb_partial = 1;
1829       return 1;
1830     }
1831 #endif
1832   if (c == '\\')
1833     {
1834       unsigned char c2;
1835       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1836         {
1837           token->type = BACK_SLASH;
1838           return 1;
1839         }
1840
1841       c2 = re_string_peek_byte_case (input, 1);
1842       token->opr.c = c2;
1843       token->type = CHARACTER;
1844 #ifdef RE_ENABLE_I18N
1845       if (input->mb_cur_max > 1)
1846         {
1847           wint_t wc = re_string_wchar_at (input,
1848                                           re_string_cur_idx (input) + 1);
1849           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1850         }
1851       else
1852 #endif
1853         token->word_char = IS_WORD_CHAR (c2) != 0;
1854
1855       switch (c2)
1856         {
1857         case '|':
1858           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1859             token->type = OP_ALT;
1860           break;
1861         case '1': case '2': case '3': case '4': case '5':
1862         case '6': case '7': case '8': case '9':
1863           if (!(syntax & RE_NO_BK_REFS))
1864             {
1865               token->type = OP_BACK_REF;
1866               token->opr.idx = c2 - '1';
1867             }
1868           break;
1869         case '<':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             {
1872               token->type = ANCHOR;
1873               token->opr.ctx_type = WORD_FIRST;
1874             }
1875           break;
1876         case '>':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             {
1879               token->type = ANCHOR;
1880               token->opr.ctx_type = WORD_LAST;
1881             }
1882           break;
1883         case 'b':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             {
1886               token->type = ANCHOR;
1887               token->opr.ctx_type = WORD_DELIM;
1888             }
1889           break;
1890         case 'B':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             {
1893               token->type = ANCHOR;
1894               token->opr.ctx_type = NOT_WORD_DELIM;
1895             }
1896           break;
1897         case 'w':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             token->type = OP_WORD;
1900           break;
1901         case 'W':
1902           if (!(syntax & RE_NO_GNU_OPS))
1903             token->type = OP_NOTWORD;
1904           break;
1905         case 's':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_SPACE;
1908           break;
1909         case 'S':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             token->type = OP_NOTSPACE;
1912           break;
1913         case '`':
1914           if (!(syntax & RE_NO_GNU_OPS))
1915             {
1916               token->type = ANCHOR;
1917               token->opr.ctx_type = BUF_FIRST;
1918             }
1919           break;
1920         case '\'':
1921           if (!(syntax & RE_NO_GNU_OPS))
1922             {
1923               token->type = ANCHOR;
1924               token->opr.ctx_type = BUF_LAST;
1925             }
1926           break;
1927         case '(':
1928           if (!(syntax & RE_NO_BK_PARENS))
1929             token->type = OP_OPEN_SUBEXP;
1930           break;
1931         case ')':
1932           if (!(syntax & RE_NO_BK_PARENS))
1933             token->type = OP_CLOSE_SUBEXP;
1934           break;
1935         case '+':
1936           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937             token->type = OP_DUP_PLUS;
1938           break;
1939         case '?':
1940           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941             token->type = OP_DUP_QUESTION;
1942           break;
1943         case '{':
1944           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945             token->type = OP_OPEN_DUP_NUM;
1946           break;
1947         case '}':
1948           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949             token->type = OP_CLOSE_DUP_NUM;
1950           break;
1951         default:
1952           break;
1953         }
1954       return 2;
1955     }
1956
1957   token->type = CHARACTER;
1958 #ifdef RE_ENABLE_I18N
1959   if (input->mb_cur_max > 1)
1960     {
1961       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1962       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1963     }
1964   else
1965 #endif
1966     token->word_char = IS_WORD_CHAR (token->opr.c);
1967
1968   switch (c)
1969     {
1970     case '\n':
1971       if (syntax & RE_NEWLINE_ALT)
1972         token->type = OP_ALT;
1973       break;
1974     case '|':
1975       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1976         token->type = OP_ALT;
1977       break;
1978     case '*':
1979       token->type = OP_DUP_ASTERISK;
1980       break;
1981     case '+':
1982       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983         token->type = OP_DUP_PLUS;
1984       break;
1985     case '?':
1986       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987         token->type = OP_DUP_QUESTION;
1988       break;
1989     case '{':
1990       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991         token->type = OP_OPEN_DUP_NUM;
1992       break;
1993     case '}':
1994       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995         token->type = OP_CLOSE_DUP_NUM;
1996       break;
1997     case '(':
1998       if (syntax & RE_NO_BK_PARENS)
1999         token->type = OP_OPEN_SUBEXP;
2000       break;
2001     case ')':
2002       if (syntax & RE_NO_BK_PARENS)
2003         token->type = OP_CLOSE_SUBEXP;
2004       break;
2005     case '[':
2006       token->type = OP_OPEN_BRACKET;
2007       break;
2008     case '.':
2009       token->type = OP_PERIOD;
2010       break;
2011     case '^':
2012       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2013           re_string_cur_idx (input) != 0)
2014         {
2015           char prev = re_string_peek_byte (input, -1);
2016           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2017             break;
2018         }
2019       token->type = ANCHOR;
2020       token->opr.ctx_type = LINE_FIRST;
2021       break;
2022     case '$':
2023       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2024           re_string_cur_idx (input) + 1 != re_string_length (input))
2025         {
2026           re_token_t next;
2027           re_string_skip_bytes (input, 1);
2028           peek_token (&next, input, syntax);
2029           re_string_skip_bytes (input, -1);
2030           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2031             break;
2032         }
2033       token->type = ANCHOR;
2034       token->opr.ctx_type = LINE_LAST;
2035       break;
2036     default:
2037       break;
2038     }
2039   return 1;
2040 }
2041
2042 /* Peek a token from INPUT, and return the length of the token.
2043    We must not use this function out of bracket expressions.  */
2044
2045 static int
2046 internal_function
2047 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2048 {
2049   unsigned char c;
2050   if (re_string_eoi (input))
2051     {
2052       token->type = END_OF_RE;
2053       return 0;
2054     }
2055   c = re_string_peek_byte (input, 0);
2056   token->opr.c = c;
2057
2058 #ifdef RE_ENABLE_I18N
2059   if (input->mb_cur_max > 1 &&
2060       !re_string_first_byte (input, re_string_cur_idx (input)))
2061     {
2062       token->type = CHARACTER;
2063       return 1;
2064     }
2065 #endif /* RE_ENABLE_I18N */
2066
2067   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2068       && re_string_cur_idx (input) + 1 < re_string_length (input))
2069     {
2070       /* In this case, '\' escape a character.  */
2071       unsigned char c2;
2072       re_string_skip_bytes (input, 1);
2073       c2 = re_string_peek_byte (input, 0);
2074       token->opr.c = c2;
2075       token->type = CHARACTER;
2076       return 1;
2077     }
2078   if (c == '[') /* '[' is a special char in a bracket exps.  */
2079     {
2080       unsigned char c2;
2081       int token_len;
2082       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2083         c2 = re_string_peek_byte (input, 1);
2084       else
2085         c2 = 0;
2086       token->opr.c = c2;
2087       token_len = 2;
2088       switch (c2)
2089         {
2090         case '.':
2091           token->type = OP_OPEN_COLL_ELEM;
2092           break;
2093         case '=':
2094           token->type = OP_OPEN_EQUIV_CLASS;
2095           break;
2096         case ':':
2097           if (syntax & RE_CHAR_CLASSES)
2098             {
2099               token->type = OP_OPEN_CHAR_CLASS;
2100               break;
2101             }
2102           /* else fall through.  */
2103         default:
2104           token->type = CHARACTER;
2105           token->opr.c = c;
2106           token_len = 1;
2107           break;
2108         }
2109       return token_len;
2110     }
2111   switch (c)
2112     {
2113     case '-':
2114       token->type = OP_CHARSET_RANGE;
2115       break;
2116     case ']':
2117       token->type = OP_CLOSE_BRACKET;
2118       break;
2119     case '^':
2120       token->type = OP_NON_MATCH_LIST;
2121       break;
2122     default:
2123       token->type = CHARACTER;
2124     }
2125   return 1;
2126 }
2127 \f
2128 /* Functions for parser.  */
2129
2130 /* Entry point of the parser.
2131    Parse the regular expression REGEXP and return the structure tree.
2132    If an error occurs, ERR is set by error code, and return NULL.
2133    This function build the following tree, from regular expression <reg_exp>:
2134            CAT
2135            / \
2136           /   \
2137    <reg_exp>  EOR
2138
2139    CAT means concatenation.
2140    EOR means end of regular expression.  */
2141
2142 static bin_tree_t *
2143 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2144        reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = preg->buffer;
2147   bin_tree_t *tree, *eor, *root;
2148   re_token_t current_token;
2149   dfa->syntax = syntax;
2150   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2151   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2152   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2153     return NULL;
2154   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2155   if (tree != NULL)
2156     root = create_tree (dfa, tree, eor, CONCAT);
2157   else
2158     root = eor;
2159   if (BE (eor == NULL || root == NULL, 0))
2160     {
2161       *err = REG_ESPACE;
2162       return NULL;
2163     }
2164   return root;
2165 }
2166
2167 /* This function build the following tree, from regular expression
2168    <branch1>|<branch2>:
2169            ALT
2170            / \
2171           /   \
2172    <branch1> <branch2>
2173
2174    ALT means alternative, which represents the operator '|'.  */
2175
2176 static bin_tree_t *
2177 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2179 {
2180   re_dfa_t *dfa = preg->buffer;
2181   bin_tree_t *tree, *branch = NULL;
2182   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2183   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184     return NULL;
2185
2186   while (token->type == OP_ALT)
2187     {
2188       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2189       if (token->type != OP_ALT && token->type != END_OF_RE
2190           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2191         {
2192           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2193           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2194             return NULL;
2195         }
2196       else
2197         branch = NULL;
2198       tree = create_tree (dfa, tree, branch, OP_ALT);
2199       if (BE (tree == NULL, 0))
2200         {
2201           *err = REG_ESPACE;
2202           return NULL;
2203         }
2204     }
2205   return tree;
2206 }
2207
2208 /* This function build the following tree, from regular expression
2209    <exp1><exp2>:
2210         CAT
2211         / \
2212        /   \
2213    <exp1> <exp2>
2214
2215    CAT means concatenation.  */
2216
2217 static bin_tree_t *
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2220 {
2221   bin_tree_t *tree, *expr;
2222   re_dfa_t *dfa = preg->buffer;
2223   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2225     return NULL;
2226
2227   while (token->type != OP_ALT && token->type != END_OF_RE
2228          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2229     {
2230       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2232         {
2233           if (tree != NULL)
2234             postorder (tree, free_tree, NULL);
2235           return NULL;
2236         }
2237       if (tree != NULL && expr != NULL)
2238         {
2239           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240           if (newtree == NULL)
2241             {
2242               postorder (expr, free_tree, NULL);
2243               postorder (tree, free_tree, NULL);
2244               *err = REG_ESPACE;
2245               return NULL;
2246             }
2247           tree = newtree;
2248         }
2249       else if (tree == NULL)
2250         tree = expr;
2251       /* Otherwise expr == NULL, we don't need to create new tree.  */
2252     }
2253   return tree;
2254 }
2255
2256 /* This function build the following tree, from regular expression a*:
2257          *
2258          |
2259          a
2260 */
2261
2262 static bin_tree_t *
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2265 {
2266   re_dfa_t *dfa = preg->buffer;
2267   bin_tree_t *tree;
2268   switch (token->type)
2269     {
2270     case CHARACTER:
2271       tree = create_token_tree (dfa, NULL, NULL, token);
2272       if (BE (tree == NULL, 0))
2273         {
2274           *err = REG_ESPACE;
2275           return NULL;
2276         }
2277 #ifdef RE_ENABLE_I18N
2278       if (dfa->mb_cur_max > 1)
2279         {
2280           while (!re_string_eoi (regexp)
2281                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2282             {
2283               bin_tree_t *mbc_remain;
2284               fetch_token (token, regexp, syntax);
2285               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287               if (BE (mbc_remain == NULL || tree == NULL, 0))
2288                 {
2289                   *err = REG_ESPACE;
2290                   return NULL;
2291                 }
2292             }
2293         }
2294 #endif
2295       break;
2296     case OP_OPEN_SUBEXP:
2297       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2298       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2299         return NULL;
2300       break;
2301     case OP_OPEN_BRACKET:
2302       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304         return NULL;
2305       break;
2306     case OP_BACK_REF:
2307       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2308         {
2309           *err = REG_ESUBREG;
2310           return NULL;
2311         }
2312       dfa->used_bkref_map |= 1 << token->opr.idx;
2313       tree = create_token_tree (dfa, NULL, NULL, token);
2314       if (BE (tree == NULL, 0))
2315         {
2316           *err = REG_ESPACE;
2317           return NULL;
2318         }
2319       ++dfa->nbackref;
2320       dfa->has_mb_node = 1;
2321       break;
2322     case OP_OPEN_DUP_NUM:
2323       if (syntax & RE_CONTEXT_INVALID_DUP)
2324         {
2325           *err = REG_BADRPT;
2326           return NULL;
2327         }
2328       /* FALLTHROUGH */
2329     case OP_DUP_ASTERISK:
2330     case OP_DUP_PLUS:
2331     case OP_DUP_QUESTION:
2332       if (syntax & RE_CONTEXT_INVALID_OPS)
2333         {
2334           *err = REG_BADRPT;
2335           return NULL;
2336         }
2337       else if (syntax & RE_CONTEXT_INDEP_OPS)
2338         {
2339           fetch_token (token, regexp, syntax);
2340           return parse_expression (regexp, preg, token, syntax, nest, err);
2341         }
2342       /* else fall through  */
2343     case OP_CLOSE_SUBEXP:
2344       if ((token->type == OP_CLOSE_SUBEXP) &&
2345           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2346         {
2347           *err = REG_ERPAREN;
2348           return NULL;
2349         }
2350       /* else fall through  */
2351     case OP_CLOSE_DUP_NUM:
2352       /* We treat it as a normal character.  */
2353
2354       /* Then we can these characters as normal characters.  */
2355       token->type = CHARACTER;
2356       /* mb_partial and word_char bits should be initialized already
2357          by peek_token.  */
2358       tree = create_token_tree (dfa, NULL, NULL, token);
2359       if (BE (tree == NULL, 0))
2360         {
2361           *err = REG_ESPACE;
2362           return NULL;
2363         }
2364       break;
2365     case ANCHOR:
2366       if ((token->opr.ctx_type
2367            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2368           && dfa->word_ops_used == 0)
2369         init_word_char (dfa);
2370       if (token->opr.ctx_type == WORD_DELIM
2371           || token->opr.ctx_type == NOT_WORD_DELIM)
2372         {
2373           bin_tree_t *tree_first, *tree_last;
2374           if (token->opr.ctx_type == WORD_DELIM)
2375             {
2376               token->opr.ctx_type = WORD_FIRST;
2377               tree_first = create_token_tree (dfa, NULL, NULL, token);
2378               token->opr.ctx_type = WORD_LAST;
2379             }
2380           else
2381             {
2382               token->opr.ctx_type = INSIDE_WORD;
2383               tree_first = create_token_tree (dfa, NULL, NULL, token);
2384               token->opr.ctx_type = INSIDE_NOTWORD;
2385             }
2386           tree_last = create_token_tree (dfa, NULL, NULL, token);
2387           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2388           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2389             {
2390               *err = REG_ESPACE;
2391               return NULL;
2392             }
2393         }
2394       else
2395         {
2396           tree = create_token_tree (dfa, NULL, NULL, token);
2397           if (BE (tree == NULL, 0))
2398             {
2399               *err = REG_ESPACE;
2400               return NULL;
2401             }
2402         }
2403       /* We must return here, since ANCHORs can't be followed
2404          by repetition operators.
2405          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2406              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2407       fetch_token (token, regexp, syntax);
2408       return tree;
2409     case OP_PERIOD:
2410       tree = create_token_tree (dfa, NULL, NULL, token);
2411       if (BE (tree == NULL, 0))
2412         {
2413           *err = REG_ESPACE;
2414           return NULL;
2415         }
2416       if (dfa->mb_cur_max > 1)
2417         dfa->has_mb_node = 1;
2418       break;
2419     case OP_WORD:
2420     case OP_NOTWORD:
2421       tree = build_charclass_op (dfa, regexp->trans,
2422                                  (const unsigned char *) "alnum",
2423                                  (const unsigned char *) "_",
2424                                  token->type == OP_NOTWORD, err);
2425       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426         return NULL;
2427       break;
2428     case OP_SPACE:
2429     case OP_NOTSPACE:
2430       tree = build_charclass_op (dfa, regexp->trans,
2431                                  (const unsigned char *) "space",
2432                                  (const unsigned char *) "",
2433                                  token->type == OP_NOTSPACE, err);
2434       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2435         return NULL;
2436       break;
2437     case OP_ALT:
2438     case END_OF_RE:
2439       return NULL;
2440     case BACK_SLASH:
2441       *err = REG_EESCAPE;
2442       return NULL;
2443     default:
2444       /* Must not happen?  */
2445 #ifdef DEBUG
2446       assert (0);
2447 #endif
2448       return NULL;
2449     }
2450   fetch_token (token, regexp, syntax);
2451
2452   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2453          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2454     {
2455       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2456       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2457         return NULL;
2458       /* In BRE consecutive duplications are not allowed.  */
2459       if ((syntax & RE_CONTEXT_INVALID_DUP)
2460           && (token->type == OP_DUP_ASTERISK
2461               || token->type == OP_OPEN_DUP_NUM))
2462         {
2463           *err = REG_BADRPT;
2464           return NULL;
2465         }
2466     }
2467
2468   return tree;
2469 }
2470
2471 /* This function build the following tree, from regular expression
2472    (<reg_exp>):
2473          SUBEXP
2474             |
2475         <reg_exp>
2476 */
2477
2478 static bin_tree_t *
2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2481 {
2482   re_dfa_t *dfa = preg->buffer;
2483   bin_tree_t *tree;
2484   size_t cur_nsub;
2485   cur_nsub = preg->re_nsub++;
2486
2487   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488
2489   /* The subexpression may be a null string.  */
2490   if (token->type == OP_CLOSE_SUBEXP)
2491     tree = NULL;
2492   else
2493     {
2494       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2496         {
2497           if (tree != NULL)
2498             postorder (tree, free_tree, NULL);
2499           *err = REG_EPAREN;
2500         }
2501       if (BE (*err != REG_NOERROR, 0))
2502         return NULL;
2503     }
2504
2505   if (cur_nsub <= '9' - '1')
2506     dfa->completed_bkref_map |= 1 << cur_nsub;
2507
2508   tree = create_tree (dfa, tree, NULL, SUBEXP);
2509   if (BE (tree == NULL, 0))
2510     {
2511       *err = REG_ESPACE;
2512       return NULL;
2513     }
2514   tree->token.opr.idx = cur_nsub;
2515   return tree;
2516 }
2517
2518 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2519
2520 static bin_tree_t *
2521 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2522               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2523 {
2524   bin_tree_t *tree = NULL, *old_tree = NULL;
2525   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2526   re_token_t start_token = *token;
2527
2528   if (token->type == OP_OPEN_DUP_NUM)
2529     {
2530       end = 0;
2531       start = fetch_number (regexp, token, syntax);
2532       if (start == REG_MISSING)
2533         {
2534           if (token->type == CHARACTER && token->opr.c == ',')
2535             start = 0; /* We treat "{,m}" as "{0,m}".  */
2536           else
2537             {
2538               *err = REG_BADBR; /* <re>{} is invalid.  */
2539               return NULL;
2540             }
2541         }
2542       if (BE (start != REG_ERROR, 1))
2543         {
2544           /* We treat "{n}" as "{n,n}".  */
2545           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2546                  : ((token->type == CHARACTER && token->opr.c == ',')
2547                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2548         }
2549       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2550         {
2551           /* Invalid sequence.  */
2552           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2553             {
2554               if (token->type == END_OF_RE)
2555                 *err = REG_EBRACE;
2556               else
2557                 *err = REG_BADBR;
2558
2559               return NULL;
2560             }
2561
2562           /* If the syntax bit is set, rollback.  */
2563           re_string_set_index (regexp, start_idx);
2564           *token = start_token;
2565           token->type = CHARACTER;
2566           /* mb_partial and word_char bits should be already initialized by
2567              peek_token.  */
2568           return elem;
2569         }
2570
2571       if (BE ((end != REG_MISSING && start > end)
2572               || token->type != OP_CLOSE_DUP_NUM, 0))
2573         {
2574           /* First number greater than second.  */
2575           *err = REG_BADBR;
2576           return NULL;
2577         }
2578
2579       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2580         {
2581           *err = REG_ESIZE;
2582           return NULL;
2583         }
2584     }
2585   else
2586     {
2587       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2589     }
2590
2591   fetch_token (token, regexp, syntax);
2592
2593   if (BE (elem == NULL, 0))
2594     return NULL;
2595   if (BE (start == 0 && end == 0, 0))
2596     {
2597       postorder (elem, free_tree, NULL);
2598       return NULL;
2599     }
2600
2601   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2602   if (BE (start > 0, 0))
2603     {
2604       tree = elem;
2605       for (i = 2; i <= start; ++i)
2606         {
2607           elem = duplicate_tree (elem, dfa);
2608           tree = create_tree (dfa, tree, elem, CONCAT);
2609           if (BE (elem == NULL || tree == NULL, 0))
2610             goto parse_dup_op_espace;
2611         }
2612
2613       if (start == end)
2614         return tree;
2615
2616       /* Duplicate ELEM before it is marked optional.  */
2617       elem = duplicate_tree (elem, dfa);
2618       old_tree = tree;
2619     }
2620   else
2621     old_tree = NULL;
2622
2623   if (elem->token.type == SUBEXP)
2624     {
2625       uintptr_t subidx = elem->token.opr.idx;
2626       postorder (elem, mark_opt_subexp, (void *) subidx);
2627     }
2628
2629   tree = create_tree (dfa, elem, NULL,
2630                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2631   if (BE (tree == NULL, 0))
2632     goto parse_dup_op_espace;
2633
2634 /* From gnulib's "intprops.h":
2635    True if the arithmetic type T is signed.  */
2636 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2637
2638   /* This loop is actually executed only when end != REG_MISSING,
2639      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2640      already created the start+1-th copy.  */
2641   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2642     for (i = start + 2; i <= end; ++i)
2643       {
2644         elem = duplicate_tree (elem, dfa);
2645         tree = create_tree (dfa, tree, elem, CONCAT);
2646         if (BE (elem == NULL || tree == NULL, 0))
2647           goto parse_dup_op_espace;
2648
2649         tree = create_tree (dfa, tree, NULL, OP_ALT);
2650         if (BE (tree == NULL, 0))
2651           goto parse_dup_op_espace;
2652       }
2653
2654   if (old_tree)
2655     tree = create_tree (dfa, old_tree, tree, CONCAT);
2656
2657   return tree;
2658
2659  parse_dup_op_espace:
2660   *err = REG_ESPACE;
2661   return NULL;
2662 }
2663
2664 /* Size of the names for collating symbol/equivalence_class/character_class.
2665    I'm not sure, but maybe enough.  */
2666 #define BRACKET_NAME_BUF_SIZE 32
2667
2668 #ifndef _LIBC
2669   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2670      Build the range expression which starts from START_ELEM, and ends
2671      at END_ELEM.  The result are written to MBCSET and SBCSET.
2672      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2673      mbcset->range_ends, is a pointer argument since we may
2674      update it.  */
2675
2676 static reg_errcode_t
2677 internal_function
2678 # ifdef RE_ENABLE_I18N
2679 build_range_exp (const reg_syntax_t syntax,
2680                  bitset_t sbcset,
2681                  re_charset_t *mbcset,
2682                  Idx *range_alloc,
2683                  const bracket_elem_t *start_elem,
2684                  const bracket_elem_t *end_elem)
2685 # else /* not RE_ENABLE_I18N */
2686 build_range_exp (const reg_syntax_t syntax,
2687                  bitset_t sbcset,
2688                  const bracket_elem_t *start_elem,
2689                  const bracket_elem_t *end_elem)
2690 # endif /* not RE_ENABLE_I18N */
2691 {
2692   unsigned int start_ch, end_ch;
2693   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2694   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2695           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2696           0))
2697     return REG_ERANGE;
2698
2699   /* We can handle no multi character collating elements without libc
2700      support.  */
2701   if (BE ((start_elem->type == COLL_SYM
2702            && strlen ((char *) start_elem->opr.name) > 1)
2703           || (end_elem->type == COLL_SYM
2704               && strlen ((char *) end_elem->opr.name) > 1), 0))
2705     return REG_ECOLLATE;
2706
2707 # ifdef RE_ENABLE_I18N
2708   {
2709     wchar_t wc;
2710     wint_t start_wc;
2711     wint_t end_wc;
2712     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2713
2714     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2715                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2716                    : 0));
2717     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2718               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2719                  : 0));
2720     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2721                 ? __btowc (start_ch) : start_elem->opr.wch);
2722     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2723               ? __btowc (end_ch) : end_elem->opr.wch);
2724     if (start_wc == WEOF || end_wc == WEOF)
2725       return REG_ECOLLATE;
2726     cmp_buf[0] = start_wc;
2727     cmp_buf[4] = end_wc;
2728
2729     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2730             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2731       return REG_ERANGE;
2732
2733     /* Got valid collation sequence values, add them as a new entry.
2734        However, for !_LIBC we have no collation elements: if the
2735        character set is single byte, the single byte character set
2736        that we build below suffices.  parse_bracket_exp passes
2737        no MBCSET if dfa->mb_cur_max == 1.  */
2738     if (mbcset)
2739       {
2740         /* Check the space of the arrays.  */
2741         if (BE (*range_alloc == mbcset->nranges, 0))
2742           {
2743             /* There is not enough space, need realloc.  */
2744             wchar_t *new_array_start, *new_array_end;
2745             Idx new_nranges;
2746
2747             /* +1 in case of mbcset->nranges is 0.  */
2748             new_nranges = 2 * mbcset->nranges + 1;
2749             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2750                are NULL if *range_alloc == 0.  */
2751             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2752                                           new_nranges);
2753             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2754                                         new_nranges);
2755
2756             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2757               return REG_ESPACE;
2758
2759             mbcset->range_starts = new_array_start;
2760             mbcset->range_ends = new_array_end;
2761             *range_alloc = new_nranges;
2762           }
2763
2764         mbcset->range_starts[mbcset->nranges] = start_wc;
2765         mbcset->range_ends[mbcset->nranges++] = end_wc;
2766       }
2767
2768     /* Build the table for single byte characters.  */
2769     for (wc = 0; wc < SBC_MAX; ++wc)
2770       {
2771         cmp_buf[2] = wc;
2772         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2773             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2774           bitset_set (sbcset, wc);
2775       }
2776   }
2777 # else /* not RE_ENABLE_I18N */
2778   {
2779     unsigned int ch;
2780     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2781                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2782                    : 0));
2783     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2784               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2785                  : 0));
2786     if (start_ch > end_ch)
2787       return REG_ERANGE;
2788     /* Build the table for single byte characters.  */
2789     for (ch = 0; ch < SBC_MAX; ++ch)
2790       if (start_ch <= ch  && ch <= end_ch)
2791         bitset_set (sbcset, ch);
2792   }
2793 # endif /* not RE_ENABLE_I18N */
2794   return REG_NOERROR;
2795 }
2796 #endif /* not _LIBC */
2797
2798 #ifndef _LIBC
2799 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2800    Build the collating element which is represented by NAME.
2801    The result are written to MBCSET and SBCSET.
2802    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2803    pointer argument since we may update it.  */
2804
2805 static reg_errcode_t
2806 internal_function
2807 # ifdef RE_ENABLE_I18N
2808 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2809                         Idx *coll_sym_alloc, const unsigned char *name)
2810 # else /* not RE_ENABLE_I18N */
2811 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2812 # endif /* not RE_ENABLE_I18N */
2813 {
2814   size_t name_len = strlen ((const char *) name);
2815   if (BE (name_len != 1, 0))
2816     return REG_ECOLLATE;
2817   else
2818     {
2819       bitset_set (sbcset, name[0]);
2820       return REG_NOERROR;
2821     }
2822 }
2823 #endif /* not _LIBC */
2824
2825 /* This function parse bracket expression like "[abc]", "[a-c]",
2826    "[[.a-a.]]" etc.  */
2827
2828 static bin_tree_t *
2829 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2830                    reg_syntax_t syntax, reg_errcode_t *err)
2831 {
2832 #ifdef _LIBC
2833   const unsigned char *collseqmb;
2834   const char *collseqwc;
2835   uint32_t nrules;
2836   int32_t table_size;
2837   const int32_t *symb_table;
2838   const unsigned char *extra;
2839
2840   /* Local function for parse_bracket_exp used in _LIBC environment.
2841      Seek the collating symbol entry corresponding to NAME.
2842      Return the index of the symbol in the SYMB_TABLE.  */
2843
2844   auto inline int32_t
2845   __attribute ((always_inline))
2846   seek_collating_symbol_entry (name, name_len)
2847          const unsigned char *name;
2848          size_t name_len;
2849     {
2850       int32_t hash = elem_hash ((const char *) name, name_len);
2851       int32_t elem = hash % table_size;
2852       if (symb_table[2 * elem] != 0)
2853         {
2854           int32_t second = hash % (table_size - 2) + 1;
2855
2856           do
2857             {
2858               /* First compare the hashing value.  */
2859               if (symb_table[2 * elem] == hash
2860                   /* Compare the length of the name.  */
2861                   && name_len == extra[symb_table[2 * elem + 1]]
2862                   /* Compare the name.  */
2863                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2864                              name_len) == 0)
2865                 {
2866                   /* Yep, this is the entry.  */
2867                   break;
2868                 }
2869
2870               /* Next entry.  */
2871               elem += second;
2872             }
2873           while (symb_table[2 * elem] != 0);
2874         }
2875       return elem;
2876     }
2877
2878   /* Local function for parse_bracket_exp used in _LIBC environment.
2879      Look up the collation sequence value of BR_ELEM.
2880      Return the value if succeeded, UINT_MAX otherwise.  */
2881
2882   auto inline unsigned int
2883   __attribute ((always_inline))
2884   lookup_collation_sequence_value (br_elem)
2885          bracket_elem_t *br_elem;
2886     {
2887       if (br_elem->type == SB_CHAR)
2888         {
2889           /*
2890           if (MB_CUR_MAX == 1)
2891           */
2892           if (nrules == 0)
2893             return collseqmb[br_elem->opr.ch];
2894           else
2895             {
2896               wint_t wc = __btowc (br_elem->opr.ch);
2897               return __collseq_table_lookup (collseqwc, wc);
2898             }
2899         }
2900       else if (br_elem->type == MB_CHAR)
2901         {
2902           if (nrules != 0)
2903             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2904         }
2905       else if (br_elem->type == COLL_SYM)
2906         {
2907           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2908           if (nrules != 0)
2909             {
2910               int32_t elem, idx;
2911               elem = seek_collating_symbol_entry (br_elem->opr.name,
2912                                                   sym_name_len);
2913               if (symb_table[2 * elem] != 0)
2914                 {
2915                   /* We found the entry.  */
2916                   idx = symb_table[2 * elem + 1];
2917                   /* Skip the name of collating element name.  */
2918                   idx += 1 + extra[idx];
2919                   /* Skip the byte sequence of the collating element.  */
2920                   idx += 1 + extra[idx];
2921                   /* Adjust for the alignment.  */
2922                   idx = (idx + 3) & ~3;
2923                   /* Skip the multibyte collation sequence value.  */
2924                   idx += sizeof (unsigned int);
2925                   /* Skip the wide char sequence of the collating element.  */
2926                   idx += sizeof (unsigned int) *
2927                     (1 + *(unsigned int *) (extra + idx));
2928                   /* Return the collation sequence value.  */
2929                   return *(unsigned int *) (extra + idx);
2930                 }
2931               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2932                 {
2933                   /* No valid character.  Match it as a single byte
2934                      character.  */
2935                   return collseqmb[br_elem->opr.name[0]];
2936                 }
2937             }
2938           else if (sym_name_len == 1)
2939             return collseqmb[br_elem->opr.name[0]];
2940         }
2941       return UINT_MAX;
2942     }
2943
2944   /* Local function for parse_bracket_exp used in _LIBC environment.
2945      Build the range expression which starts from START_ELEM, and ends
2946      at END_ELEM.  The result are written to MBCSET and SBCSET.
2947      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2948      mbcset->range_ends, is a pointer argument since we may
2949      update it.  */
2950
2951   auto inline reg_errcode_t
2952   __attribute ((always_inline))
2953   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2954          re_charset_t *mbcset;
2955          Idx *range_alloc;
2956          bitset_t sbcset;
2957          bracket_elem_t *start_elem, *end_elem;
2958     {
2959       unsigned int ch;
2960       uint32_t start_collseq;
2961       uint32_t end_collseq;
2962
2963       /* Equivalence Classes and Character Classes can't be a range
2964          start/end.  */
2965       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2966               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2967               0))
2968         return REG_ERANGE;
2969
2970       start_collseq = lookup_collation_sequence_value (start_elem);
2971       end_collseq = lookup_collation_sequence_value (end_elem);
2972       /* Check start/end collation sequence values.  */
2973       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2974         return REG_ECOLLATE;
2975       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2976         return REG_ERANGE;
2977
2978       /* Got valid collation sequence values, add them as a new entry.
2979          However, if we have no collation elements, and the character set
2980          is single byte, the single byte character set that we
2981          build below suffices. */
2982       if (nrules > 0 || dfa->mb_cur_max > 1)
2983         {
2984           /* Check the space of the arrays.  */
2985           if (BE (*range_alloc == mbcset->nranges, 0))
2986             {
2987               /* There is not enough space, need realloc.  */
2988               uint32_t *new_array_start;
2989               uint32_t *new_array_end;
2990               Idx new_nranges;
2991
2992               /* +1 in case of mbcset->nranges is 0.  */
2993               new_nranges = 2 * mbcset->nranges + 1;
2994               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2995                                             new_nranges);
2996               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2997                                           new_nranges);
2998
2999               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3000                 return REG_ESPACE;
3001
3002               mbcset->range_starts = new_array_start;
3003               mbcset->range_ends = new_array_end;
3004               *range_alloc = new_nranges;
3005             }
3006
3007           mbcset->range_starts[mbcset->nranges] = start_collseq;
3008           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3009         }
3010
3011       /* Build the table for single byte characters.  */
3012       for (ch = 0; ch < SBC_MAX; ch++)
3013         {
3014           uint32_t ch_collseq;
3015           /*
3016           if (MB_CUR_MAX == 1)
3017           */
3018           if (nrules == 0)
3019             ch_collseq = collseqmb[ch];
3020           else
3021             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3022           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3023             bitset_set (sbcset, ch);
3024         }
3025       return REG_NOERROR;
3026     }
3027
3028   /* Local function for parse_bracket_exp used in _LIBC environment.
3029      Build the collating element which is represented by NAME.
3030      The result are written to MBCSET and SBCSET.
3031      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3032      pointer argument since we may update it.  */
3033
3034   auto inline reg_errcode_t
3035   __attribute ((always_inline))
3036   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3037          re_charset_t *mbcset;
3038          Idx *coll_sym_alloc;
3039          bitset_t sbcset;
3040          const unsigned char *name;
3041     {
3042       int32_t elem, idx;
3043       size_t name_len = strlen ((const char *) name);
3044       if (nrules != 0)
3045         {
3046           elem = seek_collating_symbol_entry (name, name_len);
3047           if (symb_table[2 * elem] != 0)
3048             {
3049               /* We found the entry.  */
3050               idx = symb_table[2 * elem + 1];
3051               /* Skip the name of collating element name.  */
3052               idx += 1 + extra[idx];
3053             }
3054           else if (symb_table[2 * elem] == 0 && name_len == 1)
3055             {
3056               /* No valid character, treat it as a normal
3057                  character.  */
3058               bitset_set (sbcset, name[0]);
3059               return REG_NOERROR;
3060             }
3061           else
3062             return REG_ECOLLATE;
3063
3064           /* Got valid collation sequence, add it as a new entry.  */
3065           /* Check the space of the arrays.  */
3066           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3067             {
3068               /* Not enough, realloc it.  */
3069               /* +1 in case of mbcset->ncoll_syms is 0.  */
3070               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3071               /* Use realloc since mbcset->coll_syms is NULL
3072                  if *alloc == 0.  */
3073               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3074                                                    new_coll_sym_alloc);
3075               if (BE (new_coll_syms == NULL, 0))
3076                 return REG_ESPACE;
3077               mbcset->coll_syms = new_coll_syms;
3078               *coll_sym_alloc = new_coll_sym_alloc;
3079             }
3080           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3081           return REG_NOERROR;
3082         }
3083       else
3084         {
3085           if (BE (name_len != 1, 0))
3086             return REG_ECOLLATE;
3087           else
3088             {
3089               bitset_set (sbcset, name[0]);
3090               return REG_NOERROR;
3091             }
3092         }
3093     }
3094 #endif
3095
3096   re_token_t br_token;
3097   re_bitset_ptr_t sbcset;
3098 #ifdef RE_ENABLE_I18N
3099   re_charset_t *mbcset;
3100   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3101   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3102 #endif /* not RE_ENABLE_I18N */
3103   bool non_match = false;
3104   bin_tree_t *work_tree;
3105   int token_len;
3106   bool first_round = true;
3107 #ifdef _LIBC
3108   collseqmb = (const unsigned char *)
3109     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3110   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3111   if (nrules)
3112     {
3113       /*
3114       if (MB_CUR_MAX > 1)
3115       */
3116       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3117       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3118       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3119                                                   _NL_COLLATE_SYMB_TABLEMB);
3120       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3121                                                    _NL_COLLATE_SYMB_EXTRAMB);
3122     }
3123 #endif
3124   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3125 #ifdef RE_ENABLE_I18N
3126   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3127 #endif /* RE_ENABLE_I18N */
3128 #ifdef RE_ENABLE_I18N
3129   if (BE (sbcset == NULL || mbcset == NULL, 0))
3130 #else
3131   if (BE (sbcset == NULL, 0))
3132 #endif /* RE_ENABLE_I18N */
3133     {
3134       re_free (sbcset);
3135 #ifdef RE_ENABLE_I18N
3136       re_free (mbcset);
3137 #endif
3138       *err = REG_ESPACE;
3139       return NULL;
3140     }
3141
3142   token_len = peek_token_bracket (token, regexp, syntax);
3143   if (BE (token->type == END_OF_RE, 0))
3144     {
3145       *err = REG_BADPAT;
3146       goto parse_bracket_exp_free_return;
3147     }
3148   if (token->type == OP_NON_MATCH_LIST)
3149     {
3150 #ifdef RE_ENABLE_I18N
3151       mbcset->non_match = 1;
3152 #endif /* not RE_ENABLE_I18N */
3153       non_match = true;
3154       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3155         bitset_set (sbcset, '\n');
3156       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3157       token_len = peek_token_bracket (token, regexp, syntax);
3158       if (BE (token->type == END_OF_RE, 0))
3159         {
3160           *err = REG_BADPAT;
3161           goto parse_bracket_exp_free_return;
3162         }
3163     }
3164
3165   /* We treat the first ']' as a normal character.  */
3166   if (token->type == OP_CLOSE_BRACKET)
3167     token->type = CHARACTER;
3168
3169   while (1)
3170     {
3171       bracket_elem_t start_elem, end_elem;
3172       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3173       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3174       reg_errcode_t ret;
3175       int token_len2 = 0;
3176       bool is_range_exp = false;
3177       re_token_t token2;
3178
3179       start_elem.opr.name = start_name_buf;
3180       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3181                                    syntax, first_round);
3182       if (BE (ret != REG_NOERROR, 0))
3183         {
3184           *err = ret;
3185           goto parse_bracket_exp_free_return;
3186         }
3187       first_round = false;
3188
3189       /* Get information about the next token.  We need it in any case.  */
3190       token_len = peek_token_bracket (token, regexp, syntax);
3191
3192       /* Do not check for ranges if we know they are not allowed.  */
3193       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3194         {
3195           if (BE (token->type == END_OF_RE, 0))
3196             {
3197               *err = REG_EBRACK;
3198               goto parse_bracket_exp_free_return;
3199             }
3200           if (token->type == OP_CHARSET_RANGE)
3201             {
3202               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3203               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3204               if (BE (token2.type == END_OF_RE, 0))
3205                 {
3206                   *err = REG_EBRACK;
3207                   goto parse_bracket_exp_free_return;
3208                 }
3209               if (token2.type == OP_CLOSE_BRACKET)
3210                 {
3211                   /* We treat the last '-' as a normal character.  */
3212                   re_string_skip_bytes (regexp, -token_len);
3213                   token->type = CHARACTER;
3214                 }
3215               else
3216                 is_range_exp = true;
3217             }
3218         }
3219
3220       if (is_range_exp == true)
3221         {
3222           end_elem.opr.name = end_name_buf;
3223           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3224                                        dfa, syntax, true);
3225           if (BE (ret != REG_NOERROR, 0))
3226             {
3227               *err = ret;
3228               goto parse_bracket_exp_free_return;
3229             }
3230
3231           token_len = peek_token_bracket (token, regexp, syntax);
3232
3233 #ifdef _LIBC
3234           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3235                                   &start_elem, &end_elem);
3236 #else
3237 # ifdef RE_ENABLE_I18N
3238           *err = build_range_exp (syntax, sbcset,
3239                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3240                                   &range_alloc, &start_elem, &end_elem);
3241 # else
3242           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3243 # endif
3244 #endif /* RE_ENABLE_I18N */
3245           if (BE (*err != REG_NOERROR, 0))
3246             goto parse_bracket_exp_free_return;
3247         }
3248       else
3249         {
3250           switch (start_elem.type)
3251             {
3252             case SB_CHAR:
3253               bitset_set (sbcset, start_elem.opr.ch);
3254               break;
3255 #ifdef RE_ENABLE_I18N
3256             case MB_CHAR:
3257               /* Check whether the array has enough space.  */
3258               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3259                 {
3260                   wchar_t *new_mbchars;
3261                   /* Not enough, realloc it.  */
3262                   /* +1 in case of mbcset->nmbchars is 0.  */
3263                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3264                   /* Use realloc since array is NULL if *alloc == 0.  */
3265                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3266                                             mbchar_alloc);
3267                   if (BE (new_mbchars == NULL, 0))
3268                     goto parse_bracket_exp_espace;
3269                   mbcset->mbchars = new_mbchars;
3270                 }
3271               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3272               break;
3273 #endif /* RE_ENABLE_I18N */
3274             case EQUIV_CLASS:
3275               *err = build_equiv_class (sbcset,
3276 #ifdef RE_ENABLE_I18N
3277                                         mbcset, &equiv_class_alloc,
3278 #endif /* RE_ENABLE_I18N */
3279                                         start_elem.opr.name);
3280               if (BE (*err != REG_NOERROR, 0))
3281                 goto parse_bracket_exp_free_return;
3282               break;
3283             case COLL_SYM:
3284               *err = build_collating_symbol (sbcset,
3285 #ifdef RE_ENABLE_I18N
3286                                              mbcset, &coll_sym_alloc,
3287 #endif /* RE_ENABLE_I18N */
3288                                              start_elem.opr.name);
3289               if (BE (*err != REG_NOERROR, 0))
3290                 goto parse_bracket_exp_free_return;
3291               break;
3292             case CHAR_CLASS:
3293               *err = build_charclass (regexp->trans, sbcset,
3294 #ifdef RE_ENABLE_I18N
3295                                       mbcset, &char_class_alloc,
3296 #endif /* RE_ENABLE_I18N */
3297                                       start_elem.opr.name, syntax);
3298               if (BE (*err != REG_NOERROR, 0))
3299                goto parse_bracket_exp_free_return;
3300               break;
3301             default:
3302               assert (0);
3303               break;
3304             }
3305         }
3306       if (BE (token->type == END_OF_RE, 0))
3307         {
3308           *err = REG_EBRACK;
3309           goto parse_bracket_exp_free_return;
3310         }
3311       if (token->type == OP_CLOSE_BRACKET)
3312         break;
3313     }
3314
3315   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3316
3317   /* If it is non-matching list.  */
3318   if (non_match)
3319     bitset_not (sbcset);
3320
3321 #ifdef RE_ENABLE_I18N
3322   /* Ensure only single byte characters are set.  */
3323   if (dfa->mb_cur_max > 1)
3324     bitset_mask (sbcset, dfa->sb_char);
3325
3326   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3327       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3328                                                      || mbcset->non_match)))
3329     {
3330       bin_tree_t *mbc_tree;
3331       int sbc_idx;
3332       /* Build a tree for complex bracket.  */
3333       dfa->has_mb_node = 1;
3334       br_token.type = COMPLEX_BRACKET;
3335       br_token.opr.mbcset = mbcset;
3336       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3337       if (BE (mbc_tree == NULL, 0))
3338         goto parse_bracket_exp_espace;
3339       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3340         if (sbcset[sbc_idx])
3341           break;
3342       /* If there are no bits set in sbcset, there is no point
3343          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3344       if (sbc_idx < BITSET_WORDS)
3345         {
3346           /* Build a tree for simple bracket.  */
3347           br_token.type = SIMPLE_BRACKET;
3348           br_token.opr.sbcset = sbcset;
3349           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3350           if (BE (work_tree == NULL, 0))
3351             goto parse_bracket_exp_espace;
3352
3353           /* Then join them by ALT node.  */
3354           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3355           if (BE (work_tree == NULL, 0))
3356             goto parse_bracket_exp_espace;
3357         }
3358       else
3359         {
3360           re_free (sbcset);
3361           work_tree = mbc_tree;
3362         }
3363     }
3364   else
3365 #endif /* not RE_ENABLE_I18N */
3366     {
3367 #ifdef RE_ENABLE_I18N
3368       free_charset (mbcset);
3369 #endif
3370       /* Build a tree for simple bracket.  */
3371       br_token.type = SIMPLE_BRACKET;
3372       br_token.opr.sbcset = sbcset;
3373       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3374       if (BE (work_tree == NULL, 0))
3375         goto parse_bracket_exp_espace;
3376     }
3377   return work_tree;
3378
3379  parse_bracket_exp_espace:
3380   *err = REG_ESPACE;
3381  parse_bracket_exp_free_return:
3382   re_free (sbcset);
3383 #ifdef RE_ENABLE_I18N
3384   free_charset (mbcset);
3385 #endif /* RE_ENABLE_I18N */
3386   return NULL;
3387 }
3388
3389 /* Parse an element in the bracket expression.  */
3390
3391 static reg_errcode_t
3392 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3393                        re_token_t *token, int token_len, re_dfa_t *dfa,
3394                        reg_syntax_t syntax, bool accept_hyphen)
3395 {
3396 #ifdef RE_ENABLE_I18N
3397   int cur_char_size;
3398   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3399   if (cur_char_size > 1)
3400     {
3401       elem->type = MB_CHAR;
3402       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3403       re_string_skip_bytes (regexp, cur_char_size);
3404       return REG_NOERROR;
3405     }
3406 #endif /* RE_ENABLE_I18N */
3407   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3408   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3409       || token->type == OP_OPEN_EQUIV_CLASS)
3410     return parse_bracket_symbol (elem, regexp, token);
3411   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3412     {
3413       /* A '-' must only appear as anything but a range indicator before
3414          the closing bracket.  Everything else is an error.  */
3415       re_token_t token2;
3416       (void) peek_token_bracket (&token2, regexp, syntax);
3417       if (token2.type != OP_CLOSE_BRACKET)
3418         /* The actual error value is not standardized since this whole
3419            case is undefined.  But ERANGE makes good sense.  */
3420         return REG_ERANGE;
3421     }
3422   elem->type = SB_CHAR;
3423   elem->opr.ch = token->opr.c;
3424   return REG_NOERROR;
3425 }
3426
3427 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3428    such as [:<character_class>:], [.<collating_element>.], and
3429    [=<equivalent_class>=].  */
3430
3431 static reg_errcode_t
3432 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3433                       re_token_t *token)
3434 {
3435   unsigned char ch, delim = token->opr.c;
3436   int i = 0;
3437   if (re_string_eoi(regexp))
3438     return REG_EBRACK;
3439   for (;; ++i)
3440     {
3441       if (i >= BRACKET_NAME_BUF_SIZE)
3442         return REG_EBRACK;
3443       if (token->type == OP_OPEN_CHAR_CLASS)
3444         ch = re_string_fetch_byte_case (regexp);
3445       else
3446         ch = re_string_fetch_byte (regexp);
3447       if (re_string_eoi(regexp))
3448         return REG_EBRACK;
3449       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3450         break;
3451       elem->opr.name[i] = ch;
3452     }
3453   re_string_skip_bytes (regexp, 1);
3454   elem->opr.name[i] = '\0';
3455   switch (token->type)
3456     {
3457     case OP_OPEN_COLL_ELEM:
3458       elem->type = COLL_SYM;
3459       break;
3460     case OP_OPEN_EQUIV_CLASS:
3461       elem->type = EQUIV_CLASS;
3462       break;
3463     case OP_OPEN_CHAR_CLASS:
3464       elem->type = CHAR_CLASS;
3465       break;
3466     default:
3467       break;
3468     }
3469   return REG_NOERROR;
3470 }
3471
3472   /* Helper function for parse_bracket_exp.
3473      Build the equivalence class which is represented by NAME.
3474      The result are written to MBCSET and SBCSET.
3475      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3476      is a pointer argument since we may update it.  */
3477
3478 static reg_errcode_t
3479 #ifdef RE_ENABLE_I18N
3480 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3481                    Idx *equiv_class_alloc, const unsigned char *name)
3482 #else /* not RE_ENABLE_I18N */
3483 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3484 #endif /* not RE_ENABLE_I18N */
3485 {
3486 #ifdef _LIBC
3487   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3488   if (nrules != 0)
3489     {
3490       const int32_t *table, *indirect;
3491       const unsigned char *weights, *extra, *cp;
3492       unsigned char char_buf[2];
3493       int32_t idx1, idx2;
3494       unsigned int ch;
3495       size_t len;
3496       /* This #include defines a local function!  */
3497 # include <locale/weight.h>
3498       /* Calculate the index for equivalence class.  */
3499       cp = name;
3500       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3501       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502                                                _NL_COLLATE_WEIGHTMB);
3503       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504                                                    _NL_COLLATE_EXTRAMB);
3505       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3506                                                 _NL_COLLATE_INDIRECTMB);
3507       idx1 = findidx (&cp, -1);
3508       if (BE (idx1 == 0 || *cp != '\0', 0))
3509         /* This isn't a valid character.  */
3510         return REG_ECOLLATE;
3511
3512       /* Build single byte matching table for this equivalence class.  */
3513       len = weights[idx1 & 0xffffff];
3514       for (ch = 0; ch < SBC_MAX; ++ch)
3515         {
3516           char_buf[0] = ch;
3517           cp = char_buf;
3518           idx2 = findidx (&cp, 1);
3519 /*
3520           idx2 = table[ch];
3521 */
3522           if (idx2 == 0)
3523             /* This isn't a valid character.  */
3524             continue;
3525           /* Compare only if the length matches and the collation rule
3526              index is the same.  */
3527           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3528             {
3529               int cnt = 0;
3530
3531               while (cnt <= len &&
3532                      weights[(idx1 & 0xffffff) + 1 + cnt]
3533                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3534                 ++cnt;
3535
3536               if (cnt > len)
3537                 bitset_set (sbcset, ch);
3538             }
3539         }
3540       /* Check whether the array has enough space.  */
3541       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3542         {
3543           /* Not enough, realloc it.  */
3544           /* +1 in case of mbcset->nequiv_classes is 0.  */
3545           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3546           /* Use realloc since the array is NULL if *alloc == 0.  */
3547           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3548                                                    int32_t,
3549                                                    new_equiv_class_alloc);
3550           if (BE (new_equiv_classes == NULL, 0))
3551             return REG_ESPACE;
3552           mbcset->equiv_classes = new_equiv_classes;
3553           *equiv_class_alloc = new_equiv_class_alloc;
3554         }
3555       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3556     }
3557   else
3558 #endif /* _LIBC */
3559     {
3560       if (BE (strlen ((const char *) name) != 1, 0))
3561         return REG_ECOLLATE;
3562       bitset_set (sbcset, *name);
3563     }
3564   return REG_NOERROR;
3565 }
3566
3567   /* Helper function for parse_bracket_exp.
3568      Build the character class which is represented by NAME.
3569      The result are written to MBCSET and SBCSET.
3570      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3571      is a pointer argument since we may update it.  */
3572
3573 static reg_errcode_t
3574 #ifdef RE_ENABLE_I18N
3575 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3576                  re_charset_t *mbcset, Idx *char_class_alloc,
3577                  const unsigned char *class_name, reg_syntax_t syntax)
3578 #else /* not RE_ENABLE_I18N */
3579 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3580                  const unsigned char *class_name, reg_syntax_t syntax)
3581 #endif /* not RE_ENABLE_I18N */
3582 {
3583   int i;
3584   const char *name = (const char *) class_name;
3585
3586   /* In case of REG_ICASE "upper" and "lower" match the both of
3587      upper and lower cases.  */
3588   if ((syntax & RE_ICASE)
3589       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3590     name = "alpha";
3591
3592 #ifdef RE_ENABLE_I18N
3593   /* Check the space of the arrays.  */
3594   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3595     {
3596       /* Not enough, realloc it.  */
3597       /* +1 in case of mbcset->nchar_classes is 0.  */
3598       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3599       /* Use realloc since array is NULL if *alloc == 0.  */
3600       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3601                                                new_char_class_alloc);
3602       if (BE (new_char_classes == NULL, 0))
3603         return REG_ESPACE;
3604       mbcset->char_classes = new_char_classes;
3605       *char_class_alloc = new_char_class_alloc;
3606     }
3607   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3608 #endif /* RE_ENABLE_I18N */
3609
3610 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3611   do {                                          \
3612     if (BE (trans != NULL, 0))                  \
3613       {                                         \
3614         for (i = 0; i < SBC_MAX; ++i)           \
3615           if (ctype_func (i))                   \
3616             bitset_set (sbcset, trans[i]);      \
3617       }                                         \
3618     else                                        \
3619       {                                         \
3620         for (i = 0; i < SBC_MAX; ++i)           \
3621           if (ctype_func (i))                   \
3622             bitset_set (sbcset, i);             \
3623       }                                         \
3624   } while (0)
3625
3626   if (strcmp (name, "alnum") == 0)
3627     BUILD_CHARCLASS_LOOP (isalnum);
3628   else if (strcmp (name, "cntrl") == 0)
3629     BUILD_CHARCLASS_LOOP (iscntrl);
3630   else if (strcmp (name, "lower") == 0)
3631     BUILD_CHARCLASS_LOOP (islower);
3632   else if (strcmp (name, "space") == 0)
3633     BUILD_CHARCLASS_LOOP (isspace);
3634   else if (strcmp (name, "alpha") == 0)
3635     BUILD_CHARCLASS_LOOP (isalpha);
3636   else if (strcmp (name, "digit") == 0)
3637     BUILD_CHARCLASS_LOOP (isdigit);
3638   else if (strcmp (name, "print") == 0)
3639     BUILD_CHARCLASS_LOOP (isprint);
3640   else if (strcmp (name, "upper") == 0)
3641     BUILD_CHARCLASS_LOOP (isupper);
3642   else if (strcmp (name, "blank") == 0)
3643     BUILD_CHARCLASS_LOOP (isblank);
3644   else if (strcmp (name, "graph") == 0)
3645     BUILD_CHARCLASS_LOOP (isgraph);
3646   else if (strcmp (name, "punct") == 0)
3647     BUILD_CHARCLASS_LOOP (ispunct);
3648   else if (strcmp (name, "xdigit") == 0)
3649     BUILD_CHARCLASS_LOOP (isxdigit);
3650   else
3651     return REG_ECTYPE;
3652
3653   return REG_NOERROR;
3654 }
3655
3656 static bin_tree_t *
3657 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3658                     const unsigned char *class_name,
3659                     const unsigned char *extra, bool non_match,
3660                     reg_errcode_t *err)
3661 {
3662   re_bitset_ptr_t sbcset;
3663 #ifdef RE_ENABLE_I18N
3664   re_charset_t *mbcset;
3665   Idx alloc = 0;
3666 #endif /* not RE_ENABLE_I18N */
3667   reg_errcode_t ret;
3668   re_token_t br_token;
3669   bin_tree_t *tree;
3670
3671   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3672 #ifdef RE_ENABLE_I18N
3673   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3674 #endif /* RE_ENABLE_I18N */
3675
3676 #ifdef RE_ENABLE_I18N
3677   if (BE (sbcset == NULL || mbcset == NULL, 0))
3678 #else /* not RE_ENABLE_I18N */
3679   if (BE (sbcset == NULL, 0))
3680 #endif /* not RE_ENABLE_I18N */
3681     {
3682       *err = REG_ESPACE;
3683       return NULL;
3684     }
3685
3686   if (non_match)
3687     {
3688 #ifdef RE_ENABLE_I18N
3689       mbcset->non_match = 1;
3690 #endif /* not RE_ENABLE_I18N */
3691     }
3692
3693   /* We don't care the syntax in this case.  */
3694   ret = build_charclass (trans, sbcset,
3695 #ifdef RE_ENABLE_I18N
3696                          mbcset, &alloc,
3697 #endif /* RE_ENABLE_I18N */
3698                          class_name, 0);
3699
3700   if (BE (ret != REG_NOERROR, 0))
3701     {
3702       re_free (sbcset);
3703 #ifdef RE_ENABLE_I18N
3704       free_charset (mbcset);
3705 #endif /* RE_ENABLE_I18N */
3706       *err = ret;
3707       return NULL;
3708     }
3709   /* \w match '_' also.  */
3710   for (; *extra; extra++)
3711     bitset_set (sbcset, *extra);
3712
3713   /* If it is non-matching list.  */
3714   if (non_match)
3715     bitset_not (sbcset);
3716
3717 #ifdef RE_ENABLE_I18N
3718   /* Ensure only single byte characters are set.  */
3719   if (dfa->mb_cur_max > 1)
3720     bitset_mask (sbcset, dfa->sb_char);
3721 #endif
3722
3723   /* Build a tree for simple bracket.  */
3724   br_token.type = SIMPLE_BRACKET;
3725   br_token.opr.sbcset = sbcset;
3726   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3727   if (BE (tree == NULL, 0))
3728     goto build_word_op_espace;
3729
3730 #ifdef RE_ENABLE_I18N
3731   if (dfa->mb_cur_max > 1)
3732     {
3733       bin_tree_t *mbc_tree;
3734       /* Build a tree for complex bracket.  */
3735       br_token.type = COMPLEX_BRACKET;
3736       br_token.opr.mbcset = mbcset;
3737       dfa->has_mb_node = 1;
3738       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3739       if (BE (mbc_tree == NULL, 0))
3740         goto build_word_op_espace;
3741       /* Then join them by ALT node.  */
3742       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3743       if (BE (mbc_tree != NULL, 1))
3744         return tree;
3745     }
3746   else
3747     {
3748       free_charset (mbcset);
3749       return tree;
3750     }
3751 #else /* not RE_ENABLE_I18N */
3752   return tree;
3753 #endif /* not RE_ENABLE_I18N */
3754
3755  build_word_op_espace:
3756   re_free (sbcset);
3757 #ifdef RE_ENABLE_I18N
3758   free_charset (mbcset);
3759 #endif /* RE_ENABLE_I18N */
3760   *err = REG_ESPACE;
3761   return NULL;
3762 }
3763
3764 /* This is intended for the expressions like "a{1,3}".
3765    Fetch a number from 'input', and return the number.
3766    Return REG_MISSING if the number field is empty like "{,1}".
3767    Return RE_DUP_MAX + 1 if the number field is too large.
3768    Return REG_ERROR if an error occurred.  */
3769
3770 static Idx
3771 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3772 {
3773   Idx num = REG_MISSING;
3774   unsigned char c;
3775   while (1)
3776     {
3777       fetch_token (token, input, syntax);
3778       c = token->opr.c;
3779       if (BE (token->type == END_OF_RE, 0))
3780         return REG_ERROR;
3781       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3782         break;
3783       num = ((token->type != CHARACTER || c < '0' || '9' < c
3784               || num == REG_ERROR)
3785              ? REG_ERROR
3786              : num == REG_MISSING
3787              ? c - '0'
3788              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3789     }
3790   return num;
3791 }
3792 \f
3793 #ifdef RE_ENABLE_I18N
3794 static void
3795 free_charset (re_charset_t *cset)
3796 {
3797   re_free (cset->mbchars);
3798 # ifdef _LIBC
3799   re_free (cset->coll_syms);
3800   re_free (cset->equiv_classes);
3801   re_free (cset->range_starts);
3802   re_free (cset->range_ends);
3803 # endif
3804   re_free (cset->char_classes);
3805   re_free (cset);
3806 }
3807 #endif /* RE_ENABLE_I18N */
3808 \f
3809 /* Functions for binary tree operation.  */
3810
3811 /* Create a tree node.  */
3812
3813 static bin_tree_t *
3814 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3815              re_token_type_t type)
3816 {
3817   re_token_t t;
3818   t.type = type;
3819   return create_token_tree (dfa, left, right, &t);
3820 }
3821
3822 static bin_tree_t *
3823 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3824                    const re_token_t *token)
3825 {
3826   bin_tree_t *tree;
3827   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3828     {
3829       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3830
3831       if (storage == NULL)
3832         return NULL;
3833       storage->next = dfa->str_tree_storage;
3834       dfa->str_tree_storage = storage;
3835       dfa->str_tree_storage_idx = 0;
3836     }
3837   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3838
3839   tree->parent = NULL;
3840   tree->left = left;
3841   tree->right = right;
3842   tree->token = *token;
3843   tree->token.duplicated = 0;
3844   tree->token.opt_subexp = 0;
3845   tree->first = NULL;
3846   tree->next = NULL;
3847   tree->node_idx = REG_MISSING;
3848
3849   if (left != NULL)
3850     left->parent = tree;
3851   if (right != NULL)
3852     right->parent = tree;
3853   return tree;
3854 }
3855
3856 /* Mark the tree SRC as an optional subexpression.
3857    To be called from preorder or postorder.  */
3858
3859 static reg_errcode_t
3860 mark_opt_subexp (void *extra, bin_tree_t *node)
3861 {
3862   Idx idx = (uintptr_t) extra;
3863   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3864     node->token.opt_subexp = 1;
3865
3866   return REG_NOERROR;
3867 }
3868
3869 /* Free the allocated memory inside NODE. */
3870
3871 static void
3872 free_token (re_token_t *node)
3873 {
3874 #ifdef RE_ENABLE_I18N
3875   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3876     free_charset (node->opr.mbcset);
3877   else
3878 #endif /* RE_ENABLE_I18N */
3879     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3880       re_free (node->opr.sbcset);
3881 }
3882
3883 /* Worker function for tree walking.  Free the allocated memory inside NODE
3884    and its children. */
3885
3886 static reg_errcode_t
3887 free_tree (void *extra, bin_tree_t *node)
3888 {
3889   free_token (&node->token);
3890   return REG_NOERROR;
3891 }
3892
3893
3894 /* Duplicate the node SRC, and return new node.  This is a preorder
3895    visit similar to the one implemented by the generic visitor, but
3896    we need more infrastructure to maintain two parallel trees --- so,
3897    it's easier to duplicate.  */
3898
3899 static bin_tree_t *
3900 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3901 {
3902   const bin_tree_t *node;
3903   bin_tree_t *dup_root;
3904   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3905
3906   for (node = root; ; )
3907     {
3908       /* Create a new tree and link it back to the current parent.  */
3909       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910       if (*p_new == NULL)
3911         return NULL;
3912       (*p_new)->parent = dup_node;
3913       (*p_new)->token.duplicated = 1;
3914       dup_node = *p_new;
3915
3916       /* Go to the left node, or up and to the right.  */
3917       if (node->left)
3918         {
3919           node = node->left;
3920           p_new = &dup_node->left;
3921         }
3922       else
3923         {
3924           const bin_tree_t *prev = NULL;
3925           while (node->right == prev || node->right == NULL)
3926             {
3927               prev = node;
3928               node = node->parent;
3929               dup_node = dup_node->parent;
3930               if (!node)
3931                 return dup_root;
3932             }
3933           node = node->right;
3934           p_new = &dup_node->right;
3935         }
3936     }
3937 }