lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
  18
  19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  20                                           size_t length, reg_syntax_t syntax);
  21 static void re_compile_fastmap_iter (regex_t *bufp,
  22                                      const re_dfastate_t *init_state,
  23                                      char *fastmap);
  24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  25 #ifdef RE_ENABLE_I18N
  26 static void free_charset (re_charset_t *cset);
  27 #endif /* RE_ENABLE_I18N */
  28 static void free_workarea_compile (regex_t *preg);
  29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  30 #ifdef RE_ENABLE_I18N
  31 static void optimize_utf8 (re_dfa_t *dfa);
  32 #endif
  33 static reg_errcode_t analyze (regex_t *preg);
  34 static reg_errcode_t preorder (bin_tree_t *root,
  35                                reg_errcode_t (fn (void *, bin_tree_t *)),
  36                                void *extra);
  37 static reg_errcode_t postorder (bin_tree_t *root,
  38                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                 void *extra);
  40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  43                                  bin_tree_t *node);
  44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  49                                    unsigned int constraint);
  50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  52                                          Idx node, bool root);
  53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  54 static Idx fetch_number (re_string_t *input, re_token_t *token,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax) internal_function;
  58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  59                           reg_syntax_t syntax, reg_errcode_t *err);
  60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  61                                   re_token_t *token, reg_syntax_t syntax,
  62                                   Idx nest, reg_errcode_t *err);
  63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  64                                  re_token_t *token, reg_syntax_t syntax,
  65                                  Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  67                                      re_token_t *token, reg_syntax_t syntax,
  68                                      Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  70                                   re_token_t *token, reg_syntax_t syntax,
  71                                   Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  73                                  re_dfa_t *dfa, re_token_t *token,
  74                                  reg_syntax_t syntax, reg_errcode_t *err);
  75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  76                                       re_token_t *token, reg_syntax_t syntax,
  77                                       reg_errcode_t *err);
  78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  79                                             re_string_t *regexp,
  80                                             re_token_t *token, int token_len,
  81                                             re_dfa_t *dfa,
  82                                             reg_syntax_t syntax,
  83                                             bool accept_hyphen);
  84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  85                                           re_string_t *regexp,
  86                                           re_token_t *token);
  87 #ifdef RE_ENABLE_I18N
  88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  89                                         re_charset_t *mbcset,
  90                                         Idx *equiv_class_alloc,
  91                                         const unsigned char *name);
  92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  93                                       bitset_t sbcset,
  94                                       re_charset_t *mbcset,
  95                                       Idx *char_class_alloc,
  96                                       const unsigned char *class_name,
  97                                       reg_syntax_t syntax);
  98 #else  /* not RE_ENABLE_I18N */
  99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 100                                         const unsigned char *name);
 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 102                                       bitset_t sbcset,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #endif /* not RE_ENABLE_I18N */
 106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 107                                        RE_TRANSLATE_TYPE trans,
 108                                        const unsigned char *class_name,
 109                                        const unsigned char *extra,
 110                                        bool non_match, reg_errcode_t *err);
 111 static bin_tree_t *create_tree (re_dfa_t *dfa,
 112                                 bin_tree_t *left, bin_tree_t *right,
 113                                 re_token_type_t type);
 114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 115                                       bin_tree_t *left, bin_tree_t *right,
 116                                       const re_token_t *token);
 117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 118 static void free_token (re_token_t *node);
 119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 121 \f
 122 /* This table gives an error message for each of the error codes listed
 123    in regex.h.  Obviously the order here has to be same as there.
 124    POSIX doesn't require that we do anything for REG_NOERROR,
 125    but why not be nice?  */
 126
 127 static const char __re_error_msgid[] =
 128   {
 129 #define REG_NOERROR_IDX 0
 130     gettext_noop ("Success")    /* REG_NOERROR */
 131     "\0"
 132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 133     gettext_noop ("No match")   /* REG_NOMATCH */
 134     "\0"
 135 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 136     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 137     "\0"
 138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 139     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 140     "\0"
 141 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 142     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 143     "\0"
 144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 145     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 146     "\0"
 147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 148     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 149     "\0"
 150 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 151     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 152     "\0"
 153 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 154     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 155     "\0"
 156 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 157     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 158     "\0"
 159 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 160     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 161     "\0"
 162 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 163     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 164     "\0"
 165 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 166     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 167     "\0"
 168 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 169     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 170     "\0"
 171 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 172     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 173     "\0"
 174 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 175     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 176     "\0"
 177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 178     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 179   };
 180
 181 static const size_t __re_error_msgid_idx[] =
 182   {
 183     REG_NOERROR_IDX,
 184     REG_NOMATCH_IDX,
 185     REG_BADPAT_IDX,
 186     REG_ECOLLATE_IDX,
 187     REG_ECTYPE_IDX,
 188     REG_EESCAPE_IDX,
 189     REG_ESUBREG_IDX,
 190     REG_EBRACK_IDX,
 191     REG_EPAREN_IDX,
 192     REG_EBRACE_IDX,
 193     REG_BADBR_IDX,
 194     REG_ERANGE_IDX,
 195     REG_ESPACE_IDX,
 196     REG_BADRPT_IDX,
 197     REG_EEND_IDX,
 198     REG_ESIZE_IDX,
 199     REG_ERPAREN_IDX
 200   };
 201 \f
 202 /* Entry points for GNU code.  */
 203
 204 /* re_compile_pattern is the GNU regular expression compiler: it
 205    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 206    Returns 0 if the pattern was valid, otherwise an error string.
 207
 208    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 209    are set in BUFP on entry.  */
 210
 211 #ifdef _LIBC
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 #else /* size_t might promote */
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 #endif
 222 {
 223   reg_errcode_t ret;
 224
 225   /* And GNU code determines whether or not to get register information
 226      by passing null for the REGS argument to re_match, etc., not by
 227      setting no_sub, unless RE_NO_SUB is set.  */
 228   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 229
 230   /* Match anchors at newline.  */
 231   bufp->newline_anchor = 1;
 232
 233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 234
 235   if (!ret)
 236     return NULL;
 237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 238 }
 239 #ifdef _LIBC
 240 weak_alias (__re_compile_pattern, re_compile_pattern)
 241 #endif
 242
 243 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 244    also be assigned to arbitrarily: each pattern buffer stores its own
 245    syntax, so it can be changed between regex compilations.  */
 246 /* This has no initializer because initialized variables in Emacs
 247    become read-only after dumping.  */
 248 reg_syntax_t re_syntax_options;
 249
 250
 251 /* Specify the precise syntax of regexps for compilation.  This provides
 252    for compatibility for various utilities which historically have
 253    different, incompatible syntaxes.
 254
 255    The argument SYNTAX is a bit mask comprised of the various bits
 256    defined in regex.h.  We return the old syntax.  */
 257
 258 reg_syntax_t
 259 re_set_syntax (syntax)
 260     reg_syntax_t syntax;
 261 {
 262   reg_syntax_t ret = re_syntax_options;
 263
 264   re_syntax_options = syntax;
 265   return ret;
 266 }
 267 #ifdef _LIBC
 268 weak_alias (__re_set_syntax, re_set_syntax)
 269 #endif
 270
 271 int
 272 re_compile_fastmap (bufp)
 273     struct re_pattern_buffer *bufp;
 274 {
 275   re_dfa_t *dfa = bufp->buffer;
 276   char *fastmap = bufp->fastmap;
 277
 278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 280   if (dfa->init_state != dfa->init_state_word)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 282   if (dfa->init_state != dfa->init_state_nl)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 284   if (dfa->init_state != dfa->init_state_begbuf)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 286   bufp->fastmap_accurate = 1;
 287   return 0;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 291 #endif
 292
 293 static inline void
 294 __attribute ((always_inline))
 295 re_set_fastmap (char *fastmap, bool icase, int ch)
 296 {
 297   fastmap[ch] = 1;
 298   if (icase)
 299     fastmap[tolower (ch)] = 1;
 300 }
 301
 302 /* Helper function for re_compile_fastmap.
 303    Compile fastmap for the initial_state INIT_STATE.  */
 304
 305 static void
 306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 307                          char *fastmap)
 308 {
 309   re_dfa_t *dfa = bufp->buffer;
 310   Idx node_cnt;
 311   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 312   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 313     {
 314       Idx node = init_state->nodes.elems[node_cnt];
 315       re_token_type_t type = dfa->nodes[node].type;
 316
 317       if (type == CHARACTER)
 318         {
 319           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 320 #ifdef RE_ENABLE_I18N
 321           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 322             {
 323               unsigned char buf[MB_LEN_MAX];
 324               unsigned char *p;
 325               wchar_t wc;
 326               mbstate_t state;
 327
 328               p = buf;
 329               *p++ = dfa->nodes[node].opr.c;
 330               while (++node < dfa->nodes_len
 331                      && dfa->nodes[node].type == CHARACTER
 332                      && dfa->nodes[node].mb_partial)
 333                 *p++ = dfa->nodes[node].opr.c;
 334               memset (&state, '\0', sizeof (state));
 335               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 336                              &state) == p - buf
 337                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 338                       != (size_t) -1))
 339                 re_set_fastmap (fastmap, false, buf[0]);
 340             }
 341 #endif
 342         }
 343       else if (type == SIMPLE_BRACKET)
 344         {
 345           int i, ch;
 346           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 347             {
 348               int j;
 349               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 350               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 351                 if (w & ((bitset_word_t) 1 << j))
 352                   re_set_fastmap (fastmap, icase, ch);
 353             }
 354         }
 355 #ifdef RE_ENABLE_I18N
 356       else if (type == COMPLEX_BRACKET)
 357         {
 358           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 359           Idx i;
 360
 361 # ifdef _LIBC
 362           /* See if we have to try all bytes which start multiple collation
 363              elements.
 364              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 365                   collation element, and don't catch 'b' since 'b' is
 366                   the only collation element which starts from 'b' (and
 367                   it is caught by SIMPLE_BRACKET).  */
 368               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 369                   && (cset->ncoll_syms || cset->nranges))
 370                 {
 371                   const int32_t *table = (const int32_t *)
 372                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 373                   for (i = 0; i < SBC_MAX; ++i)
 374                     if (table[i] < 0)
 375                       re_set_fastmap (fastmap, icase, i);
 376                 }
 377 # endif /* _LIBC */
 378
 379           /* See if we have to start the match at all multibyte characters,
 380              i.e. where we would not find an invalid sequence.  This only
 381              applies to multibyte character sets; for single byte character
 382              sets, the SIMPLE_BRACKET again suffices.  */
 383           if (dfa->mb_cur_max > 1
 384               && (cset->nchar_classes || cset->non_match || cset->nranges
 385 # ifdef _LIBC
 386                   || cset->nequiv_classes
 387 # endif /* _LIBC */
 388                  ))
 389             {
 390               unsigned char c = 0;
 391               do
 392                 {
 393                   mbstate_t mbs;
 394                   memset (&mbs, 0, sizeof (mbs));
 395                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 396                     re_set_fastmap (fastmap, false, (int) c);
 397                 }
 398               while (++c != 0);
 399             }
 400
 401           else
 402             {
 403               /* ... Else catch all bytes which can start the mbchars.  */
 404               for (i = 0; i < cset->nmbchars; ++i)
 405                 {
 406                   char buf[256];
 407                   mbstate_t state;
 408                   memset (&state, '\0', sizeof (state));
 409                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 410                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 411                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 412                     {
 413                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 414                           != (size_t) -1)
 415                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 416                     }
 417                 }
 418             }
 419         }
 420 #endif /* RE_ENABLE_I18N */
 421       else if (type == OP_PERIOD
 422 #ifdef RE_ENABLE_I18N
 423                || type == OP_UTF8_PERIOD
 424 #endif /* RE_ENABLE_I18N */
 425                || type == END_OF_RE)
 426         {
 427           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 428           if (type == END_OF_RE)
 429             bufp->can_be_null = 1;
 430           return;
 431         }
 432     }
 433 }
 434 \f
 435 /* Entry point for POSIX code.  */
 436 /* regcomp takes a regular expression as a string and compiles it.
 437
 438    PREG is a regex_t *.  We do not expect any fields to be initialized,
 439    since POSIX says we shouldn't.  Thus, we set
 440
 441      'buffer' to the compiled pattern;
 442      'used' to the length of the compiled pattern;
 443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 445        RE_SYNTAX_POSIX_BASIC;
 446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 447      'fastmap' to an allocated space for the fastmap;
 448      'fastmap_accurate' to zero;
 449      're_nsub' to the number of subexpressions in PATTERN.
 450
 451    PATTERN is the address of the pattern string.
 452
 453    CFLAGS is a series of bits which affect compilation.
 454
 455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 456      use POSIX basic syntax.
 457
 458      If REG_NEWLINE is set, then . and [^...] don't match newline.
 459      Also, regexec will try a match beginning after every newline.
 460
 461      If REG_ICASE is set, then we considers upper- and lowercase
 462      versions of letters to be equivalent when matching.
 463
 464      If REG_NOSUB is set, then when PREG is passed to regexec, that
 465      routine will report only success or failure, and nothing about the
 466      registers.
 467
 468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 469    the return codes and their meanings.)  */
 470
 471 int
 472 regcomp (preg, pattern, cflags)
 473     regex_t *_Restrict_ preg;
 474     const char *_Restrict_ pattern;
 475     int cflags;
 476 {
 477   reg_errcode_t ret;
 478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 479                          : RE_SYNTAX_POSIX_BASIC);
 480
 481   preg->buffer = NULL;
 482   preg->allocated = 0;
 483   preg->used = 0;
 484
 485   /* Try to allocate space for the fastmap.  */
 486   preg->fastmap = re_malloc (char, SBC_MAX);
 487   if (BE (preg->fastmap == NULL, 0))
 488     return REG_ESPACE;
 489
 490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 491
 492   /* If REG_NEWLINE is set, newlines are treated differently.  */
 493   if (cflags & REG_NEWLINE)
 494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 495       syntax &= ~RE_DOT_NEWLINE;
 496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 497       /* It also changes the matching behavior.  */
 498       preg->newline_anchor = 1;
 499     }
 500   else
 501     preg->newline_anchor = 0;
 502   preg->no_sub = !!(cflags & REG_NOSUB);
 503   preg->translate = NULL;
 504
 505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 506
 507   /* POSIX doesn't distinguish between an unmatched open-group and an
 508      unmatched close-group: both are REG_EPAREN.  */
 509   if (ret == REG_ERPAREN)
 510     ret = REG_EPAREN;
 511
 512   /* We have already checked preg->fastmap != NULL.  */
 513   if (BE (ret == REG_NOERROR, 1))
 514     /* Compute the fastmap now, since regexec cannot modify the pattern
 515        buffer.  This function never fails in this implementation.  */
 516     (void) re_compile_fastmap (preg);
 517   else
 518     {
 519       /* Some error occurred while compiling the expression.  */
 520       re_free (preg->fastmap);
 521       preg->fastmap = NULL;
 522     }
 523
 524   return (int) ret;
 525 }
 526 #ifdef _LIBC
 527 weak_alias (__regcomp, regcomp)
 528 #endif
 529
 530 /* Returns a message corresponding to an error code, ERRCODE, returned
 531    from either regcomp or regexec.   We don't use PREG here.  */
 532
 533 #ifdef _LIBC
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *_Restrict_ preg;
 538     char *_Restrict_ errbuf;
 539     size_t errbuf_size;
 540 #else /* size_t might promote */
 541 size_t
 542 regerror (int errcode, const regex_t *_Restrict_ preg,
 543           char *_Restrict_ errbuf, size_t errbuf_size)
 544 #endif
 545 {
 546   const char *msg;
 547   size_t msg_size;
 548
 549   if (BE (errcode < 0
 550           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 551                                / sizeof (__re_error_msgid_idx[0])), 0))
 552     /* Only error codes returned by the rest of the code should be passed
 553        to this routine.  If we are given anything else, or if other regex
 554        code generates an invalid error code, then the program has a bug.
 555        Dump core so we can fix it.  */
 556     abort ();
 557
 558   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 559
 560   msg_size = strlen (msg) + 1; /* Includes the null.  */
 561
 562   if (BE (errbuf_size != 0, 1))
 563     {
 564       size_t cpy_size = msg_size;
 565       if (BE (msg_size > errbuf_size, 0))
 566         {
 567           cpy_size = errbuf_size - 1;
 568           errbuf[cpy_size] = '\0';
 569         }
 570       memcpy (errbuf, msg, cpy_size);
 571     }
 572
 573   return msg_size;
 574 }
 575 #ifdef _LIBC
 576 weak_alias (__regerror, regerror)
 577 #endif
 578
 579
 580 #ifdef RE_ENABLE_I18N
 581 /* This static array is used for the map to single-byte characters when
 582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 583    it the same all the time.  UTF-8 is the preferred encoding so this is
 584    a worthwhile optimization.  */
 585 static const bitset_t utf8_sb_map =
 586 {
 587   /* Set the first 128 bits.  */
 588 # ifdef __GNUC__
 589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 590 # else
 591 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 592 #   error "bitset_word_t is narrower than 32 bits"
 593 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX, BITSET_WORD_MAX,
 597 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 598   BITSET_WORD_MAX,
 599 #  endif
 600   (BITSET_WORD_MAX
 601    >> (SBC_MAX % BITSET_WORD_BITS == 0
 602        ? 0
 603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 604 # endif
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   const char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if (strcasecmp (codeset_name, "UTF-8") == 0
 903       || strcasecmp (codeset_name, "UTF8") == 0)
 904     dfa->is_utf8 = 1;
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 916       else
 917         {
 918           int i, j, ch;
 919
 920           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 921           if (BE (dfa->sb_char == NULL, 0))
 922             return REG_ESPACE;
 923
 924           /* Set the bits corresponding to single byte chars.  */
 925           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 926             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 927               {
 928                 wint_t wch = __btowc (ch);
 929                 if (wch != WEOF)
 930                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 931 # ifndef _LIBC
 932                 if (isascii (ch) && wch != ch)
 933                   dfa->map_notascii = 1;
 934 # endif
 935               }
 936         }
 937     }
 938 #endif
 939
 940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 /* Initialize WORD_CHAR table, which indicate which character is
 946    "word".  In this case "word" means that it is the word construction
 947    character used by some operators like "\<", "\>", etc.  */
 948
 949 static void
 950 internal_function
 951 init_word_char (re_dfa_t *dfa)
 952 {
 953   dfa->word_ops_used = 1;
 954   int i = 0;
 955   int j;
 956   int ch = 0;
 957   if (BE (dfa->map_notascii == 0, 1))
 958     {
 959       bitset_word_t bits0 = 0x00000000;
 960       bitset_word_t bits1 = 0x03ff0000;
 961       bitset_word_t bits2 = 0x87fffffe;
 962       bitset_word_t bits3 = 0x07fffffe;
 963       if (BITSET_WORD_BITS == 64)
 964         {
 965           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 966           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 967           i = 2;
 968         }
 969       else if (BITSET_WORD_BITS == 32)
 970         {
 971           dfa->word_char[0] = bits0;
 972           dfa->word_char[1] = bits1;
 973           dfa->word_char[2] = bits2;
 974           dfa->word_char[3] = bits3;
 975           i = 4;
 976         }
 977       else
 978         goto general_case;
 979       ch = 128;
 980
 981       if (BE (dfa->is_utf8, 1))
 982         {
 983           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 984           return;
 985         }
 986     }
 987
 988  general_case:
 989   for (; i < BITSET_WORDS; ++i)
 990     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 991       if (isalnum (ch) || ch == '_')
 992         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 993 }
 994
 995 /* Free the work area which are only used while compiling.  */
 996
 997 static void
 998 free_workarea_compile (regex_t *preg)
 999 {
1000   re_dfa_t *dfa = preg->buffer;
1001   bin_tree_storage_t *storage, *next;
1002   for (storage = dfa->str_tree_storage; storage; storage = next)
1003     {
1004       next = storage->next;
1005       re_free (storage);
1006     }
1007   dfa->str_tree_storage = NULL;
1008   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1009   dfa->str_tree = NULL;
1010   re_free (dfa->org_indices);
1011   dfa->org_indices = NULL;
1012 }
1013
1014 /* Create initial states for all contexts.  */
1015
1016 static reg_errcode_t
1017 create_initial_state (re_dfa_t *dfa)
1018 {
1019   Idx first, i;
1020   reg_errcode_t err;
1021   re_node_set init_nodes;
1022
1023   /* Initial states have the epsilon closure of the node which is
1024      the first node of the regular expression.  */
1025   first = dfa->str_tree->first->node_idx;
1026   dfa->init_node = first;
1027   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1028   if (BE (err != REG_NOERROR, 0))
1029     return err;
1030
1031   /* The back-references which are in initial states can epsilon transit,
1032      since in this case all of the subexpressions can be null.
1033      Then we add epsilon closures of the nodes which are the next nodes of
1034      the back-references.  */
1035   if (dfa->nbackref > 0)
1036     for (i = 0; i < init_nodes.nelem; ++i)
1037       {
1038         Idx node_idx = init_nodes.elems[i];
1039         re_token_type_t type = dfa->nodes[node_idx].type;
1040
1041         Idx clexp_idx;
1042         if (type != OP_BACK_REF)
1043           continue;
1044         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1045           {
1046             re_token_t *clexp_node;
1047             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1048             if (clexp_node->type == OP_CLOSE_SUBEXP
1049                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1050               break;
1051           }
1052         if (clexp_idx == init_nodes.nelem)
1053           continue;
1054
1055         if (type == OP_BACK_REF)
1056           {
1057             Idx dest_idx = dfa->edests[node_idx].elems[0];
1058             if (!re_node_set_contains (&init_nodes, dest_idx))
1059               {
1060                 reg_errcode_t merge_err
1061                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1062                 if (merge_err != REG_NOERROR)
1063                   return merge_err;
1064                 i = 0;
1065               }
1066           }
1067       }
1068
1069   /* It must be the first time to invoke acquire_state.  */
1070   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1071   /* We don't check ERR here, since the initial state must not be NULL.  */
1072   if (BE (dfa->init_state == NULL, 0))
1073     return err;
1074   if (dfa->init_state->has_constraint)
1075     {
1076       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1077                                                        CONTEXT_WORD);
1078       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1079                                                      CONTEXT_NEWLINE);
1080       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1081                                                          &init_nodes,
1082                                                          CONTEXT_NEWLINE
1083                                                          | CONTEXT_BEGBUF);
1084       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1085               || dfa->init_state_begbuf == NULL, 0))
1086         return err;
1087     }
1088   else
1089     dfa->init_state_word = dfa->init_state_nl
1090       = dfa->init_state_begbuf = dfa->init_state;
1091
1092   re_node_set_free (&init_nodes);
1093   return REG_NOERROR;
1094 }
1095 \f
1096 #ifdef RE_ENABLE_I18N
1097 /* If it is possible to do searching in single byte encoding instead of UTF-8
1098    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1099    DFA nodes where needed.  */
1100
1101 static void
1102 optimize_utf8 (re_dfa_t *dfa)
1103 {
1104   Idx node;
1105   int i;
1106   bool mb_chars = false;
1107   bool has_period = false;
1108
1109   for (node = 0; node < dfa->nodes_len; ++node)
1110     switch (dfa->nodes[node].type)
1111       {
1112       case CHARACTER:
1113         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1114           mb_chars = true;
1115         break;
1116       case ANCHOR:
1117         switch (dfa->nodes[node].opr.ctx_type)
1118           {
1119           case LINE_FIRST:
1120           case LINE_LAST:
1121           case BUF_FIRST:
1122           case BUF_LAST:
1123             break;
1124           default:
1125             /* Word anchors etc. cannot be handled.  It's okay to test
1126                opr.ctx_type since constraints (for all DFA nodes) are
1127                created by ORing one or more opr.ctx_type values.  */
1128             return;
1129           }
1130         break;
1131       case OP_PERIOD:
1132         has_period = true;
1133         break;
1134       case OP_BACK_REF:
1135       case OP_ALT:
1136       case END_OF_RE:
1137       case OP_DUP_ASTERISK:
1138       case OP_OPEN_SUBEXP:
1139       case OP_CLOSE_SUBEXP:
1140         break;
1141       case COMPLEX_BRACKET:
1142         return;
1143       case SIMPLE_BRACKET:
1144         /* Just double check.  */
1145         {
1146           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1147                         ? 0
1148                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1149           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1150             {
1151               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1152                 return;
1153               rshift = 0;
1154             }
1155         }
1156         break;
1157       default:
1158         abort ();
1159       }
1160
1161   if (mb_chars || has_period)
1162     for (node = 0; node < dfa->nodes_len; ++node)
1163       {
1164         if (dfa->nodes[node].type == CHARACTER
1165             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1166           dfa->nodes[node].mb_partial = 0;
1167         else if (dfa->nodes[node].type == OP_PERIOD)
1168           dfa->nodes[node].type = OP_UTF8_PERIOD;
1169       }
1170
1171   /* The search can be in single byte locale.  */
1172   dfa->mb_cur_max = 1;
1173   dfa->is_utf8 = 0;
1174   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1175 }
1176 #endif
1177 \f
1178 /* Analyze the structure tree, and calculate "first", "next", "edest",
1179    "eclosure", and "inveclosure".  */
1180
1181 static reg_errcode_t
1182 analyze (regex_t *preg)
1183 {
1184   re_dfa_t *dfa = preg->buffer;
1185   reg_errcode_t ret;
1186
1187   /* Allocate arrays.  */
1188   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1189   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1190   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1191   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1192   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1193           || dfa->eclosures == NULL, 0))
1194     return REG_ESPACE;
1195
1196   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1197   if (dfa->subexp_map != NULL)
1198     {
1199       Idx i;
1200       for (i = 0; i < preg->re_nsub; i++)
1201         dfa->subexp_map[i] = i;
1202       preorder (dfa->str_tree, optimize_subexps, dfa);
1203       for (i = 0; i < preg->re_nsub; i++)
1204         if (dfa->subexp_map[i] != i)
1205           break;
1206       if (i == preg->re_nsub)
1207         {
1208           free (dfa->subexp_map);
1209           dfa->subexp_map = NULL;
1210         }
1211     }
1212
1213   ret = postorder (dfa->str_tree, lower_subexps, preg);
1214   if (BE (ret != REG_NOERROR, 0))
1215     return ret;
1216   ret = postorder (dfa->str_tree, calc_first, dfa);
1217   if (BE (ret != REG_NOERROR, 0))
1218     return ret;
1219   preorder (dfa->str_tree, calc_next, dfa);
1220   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1221   if (BE (ret != REG_NOERROR, 0))
1222     return ret;
1223   ret = calc_eclosure (dfa);
1224   if (BE (ret != REG_NOERROR, 0))
1225     return ret;
1226
1227   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1228      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1229   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1230       || dfa->nbackref)
1231     {
1232       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1233       if (BE (dfa->inveclosures == NULL, 0))
1234         return REG_ESPACE;
1235       ret = calc_inveclosure (dfa);
1236     }
1237
1238   return ret;
1239 }
1240
1241 /* Our parse trees are very unbalanced, so we cannot use a stack to
1242    implement parse tree visits.  Instead, we use parent pointers and
1243    some hairy code in these two functions.  */
1244 static reg_errcode_t
1245 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1246            void *extra)
1247 {
1248   bin_tree_t *node, *prev;
1249
1250   for (node = root; ; )
1251     {
1252       /* Descend down the tree, preferably to the left (or to the right
1253          if that's the only child).  */
1254       while (node->left || node->right)
1255         if (node->left)
1256           node = node->left;
1257         else
1258           node = node->right;
1259
1260       do
1261         {
1262           reg_errcode_t err = fn (extra, node);
1263           if (BE (err != REG_NOERROR, 0))
1264             return err;
1265           if (node->parent == NULL)
1266             return REG_NOERROR;
1267           prev = node;
1268           node = node->parent;
1269         }
1270       /* Go up while we have a node that is reached from the right.  */
1271       while (node->right == prev || node->right == NULL);
1272       node = node->right;
1273     }
1274 }
1275
1276 static reg_errcode_t
1277 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1278           void *extra)
1279 {
1280   bin_tree_t *node;
1281
1282   for (node = root; ; )
1283     {
1284       reg_errcode_t err = fn (extra, node);
1285       if (BE (err != REG_NOERROR, 0))
1286         return err;
1287
1288       /* Go to the left node, or up and to the right.  */
1289       if (node->left)
1290         node = node->left;
1291       else
1292         {
1293           bin_tree_t *prev = NULL;
1294           while (node->right == prev || node->right == NULL)
1295             {
1296               prev = node;
1297               node = node->parent;
1298               if (!node)
1299                 return REG_NOERROR;
1300             }
1301           node = node->right;
1302         }
1303     }
1304 }
1305
1306 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1307    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1308    backreferences as well.  Requires a preorder visit.  */
1309 static reg_errcode_t
1310 optimize_subexps (void *extra, bin_tree_t *node)
1311 {
1312   re_dfa_t *dfa = (re_dfa_t *) extra;
1313
1314   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1315     {
1316       int idx = node->token.opr.idx;
1317       node->token.opr.idx = dfa->subexp_map[idx];
1318       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1319     }
1320
1321   else if (node->token.type == SUBEXP
1322            && node->left && node->left->token.type == SUBEXP)
1323     {
1324       Idx other_idx = node->left->token.opr.idx;
1325
1326       node->left = node->left->left;
1327       if (node->left)
1328         node->left->parent = node;
1329
1330       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1331       if (other_idx < BITSET_WORD_BITS)
1332         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1333     }
1334
1335   return REG_NOERROR;
1336 }
1337
1338 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1339    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1340 static reg_errcode_t
1341 lower_subexps (void *extra, bin_tree_t *node)
1342 {
1343   regex_t *preg = (regex_t *) extra;
1344   reg_errcode_t err = REG_NOERROR;
1345
1346   if (node->left && node->left->token.type == SUBEXP)
1347     {
1348       node->left = lower_subexp (&err, preg, node->left);
1349       if (node->left)
1350         node->left->parent = node;
1351     }
1352   if (node->right && node->right->token.type == SUBEXP)
1353     {
1354       node->right = lower_subexp (&err, preg, node->right);
1355       if (node->right)
1356         node->right->parent = node;
1357     }
1358
1359   return err;
1360 }
1361
1362 static bin_tree_t *
1363 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1364 {
1365   re_dfa_t *dfa = preg->buffer;
1366   bin_tree_t *body = node->left;
1367   bin_tree_t *op, *cls, *tree1, *tree;
1368
1369   if (preg->no_sub
1370       /* We do not optimize empty subexpressions, because otherwise we may
1371          have bad CONCAT nodes with NULL children.  This is obviously not
1372          very common, so we do not lose much.  An example that triggers
1373          this case is the sed "script" /\(\)/x.  */
1374       && node->left != NULL
1375       && (node->token.opr.idx >= BITSET_WORD_BITS
1376           || !(dfa->used_bkref_map
1377                & ((bitset_word_t) 1 << node->token.opr.idx))))
1378     return node->left;
1379
1380   /* Convert the SUBEXP node to the concatenation of an
1381      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1382   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1383   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1384   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1385   tree = create_tree (dfa, op, tree1, CONCAT);
1386   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1387     {
1388       *err = REG_ESPACE;
1389       return NULL;
1390     }
1391
1392   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1393   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1394   return tree;
1395 }
1396
1397 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1398    nodes.  Requires a postorder visit.  */
1399 static reg_errcode_t
1400 calc_first (void *extra, bin_tree_t *node)
1401 {
1402   re_dfa_t *dfa = (re_dfa_t *) extra;
1403   if (node->token.type == CONCAT)
1404     {
1405       node->first = node->left->first;
1406       node->node_idx = node->left->node_idx;
1407     }
1408   else
1409     {
1410       node->first = node;
1411       node->node_idx = re_dfa_add_node (dfa, node->token);
1412       if (BE (node->node_idx == REG_MISSING, 0))
1413         return REG_ESPACE;
1414       if (node->token.type == ANCHOR)
1415         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1416     }
1417   return REG_NOERROR;
1418 }
1419
1420 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1421 static reg_errcode_t
1422 calc_next (void *extra, bin_tree_t *node)
1423 {
1424   switch (node->token.type)
1425     {
1426     case OP_DUP_ASTERISK:
1427       node->left->next = node;
1428       break;
1429     case CONCAT:
1430       node->left->next = node->right->first;
1431       node->right->next = node->next;
1432       break;
1433     default:
1434       if (node->left)
1435         node->left->next = node->next;
1436       if (node->right)
1437         node->right->next = node->next;
1438       break;
1439     }
1440   return REG_NOERROR;
1441 }
1442
1443 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1444 static reg_errcode_t
1445 link_nfa_nodes (void *extra, bin_tree_t *node)
1446 {
1447   re_dfa_t *dfa = (re_dfa_t *) extra;
1448   Idx idx = node->node_idx;
1449   reg_errcode_t err = REG_NOERROR;
1450
1451   switch (node->token.type)
1452     {
1453     case CONCAT:
1454       break;
1455
1456     case END_OF_RE:
1457       assert (node->next == NULL);
1458       break;
1459
1460     case OP_DUP_ASTERISK:
1461     case OP_ALT:
1462       {
1463         Idx left, right;
1464         dfa->has_plural_match = 1;
1465         if (node->left != NULL)
1466           left = node->left->first->node_idx;
1467         else
1468           left = node->next->node_idx;
1469         if (node->right != NULL)
1470           right = node->right->first->node_idx;
1471         else
1472           right = node->next->node_idx;
1473         assert (REG_VALID_INDEX (left));
1474         assert (REG_VALID_INDEX (right));
1475         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1476       }
1477       break;
1478
1479     case ANCHOR:
1480     case OP_OPEN_SUBEXP:
1481     case OP_CLOSE_SUBEXP:
1482       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1483       break;
1484
1485     case OP_BACK_REF:
1486       dfa->nexts[idx] = node->next->node_idx;
1487       if (node->token.type == OP_BACK_REF)
1488         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1489       break;
1490
1491     default:
1492       assert (!IS_EPSILON_NODE (node->token.type));
1493       dfa->nexts[idx] = node->next->node_idx;
1494       break;
1495     }
1496
1497   return err;
1498 }
1499
1500 /* Duplicate the epsilon closure of the node ROOT_NODE.
1501    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1502    to their own constraint.  */
1503
1504 static reg_errcode_t
1505 internal_function
1506 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1507                         Idx root_node, unsigned int init_constraint)
1508 {
1509   Idx org_node, clone_node;
1510   bool ok;
1511   unsigned int constraint = init_constraint;
1512   for (org_node = top_org_node, clone_node = top_clone_node;;)
1513     {
1514       Idx org_dest, clone_dest;
1515       if (dfa->nodes[org_node].type == OP_BACK_REF)
1516         {
1517           /* If the back reference epsilon-transit, its destination must
1518              also have the constraint.  Then duplicate the epsilon closure
1519              of the destination of the back reference, and store it in
1520              edests of the back reference.  */
1521           org_dest = dfa->nexts[org_node];
1522           re_node_set_empty (dfa->edests + clone_node);
1523           clone_dest = duplicate_node (dfa, org_dest, constraint);
1524           if (BE (clone_dest == REG_MISSING, 0))
1525             return REG_ESPACE;
1526           dfa->nexts[clone_node] = dfa->nexts[org_node];
1527           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528           if (BE (! ok, 0))
1529             return REG_ESPACE;
1530         }
1531       else if (dfa->edests[org_node].nelem == 0)
1532         {
1533           /* In case of the node can't epsilon-transit, don't duplicate the
1534              destination and store the original destination as the
1535              destination of the node.  */
1536           dfa->nexts[clone_node] = dfa->nexts[org_node];
1537           break;
1538         }
1539       else if (dfa->edests[org_node].nelem == 1)
1540         {
1541           /* In case of the node can epsilon-transit, and it has only one
1542              destination.  */
1543           org_dest = dfa->edests[org_node].elems[0];
1544           re_node_set_empty (dfa->edests + clone_node);
1545           /* If the node is root_node itself, it means the epsilon closure
1546              has a loop.  Then tie it to the destination of the root_node.  */
1547           if (org_node == root_node && clone_node != org_node)
1548             {
1549               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1550               if (BE (! ok, 0))
1551                 return REG_ESPACE;
1552               break;
1553             }
1554           /* In case the node has another constraint, append it.  */
1555           constraint |= dfa->nodes[org_node].constraint;
1556           clone_dest = duplicate_node (dfa, org_dest, constraint);
1557           if (BE (clone_dest == REG_MISSING, 0))
1558             return REG_ESPACE;
1559           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560           if (BE (! ok, 0))
1561             return REG_ESPACE;
1562         }
1563       else /* dfa->edests[org_node].nelem == 2 */
1564         {
1565           /* In case of the node can epsilon-transit, and it has two
1566              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1567           org_dest = dfa->edests[org_node].elems[0];
1568           re_node_set_empty (dfa->edests + clone_node);
1569           /* Search for a duplicated node which satisfies the constraint.  */
1570           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1571           if (clone_dest == REG_MISSING)
1572             {
1573               /* There is no such duplicated node, create a new one.  */
1574               reg_errcode_t err;
1575               clone_dest = duplicate_node (dfa, org_dest, constraint);
1576               if (BE (clone_dest == REG_MISSING, 0))
1577                 return REG_ESPACE;
1578               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579               if (BE (! ok, 0))
1580                 return REG_ESPACE;
1581               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1582                                             root_node, constraint);
1583               if (BE (err != REG_NOERROR, 0))
1584                 return err;
1585             }
1586           else
1587             {
1588               /* There is a duplicated node which satisfies the constraint,
1589                  use it to avoid infinite loop.  */
1590               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591               if (BE (! ok, 0))
1592                 return REG_ESPACE;
1593             }
1594
1595           org_dest = dfa->edests[org_node].elems[1];
1596           clone_dest = duplicate_node (dfa, org_dest, constraint);
1597           if (BE (clone_dest == REG_MISSING, 0))
1598             return REG_ESPACE;
1599           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1600           if (BE (! ok, 0))
1601             return REG_ESPACE;
1602         }
1603       org_node = org_dest;
1604       clone_node = clone_dest;
1605     }
1606   return REG_NOERROR;
1607 }
1608
1609 /* Search for a node which is duplicated from the node ORG_NODE, and
1610    satisfies the constraint CONSTRAINT.  */
1611
1612 static Idx
1613 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1614                         unsigned int constraint)
1615 {
1616   Idx idx;
1617   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1618     {
1619       if (org_node == dfa->org_indices[idx]
1620           && constraint == dfa->nodes[idx].constraint)
1621         return idx; /* Found.  */
1622     }
1623   return REG_MISSING; /* Not found.  */
1624 }
1625
1626 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1627    Return the index of the new node, or REG_MISSING if insufficient storage is
1628    available.  */
1629
1630 static Idx
1631 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1632 {
1633   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1634   if (BE (dup_idx != REG_MISSING, 1))
1635     {
1636       dfa->nodes[dup_idx].constraint = constraint;
1637       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1638       dfa->nodes[dup_idx].duplicated = 1;
1639
1640       /* Store the index of the original node.  */
1641       dfa->org_indices[dup_idx] = org_idx;
1642     }
1643   return dup_idx;
1644 }
1645
1646 static reg_errcode_t
1647 calc_inveclosure (re_dfa_t *dfa)
1648 {
1649   Idx src, idx;
1650   bool ok;
1651   for (idx = 0; idx < dfa->nodes_len; ++idx)
1652     re_node_set_init_empty (dfa->inveclosures + idx);
1653
1654   for (src = 0; src < dfa->nodes_len; ++src)
1655     {
1656       Idx *elems = dfa->eclosures[src].elems;
1657       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1658         {
1659           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1660           if (BE (! ok, 0))
1661             return REG_ESPACE;
1662         }
1663     }
1664
1665   return REG_NOERROR;
1666 }
1667
1668 /* Calculate "eclosure" for all the node in DFA.  */
1669
1670 static reg_errcode_t
1671 calc_eclosure (re_dfa_t *dfa)
1672 {
1673   Idx node_idx;
1674   bool incomplete;
1675 #ifdef DEBUG
1676   assert (dfa->nodes_len > 0);
1677 #endif
1678   incomplete = false;
1679   /* For each nodes, calculate epsilon closure.  */
1680   for (node_idx = 0; ; ++node_idx)
1681     {
1682       reg_errcode_t err;
1683       re_node_set eclosure_elem;
1684       if (node_idx == dfa->nodes_len)
1685         {
1686           if (!incomplete)
1687             break;
1688           incomplete = false;
1689           node_idx = 0;
1690         }
1691
1692 #ifdef DEBUG
1693       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1694 #endif
1695
1696       /* If we have already calculated, skip it.  */
1697       if (dfa->eclosures[node_idx].nelem != 0)
1698         continue;
1699       /* Calculate epsilon closure of 'node_idx'.  */
1700       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1701       if (BE (err != REG_NOERROR, 0))
1702         return err;
1703
1704       if (dfa->eclosures[node_idx].nelem == 0)
1705         {
1706           incomplete = true;
1707           re_node_set_free (&eclosure_elem);
1708         }
1709     }
1710   return REG_NOERROR;
1711 }
1712
1713 /* Calculate epsilon closure of NODE.  */
1714
1715 static reg_errcode_t
1716 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1717 {
1718   reg_errcode_t err;
1719   Idx i;
1720   re_node_set eclosure;
1721   bool ok;
1722   bool incomplete = false;
1723   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1724   if (BE (err != REG_NOERROR, 0))
1725     return err;
1726
1727   /* This indicates that we are calculating this node now.
1728      We reference this value to avoid infinite loop.  */
1729   dfa->eclosures[node].nelem = REG_MISSING;
1730
1731   /* If the current node has constraints, duplicate all nodes
1732      since they must inherit the constraints.  */
1733   if (dfa->nodes[node].constraint
1734       && dfa->edests[node].nelem
1735       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1736     {
1737       err = duplicate_node_closure (dfa, node, node, node,
1738                                     dfa->nodes[node].constraint);
1739       if (BE (err != REG_NOERROR, 0))
1740         return err;
1741     }
1742
1743   /* Expand each epsilon destination nodes.  */
1744   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1745     for (i = 0; i < dfa->edests[node].nelem; ++i)
1746       {
1747         re_node_set eclosure_elem;
1748         Idx edest = dfa->edests[node].elems[i];
1749         /* If calculating the epsilon closure of 'edest' is in progress,
1750            return intermediate result.  */
1751         if (dfa->eclosures[edest].nelem == REG_MISSING)
1752           {
1753             incomplete = true;
1754             continue;
1755           }
1756         /* If we haven't calculated the epsilon closure of 'edest' yet,
1757            calculate now. Otherwise use calculated epsilon closure.  */
1758         if (dfa->eclosures[edest].nelem == 0)
1759           {
1760             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1761             if (BE (err != REG_NOERROR, 0))
1762               return err;
1763           }
1764         else
1765           eclosure_elem = dfa->eclosures[edest];
1766         /* Merge the epsilon closure of 'edest'.  */
1767         err = re_node_set_merge (&eclosure, &eclosure_elem);
1768         if (BE (err != REG_NOERROR, 0))
1769           return err;
1770         /* If the epsilon closure of 'edest' is incomplete,
1771            the epsilon closure of this node is also incomplete.  */
1772         if (dfa->eclosures[edest].nelem == 0)
1773           {
1774             incomplete = true;
1775             re_node_set_free (&eclosure_elem);
1776           }
1777       }
1778
1779   /* An epsilon closure includes itself.  */
1780   ok = re_node_set_insert (&eclosure, node);
1781   if (BE (! ok, 0))
1782     return REG_ESPACE;
1783   if (incomplete && !root)
1784     dfa->eclosures[node].nelem = 0;
1785   else
1786     dfa->eclosures[node] = eclosure;
1787   *new_set = eclosure;
1788   return REG_NOERROR;
1789 }
1790 \f
1791 /* Functions for token which are used in the parser.  */
1792
1793 /* Fetch a token from INPUT.
1794    We must not use this function inside bracket expressions.  */
1795
1796 static void
1797 internal_function
1798 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1799 {
1800   re_string_skip_bytes (input, peek_token (result, input, syntax));
1801 }
1802
1803 /* Peek a token from INPUT, and return the length of the token.
1804    We must not use this function inside bracket expressions.  */
1805
1806 static int
1807 internal_function
1808 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1809 {
1810   unsigned char c;
1811
1812   if (re_string_eoi (input))
1813     {
1814       token->type = END_OF_RE;
1815       return 0;
1816     }
1817
1818   c = re_string_peek_byte (input, 0);
1819   token->opr.c = c;
1820
1821   token->word_char = 0;
1822 #ifdef RE_ENABLE_I18N
1823   token->mb_partial = 0;
1824   if (input->mb_cur_max > 1 &&
1825       !re_string_first_byte (input, re_string_cur_idx (input)))
1826     {
1827       token->type = CHARACTER;
1828       token->mb_partial = 1;
1829       return 1;
1830     }
1831 #endif
1832   if (c == '\\')
1833     {
1834       unsigned char c2;
1835       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1836         {
1837           token->type = BACK_SLASH;
1838           return 1;
1839         }
1840
1841       c2 = re_string_peek_byte_case (input, 1);
1842       token->opr.c = c2;
1843       token->type = CHARACTER;
1844 #ifdef RE_ENABLE_I18N
1845       if (input->mb_cur_max > 1)
1846         {
1847           wint_t wc = re_string_wchar_at (input,
1848                                           re_string_cur_idx (input) + 1);
1849           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1850         }
1851       else
1852 #endif
1853         token->word_char = IS_WORD_CHAR (c2) != 0;
1854
1855       switch (c2)
1856         {
1857         case '|':
1858           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1859             token->type = OP_ALT;
1860           break;
1861         case '1': case '2': case '3': case '4': case '5':
1862         case '6': case '7': case '8': case '9':
1863           if (!(syntax & RE_NO_BK_REFS))
1864             {
1865               token->type = OP_BACK_REF;
1866               token->opr.idx = c2 - '1';
1867             }
1868           break;
1869         case '<':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             {
1872               token->type = ANCHOR;
1873               token->opr.ctx_type = WORD_FIRST;
1874             }
1875           break;
1876         case '>':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             {
1879               token->type = ANCHOR;
1880               token->opr.ctx_type = WORD_LAST;
1881             }
1882           break;
1883         case 'b':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             {
1886               token->type = ANCHOR;
1887               token->opr.ctx_type = WORD_DELIM;
1888             }
1889           break;
1890         case 'B':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             {
1893               token->type = ANCHOR;
1894               token->opr.ctx_type = NOT_WORD_DELIM;
1895             }
1896           break;
1897         case 'w':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             token->type = OP_WORD;
1900           break;
1901         case 'W':
1902           if (!(syntax & RE_NO_GNU_OPS))
1903             token->type = OP_NOTWORD;
1904           break;
1905         case 's':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_SPACE;
1908           break;
1909         case 'S':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             token->type = OP_NOTSPACE;
1912           break;
1913         case '`':
1914           if (!(syntax & RE_NO_GNU_OPS))
1915             {
1916               token->type = ANCHOR;
1917               token->opr.ctx_type = BUF_FIRST;
1918             }
1919           break;
1920         case '\'':
1921           if (!(syntax & RE_NO_GNU_OPS))
1922             {
1923               token->type = ANCHOR;
1924               token->opr.ctx_type = BUF_LAST;
1925             }
1926           break;
1927         case '(':
1928           if (!(syntax & RE_NO_BK_PARENS))
1929             token->type = OP_OPEN_SUBEXP;
1930           break;
1931         case ')':
1932           if (!(syntax & RE_NO_BK_PARENS))
1933             token->type = OP_CLOSE_SUBEXP;
1934           break;
1935         case '+':
1936           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937             token->type = OP_DUP_PLUS;
1938           break;
1939         case '?':
1940           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941             token->type = OP_DUP_QUESTION;
1942           break;
1943         case '{':
1944           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945             token->type = OP_OPEN_DUP_NUM;
1946           break;
1947         case '}':
1948           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949             token->type = OP_CLOSE_DUP_NUM;
1950           break;
1951         default:
1952           break;
1953         }
1954       return 2;
1955     }
1956
1957   token->type = CHARACTER;
1958 #ifdef RE_ENABLE_I18N
1959   if (input->mb_cur_max > 1)
1960     {
1961       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1962       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1963     }
1964   else
1965 #endif
1966     token->word_char = IS_WORD_CHAR (token->opr.c);
1967
1968   switch (c)
1969     {
1970     case '\n':
1971       if (syntax & RE_NEWLINE_ALT)
1972         token->type = OP_ALT;
1973       break;
1974     case '|':
1975       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1976         token->type = OP_ALT;
1977       break;
1978     case '*':
1979       token->type = OP_DUP_ASTERISK;
1980       break;
1981     case '+':
1982       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983         token->type = OP_DUP_PLUS;
1984       break;
1985     case '?':
1986       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987         token->type = OP_DUP_QUESTION;
1988       break;
1989     case '{':
1990       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991         token->type = OP_OPEN_DUP_NUM;
1992       break;
1993     case '}':
1994       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995         token->type = OP_CLOSE_DUP_NUM;
1996       break;
1997     case '(':
1998       if (syntax & RE_NO_BK_PARENS)
1999         token->type = OP_OPEN_SUBEXP;
2000       break;
2001     case ')':
2002       if (syntax & RE_NO_BK_PARENS)
2003         token->type = OP_CLOSE_SUBEXP;
2004       break;
2005     case '[':
2006       token->type = OP_OPEN_BRACKET;
2007       break;
2008     case '.':
2009       token->type = OP_PERIOD;
2010       break;
2011     case '^':
2012       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2013           re_string_cur_idx (input) != 0)
2014         {
2015           char prev = re_string_peek_byte (input, -1);
2016           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2017             break;
2018         }
2019       token->type = ANCHOR;
2020       token->opr.ctx_type = LINE_FIRST;
2021       break;
2022     case '$':
2023       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2024           re_string_cur_idx (input) + 1 != re_string_length (input))
2025         {
2026           re_token_t next;
2027           re_string_skip_bytes (input, 1);
2028           peek_token (&next, input, syntax);
2029           re_string_skip_bytes (input, -1);
2030           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2031             break;
2032         }
2033       token->type = ANCHOR;
2034       token->opr.ctx_type = LINE_LAST;
2035       break;
2036     default:
2037       break;
2038     }
2039   return 1;
2040 }
2041
2042 /* Peek a token from INPUT, and return the length of the token.
2043    We must not use this function out of bracket expressions.  */
2044
2045 static int
2046 internal_function
2047 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2048 {
2049   unsigned char c;
2050   if (re_string_eoi (input))
2051     {
2052       token->type = END_OF_RE;
2053       return 0;
2054     }
2055   c = re_string_peek_byte (input, 0);
2056   token->opr.c = c;
2057
2058 #ifdef RE_ENABLE_I18N
2059   if (input->mb_cur_max > 1 &&
2060       !re_string_first_byte (input, re_string_cur_idx (input)))
2061     {
2062       token->type = CHARACTER;
2063       return 1;
2064     }
2065 #endif /* RE_ENABLE_I18N */
2066
2067   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2068       && re_string_cur_idx (input) + 1 < re_string_length (input))
2069     {
2070       /* In this case, '\' escape a character.  */
2071       unsigned char c2;
2072       re_string_skip_bytes (input, 1);
2073       c2 = re_string_peek_byte (input, 0);
2074       token->opr.c = c2;
2075       token->type = CHARACTER;
2076       return 1;
2077     }
2078   if (c == '[') /* '[' is a special char in a bracket exps.  */
2079     {
2080       unsigned char c2;
2081       int token_len;
2082       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2083         c2 = re_string_peek_byte (input, 1);
2084       else
2085         c2 = 0;
2086       token->opr.c = c2;
2087       token_len = 2;
2088       switch (c2)
2089         {
2090         case '.':
2091           token->type = OP_OPEN_COLL_ELEM;
2092           break;
2093         case '=':
2094           token->type = OP_OPEN_EQUIV_CLASS;
2095           break;
2096         case ':':
2097           if (syntax & RE_CHAR_CLASSES)
2098             {
2099               token->type = OP_OPEN_CHAR_CLASS;
2100               break;
2101             }
2102           /* else fall through.  */
2103         default:
2104           token->type = CHARACTER;
2105           token->opr.c = c;
2106           token_len = 1;
2107           break;
2108         }
2109       return token_len;
2110     }
2111   switch (c)
2112     {
2113     case '-':
2114       token->type = OP_CHARSET_RANGE;
2115       break;
2116     case ']':
2117       token->type = OP_CLOSE_BRACKET;
2118       break;
2119     case '^':
2120       token->type = OP_NON_MATCH_LIST;
2121       break;
2122     default:
2123       token->type = CHARACTER;
2124     }
2125   return 1;
2126 }
2127 \f
2128 /* Functions for parser.  */
2129
2130 /* Entry point of the parser.
2131    Parse the regular expression REGEXP and return the structure tree.
2132    If an error occurs, ERR is set by error code, and return NULL.
2133    This function build the following tree, from regular expression <reg_exp>:
2134            CAT
2135            / \
2136           /   \
2137    <reg_exp>  EOR
2138
2139    CAT means concatenation.
2140    EOR means end of regular expression.  */
2141
2142 static bin_tree_t *
2143 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2144        reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = preg->buffer;
2147   bin_tree_t *tree, *eor, *root;
2148   re_token_t current_token;
2149   dfa->syntax = syntax;
2150   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2151   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2152   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2153     return NULL;
2154   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2155   if (tree != NULL)
2156     root = create_tree (dfa, tree, eor, CONCAT);
2157   else
2158     root = eor;
2159   if (BE (eor == NULL || root == NULL, 0))
2160     {
2161       *err = REG_ESPACE;
2162       return NULL;
2163     }
2164   return root;
2165 }
2166
2167 /* This function build the following tree, from regular expression
2168    <branch1>|<branch2>:
2169            ALT
2170            / \
2171           /   \
2172    <branch1> <branch2>
2173
2174    ALT means alternative, which represents the operator '|'.  */
2175
2176 static bin_tree_t *
2177 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2179 {
2180   re_dfa_t *dfa = preg->buffer;
2181   bin_tree_t *tree, *branch = NULL;
2182   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2183   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184     return NULL;
2185
2186   while (token->type == OP_ALT)
2187     {
2188       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2189       if (token->type != OP_ALT && token->type != END_OF_RE
2190           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2191         {
2192           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2193           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2194             return NULL;
2195         }
2196       else
2197         branch = NULL;
2198       tree = create_tree (dfa, tree, branch, OP_ALT);
2199       if (BE (tree == NULL, 0))
2200         {
2201           *err = REG_ESPACE;
2202           return NULL;
2203         }
2204     }
2205   return tree;
2206 }
2207
2208 /* This function build the following tree, from regular expression
2209    <exp1><exp2>:
2210         CAT
2211         / \
2212        /   \
2213    <exp1> <exp2>
2214
2215    CAT means concatenation.  */
2216
2217 static bin_tree_t *
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2220 {
2221   bin_tree_t *tree, *expr;
2222   re_dfa_t *dfa = preg->buffer;
2223   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2225     return NULL;
2226
2227   while (token->type != OP_ALT && token->type != END_OF_RE
2228          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2229     {
2230       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2232         {
2233           if (tree != NULL)
2234             postorder (tree, free_tree, NULL);
2235           return NULL;
2236         }
2237       if (tree != NULL && expr != NULL)
2238         {
2239           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240           if (newtree == NULL)
2241             {
2242               postorder (expr, free_tree, NULL);
2243               postorder (tree, free_tree, NULL);
2244               *err = REG_ESPACE;
2245               return NULL;
2246             }
2247           tree = newtree;
2248         }
2249       else if (tree == NULL)
2250         tree = expr;
2251       /* Otherwise expr == NULL, we don't need to create new tree.  */
2252     }
2253   return tree;
2254 }
2255
2256 /* This function build the following tree, from regular expression a*:
2257          *
2258          |
2259          a
2260 */
2261
2262 static bin_tree_t *
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2265 {
2266   re_dfa_t *dfa = preg->buffer;
2267   bin_tree_t *tree;
2268   switch (token->type)
2269     {
2270     case CHARACTER:
2271       tree = create_token_tree (dfa, NULL, NULL, token);
2272       if (BE (tree == NULL, 0))
2273         {
2274           *err = REG_ESPACE;
2275           return NULL;
2276         }
2277 #ifdef RE_ENABLE_I18N
2278       if (dfa->mb_cur_max > 1)
2279         {
2280           while (!re_string_eoi (regexp)
2281                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2282             {
2283               bin_tree_t *mbc_remain;
2284               fetch_token (token, regexp, syntax);
2285               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287               if (BE (mbc_remain == NULL || tree == NULL, 0))
2288                 {
2289                   *err = REG_ESPACE;
2290                   return NULL;
2291                 }
2292             }
2293         }
2294 #endif
2295       break;
2296     case OP_OPEN_SUBEXP:
2297       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2298       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2299         return NULL;
2300       break;
2301     case OP_OPEN_BRACKET:
2302       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304         return NULL;
2305       break;
2306     case OP_BACK_REF:
2307       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2308         {
2309           *err = REG_ESUBREG;
2310           return NULL;
2311         }
2312       dfa->used_bkref_map |= 1 << token->opr.idx;
2313       tree = create_token_tree (dfa, NULL, NULL, token);
2314       if (BE (tree == NULL, 0))
2315         {
2316           *err = REG_ESPACE;
2317           return NULL;
2318         }
2319       ++dfa->nbackref;
2320       dfa->has_mb_node = 1;
2321       break;
2322     case OP_OPEN_DUP_NUM:
2323       if (syntax & RE_CONTEXT_INVALID_DUP)
2324         {
2325           *err = REG_BADRPT;
2326           return NULL;
2327         }
2328       /* FALLTHROUGH */
2329     case OP_DUP_ASTERISK:
2330     case OP_DUP_PLUS:
2331     case OP_DUP_QUESTION:
2332       if (syntax & RE_CONTEXT_INVALID_OPS)
2333         {
2334           *err = REG_BADRPT;
2335           return NULL;
2336         }
2337       else if (syntax & RE_CONTEXT_INDEP_OPS)
2338         {
2339           fetch_token (token, regexp, syntax);
2340           return parse_expression (regexp, preg, token, syntax, nest, err);
2341         }
2342       /* else fall through  */
2343     case OP_CLOSE_SUBEXP:
2344       if ((token->type == OP_CLOSE_SUBEXP) &&
2345           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2346         {
2347           *err = REG_ERPAREN;
2348           return NULL;
2349         }
2350       /* else fall through  */
2351     case OP_CLOSE_DUP_NUM:
2352       /* We treat it as a normal character.  */
2353
2354       /* Then we can these characters as normal characters.  */
2355       token->type = CHARACTER;
2356       /* mb_partial and word_char bits should be initialized already
2357          by peek_token.  */
2358       tree = create_token_tree (dfa, NULL, NULL, token);
2359       if (BE (tree == NULL, 0))
2360         {
2361           *err = REG_ESPACE;
2362           return NULL;
2363         }
2364       break;
2365     case ANCHOR:
2366       if ((token->opr.ctx_type
2367            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2368           && dfa->word_ops_used == 0)
2369         init_word_char (dfa);
2370       if (token->opr.ctx_type == WORD_DELIM
2371           || token->opr.ctx_type == NOT_WORD_DELIM)
2372         {
2373           bin_tree_t *tree_first, *tree_last;
2374           if (token->opr.ctx_type == WORD_DELIM)
2375             {
2376               token->opr.ctx_type = WORD_FIRST;
2377               tree_first = create_token_tree (dfa, NULL, NULL, token);
2378               token->opr.ctx_type = WORD_LAST;
2379             }
2380           else
2381             {
2382               token->opr.ctx_type = INSIDE_WORD;
2383               tree_first = create_token_tree (dfa, NULL, NULL, token);
2384               token->opr.ctx_type = INSIDE_NOTWORD;
2385             }
2386           tree_last = create_token_tree (dfa, NULL, NULL, token);
2387           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2388           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2389             {
2390               *err = REG_ESPACE;
2391               return NULL;
2392             }
2393         }
2394       else
2395         {
2396           tree = create_token_tree (dfa, NULL, NULL, token);
2397           if (BE (tree == NULL, 0))
2398             {
2399               *err = REG_ESPACE;
2400               return NULL;
2401             }
2402         }
2403       /* We must return here, since ANCHORs can't be followed
2404          by repetition operators.
2405          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2406              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2407       fetch_token (token, regexp, syntax);
2408       return tree;
2409     case OP_PERIOD:
2410       tree = create_token_tree (dfa, NULL, NULL, token);
2411       if (BE (tree == NULL, 0))
2412         {
2413           *err = REG_ESPACE;
2414           return NULL;
2415         }
2416       if (dfa->mb_cur_max > 1)
2417         dfa->has_mb_node = 1;
2418       break;
2419     case OP_WORD:
2420     case OP_NOTWORD:
2421       tree = build_charclass_op (dfa, regexp->trans,
2422                                  (const unsigned char *) "alnum",
2423                                  (const unsigned char *) "_",
2424                                  token->type == OP_NOTWORD, err);
2425       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426         return NULL;
2427       break;
2428     case OP_SPACE:
2429     case OP_NOTSPACE:
2430       tree = build_charclass_op (dfa, regexp->trans,
2431                                  (const unsigned char *) "space",
2432                                  (const unsigned char *) "",
2433                                  token->type == OP_NOTSPACE, err);
2434       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2435         return NULL;
2436       break;
2437     case OP_ALT:
2438     case END_OF_RE:
2439       return NULL;
2440     case BACK_SLASH:
2441       *err = REG_EESCAPE;
2442       return NULL;
2443     default:
2444       /* Must not happen?  */
2445 #ifdef DEBUG
2446       assert (0);
2447 #endif
2448       return NULL;
2449     }
2450   fetch_token (token, regexp, syntax);
2451
2452   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2453          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2454     {
2455       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2456       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2457         return NULL;
2458       /* In BRE consecutive duplications are not allowed.  */
2459       if ((syntax & RE_CONTEXT_INVALID_DUP)
2460           && (token->type == OP_DUP_ASTERISK
2461               || token->type == OP_OPEN_DUP_NUM))
2462         {
2463           *err = REG_BADRPT;
2464           return NULL;
2465         }
2466     }
2467
2468   return tree;
2469 }
2470
2471 /* This function build the following tree, from regular expression
2472    (<reg_exp>):
2473          SUBEXP
2474             |
2475         <reg_exp>
2476 */
2477
2478 static bin_tree_t *
2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2481 {
2482   re_dfa_t *dfa = preg->buffer;
2483   bin_tree_t *tree;
2484   size_t cur_nsub;
2485   cur_nsub = preg->re_nsub++;
2486
2487   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488
2489   /* The subexpression may be a null string.  */
2490   if (token->type == OP_CLOSE_SUBEXP)
2491     tree = NULL;
2492   else
2493     {
2494       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2496         {
2497           if (tree != NULL)
2498             postorder (tree, free_tree, NULL);
2499           *err = REG_EPAREN;
2500         }
2501       if (BE (*err != REG_NOERROR, 0))
2502         return NULL;
2503     }
2504
2505   if (cur_nsub <= '9' - '1')
2506     dfa->completed_bkref_map |= 1 << cur_nsub;
2507
2508   tree = create_tree (dfa, tree, NULL, SUBEXP);
2509   if (BE (tree == NULL, 0))
2510     {
2511       *err = REG_ESPACE;
2512       return NULL;
2513     }
2514   tree->token.opr.idx = cur_nsub;
2515   return tree;
2516 }
2517
2518 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2519
2520 static bin_tree_t *
2521 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2522               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2523 {
2524   bin_tree_t *tree = NULL, *old_tree = NULL;
2525   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2526   re_token_t start_token = *token;
2527
2528   if (token->type == OP_OPEN_DUP_NUM)
2529     {
2530       end = 0;
2531       start = fetch_number (regexp, token, syntax);
2532       if (start == REG_MISSING)
2533         {
2534           if (token->type == CHARACTER && token->opr.c == ',')
2535             start = 0; /* We treat "{,m}" as "{0,m}".  */
2536           else
2537             {
2538               *err = REG_BADBR; /* <re>{} is invalid.  */
2539               return NULL;
2540             }
2541         }
2542       if (BE (start != REG_ERROR, 1))
2543         {
2544           /* We treat "{n}" as "{n,n}".  */
2545           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2546                  : ((token->type == CHARACTER && token->opr.c == ',')
2547                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2548         }
2549       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2550         {
2551           /* Invalid sequence.  */
2552           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2553             {
2554               if (token->type == END_OF_RE)
2555                 *err = REG_EBRACE;
2556               else
2557                 *err = REG_BADBR;
2558
2559               return NULL;
2560             }
2561
2562           /* If the syntax bit is set, rollback.  */
2563           re_string_set_index (regexp, start_idx);
2564           *token = start_token;
2565           token->type = CHARACTER;
2566           /* mb_partial and word_char bits should be already initialized by
2567              peek_token.  */
2568           return elem;
2569         }
2570
2571       if (BE ((end != REG_MISSING && start > end)
2572               || token->type != OP_CLOSE_DUP_NUM, 0))
2573         {
2574           /* First number greater than second.  */
2575           *err = REG_BADBR;
2576           return NULL;
2577         }
2578
2579       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2580         {
2581           *err = REG_ESIZE;
2582           return NULL;
2583         }
2584     }
2585   else
2586     {
2587       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2589     }
2590
2591   fetch_token (token, regexp, syntax);
2592
2593   if (BE (elem == NULL, 0))
2594     return NULL;
2595   if (BE (start == 0 && end == 0, 0))
2596     {
2597       postorder (elem, free_tree, NULL);
2598       return NULL;
2599     }
2600
2601   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2602   if (BE (start > 0, 0))
2603     {
2604       tree = elem;
2605       for (i = 2; i <= start; ++i)
2606         {
2607           elem = duplicate_tree (elem, dfa);
2608           tree = create_tree (dfa, tree, elem, CONCAT);
2609           if (BE (elem == NULL || tree == NULL, 0))
2610             goto parse_dup_op_espace;
2611         }
2612
2613       if (start == end)
2614         return tree;
2615
2616       /* Duplicate ELEM before it is marked optional.  */
2617       elem = duplicate_tree (elem, dfa);
2618       old_tree = tree;
2619     }
2620   else
2621     old_tree = NULL;
2622
2623   if (elem->token.type == SUBEXP)
2624     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2625
2626   tree = create_tree (dfa, elem, NULL,
2627                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2628   if (BE (tree == NULL, 0))
2629     goto parse_dup_op_espace;
2630
2631 /* From gnulib's "intprops.h":
2632    True if the arithmetic type T is signed.  */
2633 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2634
2635   /* This loop is actually executed only when end != REG_MISSING,
2636      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2637      already created the start+1-th copy.  */
2638   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2639     for (i = start + 2; i <= end; ++i)
2640       {
2641         elem = duplicate_tree (elem, dfa);
2642         tree = create_tree (dfa, tree, elem, CONCAT);
2643         if (BE (elem == NULL || tree == NULL, 0))
2644           goto parse_dup_op_espace;
2645
2646         tree = create_tree (dfa, tree, NULL, OP_ALT);
2647         if (BE (tree == NULL, 0))
2648           goto parse_dup_op_espace;
2649       }
2650
2651   if (old_tree)
2652     tree = create_tree (dfa, old_tree, tree, CONCAT);
2653
2654   return tree;
2655
2656  parse_dup_op_espace:
2657   *err = REG_ESPACE;
2658   return NULL;
2659 }
2660
2661 /* Size of the names for collating symbol/equivalence_class/character_class.
2662    I'm not sure, but maybe enough.  */
2663 #define BRACKET_NAME_BUF_SIZE 32
2664
2665 #ifndef _LIBC
2666   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2667      Build the range expression which starts from START_ELEM, and ends
2668      at END_ELEM.  The result are written to MBCSET and SBCSET.
2669      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2670      mbcset->range_ends, is a pointer argument since we may
2671      update it.  */
2672
2673 static reg_errcode_t
2674 internal_function
2675 # ifdef RE_ENABLE_I18N
2676 build_range_exp (const reg_syntax_t syntax,
2677                  bitset_t sbcset,
2678                  re_charset_t *mbcset,
2679                  Idx *range_alloc,
2680                  const bracket_elem_t *start_elem,
2681                  const bracket_elem_t *end_elem)
2682 # else /* not RE_ENABLE_I18N */
2683 build_range_exp (const reg_syntax_t syntax,
2684                  bitset_t sbcset,
2685                  const bracket_elem_t *start_elem,
2686                  const bracket_elem_t *end_elem)
2687 # endif /* not RE_ENABLE_I18N */
2688 {
2689   unsigned int start_ch, end_ch;
2690   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2691   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2692           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2693           0))
2694     return REG_ERANGE;
2695
2696   /* We can handle no multi character collating elements without libc
2697      support.  */
2698   if (BE ((start_elem->type == COLL_SYM
2699            && strlen ((char *) start_elem->opr.name) > 1)
2700           || (end_elem->type == COLL_SYM
2701               && strlen ((char *) end_elem->opr.name) > 1), 0))
2702     return REG_ECOLLATE;
2703
2704 # ifdef RE_ENABLE_I18N
2705   {
2706     wchar_t wc;
2707     wint_t start_wc;
2708     wint_t end_wc;
2709     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2710
2711     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2712                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2713                    : 0));
2714     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2715               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2716                  : 0));
2717     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2718                 ? __btowc (start_ch) : start_elem->opr.wch);
2719     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2720               ? __btowc (end_ch) : end_elem->opr.wch);
2721     if (start_wc == WEOF || end_wc == WEOF)
2722       return REG_ECOLLATE;
2723     cmp_buf[0] = start_wc;
2724     cmp_buf[4] = end_wc;
2725
2726     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2727             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2728       return REG_ERANGE;
2729
2730     /* Got valid collation sequence values, add them as a new entry.
2731        However, for !_LIBC we have no collation elements: if the
2732        character set is single byte, the single byte character set
2733        that we build below suffices.  parse_bracket_exp passes
2734        no MBCSET if dfa->mb_cur_max == 1.  */
2735     if (mbcset)
2736       {
2737         /* Check the space of the arrays.  */
2738         if (BE (*range_alloc == mbcset->nranges, 0))
2739           {
2740             /* There is not enough space, need realloc.  */
2741             wchar_t *new_array_start, *new_array_end;
2742             Idx new_nranges;
2743
2744             /* +1 in case of mbcset->nranges is 0.  */
2745             new_nranges = 2 * mbcset->nranges + 1;
2746             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2747                are NULL if *range_alloc == 0.  */
2748             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2749                                           new_nranges);
2750             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2751                                         new_nranges);
2752
2753             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2754               return REG_ESPACE;
2755
2756             mbcset->range_starts = new_array_start;
2757             mbcset->range_ends = new_array_end;
2758             *range_alloc = new_nranges;
2759           }
2760
2761         mbcset->range_starts[mbcset->nranges] = start_wc;
2762         mbcset->range_ends[mbcset->nranges++] = end_wc;
2763       }
2764
2765     /* Build the table for single byte characters.  */
2766     for (wc = 0; wc < SBC_MAX; ++wc)
2767       {
2768         cmp_buf[2] = wc;
2769         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2770             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2771           bitset_set (sbcset, wc);
2772       }
2773   }
2774 # else /* not RE_ENABLE_I18N */
2775   {
2776     unsigned int ch;
2777     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2778                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2779                    : 0));
2780     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2781               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2782                  : 0));
2783     if (start_ch > end_ch)
2784       return REG_ERANGE;
2785     /* Build the table for single byte characters.  */
2786     for (ch = 0; ch < SBC_MAX; ++ch)
2787       if (start_ch <= ch  && ch <= end_ch)
2788         bitset_set (sbcset, ch);
2789   }
2790 # endif /* not RE_ENABLE_I18N */
2791   return REG_NOERROR;
2792 }
2793 #endif /* not _LIBC */
2794
2795 #ifndef _LIBC
2796 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2797    Build the collating element which is represented by NAME.
2798    The result are written to MBCSET and SBCSET.
2799    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2800    pointer argument since we may update it.  */
2801
2802 static reg_errcode_t
2803 internal_function
2804 # ifdef RE_ENABLE_I18N
2805 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2806                         Idx *coll_sym_alloc, const unsigned char *name)
2807 # else /* not RE_ENABLE_I18N */
2808 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2809 # endif /* not RE_ENABLE_I18N */
2810 {
2811   size_t name_len = strlen ((const char *) name);
2812   if (BE (name_len != 1, 0))
2813     return REG_ECOLLATE;
2814   else
2815     {
2816       bitset_set (sbcset, name[0]);
2817       return REG_NOERROR;
2818     }
2819 }
2820 #endif /* not _LIBC */
2821
2822 /* This function parse bracket expression like "[abc]", "[a-c]",
2823    "[[.a-a.]]" etc.  */
2824
2825 static bin_tree_t *
2826 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2827                    reg_syntax_t syntax, reg_errcode_t *err)
2828 {
2829 #ifdef _LIBC
2830   const unsigned char *collseqmb;
2831   const char *collseqwc;
2832   uint32_t nrules;
2833   int32_t table_size;
2834   const int32_t *symb_table;
2835   const unsigned char *extra;
2836
2837   /* Local function for parse_bracket_exp used in _LIBC environment.
2838      Seek the collating symbol entry corresponding to NAME.
2839      Return the index of the symbol in the SYMB_TABLE.  */
2840
2841   auto inline int32_t
2842   __attribute ((always_inline))
2843   seek_collating_symbol_entry (name, name_len)
2844          const unsigned char *name;
2845          size_t name_len;
2846     {
2847       int32_t hash = elem_hash ((const char *) name, name_len);
2848       int32_t elem = hash % table_size;
2849       if (symb_table[2 * elem] != 0)
2850         {
2851           int32_t second = hash % (table_size - 2) + 1;
2852
2853           do
2854             {
2855               /* First compare the hashing value.  */
2856               if (symb_table[2 * elem] == hash
2857                   /* Compare the length of the name.  */
2858                   && name_len == extra[symb_table[2 * elem + 1]]
2859                   /* Compare the name.  */
2860                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2861                              name_len) == 0)
2862                 {
2863                   /* Yep, this is the entry.  */
2864                   break;
2865                 }
2866
2867               /* Next entry.  */
2868               elem += second;
2869             }
2870           while (symb_table[2 * elem] != 0);
2871         }
2872       return elem;
2873     }
2874
2875   /* Local function for parse_bracket_exp used in _LIBC environment.
2876      Look up the collation sequence value of BR_ELEM.
2877      Return the value if succeeded, UINT_MAX otherwise.  */
2878
2879   auto inline unsigned int
2880   __attribute ((always_inline))
2881   lookup_collation_sequence_value (br_elem)
2882          bracket_elem_t *br_elem;
2883     {
2884       if (br_elem->type == SB_CHAR)
2885         {
2886           /*
2887           if (MB_CUR_MAX == 1)
2888           */
2889           if (nrules == 0)
2890             return collseqmb[br_elem->opr.ch];
2891           else
2892             {
2893               wint_t wc = __btowc (br_elem->opr.ch);
2894               return __collseq_table_lookup (collseqwc, wc);
2895             }
2896         }
2897       else if (br_elem->type == MB_CHAR)
2898         {
2899           if (nrules != 0)
2900             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2901         }
2902       else if (br_elem->type == COLL_SYM)
2903         {
2904           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2905           if (nrules != 0)
2906             {
2907               int32_t elem, idx;
2908               elem = seek_collating_symbol_entry (br_elem->opr.name,
2909                                                   sym_name_len);
2910               if (symb_table[2 * elem] != 0)
2911                 {
2912                   /* We found the entry.  */
2913                   idx = symb_table[2 * elem + 1];
2914                   /* Skip the name of collating element name.  */
2915                   idx += 1 + extra[idx];
2916                   /* Skip the byte sequence of the collating element.  */
2917                   idx += 1 + extra[idx];
2918                   /* Adjust for the alignment.  */
2919                   idx = (idx + 3) & ~3;
2920                   /* Skip the multibyte collation sequence value.  */
2921                   idx += sizeof (unsigned int);
2922                   /* Skip the wide char sequence of the collating element.  */
2923                   idx += sizeof (unsigned int) *
2924                     (1 + *(unsigned int *) (extra + idx));
2925                   /* Return the collation sequence value.  */
2926                   return *(unsigned int *) (extra + idx);
2927                 }
2928               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2929                 {
2930                   /* No valid character.  Match it as a single byte
2931                      character.  */
2932                   return collseqmb[br_elem->opr.name[0]];
2933                 }
2934             }
2935           else if (sym_name_len == 1)
2936             return collseqmb[br_elem->opr.name[0]];
2937         }
2938       return UINT_MAX;
2939     }
2940
2941   /* Local function for parse_bracket_exp used in _LIBC environment.
2942      Build the range expression which starts from START_ELEM, and ends
2943      at END_ELEM.  The result are written to MBCSET and SBCSET.
2944      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2945      mbcset->range_ends, is a pointer argument since we may
2946      update it.  */
2947
2948   auto inline reg_errcode_t
2949   __attribute ((always_inline))
2950   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2951          re_charset_t *mbcset;
2952          Idx *range_alloc;
2953          bitset_t sbcset;
2954          bracket_elem_t *start_elem, *end_elem;
2955     {
2956       unsigned int ch;
2957       uint32_t start_collseq;
2958       uint32_t end_collseq;
2959
2960       /* Equivalence Classes and Character Classes can't be a range
2961          start/end.  */
2962       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2963               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2964               0))
2965         return REG_ERANGE;
2966
2967       start_collseq = lookup_collation_sequence_value (start_elem);
2968       end_collseq = lookup_collation_sequence_value (end_elem);
2969       /* Check start/end collation sequence values.  */
2970       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2971         return REG_ECOLLATE;
2972       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2973         return REG_ERANGE;
2974
2975       /* Got valid collation sequence values, add them as a new entry.
2976          However, if we have no collation elements, and the character set
2977          is single byte, the single byte character set that we
2978          build below suffices. */
2979       if (nrules > 0 || dfa->mb_cur_max > 1)
2980         {
2981           /* Check the space of the arrays.  */
2982           if (BE (*range_alloc == mbcset->nranges, 0))
2983             {
2984               /* There is not enough space, need realloc.  */
2985               uint32_t *new_array_start;
2986               uint32_t *new_array_end;
2987               Idx new_nranges;
2988
2989               /* +1 in case of mbcset->nranges is 0.  */
2990               new_nranges = 2 * mbcset->nranges + 1;
2991               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2992                                             new_nranges);
2993               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994                                           new_nranges);
2995
2996               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2997                 return REG_ESPACE;
2998
2999               mbcset->range_starts = new_array_start;
3000               mbcset->range_ends = new_array_end;
3001               *range_alloc = new_nranges;
3002             }
3003
3004           mbcset->range_starts[mbcset->nranges] = start_collseq;
3005           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3006         }
3007
3008       /* Build the table for single byte characters.  */
3009       for (ch = 0; ch < SBC_MAX; ch++)
3010         {
3011           uint32_t ch_collseq;
3012           /*
3013           if (MB_CUR_MAX == 1)
3014           */
3015           if (nrules == 0)
3016             ch_collseq = collseqmb[ch];
3017           else
3018             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3019           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3020             bitset_set (sbcset, ch);
3021         }
3022       return REG_NOERROR;
3023     }
3024
3025   /* Local function for parse_bracket_exp used in _LIBC environment.
3026      Build the collating element which is represented by NAME.
3027      The result are written to MBCSET and SBCSET.
3028      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3029      pointer argument since we may update it.  */
3030
3031   auto inline reg_errcode_t
3032   __attribute ((always_inline))
3033   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3034          re_charset_t *mbcset;
3035          Idx *coll_sym_alloc;
3036          bitset_t sbcset;
3037          const unsigned char *name;
3038     {
3039       int32_t elem, idx;
3040       size_t name_len = strlen ((const char *) name);
3041       if (nrules != 0)
3042         {
3043           elem = seek_collating_symbol_entry (name, name_len);
3044           if (symb_table[2 * elem] != 0)
3045             {
3046               /* We found the entry.  */
3047               idx = symb_table[2 * elem + 1];
3048               /* Skip the name of collating element name.  */
3049               idx += 1 + extra[idx];
3050             }
3051           else if (symb_table[2 * elem] == 0 && name_len == 1)
3052             {
3053               /* No valid character, treat it as a normal
3054                  character.  */
3055               bitset_set (sbcset, name[0]);
3056               return REG_NOERROR;
3057             }
3058           else
3059             return REG_ECOLLATE;
3060
3061           /* Got valid collation sequence, add it as a new entry.  */
3062           /* Check the space of the arrays.  */
3063           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3064             {
3065               /* Not enough, realloc it.  */
3066               /* +1 in case of mbcset->ncoll_syms is 0.  */
3067               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3068               /* Use realloc since mbcset->coll_syms is NULL
3069                  if *alloc == 0.  */
3070               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3071                                                    new_coll_sym_alloc);
3072               if (BE (new_coll_syms == NULL, 0))
3073                 return REG_ESPACE;
3074               mbcset->coll_syms = new_coll_syms;
3075               *coll_sym_alloc = new_coll_sym_alloc;
3076             }
3077           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3078           return REG_NOERROR;
3079         }
3080       else
3081         {
3082           if (BE (name_len != 1, 0))
3083             return REG_ECOLLATE;
3084           else
3085             {
3086               bitset_set (sbcset, name[0]);
3087               return REG_NOERROR;
3088             }
3089         }
3090     }
3091 #endif
3092
3093   re_token_t br_token;
3094   re_bitset_ptr_t sbcset;
3095 #ifdef RE_ENABLE_I18N
3096   re_charset_t *mbcset;
3097   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3098   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3099 #endif /* not RE_ENABLE_I18N */
3100   bool non_match = false;
3101   bin_tree_t *work_tree;
3102   int token_len;
3103   bool first_round = true;
3104 #ifdef _LIBC
3105   collseqmb = (const unsigned char *)
3106     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3107   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3108   if (nrules)
3109     {
3110       /*
3111       if (MB_CUR_MAX > 1)
3112       */
3113       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3114       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3115       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3116                                                   _NL_COLLATE_SYMB_TABLEMB);
3117       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3118                                                    _NL_COLLATE_SYMB_EXTRAMB);
3119     }
3120 #endif
3121   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3122 #ifdef RE_ENABLE_I18N
3123   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3124 #endif /* RE_ENABLE_I18N */
3125 #ifdef RE_ENABLE_I18N
3126   if (BE (sbcset == NULL || mbcset == NULL, 0))
3127 #else
3128   if (BE (sbcset == NULL, 0))
3129 #endif /* RE_ENABLE_I18N */
3130     {
3131       re_free (sbcset);
3132 #ifdef RE_ENABLE_I18N
3133       re_free (mbcset);
3134 #endif
3135       *err = REG_ESPACE;
3136       return NULL;
3137     }
3138
3139   token_len = peek_token_bracket (token, regexp, syntax);
3140   if (BE (token->type == END_OF_RE, 0))
3141     {
3142       *err = REG_BADPAT;
3143       goto parse_bracket_exp_free_return;
3144     }
3145   if (token->type == OP_NON_MATCH_LIST)
3146     {
3147 #ifdef RE_ENABLE_I18N
3148       mbcset->non_match = 1;
3149 #endif /* not RE_ENABLE_I18N */
3150       non_match = true;
3151       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3152         bitset_set (sbcset, '\n');
3153       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3154       token_len = peek_token_bracket (token, regexp, syntax);
3155       if (BE (token->type == END_OF_RE, 0))
3156         {
3157           *err = REG_BADPAT;
3158           goto parse_bracket_exp_free_return;
3159         }
3160     }
3161
3162   /* We treat the first ']' as a normal character.  */
3163   if (token->type == OP_CLOSE_BRACKET)
3164     token->type = CHARACTER;
3165
3166   while (1)
3167     {
3168       bracket_elem_t start_elem, end_elem;
3169       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3170       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3171       reg_errcode_t ret;
3172       int token_len2 = 0;
3173       bool is_range_exp = false;
3174       re_token_t token2;
3175
3176       start_elem.opr.name = start_name_buf;
3177       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3178                                    syntax, first_round);
3179       if (BE (ret != REG_NOERROR, 0))
3180         {
3181           *err = ret;
3182           goto parse_bracket_exp_free_return;
3183         }
3184       first_round = false;
3185
3186       /* Get information about the next token.  We need it in any case.  */
3187       token_len = peek_token_bracket (token, regexp, syntax);
3188
3189       /* Do not check for ranges if we know they are not allowed.  */
3190       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3191         {
3192           if (BE (token->type == END_OF_RE, 0))
3193             {
3194               *err = REG_EBRACK;
3195               goto parse_bracket_exp_free_return;
3196             }
3197           if (token->type == OP_CHARSET_RANGE)
3198             {
3199               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3200               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3201               if (BE (token2.type == END_OF_RE, 0))
3202                 {
3203                   *err = REG_EBRACK;
3204                   goto parse_bracket_exp_free_return;
3205                 }
3206               if (token2.type == OP_CLOSE_BRACKET)
3207                 {
3208                   /* We treat the last '-' as a normal character.  */
3209                   re_string_skip_bytes (regexp, -token_len);
3210                   token->type = CHARACTER;
3211                 }
3212               else
3213                 is_range_exp = true;
3214             }
3215         }
3216
3217       if (is_range_exp == true)
3218         {
3219           end_elem.opr.name = end_name_buf;
3220           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3221                                        dfa, syntax, true);
3222           if (BE (ret != REG_NOERROR, 0))
3223             {
3224               *err = ret;
3225               goto parse_bracket_exp_free_return;
3226             }
3227
3228           token_len = peek_token_bracket (token, regexp, syntax);
3229
3230 #ifdef _LIBC
3231           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3232                                   &start_elem, &end_elem);
3233 #else
3234 # ifdef RE_ENABLE_I18N
3235           *err = build_range_exp (syntax, sbcset,
3236                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3237                                   &range_alloc, &start_elem, &end_elem);
3238 # else
3239           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3240 # endif
3241 #endif /* RE_ENABLE_I18N */
3242           if (BE (*err != REG_NOERROR, 0))
3243             goto parse_bracket_exp_free_return;
3244         }
3245       else
3246         {
3247           switch (start_elem.type)
3248             {
3249             case SB_CHAR:
3250               bitset_set (sbcset, start_elem.opr.ch);
3251               break;
3252 #ifdef RE_ENABLE_I18N
3253             case MB_CHAR:
3254               /* Check whether the array has enough space.  */
3255               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3256                 {
3257                   wchar_t *new_mbchars;
3258                   /* Not enough, realloc it.  */
3259                   /* +1 in case of mbcset->nmbchars is 0.  */
3260                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3261                   /* Use realloc since array is NULL if *alloc == 0.  */
3262                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3263                                             mbchar_alloc);
3264                   if (BE (new_mbchars == NULL, 0))
3265                     goto parse_bracket_exp_espace;
3266                   mbcset->mbchars = new_mbchars;
3267                 }
3268               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3269               break;
3270 #endif /* RE_ENABLE_I18N */
3271             case EQUIV_CLASS:
3272               *err = build_equiv_class (sbcset,
3273 #ifdef RE_ENABLE_I18N
3274                                         mbcset, &equiv_class_alloc,
3275 #endif /* RE_ENABLE_I18N */
3276                                         start_elem.opr.name);
3277               if (BE (*err != REG_NOERROR, 0))
3278                 goto parse_bracket_exp_free_return;
3279               break;
3280             case COLL_SYM:
3281               *err = build_collating_symbol (sbcset,
3282 #ifdef RE_ENABLE_I18N
3283                                              mbcset, &coll_sym_alloc,
3284 #endif /* RE_ENABLE_I18N */
3285                                              start_elem.opr.name);
3286               if (BE (*err != REG_NOERROR, 0))
3287                 goto parse_bracket_exp_free_return;
3288               break;
3289             case CHAR_CLASS:
3290               *err = build_charclass (regexp->trans, sbcset,
3291 #ifdef RE_ENABLE_I18N
3292                                       mbcset, &char_class_alloc,
3293 #endif /* RE_ENABLE_I18N */
3294                                       start_elem.opr.name, syntax);
3295               if (BE (*err != REG_NOERROR, 0))
3296                goto parse_bracket_exp_free_return;
3297               break;
3298             default:
3299               assert (0);
3300               break;
3301             }
3302         }
3303       if (BE (token->type == END_OF_RE, 0))
3304         {
3305           *err = REG_EBRACK;
3306           goto parse_bracket_exp_free_return;
3307         }
3308       if (token->type == OP_CLOSE_BRACKET)
3309         break;
3310     }
3311
3312   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3313
3314   /* If it is non-matching list.  */
3315   if (non_match)
3316     bitset_not (sbcset);
3317
3318 #ifdef RE_ENABLE_I18N
3319   /* Ensure only single byte characters are set.  */
3320   if (dfa->mb_cur_max > 1)
3321     bitset_mask (sbcset, dfa->sb_char);
3322
3323   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3324       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3325                                                      || mbcset->non_match)))
3326     {
3327       bin_tree_t *mbc_tree;
3328       int sbc_idx;
3329       /* Build a tree for complex bracket.  */
3330       dfa->has_mb_node = 1;
3331       br_token.type = COMPLEX_BRACKET;
3332       br_token.opr.mbcset = mbcset;
3333       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3334       if (BE (mbc_tree == NULL, 0))
3335         goto parse_bracket_exp_espace;
3336       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3337         if (sbcset[sbc_idx])
3338           break;
3339       /* If there are no bits set in sbcset, there is no point
3340          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3341       if (sbc_idx < BITSET_WORDS)
3342         {
3343           /* Build a tree for simple bracket.  */
3344           br_token.type = SIMPLE_BRACKET;
3345           br_token.opr.sbcset = sbcset;
3346           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3347           if (BE (work_tree == NULL, 0))
3348             goto parse_bracket_exp_espace;
3349
3350           /* Then join them by ALT node.  */
3351           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3352           if (BE (work_tree == NULL, 0))
3353             goto parse_bracket_exp_espace;
3354         }
3355       else
3356         {
3357           re_free (sbcset);
3358           work_tree = mbc_tree;
3359         }
3360     }
3361   else
3362 #endif /* not RE_ENABLE_I18N */
3363     {
3364 #ifdef RE_ENABLE_I18N
3365       free_charset (mbcset);
3366 #endif
3367       /* Build a tree for simple bracket.  */
3368       br_token.type = SIMPLE_BRACKET;
3369       br_token.opr.sbcset = sbcset;
3370       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3371       if (BE (work_tree == NULL, 0))
3372         goto parse_bracket_exp_espace;
3373     }
3374   return work_tree;
3375
3376  parse_bracket_exp_espace:
3377   *err = REG_ESPACE;
3378  parse_bracket_exp_free_return:
3379   re_free (sbcset);
3380 #ifdef RE_ENABLE_I18N
3381   free_charset (mbcset);
3382 #endif /* RE_ENABLE_I18N */
3383   return NULL;
3384 }
3385
3386 /* Parse an element in the bracket expression.  */
3387
3388 static reg_errcode_t
3389 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3390                        re_token_t *token, int token_len, re_dfa_t *dfa,
3391                        reg_syntax_t syntax, bool accept_hyphen)
3392 {
3393 #ifdef RE_ENABLE_I18N
3394   int cur_char_size;
3395   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3396   if (cur_char_size > 1)
3397     {
3398       elem->type = MB_CHAR;
3399       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3400       re_string_skip_bytes (regexp, cur_char_size);
3401       return REG_NOERROR;
3402     }
3403 #endif /* RE_ENABLE_I18N */
3404   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3405   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3406       || token->type == OP_OPEN_EQUIV_CLASS)
3407     return parse_bracket_symbol (elem, regexp, token);
3408   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3409     {
3410       /* A '-' must only appear as anything but a range indicator before
3411          the closing bracket.  Everything else is an error.  */
3412       re_token_t token2;
3413       (void) peek_token_bracket (&token2, regexp, syntax);
3414       if (token2.type != OP_CLOSE_BRACKET)
3415         /* The actual error value is not standardized since this whole
3416            case is undefined.  But ERANGE makes good sense.  */
3417         return REG_ERANGE;
3418     }
3419   elem->type = SB_CHAR;
3420   elem->opr.ch = token->opr.c;
3421   return REG_NOERROR;
3422 }
3423
3424 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3425    such as [:<character_class>:], [.<collating_element>.], and
3426    [=<equivalent_class>=].  */
3427
3428 static reg_errcode_t
3429 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3430                       re_token_t *token)
3431 {
3432   unsigned char ch, delim = token->opr.c;
3433   int i = 0;
3434   if (re_string_eoi(regexp))
3435     return REG_EBRACK;
3436   for (;; ++i)
3437     {
3438       if (i >= BRACKET_NAME_BUF_SIZE)
3439         return REG_EBRACK;
3440       if (token->type == OP_OPEN_CHAR_CLASS)
3441         ch = re_string_fetch_byte_case (regexp);
3442       else
3443         ch = re_string_fetch_byte (regexp);
3444       if (re_string_eoi(regexp))
3445         return REG_EBRACK;
3446       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3447         break;
3448       elem->opr.name[i] = ch;
3449     }
3450   re_string_skip_bytes (regexp, 1);
3451   elem->opr.name[i] = '\0';
3452   switch (token->type)
3453     {
3454     case OP_OPEN_COLL_ELEM:
3455       elem->type = COLL_SYM;
3456       break;
3457     case OP_OPEN_EQUIV_CLASS:
3458       elem->type = EQUIV_CLASS;
3459       break;
3460     case OP_OPEN_CHAR_CLASS:
3461       elem->type = CHAR_CLASS;
3462       break;
3463     default:
3464       break;
3465     }
3466   return REG_NOERROR;
3467 }
3468
3469   /* Helper function for parse_bracket_exp.
3470      Build the equivalence class which is represented by NAME.
3471      The result are written to MBCSET and SBCSET.
3472      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3473      is a pointer argument since we may update it.  */
3474
3475 static reg_errcode_t
3476 #ifdef RE_ENABLE_I18N
3477 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3478                    Idx *equiv_class_alloc, const unsigned char *name)
3479 #else /* not RE_ENABLE_I18N */
3480 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3481 #endif /* not RE_ENABLE_I18N */
3482 {
3483 #ifdef _LIBC
3484   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3485   if (nrules != 0)
3486     {
3487       const int32_t *table, *indirect;
3488       const unsigned char *weights, *extra, *cp;
3489       unsigned char char_buf[2];
3490       int32_t idx1, idx2;
3491       unsigned int ch;
3492       size_t len;
3493       /* This #include defines a local function!  */
3494 # include <locale/weight.h>
3495       /* Calculate the index for equivalence class.  */
3496       cp = name;
3497       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3498       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3499                                                _NL_COLLATE_WEIGHTMB);
3500       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501                                                    _NL_COLLATE_EXTRAMB);
3502       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3503                                                 _NL_COLLATE_INDIRECTMB);
3504       idx1 = findidx (&cp, -1);
3505       if (BE (idx1 == 0 || *cp != '\0', 0))
3506         /* This isn't a valid character.  */
3507         return REG_ECOLLATE;
3508
3509       /* Build single byte matching table for this equivalence class.  */
3510       len = weights[idx1 & 0xffffff];
3511       for (ch = 0; ch < SBC_MAX; ++ch)
3512         {
3513           char_buf[0] = ch;
3514           cp = char_buf;
3515           idx2 = findidx (&cp, 1);
3516 /*
3517           idx2 = table[ch];
3518 */
3519           if (idx2 == 0)
3520             /* This isn't a valid character.  */
3521             continue;
3522           /* Compare only if the length matches and the collation rule
3523              index is the same.  */
3524           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3525             {
3526               int cnt = 0;
3527
3528               while (cnt <= len &&
3529                      weights[(idx1 & 0xffffff) + 1 + cnt]
3530                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3531                 ++cnt;
3532
3533               if (cnt > len)
3534                 bitset_set (sbcset, ch);
3535             }
3536         }
3537       /* Check whether the array has enough space.  */
3538       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3539         {
3540           /* Not enough, realloc it.  */
3541           /* +1 in case of mbcset->nequiv_classes is 0.  */
3542           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3543           /* Use realloc since the array is NULL if *alloc == 0.  */
3544           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3545                                                    int32_t,
3546                                                    new_equiv_class_alloc);
3547           if (BE (new_equiv_classes == NULL, 0))
3548             return REG_ESPACE;
3549           mbcset->equiv_classes = new_equiv_classes;
3550           *equiv_class_alloc = new_equiv_class_alloc;
3551         }
3552       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3553     }
3554   else
3555 #endif /* _LIBC */
3556     {
3557       if (BE (strlen ((const char *) name) != 1, 0))
3558         return REG_ECOLLATE;
3559       bitset_set (sbcset, *name);
3560     }
3561   return REG_NOERROR;
3562 }
3563
3564   /* Helper function for parse_bracket_exp.
3565      Build the character class which is represented by NAME.
3566      The result are written to MBCSET and SBCSET.
3567      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3568      is a pointer argument since we may update it.  */
3569
3570 static reg_errcode_t
3571 #ifdef RE_ENABLE_I18N
3572 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573                  re_charset_t *mbcset, Idx *char_class_alloc,
3574                  const unsigned char *class_name, reg_syntax_t syntax)
3575 #else /* not RE_ENABLE_I18N */
3576 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577                  const unsigned char *class_name, reg_syntax_t syntax)
3578 #endif /* not RE_ENABLE_I18N */
3579 {
3580   int i;
3581   const char *name = (const char *) class_name;
3582
3583   /* In case of REG_ICASE "upper" and "lower" match the both of
3584      upper and lower cases.  */
3585   if ((syntax & RE_ICASE)
3586       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3587     name = "alpha";
3588
3589 #ifdef RE_ENABLE_I18N
3590   /* Check the space of the arrays.  */
3591   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3592     {
3593       /* Not enough, realloc it.  */
3594       /* +1 in case of mbcset->nchar_classes is 0.  */
3595       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3596       /* Use realloc since array is NULL if *alloc == 0.  */
3597       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3598                                                new_char_class_alloc);
3599       if (BE (new_char_classes == NULL, 0))
3600         return REG_ESPACE;
3601       mbcset->char_classes = new_char_classes;
3602       *char_class_alloc = new_char_class_alloc;
3603     }
3604   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3605 #endif /* RE_ENABLE_I18N */
3606
3607 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3608   do {                                          \
3609     if (BE (trans != NULL, 0))                  \
3610       {                                         \
3611         for (i = 0; i < SBC_MAX; ++i)           \
3612           if (ctype_func (i))                   \
3613             bitset_set (sbcset, trans[i]);      \
3614       }                                         \
3615     else                                        \
3616       {                                         \
3617         for (i = 0; i < SBC_MAX; ++i)           \
3618           if (ctype_func (i))                   \
3619             bitset_set (sbcset, i);             \
3620       }                                         \
3621   } while (0)
3622
3623   if (strcmp (name, "alnum") == 0)
3624     BUILD_CHARCLASS_LOOP (isalnum);
3625   else if (strcmp (name, "cntrl") == 0)
3626     BUILD_CHARCLASS_LOOP (iscntrl);
3627   else if (strcmp (name, "lower") == 0)
3628     BUILD_CHARCLASS_LOOP (islower);
3629   else if (strcmp (name, "space") == 0)
3630     BUILD_CHARCLASS_LOOP (isspace);
3631   else if (strcmp (name, "alpha") == 0)
3632     BUILD_CHARCLASS_LOOP (isalpha);
3633   else if (strcmp (name, "digit") == 0)
3634     BUILD_CHARCLASS_LOOP (isdigit);
3635   else if (strcmp (name, "print") == 0)
3636     BUILD_CHARCLASS_LOOP (isprint);
3637   else if (strcmp (name, "upper") == 0)
3638     BUILD_CHARCLASS_LOOP (isupper);
3639   else if (strcmp (name, "blank") == 0)
3640     BUILD_CHARCLASS_LOOP (isblank);
3641   else if (strcmp (name, "graph") == 0)
3642     BUILD_CHARCLASS_LOOP (isgraph);
3643   else if (strcmp (name, "punct") == 0)
3644     BUILD_CHARCLASS_LOOP (ispunct);
3645   else if (strcmp (name, "xdigit") == 0)
3646     BUILD_CHARCLASS_LOOP (isxdigit);
3647   else
3648     return REG_ECTYPE;
3649
3650   return REG_NOERROR;
3651 }
3652
3653 static bin_tree_t *
3654 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3655                     const unsigned char *class_name,
3656                     const unsigned char *extra, bool non_match,
3657                     reg_errcode_t *err)
3658 {
3659   re_bitset_ptr_t sbcset;
3660 #ifdef RE_ENABLE_I18N
3661   re_charset_t *mbcset;
3662   Idx alloc = 0;
3663 #endif /* not RE_ENABLE_I18N */
3664   reg_errcode_t ret;
3665   re_token_t br_token;
3666   bin_tree_t *tree;
3667
3668   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3669 #ifdef RE_ENABLE_I18N
3670   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3671 #endif /* RE_ENABLE_I18N */
3672
3673 #ifdef RE_ENABLE_I18N
3674   if (BE (sbcset == NULL || mbcset == NULL, 0))
3675 #else /* not RE_ENABLE_I18N */
3676   if (BE (sbcset == NULL, 0))
3677 #endif /* not RE_ENABLE_I18N */
3678     {
3679       *err = REG_ESPACE;
3680       return NULL;
3681     }
3682
3683   if (non_match)
3684     {
3685 #ifdef RE_ENABLE_I18N
3686       mbcset->non_match = 1;
3687 #endif /* not RE_ENABLE_I18N */
3688     }
3689
3690   /* We don't care the syntax in this case.  */
3691   ret = build_charclass (trans, sbcset,
3692 #ifdef RE_ENABLE_I18N
3693                          mbcset, &alloc,
3694 #endif /* RE_ENABLE_I18N */
3695                          class_name, 0);
3696
3697   if (BE (ret != REG_NOERROR, 0))
3698     {
3699       re_free (sbcset);
3700 #ifdef RE_ENABLE_I18N
3701       free_charset (mbcset);
3702 #endif /* RE_ENABLE_I18N */
3703       *err = ret;
3704       return NULL;
3705     }
3706   /* \w match '_' also.  */
3707   for (; *extra; extra++)
3708     bitset_set (sbcset, *extra);
3709
3710   /* If it is non-matching list.  */
3711   if (non_match)
3712     bitset_not (sbcset);
3713
3714 #ifdef RE_ENABLE_I18N
3715   /* Ensure only single byte characters are set.  */
3716   if (dfa->mb_cur_max > 1)
3717     bitset_mask (sbcset, dfa->sb_char);
3718 #endif
3719
3720   /* Build a tree for simple bracket.  */
3721   br_token.type = SIMPLE_BRACKET;
3722   br_token.opr.sbcset = sbcset;
3723   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724   if (BE (tree == NULL, 0))
3725     goto build_word_op_espace;
3726
3727 #ifdef RE_ENABLE_I18N
3728   if (dfa->mb_cur_max > 1)
3729     {
3730       bin_tree_t *mbc_tree;
3731       /* Build a tree for complex bracket.  */
3732       br_token.type = COMPLEX_BRACKET;
3733       br_token.opr.mbcset = mbcset;
3734       dfa->has_mb_node = 1;
3735       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3736       if (BE (mbc_tree == NULL, 0))
3737         goto build_word_op_espace;
3738       /* Then join them by ALT node.  */
3739       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3740       if (BE (mbc_tree != NULL, 1))
3741         return tree;
3742     }
3743   else
3744     {
3745       free_charset (mbcset);
3746       return tree;
3747     }
3748 #else /* not RE_ENABLE_I18N */
3749   return tree;
3750 #endif /* not RE_ENABLE_I18N */
3751
3752  build_word_op_espace:
3753   re_free (sbcset);
3754 #ifdef RE_ENABLE_I18N
3755   free_charset (mbcset);
3756 #endif /* RE_ENABLE_I18N */
3757   *err = REG_ESPACE;
3758   return NULL;
3759 }
3760
3761 /* This is intended for the expressions like "a{1,3}".
3762    Fetch a number from 'input', and return the number.
3763    Return REG_MISSING if the number field is empty like "{,1}".
3764    Return RE_DUP_MAX + 1 if the number field is too large.
3765    Return REG_ERROR if an error occurred.  */
3766
3767 static Idx
3768 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3769 {
3770   Idx num = REG_MISSING;
3771   unsigned char c;
3772   while (1)
3773     {
3774       fetch_token (token, input, syntax);
3775       c = token->opr.c;
3776       if (BE (token->type == END_OF_RE, 0))
3777         return REG_ERROR;
3778       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3779         break;
3780       num = ((token->type != CHARACTER || c < '0' || '9' < c
3781               || num == REG_ERROR)
3782              ? REG_ERROR
3783              : num == REG_MISSING
3784              ? c - '0'
3785              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3786     }
3787   return num;
3788 }
3789 \f
3790 #ifdef RE_ENABLE_I18N
3791 static void
3792 free_charset (re_charset_t *cset)
3793 {
3794   re_free (cset->mbchars);
3795 # ifdef _LIBC
3796   re_free (cset->coll_syms);
3797   re_free (cset->equiv_classes);
3798   re_free (cset->range_starts);
3799   re_free (cset->range_ends);
3800 # endif
3801   re_free (cset->char_classes);
3802   re_free (cset);
3803 }
3804 #endif /* RE_ENABLE_I18N */
3805 \f
3806 /* Functions for binary tree operation.  */
3807
3808 /* Create a tree node.  */
3809
3810 static bin_tree_t *
3811 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3812              re_token_type_t type)
3813 {
3814   re_token_t t;
3815   t.type = type;
3816   return create_token_tree (dfa, left, right, &t);
3817 }
3818
3819 static bin_tree_t *
3820 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3821                    const re_token_t *token)
3822 {
3823   bin_tree_t *tree;
3824   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3825     {
3826       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3827
3828       if (storage == NULL)
3829         return NULL;
3830       storage->next = dfa->str_tree_storage;
3831       dfa->str_tree_storage = storage;
3832       dfa->str_tree_storage_idx = 0;
3833     }
3834   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3835
3836   tree->parent = NULL;
3837   tree->left = left;
3838   tree->right = right;
3839   tree->token = *token;
3840   tree->token.duplicated = 0;
3841   tree->token.opt_subexp = 0;
3842   tree->first = NULL;
3843   tree->next = NULL;
3844   tree->node_idx = REG_MISSING;
3845
3846   if (left != NULL)
3847     left->parent = tree;
3848   if (right != NULL)
3849     right->parent = tree;
3850   return tree;
3851 }
3852
3853 /* Mark the tree SRC as an optional subexpression.
3854    To be called from preorder or postorder.  */
3855
3856 static reg_errcode_t
3857 mark_opt_subexp (void *extra, bin_tree_t *node)
3858 {
3859   Idx idx = (Idx) (long) extra;
3860   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3861     node->token.opt_subexp = 1;
3862
3863   return REG_NOERROR;
3864 }
3865
3866 /* Free the allocated memory inside NODE. */
3867
3868 static void
3869 free_token (re_token_t *node)
3870 {
3871 #ifdef RE_ENABLE_I18N
3872   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3873     free_charset (node->opr.mbcset);
3874   else
3875 #endif /* RE_ENABLE_I18N */
3876     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3877       re_free (node->opr.sbcset);
3878 }
3879
3880 /* Worker function for tree walking.  Free the allocated memory inside NODE
3881    and its children. */
3882
3883 static reg_errcode_t
3884 free_tree (void *extra, bin_tree_t *node)
3885 {
3886   free_token (&node->token);
3887   return REG_NOERROR;
3888 }
3889
3890
3891 /* Duplicate the node SRC, and return new node.  This is a preorder
3892    visit similar to the one implemented by the generic visitor, but
3893    we need more infrastructure to maintain two parallel trees --- so,
3894    it's easier to duplicate.  */
3895
3896 static bin_tree_t *
3897 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3898 {
3899   const bin_tree_t *node;
3900   bin_tree_t *dup_root;
3901   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3902
3903   for (node = root; ; )
3904     {
3905       /* Create a new tree and link it back to the current parent.  */
3906       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3907       if (*p_new == NULL)
3908         return NULL;
3909       (*p_new)->parent = dup_node;
3910       (*p_new)->token.duplicated = 1;
3911       dup_node = *p_new;
3912
3913       /* Go to the left node, or up and to the right.  */
3914       if (node->left)
3915         {
3916           node = node->left;
3917           p_new = &dup_node->left;
3918         }
3919       else
3920         {
3921           const bin_tree_t *prev = NULL;
3922           while (node->right == prev || node->right == NULL)
3923             {
3924               prev = node;
3925               node = node->parent;
3926               dup_node = dup_node->parent;
3927               if (!node)
3928                 return dup_root;
3929             }
3930           node = node->right;
3931           p_new = &dup_node->right;
3932         }
3933     }
3934 }