lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
  18
  19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  20                                           size_t length, reg_syntax_t syntax);
  21 static void re_compile_fastmap_iter (regex_t *bufp,
  22                                      const re_dfastate_t *init_state,
  23                                      char *fastmap);
  24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  25 #ifdef RE_ENABLE_I18N
  26 static void free_charset (re_charset_t *cset);
  27 #endif /* RE_ENABLE_I18N */
  28 static void free_workarea_compile (regex_t *preg);
  29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  30 #ifdef RE_ENABLE_I18N
  31 static void optimize_utf8 (re_dfa_t *dfa);
  32 #endif
  33 static reg_errcode_t analyze (regex_t *preg);
  34 static reg_errcode_t preorder (bin_tree_t *root,
  35                                reg_errcode_t (fn (void *, bin_tree_t *)),
  36                                void *extra);
  37 static reg_errcode_t postorder (bin_tree_t *root,
  38                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                 void *extra);
  40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  43                                  bin_tree_t *node);
  44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  49                                    unsigned int constraint);
  50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  52                                          Idx node, bool root);
  53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  54 static Idx fetch_number (re_string_t *input, re_token_t *token,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax) internal_function;
  58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  59                           reg_syntax_t syntax, reg_errcode_t *err);
  60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  61                                   re_token_t *token, reg_syntax_t syntax,
  62                                   Idx nest, reg_errcode_t *err);
  63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  64                                  re_token_t *token, reg_syntax_t syntax,
  65                                  Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  67                                      re_token_t *token, reg_syntax_t syntax,
  68                                      Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  70                                   re_token_t *token, reg_syntax_t syntax,
  71                                   Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  73                                  re_dfa_t *dfa, re_token_t *token,
  74                                  reg_syntax_t syntax, reg_errcode_t *err);
  75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  76                                       re_token_t *token, reg_syntax_t syntax,
  77                                       reg_errcode_t *err);
  78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  79                                             re_string_t *regexp,
  80                                             re_token_t *token, int token_len,
  81                                             re_dfa_t *dfa,
  82                                             reg_syntax_t syntax,
  83                                             bool accept_hyphen);
  84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  85                                           re_string_t *regexp,
  86                                           re_token_t *token);
  87 #ifdef RE_ENABLE_I18N
  88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  89                                         re_charset_t *mbcset,
  90                                         Idx *equiv_class_alloc,
  91                                         const unsigned char *name);
  92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  93                                       bitset_t sbcset,
  94                                       re_charset_t *mbcset,
  95                                       Idx *char_class_alloc,
  96                                       const unsigned char *class_name,
  97                                       reg_syntax_t syntax);
  98 #else  /* not RE_ENABLE_I18N */
  99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 100                                         const unsigned char *name);
 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 102                                       bitset_t sbcset,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #endif /* not RE_ENABLE_I18N */
 106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 107                                        RE_TRANSLATE_TYPE trans,
 108                                        const unsigned char *class_name,
 109                                        const unsigned char *extra,
 110                                        bool non_match, reg_errcode_t *err);
 111 static bin_tree_t *create_tree (re_dfa_t *dfa,
 112                                 bin_tree_t *left, bin_tree_t *right,
 113                                 re_token_type_t type);
 114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 115                                       bin_tree_t *left, bin_tree_t *right,
 116                                       const re_token_t *token);
 117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 118 static void free_token (re_token_t *node);
 119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 121 \f
 122 /* This table gives an error message for each of the error codes listed
 123    in regex.h.  Obviously the order here has to be same as there.
 124    POSIX doesn't require that we do anything for REG_NOERROR,
 125    but why not be nice?  */
 126
 127 static const char __re_error_msgid[] =
 128   {
 129 #define REG_NOERROR_IDX 0
 130     gettext_noop ("Success")    /* REG_NOERROR */
 131     "\0"
 132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 133     gettext_noop ("No match")   /* REG_NOMATCH */
 134     "\0"
 135 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 136     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 137     "\0"
 138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 139     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 140     "\0"
 141 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 142     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 143     "\0"
 144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 145     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 146     "\0"
 147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 148     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 149     "\0"
 150 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 151     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 152     "\0"
 153 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 154     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 155     "\0"
 156 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 157     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 158     "\0"
 159 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 160     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 161     "\0"
 162 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 163     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 164     "\0"
 165 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 166     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 167     "\0"
 168 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 169     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 170     "\0"
 171 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 172     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 173     "\0"
 174 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 175     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 176     "\0"
 177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 178     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 179   };
 180
 181 static const size_t __re_error_msgid_idx[] =
 182   {
 183     REG_NOERROR_IDX,
 184     REG_NOMATCH_IDX,
 185     REG_BADPAT_IDX,
 186     REG_ECOLLATE_IDX,
 187     REG_ECTYPE_IDX,
 188     REG_EESCAPE_IDX,
 189     REG_ESUBREG_IDX,
 190     REG_EBRACK_IDX,
 191     REG_EPAREN_IDX,
 192     REG_EBRACE_IDX,
 193     REG_BADBR_IDX,
 194     REG_ERANGE_IDX,
 195     REG_ESPACE_IDX,
 196     REG_BADRPT_IDX,
 197     REG_EEND_IDX,
 198     REG_ESIZE_IDX,
 199     REG_ERPAREN_IDX
 200   };
 201 \f
 202 /* Entry points for GNU code.  */
 203
 204 /* re_compile_pattern is the GNU regular expression compiler: it
 205    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 206    Returns 0 if the pattern was valid, otherwise an error string.
 207
 208    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 209    are set in BUFP on entry.  */
 210
 211 #ifdef _LIBC
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 #else /* size_t might promote */
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 #endif
 222 {
 223   reg_errcode_t ret;
 224
 225   /* And GNU code determines whether or not to get register information
 226      by passing null for the REGS argument to re_match, etc., not by
 227      setting no_sub, unless RE_NO_SUB is set.  */
 228   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 229
 230   /* Match anchors at newline.  */
 231   bufp->newline_anchor = 1;
 232
 233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 234
 235   if (!ret)
 236     return NULL;
 237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 238 }
 239 #ifdef _LIBC
 240 weak_alias (__re_compile_pattern, re_compile_pattern)
 241 #endif
 242
 243 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 244    also be assigned to arbitrarily: each pattern buffer stores its own
 245    syntax, so it can be changed between regex compilations.  */
 246 /* This has no initializer because initialized variables in Emacs
 247    become read-only after dumping.  */
 248 reg_syntax_t re_syntax_options;
 249
 250
 251 /* Specify the precise syntax of regexps for compilation.  This provides
 252    for compatibility for various utilities which historically have
 253    different, incompatible syntaxes.
 254
 255    The argument SYNTAX is a bit mask comprised of the various bits
 256    defined in regex.h.  We return the old syntax.  */
 257
 258 reg_syntax_t
 259 re_set_syntax (syntax)
 260     reg_syntax_t syntax;
 261 {
 262   reg_syntax_t ret = re_syntax_options;
 263
 264   re_syntax_options = syntax;
 265   return ret;
 266 }
 267 #ifdef _LIBC
 268 weak_alias (__re_set_syntax, re_set_syntax)
 269 #endif
 270
 271 int
 272 re_compile_fastmap (bufp)
 273     struct re_pattern_buffer *bufp;
 274 {
 275   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 276   char *fastmap = bufp->fastmap;
 277
 278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 280   if (dfa->init_state != dfa->init_state_word)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 282   if (dfa->init_state != dfa->init_state_nl)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 284   if (dfa->init_state != dfa->init_state_begbuf)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 286   bufp->fastmap_accurate = 1;
 287   return 0;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 291 #endif
 292
 293 static inline void
 294 __attribute ((always_inline))
 295 re_set_fastmap (char *fastmap, bool icase, int ch)
 296 {
 297   fastmap[ch] = 1;
 298   if (icase)
 299     fastmap[tolower (ch)] = 1;
 300 }
 301
 302 /* Helper function for re_compile_fastmap.
 303    Compile fastmap for the initial_state INIT_STATE.  */
 304
 305 static void
 306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 307                          char *fastmap)
 308 {
 309   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 310   Idx node_cnt;
 311   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 312   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 313     {
 314       Idx node = init_state->nodes.elems[node_cnt];
 315       re_token_type_t type = dfa->nodes[node].type;
 316
 317       if (type == CHARACTER)
 318         {
 319           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 320 #ifdef RE_ENABLE_I18N
 321           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 322             {
 323               unsigned char buf[MB_LEN_MAX];
 324               unsigned char *p;
 325               wchar_t wc;
 326               mbstate_t state;
 327
 328               p = buf;
 329               *p++ = dfa->nodes[node].opr.c;
 330               while (++node < dfa->nodes_len
 331                      && dfa->nodes[node].type == CHARACTER
 332                      && dfa->nodes[node].mb_partial)
 333                 *p++ = dfa->nodes[node].opr.c;
 334               memset (&state, '\0', sizeof (state));
 335               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 336                              &state) == p - buf
 337                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 338                       != (size_t) -1))
 339                 re_set_fastmap (fastmap, false, buf[0]);
 340             }
 341 #endif
 342         }
 343       else if (type == SIMPLE_BRACKET)
 344         {
 345           int i, ch;
 346           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 347             {
 348               int j;
 349               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 350               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 351                 if (w & ((bitset_word_t) 1 << j))
 352                   re_set_fastmap (fastmap, icase, ch);
 353             }
 354         }
 355 #ifdef RE_ENABLE_I18N
 356       else if (type == COMPLEX_BRACKET)
 357         {
 358           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 359           Idx i;
 360
 361 # ifdef _LIBC
 362           /* See if we have to try all bytes which start multiple collation
 363              elements.
 364              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 365                   collation element, and don't catch 'b' since 'b' is
 366                   the only collation element which starts from 'b' (and
 367                   it is caught by SIMPLE_BRACKET).  */
 368               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 369                   && (cset->ncoll_syms || cset->nranges))
 370                 {
 371                   const int32_t *table = (const int32_t *)
 372                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 373                   for (i = 0; i < SBC_MAX; ++i)
 374                     if (table[i] < 0)
 375                       re_set_fastmap (fastmap, icase, i);
 376                 }
 377 # endif /* _LIBC */
 378
 379           /* See if we have to start the match at all multibyte characters,
 380              i.e. where we would not find an invalid sequence.  This only
 381              applies to multibyte character sets; for single byte character
 382              sets, the SIMPLE_BRACKET again suffices.  */
 383           if (dfa->mb_cur_max > 1
 384               && (cset->nchar_classes || cset->non_match || cset->nranges
 385 # ifdef _LIBC
 386                   || cset->nequiv_classes
 387 # endif /* _LIBC */
 388                  ))
 389             {
 390               unsigned char c = 0;
 391               do
 392                 {
 393                   mbstate_t mbs;
 394                   memset (&mbs, 0, sizeof (mbs));
 395                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 396                     re_set_fastmap (fastmap, false, (int) c);
 397                 }
 398               while (++c != 0);
 399             }
 400
 401           else
 402             {
 403               /* ... Else catch all bytes which can start the mbchars.  */
 404               for (i = 0; i < cset->nmbchars; ++i)
 405                 {
 406                   char buf[256];
 407                   mbstate_t state;
 408                   memset (&state, '\0', sizeof (state));
 409                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 410                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 411                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 412                     {
 413                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 414                           != (size_t) -1)
 415                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 416                     }
 417                 }
 418             }
 419         }
 420 #endif /* RE_ENABLE_I18N */
 421       else if (type == OP_PERIOD
 422 #ifdef RE_ENABLE_I18N
 423                || type == OP_UTF8_PERIOD
 424 #endif /* RE_ENABLE_I18N */
 425                || type == END_OF_RE)
 426         {
 427           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 428           if (type == END_OF_RE)
 429             bufp->can_be_null = 1;
 430           return;
 431         }
 432     }
 433 }
 434 \f
 435 /* Entry point for POSIX code.  */
 436 /* regcomp takes a regular expression as a string and compiles it.
 437
 438    PREG is a regex_t *.  We do not expect any fields to be initialized,
 439    since POSIX says we shouldn't.  Thus, we set
 440
 441      'buffer' to the compiled pattern;
 442      'used' to the length of the compiled pattern;
 443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 445        RE_SYNTAX_POSIX_BASIC;
 446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 447      'fastmap' to an allocated space for the fastmap;
 448      'fastmap_accurate' to zero;
 449      're_nsub' to the number of subexpressions in PATTERN.
 450
 451    PATTERN is the address of the pattern string.
 452
 453    CFLAGS is a series of bits which affect compilation.
 454
 455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 456      use POSIX basic syntax.
 457
 458      If REG_NEWLINE is set, then . and [^...] don't match newline.
 459      Also, regexec will try a match beginning after every newline.
 460
 461      If REG_ICASE is set, then we considers upper- and lowercase
 462      versions of letters to be equivalent when matching.
 463
 464      If REG_NOSUB is set, then when PREG is passed to regexec, that
 465      routine will report only success or failure, and nothing about the
 466      registers.
 467
 468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 469    the return codes and their meanings.)  */
 470
 471 int
 472 regcomp (preg, pattern, cflags)
 473     regex_t *_Restrict_ preg;
 474     const char *_Restrict_ pattern;
 475     int cflags;
 476 {
 477   reg_errcode_t ret;
 478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 479                          : RE_SYNTAX_POSIX_BASIC);
 480
 481   preg->buffer = NULL;
 482   preg->allocated = 0;
 483   preg->used = 0;
 484
 485   /* Try to allocate space for the fastmap.  */
 486   preg->fastmap = re_malloc (char, SBC_MAX);
 487   if (BE (preg->fastmap == NULL, 0))
 488     return REG_ESPACE;
 489
 490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 491
 492   /* If REG_NEWLINE is set, newlines are treated differently.  */
 493   if (cflags & REG_NEWLINE)
 494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 495       syntax &= ~RE_DOT_NEWLINE;
 496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 497       /* It also changes the matching behavior.  */
 498       preg->newline_anchor = 1;
 499     }
 500   else
 501     preg->newline_anchor = 0;
 502   preg->no_sub = !!(cflags & REG_NOSUB);
 503   preg->translate = NULL;
 504
 505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 506
 507   /* POSIX doesn't distinguish between an unmatched open-group and an
 508      unmatched close-group: both are REG_EPAREN.  */
 509   if (ret == REG_ERPAREN)
 510     ret = REG_EPAREN;
 511
 512   /* We have already checked preg->fastmap != NULL.  */
 513   if (BE (ret == REG_NOERROR, 1))
 514     /* Compute the fastmap now, since regexec cannot modify the pattern
 515        buffer.  This function never fails in this implementation.  */
 516     (void) re_compile_fastmap (preg);
 517   else
 518     {
 519       /* Some error occurred while compiling the expression.  */
 520       re_free (preg->fastmap);
 521       preg->fastmap = NULL;
 522     }
 523
 524   return (int) ret;
 525 }
 526 #ifdef _LIBC
 527 weak_alias (__regcomp, regcomp)
 528 #endif
 529
 530 /* Returns a message corresponding to an error code, ERRCODE, returned
 531    from either regcomp or regexec.   We don't use PREG here.  */
 532
 533 #ifdef _LIBC
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *_Restrict_ preg;
 538     char *_Restrict_ errbuf;
 539     size_t errbuf_size;
 540 #else /* size_t might promote */
 541 size_t
 542 regerror (int errcode, const regex_t *_Restrict_ preg,
 543           char *_Restrict_ errbuf, size_t errbuf_size)
 544 #endif
 545 {
 546   const char *msg;
 547   size_t msg_size;
 548
 549   if (BE (errcode < 0
 550           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 551                                / sizeof (__re_error_msgid_idx[0])), 0))
 552     /* Only error codes returned by the rest of the code should be passed
 553        to this routine.  If we are given anything else, or if other regex
 554        code generates an invalid error code, then the program has a bug.
 555        Dump core so we can fix it.  */
 556     abort ();
 557
 558   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 559
 560   msg_size = strlen (msg) + 1; /* Includes the null.  */
 561
 562   if (BE (errbuf_size != 0, 1))
 563     {
 564       size_t cpy_size = msg_size;
 565       if (BE (msg_size > errbuf_size, 0))
 566         {
 567           cpy_size = errbuf_size - 1;
 568           errbuf[cpy_size] = '\0';
 569         }
 570       memcpy (errbuf, msg, cpy_size);
 571     }
 572
 573   return msg_size;
 574 }
 575 #ifdef _LIBC
 576 weak_alias (__regerror, regerror)
 577 #endif
 578
 579
 580 #ifdef RE_ENABLE_I18N
 581 /* This static array is used for the map to single-byte characters when
 582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 583    it the same all the time.  UTF-8 is the preferred encoding so this is
 584    a worthwhile optimization.  */
 585 static const bitset_t utf8_sb_map =
 586 {
 587   /* Set the first 128 bits.  */
 588 # ifdef __GNUC__
 589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 590 # else
 591 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 592 #   error "bitset_word_t is narrower than 32 bits"
 593 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX, BITSET_WORD_MAX,
 597 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 598   BITSET_WORD_MAX,
 599 #  endif
 600   (BITSET_WORD_MAX
 601    >> (SBC_MAX % BITSET_WORD_BITS == 0
 602        ? 0
 603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 604 # endif
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = (re_dfa_t *) preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = (unsigned char *) dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if (strcasecmp (codeset_name, "UTF-8") == 0
 903       || strcasecmp (codeset_name, "UTF8") == 0)
 904     dfa->is_utf8 = 1;
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 916       else
 917         {
 918           int i, j, ch;
 919
 920           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 921           if (BE (dfa->sb_char == NULL, 0))
 922             return REG_ESPACE;
 923
 924           /* Set the bits corresponding to single byte chars.  */
 925           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 926             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 927               {
 928                 wint_t wch = __btowc (ch);
 929                 if (wch != WEOF)
 930                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 931 # ifndef _LIBC
 932                 if (isascii (ch) && wch != ch)
 933                   dfa->map_notascii = 1;
 934 # endif
 935               }
 936         }
 937     }
 938 #endif
 939
 940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 /* Initialize WORD_CHAR table, which indicate which character is
 946    "word".  In this case "word" means that it is the word construction
 947    character used by some operators like "\<", "\>", etc.  */
 948
 949 static void
 950 internal_function
 951 init_word_char (re_dfa_t *dfa)
 952 {
 953   dfa->word_ops_used = 1;
 954   int i = 0;
 955   int j;
 956   int ch = 0;
 957   if (BE (dfa->map_notascii == 0, 1))
 958     {
 959       if (BITSET_WORD_BITS == 64)
 960         {
 961           dfa->word_char[0] = UINT64_C (0x03ff000000000000);
 962           dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
 963           i = 2;
 964         }
 965       else if (BITSET_WORD_BITS == 32)
 966         {
 967           dfa->word_char[0] = UINT32_C (0x00000000);
 968           dfa->word_char[1] = UINT32_C (0x03ff0000);
 969           dfa->word_char[2] = UINT32_C (0x87fffffe);
 970           dfa->word_char[3] = UINT32_C (0x07fffffe);
 971           i = 4;
 972         }
 973       else
 974         goto general_case;
 975       ch = 128;
 976
 977       if (BE (dfa->is_utf8, 1))
 978         {
 979           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 980           return;
 981         }
 982     }
 983
 984  general_case:
 985   for (; i < BITSET_WORDS; ++i)
 986     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 987       if (isalnum (ch) || ch == '_')
 988         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 989 }
 990
 991 /* Free the work area which are only used while compiling.  */
 992
 993 static void
 994 free_workarea_compile (regex_t *preg)
 995 {
 996   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 997   bin_tree_storage_t *storage, *next;
 998   for (storage = dfa->str_tree_storage; storage; storage = next)
 999     {
1000       next = storage->next;
1001       re_free (storage);
1002     }
1003   dfa->str_tree_storage = NULL;
1004   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1005   dfa->str_tree = NULL;
1006   re_free (dfa->org_indices);
1007   dfa->org_indices = NULL;
1008 }
1009
1010 /* Create initial states for all contexts.  */
1011
1012 static reg_errcode_t
1013 create_initial_state (re_dfa_t *dfa)
1014 {
1015   Idx first, i;
1016   reg_errcode_t err;
1017   re_node_set init_nodes;
1018
1019   /* Initial states have the epsilon closure of the node which is
1020      the first node of the regular expression.  */
1021   first = dfa->str_tree->first->node_idx;
1022   dfa->init_node = first;
1023   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1024   if (BE (err != REG_NOERROR, 0))
1025     return err;
1026
1027   /* The back-references which are in initial states can epsilon transit,
1028      since in this case all of the subexpressions can be null.
1029      Then we add epsilon closures of the nodes which are the next nodes of
1030      the back-references.  */
1031   if (dfa->nbackref > 0)
1032     for (i = 0; i < init_nodes.nelem; ++i)
1033       {
1034         Idx node_idx = init_nodes.elems[i];
1035         re_token_type_t type = dfa->nodes[node_idx].type;
1036
1037         Idx clexp_idx;
1038         if (type != OP_BACK_REF)
1039           continue;
1040         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1041           {
1042             re_token_t *clexp_node;
1043             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1044             if (clexp_node->type == OP_CLOSE_SUBEXP
1045                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1046               break;
1047           }
1048         if (clexp_idx == init_nodes.nelem)
1049           continue;
1050
1051         if (type == OP_BACK_REF)
1052           {
1053             Idx dest_idx = dfa->edests[node_idx].elems[0];
1054             if (!re_node_set_contains (&init_nodes, dest_idx))
1055               {
1056                 reg_errcode_t merge_err
1057                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1058                 if (merge_err != REG_NOERROR)
1059                   return merge_err;
1060                 i = 0;
1061               }
1062           }
1063       }
1064
1065   /* It must be the first time to invoke acquire_state.  */
1066   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1067   /* We don't check ERR here, since the initial state must not be NULL.  */
1068   if (BE (dfa->init_state == NULL, 0))
1069     return err;
1070   if (dfa->init_state->has_constraint)
1071     {
1072       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1073                                                        CONTEXT_WORD);
1074       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1075                                                      CONTEXT_NEWLINE);
1076       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1077                                                          &init_nodes,
1078                                                          CONTEXT_NEWLINE
1079                                                          | CONTEXT_BEGBUF);
1080       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1081               || dfa->init_state_begbuf == NULL, 0))
1082         return err;
1083     }
1084   else
1085     dfa->init_state_word = dfa->init_state_nl
1086       = dfa->init_state_begbuf = dfa->init_state;
1087
1088   re_node_set_free (&init_nodes);
1089   return REG_NOERROR;
1090 }
1091 \f
1092 #ifdef RE_ENABLE_I18N
1093 /* If it is possible to do searching in single byte encoding instead of UTF-8
1094    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1095    DFA nodes where needed.  */
1096
1097 static void
1098 optimize_utf8 (re_dfa_t *dfa)
1099 {
1100   Idx node;
1101   int i;
1102   bool mb_chars = false;
1103   bool has_period = false;
1104
1105   for (node = 0; node < dfa->nodes_len; ++node)
1106     switch (dfa->nodes[node].type)
1107       {
1108       case CHARACTER:
1109         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1110           mb_chars = true;
1111         break;
1112       case ANCHOR:
1113         switch (dfa->nodes[node].opr.ctx_type)
1114           {
1115           case LINE_FIRST:
1116           case LINE_LAST:
1117           case BUF_FIRST:
1118           case BUF_LAST:
1119             break;
1120           default:
1121             /* Word anchors etc. cannot be handled.  It's okay to test
1122                opr.ctx_type since constraints (for all DFA nodes) are
1123                created by ORing one or more opr.ctx_type values.  */
1124             return;
1125           }
1126         break;
1127       case OP_PERIOD:
1128         has_period = true;
1129         break;
1130       case OP_BACK_REF:
1131       case OP_ALT:
1132       case END_OF_RE:
1133       case OP_DUP_ASTERISK:
1134       case OP_OPEN_SUBEXP:
1135       case OP_CLOSE_SUBEXP:
1136         break;
1137       case COMPLEX_BRACKET:
1138         return;
1139       case SIMPLE_BRACKET:
1140         /* Just double check.  */
1141         {
1142           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1143                         ? 0
1144                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1145           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1146             {
1147               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1148                 return;
1149               rshift = 0;
1150             }
1151         }
1152         break;
1153       default:
1154         abort ();
1155       }
1156
1157   if (mb_chars || has_period)
1158     for (node = 0; node < dfa->nodes_len; ++node)
1159       {
1160         if (dfa->nodes[node].type == CHARACTER
1161             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1162           dfa->nodes[node].mb_partial = 0;
1163         else if (dfa->nodes[node].type == OP_PERIOD)
1164           dfa->nodes[node].type = OP_UTF8_PERIOD;
1165       }
1166
1167   /* The search can be in single byte locale.  */
1168   dfa->mb_cur_max = 1;
1169   dfa->is_utf8 = 0;
1170   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1171 }
1172 #endif
1173 \f
1174 /* Analyze the structure tree, and calculate "first", "next", "edest",
1175    "eclosure", and "inveclosure".  */
1176
1177 static reg_errcode_t
1178 analyze (regex_t *preg)
1179 {
1180   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1181   reg_errcode_t ret;
1182
1183   /* Allocate arrays.  */
1184   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1185   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1186   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1187   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1188   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1189           || dfa->eclosures == NULL, 0))
1190     return REG_ESPACE;
1191
1192   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1193   if (dfa->subexp_map != NULL)
1194     {
1195       Idx i;
1196       for (i = 0; i < preg->re_nsub; i++)
1197         dfa->subexp_map[i] = i;
1198       preorder (dfa->str_tree, optimize_subexps, dfa);
1199       for (i = 0; i < preg->re_nsub; i++)
1200         if (dfa->subexp_map[i] != i)
1201           break;
1202       if (i == preg->re_nsub)
1203         {
1204           free (dfa->subexp_map);
1205           dfa->subexp_map = NULL;
1206         }
1207     }
1208
1209   ret = postorder (dfa->str_tree, lower_subexps, preg);
1210   if (BE (ret != REG_NOERROR, 0))
1211     return ret;
1212   ret = postorder (dfa->str_tree, calc_first, dfa);
1213   if (BE (ret != REG_NOERROR, 0))
1214     return ret;
1215   preorder (dfa->str_tree, calc_next, dfa);
1216   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1217   if (BE (ret != REG_NOERROR, 0))
1218     return ret;
1219   ret = calc_eclosure (dfa);
1220   if (BE (ret != REG_NOERROR, 0))
1221     return ret;
1222
1223   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1224      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1225   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1226       || dfa->nbackref)
1227     {
1228       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1229       if (BE (dfa->inveclosures == NULL, 0))
1230         return REG_ESPACE;
1231       ret = calc_inveclosure (dfa);
1232     }
1233
1234   return ret;
1235 }
1236
1237 /* Our parse trees are very unbalanced, so we cannot use a stack to
1238    implement parse tree visits.  Instead, we use parent pointers and
1239    some hairy code in these two functions.  */
1240 static reg_errcode_t
1241 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242            void *extra)
1243 {
1244   bin_tree_t *node, *prev;
1245
1246   for (node = root; ; )
1247     {
1248       /* Descend down the tree, preferably to the left (or to the right
1249          if that's the only child).  */
1250       while (node->left || node->right)
1251         if (node->left)
1252           node = node->left;
1253         else
1254           node = node->right;
1255
1256       do
1257         {
1258           reg_errcode_t err = fn (extra, node);
1259           if (BE (err != REG_NOERROR, 0))
1260             return err;
1261           if (node->parent == NULL)
1262             return REG_NOERROR;
1263           prev = node;
1264           node = node->parent;
1265         }
1266       /* Go up while we have a node that is reached from the right.  */
1267       while (node->right == prev || node->right == NULL);
1268       node = node->right;
1269     }
1270 }
1271
1272 static reg_errcode_t
1273 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1274           void *extra)
1275 {
1276   bin_tree_t *node;
1277
1278   for (node = root; ; )
1279     {
1280       reg_errcode_t err = fn (extra, node);
1281       if (BE (err != REG_NOERROR, 0))
1282         return err;
1283
1284       /* Go to the left node, or up and to the right.  */
1285       if (node->left)
1286         node = node->left;
1287       else
1288         {
1289           bin_tree_t *prev = NULL;
1290           while (node->right == prev || node->right == NULL)
1291             {
1292               prev = node;
1293               node = node->parent;
1294               if (!node)
1295                 return REG_NOERROR;
1296             }
1297           node = node->right;
1298         }
1299     }
1300 }
1301
1302 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1303    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1304    backreferences as well.  Requires a preorder visit.  */
1305 static reg_errcode_t
1306 optimize_subexps (void *extra, bin_tree_t *node)
1307 {
1308   re_dfa_t *dfa = (re_dfa_t *) extra;
1309
1310   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1311     {
1312       int idx = node->token.opr.idx;
1313       node->token.opr.idx = dfa->subexp_map[idx];
1314       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1315     }
1316
1317   else if (node->token.type == SUBEXP
1318            && node->left && node->left->token.type == SUBEXP)
1319     {
1320       Idx other_idx = node->left->token.opr.idx;
1321
1322       node->left = node->left->left;
1323       if (node->left)
1324         node->left->parent = node;
1325
1326       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1327       if (other_idx < BITSET_WORD_BITS)
1328         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1329     }
1330
1331   return REG_NOERROR;
1332 }
1333
1334 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1335    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1336 static reg_errcode_t
1337 lower_subexps (void *extra, bin_tree_t *node)
1338 {
1339   regex_t *preg = (regex_t *) extra;
1340   reg_errcode_t err = REG_NOERROR;
1341
1342   if (node->left && node->left->token.type == SUBEXP)
1343     {
1344       node->left = lower_subexp (&err, preg, node->left);
1345       if (node->left)
1346         node->left->parent = node;
1347     }
1348   if (node->right && node->right->token.type == SUBEXP)
1349     {
1350       node->right = lower_subexp (&err, preg, node->right);
1351       if (node->right)
1352         node->right->parent = node;
1353     }
1354
1355   return err;
1356 }
1357
1358 static bin_tree_t *
1359 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1360 {
1361   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1362   bin_tree_t *body = node->left;
1363   bin_tree_t *op, *cls, *tree1, *tree;
1364
1365   if (preg->no_sub
1366       /* We do not optimize empty subexpressions, because otherwise we may
1367          have bad CONCAT nodes with NULL children.  This is obviously not
1368          very common, so we do not lose much.  An example that triggers
1369          this case is the sed "script" /\(\)/x.  */
1370       && node->left != NULL
1371       && (node->token.opr.idx >= BITSET_WORD_BITS
1372           || !(dfa->used_bkref_map
1373                & ((bitset_word_t) 1 << node->token.opr.idx))))
1374     return node->left;
1375
1376   /* Convert the SUBEXP node to the concatenation of an
1377      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1378   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1379   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1380   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1381   tree = create_tree (dfa, op, tree1, CONCAT);
1382   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1383     {
1384       *err = REG_ESPACE;
1385       return NULL;
1386     }
1387
1388   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1389   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1390   return tree;
1391 }
1392
1393 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1394    nodes.  Requires a postorder visit.  */
1395 static reg_errcode_t
1396 calc_first (void *extra, bin_tree_t *node)
1397 {
1398   re_dfa_t *dfa = (re_dfa_t *) extra;
1399   if (node->token.type == CONCAT)
1400     {
1401       node->first = node->left->first;
1402       node->node_idx = node->left->node_idx;
1403     }
1404   else
1405     {
1406       node->first = node;
1407       node->node_idx = re_dfa_add_node (dfa, node->token);
1408       if (BE (node->node_idx == REG_MISSING, 0))
1409         return REG_ESPACE;
1410       if (node->token.type == ANCHOR)
1411         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1412     }
1413   return REG_NOERROR;
1414 }
1415
1416 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1417 static reg_errcode_t
1418 calc_next (void *extra, bin_tree_t *node)
1419 {
1420   switch (node->token.type)
1421     {
1422     case OP_DUP_ASTERISK:
1423       node->left->next = node;
1424       break;
1425     case CONCAT:
1426       node->left->next = node->right->first;
1427       node->right->next = node->next;
1428       break;
1429     default:
1430       if (node->left)
1431         node->left->next = node->next;
1432       if (node->right)
1433         node->right->next = node->next;
1434       break;
1435     }
1436   return REG_NOERROR;
1437 }
1438
1439 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1440 static reg_errcode_t
1441 link_nfa_nodes (void *extra, bin_tree_t *node)
1442 {
1443   re_dfa_t *dfa = (re_dfa_t *) extra;
1444   Idx idx = node->node_idx;
1445   reg_errcode_t err = REG_NOERROR;
1446
1447   switch (node->token.type)
1448     {
1449     case CONCAT:
1450       break;
1451
1452     case END_OF_RE:
1453       assert (node->next == NULL);
1454       break;
1455
1456     case OP_DUP_ASTERISK:
1457     case OP_ALT:
1458       {
1459         Idx left, right;
1460         dfa->has_plural_match = 1;
1461         if (node->left != NULL)
1462           left = node->left->first->node_idx;
1463         else
1464           left = node->next->node_idx;
1465         if (node->right != NULL)
1466           right = node->right->first->node_idx;
1467         else
1468           right = node->next->node_idx;
1469         assert (REG_VALID_INDEX (left));
1470         assert (REG_VALID_INDEX (right));
1471         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1472       }
1473       break;
1474
1475     case ANCHOR:
1476     case OP_OPEN_SUBEXP:
1477     case OP_CLOSE_SUBEXP:
1478       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1479       break;
1480
1481     case OP_BACK_REF:
1482       dfa->nexts[idx] = node->next->node_idx;
1483       if (node->token.type == OP_BACK_REF)
1484         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1485       break;
1486
1487     default:
1488       assert (!IS_EPSILON_NODE (node->token.type));
1489       dfa->nexts[idx] = node->next->node_idx;
1490       break;
1491     }
1492
1493   return err;
1494 }
1495
1496 /* Duplicate the epsilon closure of the node ROOT_NODE.
1497    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1498    to their own constraint.  */
1499
1500 static reg_errcode_t
1501 internal_function
1502 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1503                         Idx root_node, unsigned int init_constraint)
1504 {
1505   Idx org_node, clone_node;
1506   bool ok;
1507   unsigned int constraint = init_constraint;
1508   for (org_node = top_org_node, clone_node = top_clone_node;;)
1509     {
1510       Idx org_dest, clone_dest;
1511       if (dfa->nodes[org_node].type == OP_BACK_REF)
1512         {
1513           /* If the back reference epsilon-transit, its destination must
1514              also have the constraint.  Then duplicate the epsilon closure
1515              of the destination of the back reference, and store it in
1516              edests of the back reference.  */
1517           org_dest = dfa->nexts[org_node];
1518           re_node_set_empty (dfa->edests + clone_node);
1519           clone_dest = duplicate_node (dfa, org_dest, constraint);
1520           if (BE (clone_dest == REG_MISSING, 0))
1521             return REG_ESPACE;
1522           dfa->nexts[clone_node] = dfa->nexts[org_node];
1523           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524           if (BE (! ok, 0))
1525             return REG_ESPACE;
1526         }
1527       else if (dfa->edests[org_node].nelem == 0)
1528         {
1529           /* In case of the node can't epsilon-transit, don't duplicate the
1530              destination and store the original destination as the
1531              destination of the node.  */
1532           dfa->nexts[clone_node] = dfa->nexts[org_node];
1533           break;
1534         }
1535       else if (dfa->edests[org_node].nelem == 1)
1536         {
1537           /* In case of the node can epsilon-transit, and it has only one
1538              destination.  */
1539           org_dest = dfa->edests[org_node].elems[0];
1540           re_node_set_empty (dfa->edests + clone_node);
1541           /* If the node is root_node itself, it means the epsilon closure
1542              has a loop.  Then tie it to the destination of the root_node.  */
1543           if (org_node == root_node && clone_node != org_node)
1544             {
1545               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1546               if (BE (! ok, 0))
1547                 return REG_ESPACE;
1548               break;
1549             }
1550           /* In case the node has another constraint, append it.  */
1551           constraint |= dfa->nodes[org_node].constraint;
1552           clone_dest = duplicate_node (dfa, org_dest, constraint);
1553           if (BE (clone_dest == REG_MISSING, 0))
1554             return REG_ESPACE;
1555           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556           if (BE (! ok, 0))
1557             return REG_ESPACE;
1558         }
1559       else /* dfa->edests[org_node].nelem == 2 */
1560         {
1561           /* In case of the node can epsilon-transit, and it has two
1562              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1563           org_dest = dfa->edests[org_node].elems[0];
1564           re_node_set_empty (dfa->edests + clone_node);
1565           /* Search for a duplicated node which satisfies the constraint.  */
1566           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1567           if (clone_dest == REG_MISSING)
1568             {
1569               /* There is no such duplicated node, create a new one.  */
1570               reg_errcode_t err;
1571               clone_dest = duplicate_node (dfa, org_dest, constraint);
1572               if (BE (clone_dest == REG_MISSING, 0))
1573                 return REG_ESPACE;
1574               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575               if (BE (! ok, 0))
1576                 return REG_ESPACE;
1577               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1578                                             root_node, constraint);
1579               if (BE (err != REG_NOERROR, 0))
1580                 return err;
1581             }
1582           else
1583             {
1584               /* There is a duplicated node which satisfies the constraint,
1585                  use it to avoid infinite loop.  */
1586               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587               if (BE (! ok, 0))
1588                 return REG_ESPACE;
1589             }
1590
1591           org_dest = dfa->edests[org_node].elems[1];
1592           clone_dest = duplicate_node (dfa, org_dest, constraint);
1593           if (BE (clone_dest == REG_MISSING, 0))
1594             return REG_ESPACE;
1595           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1596           if (BE (! ok, 0))
1597             return REG_ESPACE;
1598         }
1599       org_node = org_dest;
1600       clone_node = clone_dest;
1601     }
1602   return REG_NOERROR;
1603 }
1604
1605 /* Search for a node which is duplicated from the node ORG_NODE, and
1606    satisfies the constraint CONSTRAINT.  */
1607
1608 static Idx
1609 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1610                         unsigned int constraint)
1611 {
1612   Idx idx;
1613   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1614     {
1615       if (org_node == dfa->org_indices[idx]
1616           && constraint == dfa->nodes[idx].constraint)
1617         return idx; /* Found.  */
1618     }
1619   return REG_MISSING; /* Not found.  */
1620 }
1621
1622 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1623    Return the index of the new node, or REG_MISSING if insufficient storage is
1624    available.  */
1625
1626 static Idx
1627 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1628 {
1629   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1630   if (BE (dup_idx != REG_MISSING, 1))
1631     {
1632       dfa->nodes[dup_idx].constraint = constraint;
1633       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1634       dfa->nodes[dup_idx].duplicated = 1;
1635
1636       /* Store the index of the original node.  */
1637       dfa->org_indices[dup_idx] = org_idx;
1638     }
1639   return dup_idx;
1640 }
1641
1642 static reg_errcode_t
1643 calc_inveclosure (re_dfa_t *dfa)
1644 {
1645   Idx src, idx;
1646   bool ok;
1647   for (idx = 0; idx < dfa->nodes_len; ++idx)
1648     re_node_set_init_empty (dfa->inveclosures + idx);
1649
1650   for (src = 0; src < dfa->nodes_len; ++src)
1651     {
1652       Idx *elems = dfa->eclosures[src].elems;
1653       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1654         {
1655           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1656           if (BE (! ok, 0))
1657             return REG_ESPACE;
1658         }
1659     }
1660
1661   return REG_NOERROR;
1662 }
1663
1664 /* Calculate "eclosure" for all the node in DFA.  */
1665
1666 static reg_errcode_t
1667 calc_eclosure (re_dfa_t *dfa)
1668 {
1669   Idx node_idx;
1670   bool incomplete;
1671 #ifdef DEBUG
1672   assert (dfa->nodes_len > 0);
1673 #endif
1674   incomplete = false;
1675   /* For each nodes, calculate epsilon closure.  */
1676   for (node_idx = 0; ; ++node_idx)
1677     {
1678       reg_errcode_t err;
1679       re_node_set eclosure_elem;
1680       if (node_idx == dfa->nodes_len)
1681         {
1682           if (!incomplete)
1683             break;
1684           incomplete = false;
1685           node_idx = 0;
1686         }
1687
1688 #ifdef DEBUG
1689       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1690 #endif
1691
1692       /* If we have already calculated, skip it.  */
1693       if (dfa->eclosures[node_idx].nelem != 0)
1694         continue;
1695       /* Calculate epsilon closure of 'node_idx'.  */
1696       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1697       if (BE (err != REG_NOERROR, 0))
1698         return err;
1699
1700       if (dfa->eclosures[node_idx].nelem == 0)
1701         {
1702           incomplete = true;
1703           re_node_set_free (&eclosure_elem);
1704         }
1705     }
1706   return REG_NOERROR;
1707 }
1708
1709 /* Calculate epsilon closure of NODE.  */
1710
1711 static reg_errcode_t
1712 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1713 {
1714   reg_errcode_t err;
1715   Idx i;
1716   re_node_set eclosure;
1717   bool ok;
1718   bool incomplete = false;
1719   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1720   if (BE (err != REG_NOERROR, 0))
1721     return err;
1722
1723   /* This indicates that we are calculating this node now.
1724      We reference this value to avoid infinite loop.  */
1725   dfa->eclosures[node].nelem = REG_MISSING;
1726
1727   /* If the current node has constraints, duplicate all nodes
1728      since they must inherit the constraints.  */
1729   if (dfa->nodes[node].constraint
1730       && dfa->edests[node].nelem
1731       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1732     {
1733       err = duplicate_node_closure (dfa, node, node, node,
1734                                     dfa->nodes[node].constraint);
1735       if (BE (err != REG_NOERROR, 0))
1736         return err;
1737     }
1738
1739   /* Expand each epsilon destination nodes.  */
1740   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1741     for (i = 0; i < dfa->edests[node].nelem; ++i)
1742       {
1743         re_node_set eclosure_elem;
1744         Idx edest = dfa->edests[node].elems[i];
1745         /* If calculating the epsilon closure of 'edest' is in progress,
1746            return intermediate result.  */
1747         if (dfa->eclosures[edest].nelem == REG_MISSING)
1748           {
1749             incomplete = true;
1750             continue;
1751           }
1752         /* If we haven't calculated the epsilon closure of 'edest' yet,
1753            calculate now. Otherwise use calculated epsilon closure.  */
1754         if (dfa->eclosures[edest].nelem == 0)
1755           {
1756             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1757             if (BE (err != REG_NOERROR, 0))
1758               return err;
1759           }
1760         else
1761           eclosure_elem = dfa->eclosures[edest];
1762         /* Merge the epsilon closure of 'edest'.  */
1763         err = re_node_set_merge (&eclosure, &eclosure_elem);
1764         if (BE (err != REG_NOERROR, 0))
1765           return err;
1766         /* If the epsilon closure of 'edest' is incomplete,
1767            the epsilon closure of this node is also incomplete.  */
1768         if (dfa->eclosures[edest].nelem == 0)
1769           {
1770             incomplete = true;
1771             re_node_set_free (&eclosure_elem);
1772           }
1773       }
1774
1775   /* An epsilon closure includes itself.  */
1776   ok = re_node_set_insert (&eclosure, node);
1777   if (BE (! ok, 0))
1778     return REG_ESPACE;
1779   if (incomplete && !root)
1780     dfa->eclosures[node].nelem = 0;
1781   else
1782     dfa->eclosures[node] = eclosure;
1783   *new_set = eclosure;
1784   return REG_NOERROR;
1785 }
1786 \f
1787 /* Functions for token which are used in the parser.  */
1788
1789 /* Fetch a token from INPUT.
1790    We must not use this function inside bracket expressions.  */
1791
1792 static void
1793 internal_function
1794 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1795 {
1796   re_string_skip_bytes (input, peek_token (result, input, syntax));
1797 }
1798
1799 /* Peek a token from INPUT, and return the length of the token.
1800    We must not use this function inside bracket expressions.  */
1801
1802 static int
1803 internal_function
1804 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1805 {
1806   unsigned char c;
1807
1808   if (re_string_eoi (input))
1809     {
1810       token->type = END_OF_RE;
1811       return 0;
1812     }
1813
1814   c = re_string_peek_byte (input, 0);
1815   token->opr.c = c;
1816
1817   token->word_char = 0;
1818 #ifdef RE_ENABLE_I18N
1819   token->mb_partial = 0;
1820   if (input->mb_cur_max > 1 &&
1821       !re_string_first_byte (input, re_string_cur_idx (input)))
1822     {
1823       token->type = CHARACTER;
1824       token->mb_partial = 1;
1825       return 1;
1826     }
1827 #endif
1828   if (c == '\\')
1829     {
1830       unsigned char c2;
1831       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1832         {
1833           token->type = BACK_SLASH;
1834           return 1;
1835         }
1836
1837       c2 = re_string_peek_byte_case (input, 1);
1838       token->opr.c = c2;
1839       token->type = CHARACTER;
1840 #ifdef RE_ENABLE_I18N
1841       if (input->mb_cur_max > 1)
1842         {
1843           wint_t wc = re_string_wchar_at (input,
1844                                           re_string_cur_idx (input) + 1);
1845           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1846         }
1847       else
1848 #endif
1849         token->word_char = IS_WORD_CHAR (c2) != 0;
1850
1851       switch (c2)
1852         {
1853         case '|':
1854           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1855             token->type = OP_ALT;
1856           break;
1857         case '1': case '2': case '3': case '4': case '5':
1858         case '6': case '7': case '8': case '9':
1859           if (!(syntax & RE_NO_BK_REFS))
1860             {
1861               token->type = OP_BACK_REF;
1862               token->opr.idx = c2 - '1';
1863             }
1864           break;
1865         case '<':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             {
1868               token->type = ANCHOR;
1869               token->opr.ctx_type = WORD_FIRST;
1870             }
1871           break;
1872         case '>':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             {
1875               token->type = ANCHOR;
1876               token->opr.ctx_type = WORD_LAST;
1877             }
1878           break;
1879         case 'b':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = WORD_DELIM;
1884             }
1885           break;
1886         case 'B':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = NOT_WORD_DELIM;
1891             }
1892           break;
1893         case 'w':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             token->type = OP_WORD;
1896           break;
1897         case 'W':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             token->type = OP_NOTWORD;
1900           break;
1901         case 's':
1902           if (!(syntax & RE_NO_GNU_OPS))
1903             token->type = OP_SPACE;
1904           break;
1905         case 'S':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_NOTSPACE;
1908           break;
1909         case '`':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             {
1912               token->type = ANCHOR;
1913               token->opr.ctx_type = BUF_FIRST;
1914             }
1915           break;
1916         case '\'':
1917           if (!(syntax & RE_NO_GNU_OPS))
1918             {
1919               token->type = ANCHOR;
1920               token->opr.ctx_type = BUF_LAST;
1921             }
1922           break;
1923         case '(':
1924           if (!(syntax & RE_NO_BK_PARENS))
1925             token->type = OP_OPEN_SUBEXP;
1926           break;
1927         case ')':
1928           if (!(syntax & RE_NO_BK_PARENS))
1929             token->type = OP_CLOSE_SUBEXP;
1930           break;
1931         case '+':
1932           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1933             token->type = OP_DUP_PLUS;
1934           break;
1935         case '?':
1936           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937             token->type = OP_DUP_QUESTION;
1938           break;
1939         case '{':
1940           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1941             token->type = OP_OPEN_DUP_NUM;
1942           break;
1943         case '}':
1944           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945             token->type = OP_CLOSE_DUP_NUM;
1946           break;
1947         default:
1948           break;
1949         }
1950       return 2;
1951     }
1952
1953   token->type = CHARACTER;
1954 #ifdef RE_ENABLE_I18N
1955   if (input->mb_cur_max > 1)
1956     {
1957       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1958       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1959     }
1960   else
1961 #endif
1962     token->word_char = IS_WORD_CHAR (token->opr.c);
1963
1964   switch (c)
1965     {
1966     case '\n':
1967       if (syntax & RE_NEWLINE_ALT)
1968         token->type = OP_ALT;
1969       break;
1970     case '|':
1971       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1972         token->type = OP_ALT;
1973       break;
1974     case '*':
1975       token->type = OP_DUP_ASTERISK;
1976       break;
1977     case '+':
1978       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1979         token->type = OP_DUP_PLUS;
1980       break;
1981     case '?':
1982       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983         token->type = OP_DUP_QUESTION;
1984       break;
1985     case '{':
1986       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1987         token->type = OP_OPEN_DUP_NUM;
1988       break;
1989     case '}':
1990       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991         token->type = OP_CLOSE_DUP_NUM;
1992       break;
1993     case '(':
1994       if (syntax & RE_NO_BK_PARENS)
1995         token->type = OP_OPEN_SUBEXP;
1996       break;
1997     case ')':
1998       if (syntax & RE_NO_BK_PARENS)
1999         token->type = OP_CLOSE_SUBEXP;
2000       break;
2001     case '[':
2002       token->type = OP_OPEN_BRACKET;
2003       break;
2004     case '.':
2005       token->type = OP_PERIOD;
2006       break;
2007     case '^':
2008       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2009           re_string_cur_idx (input) != 0)
2010         {
2011           char prev = re_string_peek_byte (input, -1);
2012           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2013             break;
2014         }
2015       token->type = ANCHOR;
2016       token->opr.ctx_type = LINE_FIRST;
2017       break;
2018     case '$':
2019       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2020           re_string_cur_idx (input) + 1 != re_string_length (input))
2021         {
2022           re_token_t next;
2023           re_string_skip_bytes (input, 1);
2024           peek_token (&next, input, syntax);
2025           re_string_skip_bytes (input, -1);
2026           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2027             break;
2028         }
2029       token->type = ANCHOR;
2030       token->opr.ctx_type = LINE_LAST;
2031       break;
2032     default:
2033       break;
2034     }
2035   return 1;
2036 }
2037
2038 /* Peek a token from INPUT, and return the length of the token.
2039    We must not use this function out of bracket expressions.  */
2040
2041 static int
2042 internal_function
2043 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2044 {
2045   unsigned char c;
2046   if (re_string_eoi (input))
2047     {
2048       token->type = END_OF_RE;
2049       return 0;
2050     }
2051   c = re_string_peek_byte (input, 0);
2052   token->opr.c = c;
2053
2054 #ifdef RE_ENABLE_I18N
2055   if (input->mb_cur_max > 1 &&
2056       !re_string_first_byte (input, re_string_cur_idx (input)))
2057     {
2058       token->type = CHARACTER;
2059       return 1;
2060     }
2061 #endif /* RE_ENABLE_I18N */
2062
2063   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2064       && re_string_cur_idx (input) + 1 < re_string_length (input))
2065     {
2066       /* In this case, '\' escape a character.  */
2067       unsigned char c2;
2068       re_string_skip_bytes (input, 1);
2069       c2 = re_string_peek_byte (input, 0);
2070       token->opr.c = c2;
2071       token->type = CHARACTER;
2072       return 1;
2073     }
2074   if (c == '[') /* '[' is a special char in a bracket exps.  */
2075     {
2076       unsigned char c2;
2077       int token_len;
2078       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2079         c2 = re_string_peek_byte (input, 1);
2080       else
2081         c2 = 0;
2082       token->opr.c = c2;
2083       token_len = 2;
2084       switch (c2)
2085         {
2086         case '.':
2087           token->type = OP_OPEN_COLL_ELEM;
2088           break;
2089         case '=':
2090           token->type = OP_OPEN_EQUIV_CLASS;
2091           break;
2092         case ':':
2093           if (syntax & RE_CHAR_CLASSES)
2094             {
2095               token->type = OP_OPEN_CHAR_CLASS;
2096               break;
2097             }
2098           /* else fall through.  */
2099         default:
2100           token->type = CHARACTER;
2101           token->opr.c = c;
2102           token_len = 1;
2103           break;
2104         }
2105       return token_len;
2106     }
2107   switch (c)
2108     {
2109     case '-':
2110       token->type = OP_CHARSET_RANGE;
2111       break;
2112     case ']':
2113       token->type = OP_CLOSE_BRACKET;
2114       break;
2115     case '^':
2116       token->type = OP_NON_MATCH_LIST;
2117       break;
2118     default:
2119       token->type = CHARACTER;
2120     }
2121   return 1;
2122 }
2123 \f
2124 /* Functions for parser.  */
2125
2126 /* Entry point of the parser.
2127    Parse the regular expression REGEXP and return the structure tree.
2128    If an error occurs, ERR is set by error code, and return NULL.
2129    This function build the following tree, from regular expression <reg_exp>:
2130            CAT
2131            / \
2132           /   \
2133    <reg_exp>  EOR
2134
2135    CAT means concatenation.
2136    EOR means end of regular expression.  */
2137
2138 static bin_tree_t *
2139 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2140        reg_errcode_t *err)
2141 {
2142   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2143   bin_tree_t *tree, *eor, *root;
2144   re_token_t current_token;
2145   dfa->syntax = syntax;
2146   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2151   if (tree != NULL)
2152     root = create_tree (dfa, tree, eor, CONCAT);
2153   else
2154     root = eor;
2155   if (BE (eor == NULL || root == NULL, 0))
2156     {
2157       *err = REG_ESPACE;
2158       return NULL;
2159     }
2160   return root;
2161 }
2162
2163 /* This function build the following tree, from regular expression
2164    <branch1>|<branch2>:
2165            ALT
2166            / \
2167           /   \
2168    <branch1> <branch2>
2169
2170    ALT means alternative, which represents the operator '|'.  */
2171
2172 static bin_tree_t *
2173 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2175 {
2176   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2177   bin_tree_t *tree, *branch = NULL;
2178   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2179   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2180     return NULL;
2181
2182   while (token->type == OP_ALT)
2183     {
2184       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2185       if (token->type != OP_ALT && token->type != END_OF_RE
2186           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187         {
2188           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2189           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2190             return NULL;
2191         }
2192       else
2193         branch = NULL;
2194       tree = create_tree (dfa, tree, branch, OP_ALT);
2195       if (BE (tree == NULL, 0))
2196         {
2197           *err = REG_ESPACE;
2198           return NULL;
2199         }
2200     }
2201   return tree;
2202 }
2203
2204 /* This function build the following tree, from regular expression
2205    <exp1><exp2>:
2206         CAT
2207         / \
2208        /   \
2209    <exp1> <exp2>
2210
2211    CAT means concatenation.  */
2212
2213 static bin_tree_t *
2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 {
2217   bin_tree_t *tree, *expr;
2218   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2219   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221     return NULL;
2222
2223   while (token->type != OP_ALT && token->type != END_OF_RE
2224          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2225     {
2226       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2228         {
2229           if (tree != NULL)
2230             postorder (tree, free_tree, NULL);
2231           return NULL;
2232         }
2233       if (tree != NULL && expr != NULL)
2234         {
2235           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236           if (newtree == NULL)
2237             {
2238               postorder (expr, free_tree, NULL);
2239               postorder (tree, free_tree, NULL);
2240               *err = REG_ESPACE;
2241               return NULL;
2242             }
2243           tree = newtree;
2244         }
2245       else if (tree == NULL)
2246         tree = expr;
2247       /* Otherwise expr == NULL, we don't need to create new tree.  */
2248     }
2249   return tree;
2250 }
2251
2252 /* This function build the following tree, from regular expression a*:
2253          *
2254          |
2255          a
2256 */
2257
2258 static bin_tree_t *
2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261 {
2262   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2263   bin_tree_t *tree;
2264   switch (token->type)
2265     {
2266     case CHARACTER:
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269         {
2270           *err = REG_ESPACE;
2271           return NULL;
2272         }
2273 #ifdef RE_ENABLE_I18N
2274       if (dfa->mb_cur_max > 1)
2275         {
2276           while (!re_string_eoi (regexp)
2277                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278             {
2279               bin_tree_t *mbc_remain;
2280               fetch_token (token, regexp, syntax);
2281               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283               if (BE (mbc_remain == NULL || tree == NULL, 0))
2284                 {
2285                   *err = REG_ESPACE;
2286                   return NULL;
2287                 }
2288             }
2289         }
2290 #endif
2291       break;
2292     case OP_OPEN_SUBEXP:
2293       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295         return NULL;
2296       break;
2297     case OP_OPEN_BRACKET:
2298       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300         return NULL;
2301       break;
2302     case OP_BACK_REF:
2303       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2304         {
2305           *err = REG_ESUBREG;
2306           return NULL;
2307         }
2308       dfa->used_bkref_map |= 1 << token->opr.idx;
2309       tree = create_token_tree (dfa, NULL, NULL, token);
2310       if (BE (tree == NULL, 0))
2311         {
2312           *err = REG_ESPACE;
2313           return NULL;
2314         }
2315       ++dfa->nbackref;
2316       dfa->has_mb_node = 1;
2317       break;
2318     case OP_OPEN_DUP_NUM:
2319       if (syntax & RE_CONTEXT_INVALID_DUP)
2320         {
2321           *err = REG_BADRPT;
2322           return NULL;
2323         }
2324       /* FALLTHROUGH */
2325     case OP_DUP_ASTERISK:
2326     case OP_DUP_PLUS:
2327     case OP_DUP_QUESTION:
2328       if (syntax & RE_CONTEXT_INVALID_OPS)
2329         {
2330           *err = REG_BADRPT;
2331           return NULL;
2332         }
2333       else if (syntax & RE_CONTEXT_INDEP_OPS)
2334         {
2335           fetch_token (token, regexp, syntax);
2336           return parse_expression (regexp, preg, token, syntax, nest, err);
2337         }
2338       /* else fall through  */
2339     case OP_CLOSE_SUBEXP:
2340       if ((token->type == OP_CLOSE_SUBEXP) &&
2341           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2342         {
2343           *err = REG_ERPAREN;
2344           return NULL;
2345         }
2346       /* else fall through  */
2347     case OP_CLOSE_DUP_NUM:
2348       /* We treat it as a normal character.  */
2349
2350       /* Then we can these characters as normal characters.  */
2351       token->type = CHARACTER;
2352       /* mb_partial and word_char bits should be initialized already
2353          by peek_token.  */
2354       tree = create_token_tree (dfa, NULL, NULL, token);
2355       if (BE (tree == NULL, 0))
2356         {
2357           *err = REG_ESPACE;
2358           return NULL;
2359         }
2360       break;
2361     case ANCHOR:
2362       if ((token->opr.ctx_type
2363            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364           && dfa->word_ops_used == 0)
2365         init_word_char (dfa);
2366       if (token->opr.ctx_type == WORD_DELIM
2367           || token->opr.ctx_type == NOT_WORD_DELIM)
2368         {
2369           bin_tree_t *tree_first, *tree_last;
2370           if (token->opr.ctx_type == WORD_DELIM)
2371             {
2372               token->opr.ctx_type = WORD_FIRST;
2373               tree_first = create_token_tree (dfa, NULL, NULL, token);
2374               token->opr.ctx_type = WORD_LAST;
2375             }
2376           else
2377             {
2378               token->opr.ctx_type = INSIDE_WORD;
2379               tree_first = create_token_tree (dfa, NULL, NULL, token);
2380               token->opr.ctx_type = INSIDE_NOTWORD;
2381             }
2382           tree_last = create_token_tree (dfa, NULL, NULL, token);
2383           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2385             {
2386               *err = REG_ESPACE;
2387               return NULL;
2388             }
2389         }
2390       else
2391         {
2392           tree = create_token_tree (dfa, NULL, NULL, token);
2393           if (BE (tree == NULL, 0))
2394             {
2395               *err = REG_ESPACE;
2396               return NULL;
2397             }
2398         }
2399       /* We must return here, since ANCHORs can't be followed
2400          by repetition operators.
2401          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2403       fetch_token (token, regexp, syntax);
2404       return tree;
2405     case OP_PERIOD:
2406       tree = create_token_tree (dfa, NULL, NULL, token);
2407       if (BE (tree == NULL, 0))
2408         {
2409           *err = REG_ESPACE;
2410           return NULL;
2411         }
2412       if (dfa->mb_cur_max > 1)
2413         dfa->has_mb_node = 1;
2414       break;
2415     case OP_WORD:
2416     case OP_NOTWORD:
2417       tree = build_charclass_op (dfa, regexp->trans,
2418                                  (const unsigned char *) "alnum",
2419                                  (const unsigned char *) "_",
2420                                  token->type == OP_NOTWORD, err);
2421       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2422         return NULL;
2423       break;
2424     case OP_SPACE:
2425     case OP_NOTSPACE:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427                                  (const unsigned char *) "space",
2428                                  (const unsigned char *) "",
2429                                  token->type == OP_NOTSPACE, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431         return NULL;
2432       break;
2433     case OP_ALT:
2434     case END_OF_RE:
2435       return NULL;
2436     case BACK_SLASH:
2437       *err = REG_EESCAPE;
2438       return NULL;
2439     default:
2440       /* Must not happen?  */
2441 #ifdef DEBUG
2442       assert (0);
2443 #endif
2444       return NULL;
2445     }
2446   fetch_token (token, regexp, syntax);
2447
2448   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2450     {
2451       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2452       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2453         return NULL;
2454       /* In BRE consecutive duplications are not allowed.  */
2455       if ((syntax & RE_CONTEXT_INVALID_DUP)
2456           && (token->type == OP_DUP_ASTERISK
2457               || token->type == OP_OPEN_DUP_NUM))
2458         {
2459           *err = REG_BADRPT;
2460           return NULL;
2461         }
2462     }
2463
2464   return tree;
2465 }
2466
2467 /* This function build the following tree, from regular expression
2468    (<reg_exp>):
2469          SUBEXP
2470             |
2471         <reg_exp>
2472 */
2473
2474 static bin_tree_t *
2475 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2476                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2477 {
2478   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2479   bin_tree_t *tree;
2480   size_t cur_nsub;
2481   cur_nsub = preg->re_nsub++;
2482
2483   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2484
2485   /* The subexpression may be a null string.  */
2486   if (token->type == OP_CLOSE_SUBEXP)
2487     tree = NULL;
2488   else
2489     {
2490       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2491       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2492         {
2493           if (tree != NULL)
2494             postorder (tree, free_tree, NULL);
2495           *err = REG_EPAREN;
2496         }
2497       if (BE (*err != REG_NOERROR, 0))
2498         return NULL;
2499     }
2500
2501   if (cur_nsub <= '9' - '1')
2502     dfa->completed_bkref_map |= 1 << cur_nsub;
2503
2504   tree = create_tree (dfa, tree, NULL, SUBEXP);
2505   if (BE (tree == NULL, 0))
2506     {
2507       *err = REG_ESPACE;
2508       return NULL;
2509     }
2510   tree->token.opr.idx = cur_nsub;
2511   return tree;
2512 }
2513
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2515
2516 static bin_tree_t *
2517 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2518               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2519 {
2520   bin_tree_t *tree = NULL, *old_tree = NULL;
2521   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2522   re_token_t start_token = *token;
2523
2524   if (token->type == OP_OPEN_DUP_NUM)
2525     {
2526       end = 0;
2527       start = fetch_number (regexp, token, syntax);
2528       if (start == REG_MISSING)
2529         {
2530           if (token->type == CHARACTER && token->opr.c == ',')
2531             start = 0; /* We treat "{,m}" as "{0,m}".  */
2532           else
2533             {
2534               *err = REG_BADBR; /* <re>{} is invalid.  */
2535               return NULL;
2536             }
2537         }
2538       if (BE (start != REG_ERROR, 1))
2539         {
2540           /* We treat "{n}" as "{n,n}".  */
2541           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542                  : ((token->type == CHARACTER && token->opr.c == ',')
2543                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2544         }
2545       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2546         {
2547           /* Invalid sequence.  */
2548           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549             {
2550               if (token->type == END_OF_RE)
2551                 *err = REG_EBRACE;
2552               else
2553                 *err = REG_BADBR;
2554
2555               return NULL;
2556             }
2557
2558           /* If the syntax bit is set, rollback.  */
2559           re_string_set_index (regexp, start_idx);
2560           *token = start_token;
2561           token->type = CHARACTER;
2562           /* mb_partial and word_char bits should be already initialized by
2563              peek_token.  */
2564           return elem;
2565         }
2566
2567       if (BE ((end != REG_MISSING && start > end)
2568               || token->type != OP_CLOSE_DUP_NUM, 0))
2569         {
2570           /* First number greater than second.  */
2571           *err = REG_BADBR;
2572           return NULL;
2573         }
2574     }
2575   else
2576     {
2577       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2578       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2579     }
2580
2581   fetch_token (token, regexp, syntax);
2582
2583   if (BE (elem == NULL, 0))
2584     return NULL;
2585   if (BE (start == 0 && end == 0, 0))
2586     {
2587       postorder (elem, free_tree, NULL);
2588       return NULL;
2589     }
2590
2591   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2592   if (BE (start > 0, 0))
2593     {
2594       tree = elem;
2595       for (i = 2; i <= start; ++i)
2596         {
2597           elem = duplicate_tree (elem, dfa);
2598           tree = create_tree (dfa, tree, elem, CONCAT);
2599           if (BE (elem == NULL || tree == NULL, 0))
2600             goto parse_dup_op_espace;
2601         }
2602
2603       if (start == end)
2604         return tree;
2605
2606       /* Duplicate ELEM before it is marked optional.  */
2607       elem = duplicate_tree (elem, dfa);
2608       old_tree = tree;
2609     }
2610   else
2611     old_tree = NULL;
2612
2613   if (elem->token.type == SUBEXP)
2614     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2615
2616   tree = create_tree (dfa, elem, NULL,
2617                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2618   if (BE (tree == NULL, 0))
2619     goto parse_dup_op_espace;
2620
2621 /* From gnulib's "intprops.h":
2622    True if the arithmetic type T is signed.  */
2623 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2624
2625   /* This loop is actually executed only when end != REG_MISSING,
2626      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2627      already created the start+1-th copy.  */
2628   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2629     for (i = start + 2; i <= end; ++i)
2630       {
2631         elem = duplicate_tree (elem, dfa);
2632         tree = create_tree (dfa, tree, elem, CONCAT);
2633         if (BE (elem == NULL || tree == NULL, 0))
2634           goto parse_dup_op_espace;
2635
2636         tree = create_tree (dfa, tree, NULL, OP_ALT);
2637         if (BE (tree == NULL, 0))
2638           goto parse_dup_op_espace;
2639       }
2640
2641   if (old_tree)
2642     tree = create_tree (dfa, old_tree, tree, CONCAT);
2643
2644   return tree;
2645
2646  parse_dup_op_espace:
2647   *err = REG_ESPACE;
2648   return NULL;
2649 }
2650
2651 /* Size of the names for collating symbol/equivalence_class/character_class.
2652    I'm not sure, but maybe enough.  */
2653 #define BRACKET_NAME_BUF_SIZE 32
2654
2655 #ifndef _LIBC
2656   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2657      Build the range expression which starts from START_ELEM, and ends
2658      at END_ELEM.  The result are written to MBCSET and SBCSET.
2659      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2660      mbcset->range_ends, is a pointer argument since we may
2661      update it.  */
2662
2663 static reg_errcode_t
2664 internal_function
2665 # ifdef RE_ENABLE_I18N
2666 build_range_exp (const reg_syntax_t syntax,
2667                  bitset_t sbcset,
2668                  re_charset_t *mbcset,
2669                  Idx *range_alloc,
2670                  const bracket_elem_t *start_elem,
2671                  const bracket_elem_t *end_elem)
2672 # else /* not RE_ENABLE_I18N */
2673 build_range_exp (const reg_syntax_t syntax,
2674                  bitset_t sbcset,
2675                  const bracket_elem_t *start_elem,
2676                  const bracket_elem_t *end_elem)
2677 # endif /* not RE_ENABLE_I18N */
2678 {
2679   unsigned int start_ch, end_ch;
2680   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2681   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2682           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2683           0))
2684     return REG_ERANGE;
2685
2686   /* We can handle no multi character collating elements without libc
2687      support.  */
2688   if (BE ((start_elem->type == COLL_SYM
2689            && strlen ((char *) start_elem->opr.name) > 1)
2690           || (end_elem->type == COLL_SYM
2691               && strlen ((char *) end_elem->opr.name) > 1), 0))
2692     return REG_ECOLLATE;
2693
2694 # ifdef RE_ENABLE_I18N
2695   {
2696     wchar_t wc;
2697     wint_t start_wc;
2698     wint_t end_wc;
2699     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2700
2701     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2702                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2703                    : 0));
2704     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2705               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2706                  : 0));
2707     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2708                 ? __btowc (start_ch) : start_elem->opr.wch);
2709     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2710               ? __btowc (end_ch) : end_elem->opr.wch);
2711     if (start_wc == WEOF || end_wc == WEOF)
2712       return REG_ECOLLATE;
2713     cmp_buf[0] = start_wc;
2714     cmp_buf[4] = end_wc;
2715
2716     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2717             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2718       return REG_ERANGE;
2719
2720     /* Got valid collation sequence values, add them as a new entry.
2721        However, for !_LIBC we have no collation elements: if the
2722        character set is single byte, the single byte character set
2723        that we build below suffices.  parse_bracket_exp passes
2724        no MBCSET if dfa->mb_cur_max == 1.  */
2725     if (mbcset)
2726       {
2727         /* Check the space of the arrays.  */
2728         if (BE (*range_alloc == mbcset->nranges, 0))
2729           {
2730             /* There is not enough space, need realloc.  */
2731             wchar_t *new_array_start, *new_array_end;
2732             Idx new_nranges;
2733
2734             /* +1 in case of mbcset->nranges is 0.  */
2735             new_nranges = 2 * mbcset->nranges + 1;
2736             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2737                are NULL if *range_alloc == 0.  */
2738             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2739                                           new_nranges);
2740             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2741                                         new_nranges);
2742
2743             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2744               return REG_ESPACE;
2745
2746             mbcset->range_starts = new_array_start;
2747             mbcset->range_ends = new_array_end;
2748             *range_alloc = new_nranges;
2749           }
2750
2751         mbcset->range_starts[mbcset->nranges] = start_wc;
2752         mbcset->range_ends[mbcset->nranges++] = end_wc;
2753       }
2754
2755     /* Build the table for single byte characters.  */
2756     for (wc = 0; wc < SBC_MAX; ++wc)
2757       {
2758         cmp_buf[2] = wc;
2759         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2760             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2761           bitset_set (sbcset, wc);
2762       }
2763   }
2764 # else /* not RE_ENABLE_I18N */
2765   {
2766     unsigned int ch;
2767     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2768                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2769                    : 0));
2770     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2771               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2772                  : 0));
2773     if (start_ch > end_ch)
2774       return REG_ERANGE;
2775     /* Build the table for single byte characters.  */
2776     for (ch = 0; ch < SBC_MAX; ++ch)
2777       if (start_ch <= ch  && ch <= end_ch)
2778         bitset_set (sbcset, ch);
2779   }
2780 # endif /* not RE_ENABLE_I18N */
2781   return REG_NOERROR;
2782 }
2783 #endif /* not _LIBC */
2784
2785 #ifndef _LIBC
2786 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2787    Build the collating element which is represented by NAME.
2788    The result are written to MBCSET and SBCSET.
2789    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2790    pointer argument since we may update it.  */
2791
2792 static reg_errcode_t
2793 internal_function
2794 # ifdef RE_ENABLE_I18N
2795 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2796                         Idx *coll_sym_alloc, const unsigned char *name)
2797 # else /* not RE_ENABLE_I18N */
2798 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2799 # endif /* not RE_ENABLE_I18N */
2800 {
2801   size_t name_len = strlen ((const char *) name);
2802   if (BE (name_len != 1, 0))
2803     return REG_ECOLLATE;
2804   else
2805     {
2806       bitset_set (sbcset, name[0]);
2807       return REG_NOERROR;
2808     }
2809 }
2810 #endif /* not _LIBC */
2811
2812 /* This function parse bracket expression like "[abc]", "[a-c]",
2813    "[[.a-a.]]" etc.  */
2814
2815 static bin_tree_t *
2816 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2817                    reg_syntax_t syntax, reg_errcode_t *err)
2818 {
2819 #ifdef _LIBC
2820   const unsigned char *collseqmb;
2821   const char *collseqwc;
2822   uint32_t nrules;
2823   int32_t table_size;
2824   const int32_t *symb_table;
2825   const unsigned char *extra;
2826
2827   /* Local function for parse_bracket_exp used in _LIBC environment.
2828      Seek the collating symbol entry corresponding to NAME.
2829      Return the index of the symbol in the SYMB_TABLE.  */
2830
2831   auto inline int32_t
2832   __attribute ((always_inline))
2833   seek_collating_symbol_entry (name, name_len)
2834          const unsigned char *name;
2835          size_t name_len;
2836     {
2837       int32_t hash = elem_hash ((const char *) name, name_len);
2838       int32_t elem = hash % table_size;
2839       if (symb_table[2 * elem] != 0)
2840         {
2841           int32_t second = hash % (table_size - 2) + 1;
2842
2843           do
2844             {
2845               /* First compare the hashing value.  */
2846               if (symb_table[2 * elem] == hash
2847                   /* Compare the length of the name.  */
2848                   && name_len == extra[symb_table[2 * elem + 1]]
2849                   /* Compare the name.  */
2850                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2851                              name_len) == 0)
2852                 {
2853                   /* Yep, this is the entry.  */
2854                   break;
2855                 }
2856
2857               /* Next entry.  */
2858               elem += second;
2859             }
2860           while (symb_table[2 * elem] != 0);
2861         }
2862       return elem;
2863     }
2864
2865   /* Local function for parse_bracket_exp used in _LIBC environment.
2866      Look up the collation sequence value of BR_ELEM.
2867      Return the value if succeeded, UINT_MAX otherwise.  */
2868
2869   auto inline unsigned int
2870   __attribute ((always_inline))
2871   lookup_collation_sequence_value (br_elem)
2872          bracket_elem_t *br_elem;
2873     {
2874       if (br_elem->type == SB_CHAR)
2875         {
2876           /*
2877           if (MB_CUR_MAX == 1)
2878           */
2879           if (nrules == 0)
2880             return collseqmb[br_elem->opr.ch];
2881           else
2882             {
2883               wint_t wc = __btowc (br_elem->opr.ch);
2884               return __collseq_table_lookup (collseqwc, wc);
2885             }
2886         }
2887       else if (br_elem->type == MB_CHAR)
2888         {
2889           if (nrules != 0)
2890             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2891         }
2892       else if (br_elem->type == COLL_SYM)
2893         {
2894           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2895           if (nrules != 0)
2896             {
2897               int32_t elem, idx;
2898               elem = seek_collating_symbol_entry (br_elem->opr.name,
2899                                                   sym_name_len);
2900               if (symb_table[2 * elem] != 0)
2901                 {
2902                   /* We found the entry.  */
2903                   idx = symb_table[2 * elem + 1];
2904                   /* Skip the name of collating element name.  */
2905                   idx += 1 + extra[idx];
2906                   /* Skip the byte sequence of the collating element.  */
2907                   idx += 1 + extra[idx];
2908                   /* Adjust for the alignment.  */
2909                   idx = (idx + 3) & ~3;
2910                   /* Skip the multibyte collation sequence value.  */
2911                   idx += sizeof (unsigned int);
2912                   /* Skip the wide char sequence of the collating element.  */
2913                   idx += sizeof (unsigned int) *
2914                     (1 + *(unsigned int *) (extra + idx));
2915                   /* Return the collation sequence value.  */
2916                   return *(unsigned int *) (extra + idx);
2917                 }
2918               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2919                 {
2920                   /* No valid character.  Match it as a single byte
2921                      character.  */
2922                   return collseqmb[br_elem->opr.name[0]];
2923                 }
2924             }
2925           else if (sym_name_len == 1)
2926             return collseqmb[br_elem->opr.name[0]];
2927         }
2928       return UINT_MAX;
2929     }
2930
2931   /* Local function for parse_bracket_exp used in _LIBC environment.
2932      Build the range expression which starts from START_ELEM, and ends
2933      at END_ELEM.  The result are written to MBCSET and SBCSET.
2934      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2935      mbcset->range_ends, is a pointer argument since we may
2936      update it.  */
2937
2938   auto inline reg_errcode_t
2939   __attribute ((always_inline))
2940   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2941          re_charset_t *mbcset;
2942          Idx *range_alloc;
2943          bitset_t sbcset;
2944          bracket_elem_t *start_elem, *end_elem;
2945     {
2946       unsigned int ch;
2947       uint32_t start_collseq;
2948       uint32_t end_collseq;
2949
2950       /* Equivalence Classes and Character Classes can't be a range
2951          start/end.  */
2952       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2953               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2954               0))
2955         return REG_ERANGE;
2956
2957       start_collseq = lookup_collation_sequence_value (start_elem);
2958       end_collseq = lookup_collation_sequence_value (end_elem);
2959       /* Check start/end collation sequence values.  */
2960       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961         return REG_ECOLLATE;
2962       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2963         return REG_ERANGE;
2964
2965       /* Got valid collation sequence values, add them as a new entry.
2966          However, if we have no collation elements, and the character set
2967          is single byte, the single byte character set that we
2968          build below suffices. */
2969       if (nrules > 0 || dfa->mb_cur_max > 1)
2970         {
2971           /* Check the space of the arrays.  */
2972           if (BE (*range_alloc == mbcset->nranges, 0))
2973             {
2974               /* There is not enough space, need realloc.  */
2975               uint32_t *new_array_start;
2976               uint32_t *new_array_end;
2977               Idx new_nranges;
2978
2979               /* +1 in case of mbcset->nranges is 0.  */
2980               new_nranges = 2 * mbcset->nranges + 1;
2981               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2982                                             new_nranges);
2983               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2984                                           new_nranges);
2985
2986               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2987                 return REG_ESPACE;
2988
2989               mbcset->range_starts = new_array_start;
2990               mbcset->range_ends = new_array_end;
2991               *range_alloc = new_nranges;
2992             }
2993
2994           mbcset->range_starts[mbcset->nranges] = start_collseq;
2995           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2996         }
2997
2998       /* Build the table for single byte characters.  */
2999       for (ch = 0; ch < SBC_MAX; ch++)
3000         {
3001           uint32_t ch_collseq;
3002           /*
3003           if (MB_CUR_MAX == 1)
3004           */
3005           if (nrules == 0)
3006             ch_collseq = collseqmb[ch];
3007           else
3008             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010             bitset_set (sbcset, ch);
3011         }
3012       return REG_NOERROR;
3013     }
3014
3015   /* Local function for parse_bracket_exp used in _LIBC environment.
3016      Build the collating element which is represented by NAME.
3017      The result are written to MBCSET and SBCSET.
3018      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019      pointer argument since we may update it.  */
3020
3021   auto inline reg_errcode_t
3022   __attribute ((always_inline))
3023   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3024          re_charset_t *mbcset;
3025          Idx *coll_sym_alloc;
3026          bitset_t sbcset;
3027          const unsigned char *name;
3028     {
3029       int32_t elem, idx;
3030       size_t name_len = strlen ((const char *) name);
3031       if (nrules != 0)
3032         {
3033           elem = seek_collating_symbol_entry (name, name_len);
3034           if (symb_table[2 * elem] != 0)
3035             {
3036               /* We found the entry.  */
3037               idx = symb_table[2 * elem + 1];
3038               /* Skip the name of collating element name.  */
3039               idx += 1 + extra[idx];
3040             }
3041           else if (symb_table[2 * elem] == 0 && name_len == 1)
3042             {
3043               /* No valid character, treat it as a normal
3044                  character.  */
3045               bitset_set (sbcset, name[0]);
3046               return REG_NOERROR;
3047             }
3048           else
3049             return REG_ECOLLATE;
3050
3051           /* Got valid collation sequence, add it as a new entry.  */
3052           /* Check the space of the arrays.  */
3053           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3054             {
3055               /* Not enough, realloc it.  */
3056               /* +1 in case of mbcset->ncoll_syms is 0.  */
3057               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3058               /* Use realloc since mbcset->coll_syms is NULL
3059                  if *alloc == 0.  */
3060               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3061                                                    new_coll_sym_alloc);
3062               if (BE (new_coll_syms == NULL, 0))
3063                 return REG_ESPACE;
3064               mbcset->coll_syms = new_coll_syms;
3065               *coll_sym_alloc = new_coll_sym_alloc;
3066             }
3067           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3068           return REG_NOERROR;
3069         }
3070       else
3071         {
3072           if (BE (name_len != 1, 0))
3073             return REG_ECOLLATE;
3074           else
3075             {
3076               bitset_set (sbcset, name[0]);
3077               return REG_NOERROR;
3078             }
3079         }
3080     }
3081 #endif
3082
3083   re_token_t br_token;
3084   re_bitset_ptr_t sbcset;
3085 #ifdef RE_ENABLE_I18N
3086   re_charset_t *mbcset;
3087   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3088   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3089 #endif /* not RE_ENABLE_I18N */
3090   bool non_match = false;
3091   bin_tree_t *work_tree;
3092   int token_len;
3093   bool first_round = true;
3094 #ifdef _LIBC
3095   collseqmb = (const unsigned char *)
3096     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3097   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3098   if (nrules)
3099     {
3100       /*
3101       if (MB_CUR_MAX > 1)
3102       */
3103       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3104       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3105       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3106                                                   _NL_COLLATE_SYMB_TABLEMB);
3107       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3108                                                    _NL_COLLATE_SYMB_EXTRAMB);
3109     }
3110 #endif
3111   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3112 #ifdef RE_ENABLE_I18N
3113   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3114 #endif /* RE_ENABLE_I18N */
3115 #ifdef RE_ENABLE_I18N
3116   if (BE (sbcset == NULL || mbcset == NULL, 0))
3117 #else
3118   if (BE (sbcset == NULL, 0))
3119 #endif /* RE_ENABLE_I18N */
3120     {
3121       re_free (sbcset);
3122 #ifdef RE_ENABLE_I18N
3123       re_free (mbcset);
3124 #endif
3125       *err = REG_ESPACE;
3126       return NULL;
3127     }
3128
3129   token_len = peek_token_bracket (token, regexp, syntax);
3130   if (BE (token->type == END_OF_RE, 0))
3131     {
3132       *err = REG_BADPAT;
3133       goto parse_bracket_exp_free_return;
3134     }
3135   if (token->type == OP_NON_MATCH_LIST)
3136     {
3137 #ifdef RE_ENABLE_I18N
3138       mbcset->non_match = 1;
3139 #endif /* not RE_ENABLE_I18N */
3140       non_match = true;
3141       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3142         bitset_set (sbcset, '\n');
3143       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3144       token_len = peek_token_bracket (token, regexp, syntax);
3145       if (BE (token->type == END_OF_RE, 0))
3146         {
3147           *err = REG_BADPAT;
3148           goto parse_bracket_exp_free_return;
3149         }
3150     }
3151
3152   /* We treat the first ']' as a normal character.  */
3153   if (token->type == OP_CLOSE_BRACKET)
3154     token->type = CHARACTER;
3155
3156   while (1)
3157     {
3158       bracket_elem_t start_elem, end_elem;
3159       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3160       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3161       reg_errcode_t ret;
3162       int token_len2 = 0;
3163       bool is_range_exp = false;
3164       re_token_t token2;
3165
3166       start_elem.opr.name = start_name_buf;
3167       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3168                                    syntax, first_round);
3169       if (BE (ret != REG_NOERROR, 0))
3170         {
3171           *err = ret;
3172           goto parse_bracket_exp_free_return;
3173         }
3174       first_round = false;
3175
3176       /* Get information about the next token.  We need it in any case.  */
3177       token_len = peek_token_bracket (token, regexp, syntax);
3178
3179       /* Do not check for ranges if we know they are not allowed.  */
3180       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3181         {
3182           if (BE (token->type == END_OF_RE, 0))
3183             {
3184               *err = REG_EBRACK;
3185               goto parse_bracket_exp_free_return;
3186             }
3187           if (token->type == OP_CHARSET_RANGE)
3188             {
3189               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3190               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3191               if (BE (token2.type == END_OF_RE, 0))
3192                 {
3193                   *err = REG_EBRACK;
3194                   goto parse_bracket_exp_free_return;
3195                 }
3196               if (token2.type == OP_CLOSE_BRACKET)
3197                 {
3198                   /* We treat the last '-' as a normal character.  */
3199                   re_string_skip_bytes (regexp, -token_len);
3200                   token->type = CHARACTER;
3201                 }
3202               else
3203                 is_range_exp = true;
3204             }
3205         }
3206
3207       if (is_range_exp == true)
3208         {
3209           end_elem.opr.name = end_name_buf;
3210           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3211                                        dfa, syntax, true);
3212           if (BE (ret != REG_NOERROR, 0))
3213             {
3214               *err = ret;
3215               goto parse_bracket_exp_free_return;
3216             }
3217
3218           token_len = peek_token_bracket (token, regexp, syntax);
3219
3220 #ifdef _LIBC
3221           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3222                                   &start_elem, &end_elem);
3223 #else
3224 # ifdef RE_ENABLE_I18N
3225           *err = build_range_exp (syntax, sbcset,
3226                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3227                                   &range_alloc, &start_elem, &end_elem);
3228 # else
3229           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3230 # endif
3231 #endif /* RE_ENABLE_I18N */
3232           if (BE (*err != REG_NOERROR, 0))
3233             goto parse_bracket_exp_free_return;
3234         }
3235       else
3236         {
3237           switch (start_elem.type)
3238             {
3239             case SB_CHAR:
3240               bitset_set (sbcset, start_elem.opr.ch);
3241               break;
3242 #ifdef RE_ENABLE_I18N
3243             case MB_CHAR:
3244               /* Check whether the array has enough space.  */
3245               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3246                 {
3247                   wchar_t *new_mbchars;
3248                   /* Not enough, realloc it.  */
3249                   /* +1 in case of mbcset->nmbchars is 0.  */
3250                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3251                   /* Use realloc since array is NULL if *alloc == 0.  */
3252                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3253                                             mbchar_alloc);
3254                   if (BE (new_mbchars == NULL, 0))
3255                     goto parse_bracket_exp_espace;
3256                   mbcset->mbchars = new_mbchars;
3257                 }
3258               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3259               break;
3260 #endif /* RE_ENABLE_I18N */
3261             case EQUIV_CLASS:
3262               *err = build_equiv_class (sbcset,
3263 #ifdef RE_ENABLE_I18N
3264                                         mbcset, &equiv_class_alloc,
3265 #endif /* RE_ENABLE_I18N */
3266                                         start_elem.opr.name);
3267               if (BE (*err != REG_NOERROR, 0))
3268                 goto parse_bracket_exp_free_return;
3269               break;
3270             case COLL_SYM:
3271               *err = build_collating_symbol (sbcset,
3272 #ifdef RE_ENABLE_I18N
3273                                              mbcset, &coll_sym_alloc,
3274 #endif /* RE_ENABLE_I18N */
3275                                              start_elem.opr.name);
3276               if (BE (*err != REG_NOERROR, 0))
3277                 goto parse_bracket_exp_free_return;
3278               break;
3279             case CHAR_CLASS:
3280               *err = build_charclass (regexp->trans, sbcset,
3281 #ifdef RE_ENABLE_I18N
3282                                       mbcset, &char_class_alloc,
3283 #endif /* RE_ENABLE_I18N */
3284                                       start_elem.opr.name, syntax);
3285               if (BE (*err != REG_NOERROR, 0))
3286                goto parse_bracket_exp_free_return;
3287               break;
3288             default:
3289               assert (0);
3290               break;
3291             }
3292         }
3293       if (BE (token->type == END_OF_RE, 0))
3294         {
3295           *err = REG_EBRACK;
3296           goto parse_bracket_exp_free_return;
3297         }
3298       if (token->type == OP_CLOSE_BRACKET)
3299         break;
3300     }
3301
3302   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3303
3304   /* If it is non-matching list.  */
3305   if (non_match)
3306     bitset_not (sbcset);
3307
3308 #ifdef RE_ENABLE_I18N
3309   /* Ensure only single byte characters are set.  */
3310   if (dfa->mb_cur_max > 1)
3311     bitset_mask (sbcset, dfa->sb_char);
3312
3313   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3314       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3315                                                      || mbcset->non_match)))
3316     {
3317       bin_tree_t *mbc_tree;
3318       int sbc_idx;
3319       /* Build a tree for complex bracket.  */
3320       dfa->has_mb_node = 1;
3321       br_token.type = COMPLEX_BRACKET;
3322       br_token.opr.mbcset = mbcset;
3323       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3324       if (BE (mbc_tree == NULL, 0))
3325         goto parse_bracket_exp_espace;
3326       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3327         if (sbcset[sbc_idx])
3328           break;
3329       /* If there are no bits set in sbcset, there is no point
3330          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3331       if (sbc_idx < BITSET_WORDS)
3332         {
3333           /* Build a tree for simple bracket.  */
3334           br_token.type = SIMPLE_BRACKET;
3335           br_token.opr.sbcset = sbcset;
3336           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3337           if (BE (work_tree == NULL, 0))
3338             goto parse_bracket_exp_espace;
3339
3340           /* Then join them by ALT node.  */
3341           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3342           if (BE (work_tree == NULL, 0))
3343             goto parse_bracket_exp_espace;
3344         }
3345       else
3346         {
3347           re_free (sbcset);
3348           work_tree = mbc_tree;
3349         }
3350     }
3351   else
3352 #endif /* not RE_ENABLE_I18N */
3353     {
3354 #ifdef RE_ENABLE_I18N
3355       free_charset (mbcset);
3356 #endif
3357       /* Build a tree for simple bracket.  */
3358       br_token.type = SIMPLE_BRACKET;
3359       br_token.opr.sbcset = sbcset;
3360       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3361       if (BE (work_tree == NULL, 0))
3362         goto parse_bracket_exp_espace;
3363     }
3364   return work_tree;
3365
3366  parse_bracket_exp_espace:
3367   *err = REG_ESPACE;
3368  parse_bracket_exp_free_return:
3369   re_free (sbcset);
3370 #ifdef RE_ENABLE_I18N
3371   free_charset (mbcset);
3372 #endif /* RE_ENABLE_I18N */
3373   return NULL;
3374 }
3375
3376 /* Parse an element in the bracket expression.  */
3377
3378 static reg_errcode_t
3379 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3380                        re_token_t *token, int token_len, re_dfa_t *dfa,
3381                        reg_syntax_t syntax, bool accept_hyphen)
3382 {
3383 #ifdef RE_ENABLE_I18N
3384   int cur_char_size;
3385   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3386   if (cur_char_size > 1)
3387     {
3388       elem->type = MB_CHAR;
3389       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3390       re_string_skip_bytes (regexp, cur_char_size);
3391       return REG_NOERROR;
3392     }
3393 #endif /* RE_ENABLE_I18N */
3394   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3395   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3396       || token->type == OP_OPEN_EQUIV_CLASS)
3397     return parse_bracket_symbol (elem, regexp, token);
3398   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3399     {
3400       /* A '-' must only appear as anything but a range indicator before
3401          the closing bracket.  Everything else is an error.  */
3402       re_token_t token2;
3403       (void) peek_token_bracket (&token2, regexp, syntax);
3404       if (token2.type != OP_CLOSE_BRACKET)
3405         /* The actual error value is not standardized since this whole
3406            case is undefined.  But ERANGE makes good sense.  */
3407         return REG_ERANGE;
3408     }
3409   elem->type = SB_CHAR;
3410   elem->opr.ch = token->opr.c;
3411   return REG_NOERROR;
3412 }
3413
3414 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3415    such as [:<character_class>:], [.<collating_element>.], and
3416    [=<equivalent_class>=].  */
3417
3418 static reg_errcode_t
3419 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3420                       re_token_t *token)
3421 {
3422   unsigned char ch, delim = token->opr.c;
3423   int i = 0;
3424   if (re_string_eoi(regexp))
3425     return REG_EBRACK;
3426   for (;; ++i)
3427     {
3428       if (i >= BRACKET_NAME_BUF_SIZE)
3429         return REG_EBRACK;
3430       if (token->type == OP_OPEN_CHAR_CLASS)
3431         ch = re_string_fetch_byte_case (regexp);
3432       else
3433         ch = re_string_fetch_byte (regexp);
3434       if (re_string_eoi(regexp))
3435         return REG_EBRACK;
3436       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3437         break;
3438       elem->opr.name[i] = ch;
3439     }
3440   re_string_skip_bytes (regexp, 1);
3441   elem->opr.name[i] = '\0';
3442   switch (token->type)
3443     {
3444     case OP_OPEN_COLL_ELEM:
3445       elem->type = COLL_SYM;
3446       break;
3447     case OP_OPEN_EQUIV_CLASS:
3448       elem->type = EQUIV_CLASS;
3449       break;
3450     case OP_OPEN_CHAR_CLASS:
3451       elem->type = CHAR_CLASS;
3452       break;
3453     default:
3454       break;
3455     }
3456   return REG_NOERROR;
3457 }
3458
3459   /* Helper function for parse_bracket_exp.
3460      Build the equivalence class which is represented by NAME.
3461      The result are written to MBCSET and SBCSET.
3462      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3463      is a pointer argument since we may update it.  */
3464
3465 static reg_errcode_t
3466 #ifdef RE_ENABLE_I18N
3467 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3468                    Idx *equiv_class_alloc, const unsigned char *name)
3469 #else /* not RE_ENABLE_I18N */
3470 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3471 #endif /* not RE_ENABLE_I18N */
3472 {
3473 #ifdef _LIBC
3474   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3475   if (nrules != 0)
3476     {
3477       const int32_t *table, *indirect;
3478       const unsigned char *weights, *extra, *cp;
3479       unsigned char char_buf[2];
3480       int32_t idx1, idx2;
3481       unsigned int ch;
3482       size_t len;
3483       /* This #include defines a local function!  */
3484 # include <locale/weight.h>
3485       /* Calculate the index for equivalence class.  */
3486       cp = name;
3487       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3488       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489                                                _NL_COLLATE_WEIGHTMB);
3490       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3491                                                    _NL_COLLATE_EXTRAMB);
3492       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3493                                                 _NL_COLLATE_INDIRECTMB);
3494       idx1 = findidx (&cp, -1);
3495       if (BE (idx1 == 0 || *cp != '\0', 0))
3496         /* This isn't a valid character.  */
3497         return REG_ECOLLATE;
3498
3499       /* Build single byte matching table for this equivalence class.  */
3500       len = weights[idx1 & 0xffffff];
3501       for (ch = 0; ch < SBC_MAX; ++ch)
3502         {
3503           char_buf[0] = ch;
3504           cp = char_buf;
3505           idx2 = findidx (&cp, 1);
3506 /*
3507           idx2 = table[ch];
3508 */
3509           if (idx2 == 0)
3510             /* This isn't a valid character.  */
3511             continue;
3512           /* Compare only if the length matches and the collation rule
3513              index is the same.  */
3514           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3515             {
3516               int cnt = 0;
3517
3518               while (cnt <= len &&
3519                      weights[(idx1 & 0xffffff) + 1 + cnt]
3520                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3521                 ++cnt;
3522
3523               if (cnt > len)
3524                 bitset_set (sbcset, ch);
3525             }
3526         }
3527       /* Check whether the array has enough space.  */
3528       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3529         {
3530           /* Not enough, realloc it.  */
3531           /* +1 in case of mbcset->nequiv_classes is 0.  */
3532           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3533           /* Use realloc since the array is NULL if *alloc == 0.  */
3534           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3535                                                    int32_t,
3536                                                    new_equiv_class_alloc);
3537           if (BE (new_equiv_classes == NULL, 0))
3538             return REG_ESPACE;
3539           mbcset->equiv_classes = new_equiv_classes;
3540           *equiv_class_alloc = new_equiv_class_alloc;
3541         }
3542       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3543     }
3544   else
3545 #endif /* _LIBC */
3546     {
3547       if (BE (strlen ((const char *) name) != 1, 0))
3548         return REG_ECOLLATE;
3549       bitset_set (sbcset, *name);
3550     }
3551   return REG_NOERROR;
3552 }
3553
3554   /* Helper function for parse_bracket_exp.
3555      Build the character class which is represented by NAME.
3556      The result are written to MBCSET and SBCSET.
3557      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3558      is a pointer argument since we may update it.  */
3559
3560 static reg_errcode_t
3561 #ifdef RE_ENABLE_I18N
3562 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3563                  re_charset_t *mbcset, Idx *char_class_alloc,
3564                  const unsigned char *class_name, reg_syntax_t syntax)
3565 #else /* not RE_ENABLE_I18N */
3566 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567                  const unsigned char *class_name, reg_syntax_t syntax)
3568 #endif /* not RE_ENABLE_I18N */
3569 {
3570   int i;
3571   const char *name = (const char *) class_name;
3572
3573   /* In case of REG_ICASE "upper" and "lower" match the both of
3574      upper and lower cases.  */
3575   if ((syntax & RE_ICASE)
3576       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3577     name = "alpha";
3578
3579 #ifdef RE_ENABLE_I18N
3580   /* Check the space of the arrays.  */
3581   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3582     {
3583       /* Not enough, realloc it.  */
3584       /* +1 in case of mbcset->nchar_classes is 0.  */
3585       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3586       /* Use realloc since array is NULL if *alloc == 0.  */
3587       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3588                                                new_char_class_alloc);
3589       if (BE (new_char_classes == NULL, 0))
3590         return REG_ESPACE;
3591       mbcset->char_classes = new_char_classes;
3592       *char_class_alloc = new_char_class_alloc;
3593     }
3594   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3595 #endif /* RE_ENABLE_I18N */
3596
3597 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3598   do {                                          \
3599     if (BE (trans != NULL, 0))                  \
3600       {                                         \
3601         for (i = 0; i < SBC_MAX; ++i)           \
3602           if (ctype_func (i))                   \
3603             bitset_set (sbcset, trans[i]);      \
3604       }                                         \
3605     else                                        \
3606       {                                         \
3607         for (i = 0; i < SBC_MAX; ++i)           \
3608           if (ctype_func (i))                   \
3609             bitset_set (sbcset, i);             \
3610       }                                         \
3611   } while (0)
3612
3613   if (strcmp (name, "alnum") == 0)
3614     BUILD_CHARCLASS_LOOP (isalnum);
3615   else if (strcmp (name, "cntrl") == 0)
3616     BUILD_CHARCLASS_LOOP (iscntrl);
3617   else if (strcmp (name, "lower") == 0)
3618     BUILD_CHARCLASS_LOOP (islower);
3619   else if (strcmp (name, "space") == 0)
3620     BUILD_CHARCLASS_LOOP (isspace);
3621   else if (strcmp (name, "alpha") == 0)
3622     BUILD_CHARCLASS_LOOP (isalpha);
3623   else if (strcmp (name, "digit") == 0)
3624     BUILD_CHARCLASS_LOOP (isdigit);
3625   else if (strcmp (name, "print") == 0)
3626     BUILD_CHARCLASS_LOOP (isprint);
3627   else if (strcmp (name, "upper") == 0)
3628     BUILD_CHARCLASS_LOOP (isupper);
3629   else if (strcmp (name, "blank") == 0)
3630     BUILD_CHARCLASS_LOOP (isblank);
3631   else if (strcmp (name, "graph") == 0)
3632     BUILD_CHARCLASS_LOOP (isgraph);
3633   else if (strcmp (name, "punct") == 0)
3634     BUILD_CHARCLASS_LOOP (ispunct);
3635   else if (strcmp (name, "xdigit") == 0)
3636     BUILD_CHARCLASS_LOOP (isxdigit);
3637   else
3638     return REG_ECTYPE;
3639
3640   return REG_NOERROR;
3641 }
3642
3643 static bin_tree_t *
3644 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3645                     const unsigned char *class_name,
3646                     const unsigned char *extra, bool non_match,
3647                     reg_errcode_t *err)
3648 {
3649   re_bitset_ptr_t sbcset;
3650 #ifdef RE_ENABLE_I18N
3651   re_charset_t *mbcset;
3652   Idx alloc = 0;
3653 #endif /* not RE_ENABLE_I18N */
3654   reg_errcode_t ret;
3655   re_token_t br_token;
3656   bin_tree_t *tree;
3657
3658   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3659 #ifdef RE_ENABLE_I18N
3660   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3661 #endif /* RE_ENABLE_I18N */
3662
3663 #ifdef RE_ENABLE_I18N
3664   if (BE (sbcset == NULL || mbcset == NULL, 0))
3665 #else /* not RE_ENABLE_I18N */
3666   if (BE (sbcset == NULL, 0))
3667 #endif /* not RE_ENABLE_I18N */
3668     {
3669       *err = REG_ESPACE;
3670       return NULL;
3671     }
3672
3673   if (non_match)
3674     {
3675 #ifdef RE_ENABLE_I18N
3676       mbcset->non_match = 1;
3677 #endif /* not RE_ENABLE_I18N */
3678     }
3679
3680   /* We don't care the syntax in this case.  */
3681   ret = build_charclass (trans, sbcset,
3682 #ifdef RE_ENABLE_I18N
3683                          mbcset, &alloc,
3684 #endif /* RE_ENABLE_I18N */
3685                          class_name, 0);
3686
3687   if (BE (ret != REG_NOERROR, 0))
3688     {
3689       re_free (sbcset);
3690 #ifdef RE_ENABLE_I18N
3691       free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3693       *err = ret;
3694       return NULL;
3695     }
3696   /* \w match '_' also.  */
3697   for (; *extra; extra++)
3698     bitset_set (sbcset, *extra);
3699
3700   /* If it is non-matching list.  */
3701   if (non_match)
3702     bitset_not (sbcset);
3703
3704 #ifdef RE_ENABLE_I18N
3705   /* Ensure only single byte characters are set.  */
3706   if (dfa->mb_cur_max > 1)
3707     bitset_mask (sbcset, dfa->sb_char);
3708 #endif
3709
3710   /* Build a tree for simple bracket.  */
3711   br_token.type = SIMPLE_BRACKET;
3712   br_token.opr.sbcset = sbcset;
3713   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3714   if (BE (tree == NULL, 0))
3715     goto build_word_op_espace;
3716
3717 #ifdef RE_ENABLE_I18N
3718   if (dfa->mb_cur_max > 1)
3719     {
3720       bin_tree_t *mbc_tree;
3721       /* Build a tree for complex bracket.  */
3722       br_token.type = COMPLEX_BRACKET;
3723       br_token.opr.mbcset = mbcset;
3724       dfa->has_mb_node = 1;
3725       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726       if (BE (mbc_tree == NULL, 0))
3727         goto build_word_op_espace;
3728       /* Then join them by ALT node.  */
3729       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3730       if (BE (mbc_tree != NULL, 1))
3731         return tree;
3732     }
3733   else
3734     {
3735       free_charset (mbcset);
3736       return tree;
3737     }
3738 #else /* not RE_ENABLE_I18N */
3739   return tree;
3740 #endif /* not RE_ENABLE_I18N */
3741
3742  build_word_op_espace:
3743   re_free (sbcset);
3744 #ifdef RE_ENABLE_I18N
3745   free_charset (mbcset);
3746 #endif /* RE_ENABLE_I18N */
3747   *err = REG_ESPACE;
3748   return NULL;
3749 }
3750
3751 /* This is intended for the expressions like "a{1,3}".
3752    Fetch a number from 'input', and return the number.
3753    Return REG_MISSING if the number field is empty like "{,1}".
3754    Return REG_ERROR if an error occurred.  */
3755
3756 static Idx
3757 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3758 {
3759   Idx num = REG_MISSING;
3760   unsigned char c;
3761   while (1)
3762     {
3763       fetch_token (token, input, syntax);
3764       c = token->opr.c;
3765       if (BE (token->type == END_OF_RE, 0))
3766         return REG_ERROR;
3767       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3768         break;
3769       num = ((token->type != CHARACTER || c < '0' || '9' < c
3770               || num == REG_ERROR)
3771              ? REG_ERROR
3772              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3773       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3774     }
3775   return num;
3776 }
3777 \f
3778 #ifdef RE_ENABLE_I18N
3779 static void
3780 free_charset (re_charset_t *cset)
3781 {
3782   re_free (cset->mbchars);
3783 # ifdef _LIBC
3784   re_free (cset->coll_syms);
3785   re_free (cset->equiv_classes);
3786   re_free (cset->range_starts);
3787   re_free (cset->range_ends);
3788 # endif
3789   re_free (cset->char_classes);
3790   re_free (cset);
3791 }
3792 #endif /* RE_ENABLE_I18N */
3793 \f
3794 /* Functions for binary tree operation.  */
3795
3796 /* Create a tree node.  */
3797
3798 static bin_tree_t *
3799 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800              re_token_type_t type)
3801 {
3802   re_token_t t;
3803   t.type = type;
3804   return create_token_tree (dfa, left, right, &t);
3805 }
3806
3807 static bin_tree_t *
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809                    const re_token_t *token)
3810 {
3811   bin_tree_t *tree;
3812   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813     {
3814       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816       if (storage == NULL)
3817         return NULL;
3818       storage->next = dfa->str_tree_storage;
3819       dfa->str_tree_storage = storage;
3820       dfa->str_tree_storage_idx = 0;
3821     }
3822   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824   tree->parent = NULL;
3825   tree->left = left;
3826   tree->right = right;
3827   tree->token = *token;
3828   tree->token.duplicated = 0;
3829   tree->token.opt_subexp = 0;
3830   tree->first = NULL;
3831   tree->next = NULL;
3832   tree->node_idx = REG_MISSING;
3833
3834   if (left != NULL)
3835     left->parent = tree;
3836   if (right != NULL)
3837     right->parent = tree;
3838   return tree;
3839 }
3840
3841 /* Mark the tree SRC as an optional subexpression.
3842    To be called from preorder or postorder.  */
3843
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3846 {
3847   Idx idx = (Idx) (long) extra;
3848   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849     node->token.opt_subexp = 1;
3850
3851   return REG_NOERROR;
3852 }
3853
3854 /* Free the allocated memory inside NODE. */
3855
3856 static void
3857 free_token (re_token_t *node)
3858 {
3859 #ifdef RE_ENABLE_I18N
3860   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861     free_charset (node->opr.mbcset);
3862   else
3863 #endif /* RE_ENABLE_I18N */
3864     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865       re_free (node->opr.sbcset);
3866 }
3867
3868 /* Worker function for tree walking.  Free the allocated memory inside NODE
3869    and its children. */
3870
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3873 {
3874   free_token (&node->token);
3875   return REG_NOERROR;
3876 }
3877
3878
3879 /* Duplicate the node SRC, and return new node.  This is a preorder
3880    visit similar to the one implemented by the generic visitor, but
3881    we need more infrastructure to maintain two parallel trees --- so,
3882    it's easier to duplicate.  */
3883
3884 static bin_tree_t *
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886 {
3887   const bin_tree_t *node;
3888   bin_tree_t *dup_root;
3889   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891   for (node = root; ; )
3892     {
3893       /* Create a new tree and link it back to the current parent.  */
3894       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895       if (*p_new == NULL)
3896         return NULL;
3897       (*p_new)->parent = dup_node;
3898       (*p_new)->token.duplicated = 1;
3899       dup_node = *p_new;
3900
3901       /* Go to the left node, or up and to the right.  */
3902       if (node->left)
3903         {
3904           node = node->left;
3905           p_new = &dup_node->left;
3906         }
3907       else
3908         {
3909           const bin_tree_t *prev = NULL;
3910           while (node->right == prev || node->right == NULL)
3911             {
3912               prev = node;
3913               node = node->parent;
3914               dup_node = dup_node->parent;
3915               if (!node)
3916                 return dup_root;
3917             }
3918           node = node->right;
3919           p_new = &dup_node->right;
3920         }
3921     }
3922 }