Update from glibc.

[gnulib.git] / lib / regex.c
diff --git a/lib/regex.c b/lib/regex.c

index d16bd60..74fcf9c 100644 (file)
--- a/lib/regex.c
+++ b/lib/regex.c
@@ -2,25 +2,22 @@
     version 0.12.
     (Implements POSIX draft P1003.2/D11.2, except for some of the
     internationalization features.)
-   Copyright (C) 1993, 94, 95, 96, 97, 98 Free Software Foundation, Inc.
+   Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
  
-   NOTE: The canonical source of this file is maintained with the GNU C Library.
-   Bugs can be reported to bug-glibc@prep.ai.mit.edu.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
  
-   This program is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by the
-   Free Software Foundation; either version 2, or (at your option) any
-   later version.
-
-   This program is distributed in the hope that it will be useful,
+   The GNU C Library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
  
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-   USA.  */
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
  
  /* AIX requires this to be the first thing in the file. */
  #if defined _AIX && !defined REGEX_MALLOC
@@ -49,16 +46,41 @@
  # include <sys/types.h>
  #endif
  
-#define WIDE_CHAR_SUPPORT \
-  defined _LIBC || (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
+#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
  
  /* For platform which support the ISO C amendement 1 functionality we
     support user defined character classes.  */
-#if WIDE_CHAR_SUPPORT
+#if defined _LIBC || WIDE_CHAR_SUPPORT
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
  # include <wchar.h>
  # include <wctype.h>
  #endif
  
+#ifdef _LIBC
+/* We have to keep the namespace clean.  */
+# define regfree(preg) __regfree (preg)
+# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
+# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
+# define regerror(errcode, preg, errbuf, errbuf_size) \
+       __regerror(errcode, preg, errbuf, errbuf_size)
+# define re_set_registers(bu, re, nu, st, en) \
+       __re_set_registers (bu, re, nu, st, en)
+# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
+       __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
+# define re_match(bufp, string, size, pos, regs) \
+       __re_match (bufp, string, size, pos, regs)
+# define re_search(bufp, string, size, startpos, range, regs) \
+       __re_search (bufp, string, size, startpos, range, regs)
+# define re_compile_pattern(pattern, length, bufp) \
+       __re_compile_pattern (pattern, length, bufp)
+# define re_set_syntax(syntax) __re_set_syntax (syntax)
+# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
+       __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
+# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
+
+#define btowc __btowc
+#endif
+
  /* This is for other GNU distributions with internationalized messages.  */
  #if HAVE_LIBINTL_H || defined _LIBC
  # include <libintl.h>
@@ -110,8 +132,12 @@ char *realloc ();
  # ifndef INHIBIT_STRING_HEADER
  #  if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
  #   include <string.h>
-#   if !defined bzero && !defined _LIBC
-#    define bzero(s, n)                (memset (s, '\0', n), (s))
+#   ifndef bzero
+#    ifndef _LIBC
+#     define bzero(s, n)       (memset (s, '\0', n), (s))
+#    else
+#     define bzero(s, n)       __bzero (s, n)
+#    endif
  #   endif
  #  else
  #   include <strings.h>
@@ -181,7 +207,7 @@ init_syntax_once ()
  #endif /* not emacs */
  \f
  /* Get the interface, including the syntax bits.  */
-#include "regex.h"
+#include <regex.h>
  
  /* isalpha etc. are used for the character classes.  */
  #include <ctype.h>
@@ -195,7 +221,8 @@ init_syntax_once ()
     STDC_HEADERS is defined, then autoconf has verified that the ctype
     macros don't need to be guarded with references to isascii. ...
     Defining isascii to 1 should let any compiler worth its salt
-   eliminate the && through constant folding."  */
+   eliminate the && through constant folding."
+   Solaris defines some of these symbols so we must undefine them first.  */
  
  #undef ISASCII
  #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
@@ -227,6 +254,12 @@ init_syntax_once ()
  #define ISUPPER(c) (ISASCII (c) && isupper (c))
  #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
  
+#ifdef _tolower
+# define TOLOWER(c) _tolower(c)
+#else
+# define TOLOWER(c) tolower(c)
+#endif
+
  #ifndef NULL
  # define NULL (void *)0
  #endif
@@ -971,6 +1004,9 @@ re_set_syntax (syntax)
  #endif /* DEBUG */
    return ret;
  }
+#ifdef _LIBC
+weak_alias (__re_set_syntax, re_set_syntax)
+#endif
  \f
  /* This table gives an error message for each of the error codes listed
     in regex.h.  Obviously the order here has to be same as there.
@@ -1688,7 +1724,7 @@ typedef struct
         }                                                               \
      }
  
-#if WIDE_CHAR_SUPPORT
+#if defined _LIBC || WIDE_CHAR_SUPPORT
  /* The GNU C library provides support for user-defined character classes
     and the functions from ISO C amendement 1.  */
  # ifdef CHARCLASS_NAME_MAX
@@ -1699,7 +1735,11 @@ typedef struct
  #  define CHAR_CLASS_MAX_LENGTH 256
  # endif
  
-# define IS_CHAR_CLASS(string) wctype (string)
+# ifdef _LIBC
+#  define IS_CHAR_CLASS(string) __wctype (string)
+# else
+#  define IS_CHAR_CLASS(string) wctype (string)
+# endif
  #else
  # define CHAR_CLASS_MAX_LENGTH  6 /* Namely, `xdigit'.  */
  
@@ -2176,25 +2216,28 @@ regex_compile (pattern, size, syntax, bufp)
                      for (;;)
                        {
                          PATFETCH (c);
-                        if (c == ':' || c == ']' || p == pend
-                            || c1 == CHAR_CLASS_MAX_LENGTH)
+                        if ((c == ':' && *p == ']') || p == pend)
                            break;
-                        str[c1++] = c;
+                       if (c1 < CHAR_CLASS_MAX_LENGTH)
+                         str[c1++] = c;
+                       else
+                         /* This is in any case an invalid class name.  */
+                         str[0] = '\0';
                        }
                      str[c1] = '\0';
  
-                    /* If isn't a word bracketed by `[:' and:`]':
+                    /* If isn't a word bracketed by `[:' and `:]':
                         undo the ending character, the letters, and leave
                         the leading `:' and `[' (but set bits for them).  */
                      if (c == ':' && *p == ']')
                        {
-#if WIDE_CHAR_SUPPORT
+#if defined _LIBC || WIDE_CHAR_SUPPORT
                          boolean is_lower = STREQ (str, "lower");
                          boolean is_upper = STREQ (str, "upper");
                         wctype_t wt;
                          int ch;
  
-                       wt = wctype (str);
+                       wt = IS_CHAR_CLASS (str);
                         if (wt == 0)
                           FREE_STACK_RETURN (REG_ECTYPE);
  
@@ -2206,8 +2249,13 @@ regex_compile (pattern, size, syntax, bufp)
  
                          for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
                           {
+# ifdef _LIBC
+                           if (__iswctype (__btowc (ch), wt))
+                             SET_LIST_BIT (ch);
+# else
                             if (iswctype (btowc (ch), wt))
                               SET_LIST_BIT (ch);
+# endif
  
                             if (translate && (is_upper || is_lower)
                                 && (ISUPPER (ch) || ISLOWER (ch)))
@@ -2691,7 +2739,7 @@ regex_compile (pattern, size, syntax, bufp)
  
  
              case 'w':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                laststart = b;
                BUF_PUSH (wordchar);
@@ -2699,7 +2747,7 @@ regex_compile (pattern, size, syntax, bufp)
  
  
              case 'W':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                laststart = b;
                BUF_PUSH (notwordchar);
@@ -2707,37 +2755,37 @@ regex_compile (pattern, size, syntax, bufp)
  
  
              case '<':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                BUF_PUSH (wordbeg);
                break;
  
              case '>':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                BUF_PUSH (wordend);
                break;
  
              case 'b':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                BUF_PUSH (wordbound);
                break;
  
              case 'B':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                BUF_PUSH (notwordbound);
                break;
  
              case '`':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                BUF_PUSH (begbuf);
                break;
  
              case '\'':
-             if (re_syntax_options & RE_NO_GNU_OPS)
+             if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
                BUF_PUSH (endbuf);
                break;
@@ -3375,6 +3423,9 @@ re_compile_fastmap (bufp)
    RESET_FAIL_STACK ();
    return 0;
  } /* re_compile_fastmap */
+#ifdef _LIBC
+weak_alias (__re_compile_fastmap, re_compile_fastmap)
+#endif
  \f
  /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
     ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
@@ -3410,6 +3461,9 @@ re_set_registers (bufp, regs, num_regs, starts, ends)
        regs->start = regs->end = (regoff_t *) 0;
      }
  }
+#ifdef _LIBC
+weak_alias (__re_set_registers, re_set_registers)
+#endif
  \f
  /* Searching routines.  */
  
@@ -3426,6 +3480,9 @@ re_search (bufp, string, size, startpos, range, regs)
    return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
                       regs, size);
  }
+#ifdef _LIBC
+weak_alias (__re_search, re_search)
+#endif
  
  
  /* Using the compiled pattern in BUFP->buffer, first tries to match the
@@ -3479,7 +3536,11 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
  
    /* If the search isn't to be a backwards one, don't waste time in a
       search for a pattern that must be anchored.  */
-  if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
+  if (bufp->used > 0 && range > 0
+      && ((re_opcode_t) bufp->buffer[0] == begbuf
+         /* `begline' is like `begbuf' if it cannot match at newlines.  */
+         || ((re_opcode_t) bufp->buffer[0] == begline
+             && !bufp->newline_anchor)))
      {
        if (startpos > 0)
         return -1;
@@ -3582,6 +3643,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
      }
    return -1;
  } /* re_search_2 */
+#ifdef _LIBC
+weak_alias (__re_search_2, re_search_2)
+#endif
  \f
  /* This converts PTR, a pointer into one of the search strings `string1'
     and `string2' into an offset from the beginning of that string.  */
@@ -3683,6 +3747,9 @@ re_match (bufp, string, size, pos, regs)
  # endif
    return result;
  }
+# ifdef _LIBC
+weak_alias (__re_match, re_match)
+# endif
  #endif /* not emacs */
  
  static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
@@ -3728,6 +3795,9 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
  #endif
    return result;
  }
+#ifdef _LIBC
+weak_alias (__re_match_2, re_match_2)
+#endif
  
  /* This is a separate function so that we can force an alloca cleanup
     afterwards.  */
@@ -5421,6 +5491,9 @@ re_compile_pattern (pattern, length, bufp)
      return NULL;
    return gettext (re_error_msgid[(int) ret]);
  }
+#ifdef _LIBC
+weak_alias (__re_compile_pattern, re_compile_pattern)
+#endif
  \f
  /* Entry points compatible with 4.2 BSD regex library.  We don't define
     them unless specifically requested.  */
@@ -5453,12 +5526,12 @@ re_comp (s)
      {
        re_comp_buf.buffer = (unsigned char *) malloc (200);
        if (re_comp_buf.buffer == NULL)
-        return gettext (re_error_msgid[(int) REG_ESPACE]);
+        return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
        re_comp_buf.allocated = 200;
  
        re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
        if (re_comp_buf.fastmap == NULL)
-       return gettext (re_error_msgid[(int) REG_ESPACE]);
+       return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
      }
  
    /* Since `re_exec' always passes NULL for the `regs' argument, we
@@ -5506,7 +5579,8 @@ re_exec (s)
         REG_EXTENDED bit in CFLAGS is set; otherwise, to
         RE_SYNTAX_POSIX_BASIC;
       `newline_anchor' to REG_NEWLINE being set in CFLAGS;
-     `fastmap' and `fastmap_accurate' to zero;
+     `fastmap' to an allocated space for the fastmap;
+     `fastmap_accurate' to zero;
       `re_nsub' to the number of subexpressions in PATTERN.
  
     PATTERN is the address of the pattern string.
@@ -5545,11 +5619,8 @@ regcomp (preg, pattern, cflags)
    preg->allocated = 0;
    preg->used = 0;
  
-  /* Don't bother to use a fastmap when searching.  This simplifies the
-     REG_NEWLINE case: if we used a fastmap, we'd have to put all the
-     characters after newlines into the fastmap.  This way, we just try
-     every character.  */
-  preg->fastmap = 0;
+  /* Try to allocate space for the fastmap.  */
+  preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
  
    if (cflags & REG_ICASE)
      {
@@ -5563,7 +5634,7 @@ regcomp (preg, pattern, cflags)
  
        /* Map uppercase characters to corresponding lowercase ones.  */
        for (i = 0; i < CHAR_SET_SIZE; i++)
-        preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
+        preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
      }
    else
      preg->translate = NULL;
@@ -5589,8 +5660,24 @@ regcomp (preg, pattern, cflags)
       unmatched close-group: both are REG_EPAREN.  */
    if (ret == REG_ERPAREN) ret = REG_EPAREN;
  
+  if (ret == REG_NOERROR && preg->fastmap)
+    {
+      /* Compute the fastmap now, since regexec cannot modify the pattern
+        buffer.  */
+      if (re_compile_fastmap (preg) == -2)
+       {
+         /* Some error occured while computing the fastmap, just forget
+            about it.  */
+         free (preg->fastmap);
+         preg->fastmap = NULL;
+       }
+    }
+
    return (int) ret;
  }
+#ifdef _LIBC
+weak_alias (__regcomp, regcomp)
+#endif
  
  
  /* regexec searches for a given pattern, specified by PREG, in the
@@ -5634,10 +5721,10 @@ regexec (preg, string, nmatch, pmatch, eflags)
    if (want_reg_info)
      {
        regs.num_regs = nmatch;
-      regs.start = TALLOC (nmatch, regoff_t);
-      regs.end = TALLOC (nmatch, regoff_t);
-      if (regs.start == NULL || regs.end == NULL)
+      regs.start = TALLOC (nmatch * 2, regoff_t);
+      if (regs.start == NULL)
          return (int) REG_NOMATCH;
+      regs.end = regs.start + nmatch;
      }
  
    /* Perform the searching operation.  */
@@ -5661,12 +5748,14 @@ regexec (preg, string, nmatch, pmatch, eflags)
  
        /* If we needed the temporary register info, free the space now.  */
        free (regs.start);
-      free (regs.end);
      }
  
    /* We want zero return to mean success, unlike `re_search'.  */
    return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
  }
+#ifdef _LIBC
+weak_alias (__regexec, regexec)
+#endif
  
  
  /* Returns a message corresponding to an error code, ERRCODE, returned
@@ -5712,6 +5801,9 @@ regerror (errcode, preg, errbuf, errbuf_size)
  
    return msg_size;
  }
+#ifdef _LIBC
+weak_alias (__regerror, regerror)
+#endif
  
  
  /* Free dynamically allocated space used by PREG.  */
@@ -5736,5 +5828,8 @@ regfree (preg)
      free (preg->translate);
    preg->translate = NULL;
  }
+#ifdef _LIBC
+weak_alias (__regfree, regfree)
+#endif
  
  #endif /* not emacs  */