X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;ds=sidebyside;f=regex.c;h=c9219c66c83624e8f2273ed11577ef22967a558b;hb=040e6d8836a3ca85e1d2d9b76a2503864e89dc66;hp=450850609a6227f7c40b92dc517fba9658068ae6;hpb=7d4daee6123211176f93eec4b2958a4205a5f784;p=gnulib.git diff --git a/regex.c b/regex.c index 450850609..c9219c66c 100644 --- a/regex.c +++ b/regex.c @@ -2,7 +2,7 @@ 0.12. (Implements POSIX draft P10003.2/D11.2, except for internationalization features.) - Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc. + Copyright (C) 1993, 1994-1998, 1999 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -191,32 +191,25 @@ init_syntax_once () /* Get the interface, including the syntax bits. */ #include "regex.h" -/* Jim Meyering writes: +/* isalpha etc. are used for the character classes. */ +#include - "... Some ctype macros are valid only for character codes that - isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when - using /bin/cc or gcc but without giving an ansi option). So, all - ctype uses should be through macros like ISPRINT... If - STDC_HEADERS is defined, then autoconf has verified that the ctype - macros don't need to be guarded with references to isascii. ... - Defining isascii to 1 should let any compiler worth its salt - eliminate the && through constant folding." */ +#ifdef emacs -#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) -#define ISASCII(c) 1 -#else -#define ISASCII(c) isascii(c) -#endif +/* 1 if C is an ASCII character. */ +#define IS_REAL_ASCII(c) ((c) < 0200) -/* isalpha etc. are used for the character classes. */ -#include +/* 1 if C is a unibyte character. */ +#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c))) -/* In Emacs, these are only used for single-byte characters. */ -#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) -#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) -#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) +/* The Emacs definitions should not be directly affected by locales. */ -#ifdef emacs +/* In Emacs, these are only used for single-byte characters. */ +#define ISDIGIT(c) ((c) >= '0' && (c) <= '9') +#define ISCNTRL(c) ((c) < ' ') +#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \ + || ((c) >= 'a' && (c) <= 'f') \ + || ((c) >= 'A' && (c) <= 'F')) /* This is only used for single-byte characters. */ #define ISBLANK(c) ((c) == ' ' || (c) == '\t') @@ -224,25 +217,31 @@ init_syntax_once () /* The rest must handle multibyte characters. */ #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ - ? ISASCII (c) && isprint (c) && !isspace (c) \ + ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ : 1) #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ - ? ISASCII (c) && isalnum (c) \ + ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ : 1) -#define ISALNUM(c) (SINGLE_BYTE_CHAR_P (c) \ - ? ISASCII (c) && isalnum (c) \ +#define ISALNUM(c) (IS_REAL_ASCII (c) \ + ? (((c) >= 'a' && (c) <= 'z') \ + || ((c) >= 'A' && (c) <= 'Z') \ + || ((c) >= '0' && (c) <= '9')) \ : SYNTAX (c) == Sword) -#define ISALPHA(c) (SINGLE_BYTE_CHAR_P (c) \ - ? ISASCII (c) && isalpha (c) \ +#define ISALPHA(c) (IS_REAL_ASCII (c) \ + ? (((c) >= 'a' && (c) <= 'z') \ + || ((c) >= 'A' && (c) <= 'Z')) \ : SYNTAX (c) == Sword) #define ISLOWER(c) (LOWERCASEP (c)) -#define ISPUNCT(c) (SINGLE_BYTE_CHAR_P (c) \ - ? ISASCII (c) && ispunct (c) \ +#define ISPUNCT(c) (IS_REAL_ASCII (c) \ + ? ((c) > ' ' && (c) < 0177 \ + && !(((c) >= 'a' && (c) <= 'z') \ + || ((c) >= 'A' && (c) <= 'Z') \ + || ((c) >= '0' && (c) <= '9'))) \ : SYNTAX (c) != Sword) #define ISSPACE(c) (SYNTAX (c) == Swhitespace) @@ -253,6 +252,33 @@ init_syntax_once () #else /* not emacs */ +/* Jim Meyering writes: + + "... Some ctype macros are valid only for character codes that + isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when + using /bin/cc or gcc but without giving an ansi option). So, all + ctype uses should be through macros like ISPRINT... If + STDC_HEADERS is defined, then autoconf has verified that the ctype + macros don't need to be guarded with references to isascii. ... + Defining isascii to 1 should let any compiler worth its salt + eliminate the && through constant folding." */ + +#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) +#define ISASCII(c) 1 +#else +#define ISASCII(c) isascii(c) +#endif + +/* 1 if C is an ASCII character. */ +#define IS_REAL_ASCII(c) ((c) < 0200) + +/* This distinction is not meaningful, except in Emacs. */ +#define ISUNIBYTE(c) 1 + +#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) +#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) +#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) + #ifdef isblank #define ISBLANK(c) (ISASCII (c) && isblank (c)) #else @@ -1486,8 +1512,8 @@ typedef struct \ assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ \ - DEBUG_POP (&failure_id); \ - DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + DEBUG_POP (&failure_id.integer); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id.integer); \ \ /* If the saved string location is NULL, it came from an \ on_failure_keep_string_jump opcode, and we want to throw away the \ @@ -1809,12 +1835,16 @@ struct range_table_work_area #define BIT_ALNUM 0x1 #define BIT_ALPHA 0x2 #define BIT_WORD 0x4 +#define BIT_ASCII 0x8 +#define BIT_NONASCII 0x10 #define BIT_GRAPH 0x20 #define BIT_LOWER 0x40 #define BIT_PRINT 0x80 #define BIT_PUNCT 0x100 #define BIT_SPACE 0x200 #define BIT_UPPER 0x400 +#define BIT_UNIBYTE 0x800 +#define BIT_MULTIBYTE 0x1000 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ @@ -1869,7 +1899,9 @@ struct range_table_work_area || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank") \ - || STREQ (string, "word")) + || STREQ (string, "word") \ + || STREQ (string, "ascii") || STREQ (string, "nonascii") \ + || STREQ (string, "unibyte") || STREQ (string, "multibyte")) #ifndef MATCH_MAY_ALLOCATE @@ -2136,6 +2168,7 @@ regex_compile (pattern, size, syntax, bufp) /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; + char greedy = 1; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine @@ -2144,8 +2177,14 @@ regex_compile (pattern, size, syntax, bufp) for (;;) { - zero_times_ok |= c != '+'; - many_times_ok |= c != '?'; + if (!(syntax & RE_ALL_GREEDY) + && c == '?' && (zero_times_ok || many_times_ok)) + greedy = 0; + else + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + } if (p == pend) break; @@ -2186,6 +2225,8 @@ regex_compile (pattern, size, syntax, bufp) /* Now we know whether or not zero matches is allowed and also whether or not two or more matches is allowed. */ + if (greedy) + { if (many_times_ok) { /* More than one repetition is allowed, so put in at the end a backward relative jump from `b' to before the next @@ -2244,7 +2285,39 @@ regex_compile (pattern, size, syntax, bufp) INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); b += 3; } - } + + } + else /* not greedy */ + { /* I wish the greedy and non-greedy cases could be merged. */ + + if (many_times_ok) + { + /* The greedy multiple match looks like a repeat..until: + we only need a conditional jump at the end of the loop */ + GET_BUFFER_SPACE (3); + STORE_JUMP (on_failure_jump, b, laststart); + b += 3; + if (zero_times_ok) + { + /* The repeat...until naturally matches one or more. + To also match zero times, we need to first jump to + the end of the loop (its conditional jump). */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (jump, laststart, b); + b += 3; + } + } + else + { + /* non-greedy a?? */ + GET_BUFFER_SPACE (6); + INSERT_JUMP (jump, laststart, b + 3); + b += 3; + INSERT_JUMP (on_failure_jump, laststart, laststart + 6); + b += 3; + } + } + } break; @@ -2360,17 +2433,21 @@ regex_compile (pattern, size, syntax, bufp) int ch; boolean is_alnum = STREQ (str, "alnum"); boolean is_alpha = STREQ (str, "alpha"); + boolean is_ascii = STREQ (str, "ascii"); boolean is_blank = STREQ (str, "blank"); boolean is_cntrl = STREQ (str, "cntrl"); boolean is_digit = STREQ (str, "digit"); boolean is_graph = STREQ (str, "graph"); boolean is_lower = STREQ (str, "lower"); + boolean is_multibyte = STREQ (str, "multibyte"); + boolean is_nonascii = STREQ (str, "nonascii"); boolean is_print = STREQ (str, "print"); boolean is_punct = STREQ (str, "punct"); boolean is_space = STREQ (str, "space"); + boolean is_unibyte = STREQ (str, "unibyte"); boolean is_upper = STREQ (str, "upper"); - boolean is_xdigit = STREQ (str, "xdigit"); boolean is_word = STREQ (str, "word"); + boolean is_xdigit = STREQ (str, "xdigit"); if (!IS_CHAR_CLASS (str)) FREE_STACK_RETURN (REG_ECTYPE); @@ -2393,11 +2470,15 @@ regex_compile (pattern, size, syntax, bufp) if (is_alnum) bit = BIT_ALNUM; if (is_alpha) bit = BIT_ALPHA; + if (is_ascii) bit = BIT_ASCII; if (is_graph) bit = BIT_GRAPH; if (is_lower) bit = BIT_LOWER; + if (is_multibyte) bit = BIT_MULTIBYTE; + if (is_nonascii) bit = BIT_NONASCII; if (is_print) bit = BIT_PRINT; if (is_punct) bit = BIT_PUNCT; if (is_space) bit = BIT_SPACE; + if (is_unibyte) bit = BIT_UNIBYTE; if (is_upper) bit = BIT_UPPER; if (is_word) bit = BIT_WORD; if (bit) @@ -2426,6 +2507,12 @@ regex_compile (pattern, size, syntax, bufp) || (is_upper && ISUPPER (ch)) || (is_xdigit && ISXDIGIT (ch))) SET_LIST_BIT (translated); + if ( (is_ascii && IS_REAL_ASCII (ch)) + || (is_nonascii && !IS_REAL_ASCII (ch)) + || (is_unibyte && ISUNIBYTE (ch)) + || (is_multibyte && !ISUNIBYTE (ch))) + SET_LIST_BIT (translated); + if ( (is_word && ISWORD (ch))) SET_LIST_BIT (translated); } @@ -3064,8 +3151,8 @@ regex_compile (pattern, size, syntax, bufp) #ifdef emacs if (! SINGLE_BYTE_CHAR_P (c)) { - unsigned char work[4], *str; - int i = CHAR_STRING (c, work, str); + unsigned char str[MAX_MULTIBYTE_LENGTH]; + int i = CHAR_STRING (c, str); int j; for (j = 0; j < i; j++) { @@ -3434,7 +3521,7 @@ re_compile_fastmap (bufp) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) fastmap[j] = 1; - /* If we can match a syntax class, we can match + /* If we can match a character class, we can match any character set. */ if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0) @@ -3450,8 +3537,7 @@ re_compile_fastmap (bufp) /* Make P points the range table. */ p += CHARSET_BITMAP_SIZE (&p[-2]); - /* Extract the number of ranges in range table into - COUNT. */ + /* Extract the number of ranges in range table into COUNT. */ EXTRACT_NUMBER_AND_INCR (count, p); for (; count > 0; count--, p += 2 * 3) /* XXX */ { @@ -4802,11 +4888,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if ( (class_bits & BIT_ALNUM && ISALNUM (c)) | (class_bits & BIT_ALPHA && ISALPHA (c)) + | (class_bits & BIT_ASCII && IS_REAL_ASCII (c)) | (class_bits & BIT_GRAPH && ISGRAPH (c)) | (class_bits & BIT_LOWER && ISLOWER (c)) + | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c)) + | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c)) | (class_bits & BIT_PRINT && ISPRINT (c)) | (class_bits & BIT_PUNCT && ISPUNCT (c)) | (class_bits & BIT_SPACE && ISSPACE (c)) + | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c)) | (class_bits & BIT_UPPER && ISUPPER (c)) | (class_bits & BIT_WORD && ISWORD (c))) not = !not;