From fbfced515f325dd000c80fa35ade86e567d0ea3c Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Tue, 8 Mar 2011 10:09:47 +0100 Subject: [PATCH] regex-quote: New API. * lib/regex-quote.h: Include . (struct regex_quote_spec): New type. (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): New declarations. (regex_quote_length, regex_quote_copy, regex_quote): Take a 'const struct regex_quote_spec *' argument. * lib/regex-quote.c (RE_*, PCRE_*): New macros. (pcre_special): New constant. (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): New functions. (regex_quote_length, regex_quote_copy, regex_quote): Take a 'const struct regex_quote_spec *' argument. * modules/regex-quote (Depends-on): Add stdbool. * tests/test-regex-quote.c (check): Update for new API. Add test for anchored results. * NEWS: Mention the API change. Reported by Reuben Thomas and Eric Blake. --- ChangeLog | 21 ++++++ NEWS | 4 ++ lib/regex-quote.c | 182 ++++++++++++++++++++++++++++++++++++++++------- lib/regex-quote.h | 71 ++++++++++++++---- modules/regex-quote | 1 + tests/test-regex-quote.c | 25 ++++++- 6 files changed, 263 insertions(+), 41 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6ceb41c1f..d93ea6928 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2011-03-08 Bruno Haible + + regex-quote: New API. + * lib/regex-quote.h: Include . + (struct regex_quote_spec): New type. + (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): + New declarations. + (regex_quote_length, regex_quote_copy, regex_quote): Take a + 'const struct regex_quote_spec *' argument. + * lib/regex-quote.c (RE_*, PCRE_*): New macros. + (pcre_special): New constant. + (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): + New functions. + (regex_quote_length, regex_quote_copy, regex_quote): Take a + 'const struct regex_quote_spec *' argument. + * modules/regex-quote (Depends-on): Add stdbool. + * tests/test-regex-quote.c (check): Update for new API. Add test for + anchored results. + * NEWS: Mention the API change. + Reported by Reuben Thomas and Eric Blake. + 2011-03-06 Bruno Haible regex-quote: Fix creation of POSIX extended regular expressions. diff --git a/NEWS b/NEWS index 9a65c6af0..767d0ab01 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,10 @@ User visible incompatible changes Date Modules Changes +2011-03-08 regex-quote The last argument is no longer an 'int cflags' + but instead a pointer to a previously constructed + 'struct regex_quote_spec'. + 2011-02-25 dirname These modules no longer put #defines for the dirname-lgpl following symbols into : ISSLASH, backupfile FILE_SYSTEM_ACCEPTS_DRIVE_LETTER_PREFIX, diff --git a/lib/regex-quote.c b/lib/regex-quote.c index 361cff077..8b4cdb78b 100644 --- a/lib/regex-quote.c +++ b/lib/regex-quote.c @@ -31,56 +31,186 @@ static const char bre_special[] = "$^.*[]\\"; /* Characters that are special in an ERE. */ static const char ere_special[] = "$^.*[]\\+?{}()|"; +struct regex_quote_spec +regex_quote_spec_posix (int cflags, bool anchored) +{ + struct regex_quote_spec result; + + strcpy (result.special, cflags != 0 ? ere_special : bre_special); + result.multibyte = true; + result.anchored = anchored; + + return result; +} + +/* Syntax bit values, defined in GNU . We don't include it here, + otherwise this module would need to depend on gnulib module 'regex'. */ +#define RE_BK_PLUS_QM 0x00000002 +#define RE_INTERVALS 0x00000200 +#define RE_LIMITED_OPS 0x00000400 +#define RE_NEWLINE_ALT 0x00000800 +#define RE_NO_BK_BRACES 0x00001000 +#define RE_NO_BK_PARENS 0x00002000 +#define RE_NO_BK_VBAR 0x00008000 + +struct regex_quote_spec +regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored) +{ + struct regex_quote_spec result; + char *p; + + p = result.special; + memcpy (p, bre_special, sizeof (bre_special) - 1); + p += sizeof (bre_special) - 1; + if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_BK_PLUS_QM) == 0) + { + *p++ = '+'; + *p++ = '?'; + } + if ((syntax & RE_INTERVALS) != 0 && (syntax & RE_NO_BK_BRACES) != 0) + { + *p++ = '{'; + *p++ = '}'; + } + if ((syntax & RE_NO_BK_PARENS) != 0) + { + *p++ = '('; + *p++ = ')'; + } + if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_NO_BK_VBAR) != 0) + *p++ = '|'; + if ((syntax & RE_NEWLINE_ALT) != 0) + *p++ = '\n'; + *p = '\0'; + + result.multibyte = true; + result.anchored = anchored; + + return result; +} + +/* Characters that are special in a PCRE. */ +static const char pcre_special[] = "$^.*[]\\+?{}()|"; + +/* Options bit values, defined in . We don't include it here, because + it is not a standard header. */ +#define PCRE_ANCHORED 0x00000010 +#define PCRE_EXTENDED 0x00000008 + +struct regex_quote_spec +regex_quote_spec_pcre (int options, bool anchored) +{ + struct regex_quote_spec result; + char *p; + + p = result.special; + memcpy (p, bre_special, sizeof (pcre_special) - 1); + p += sizeof (pcre_special) - 1; + if (options & PCRE_EXTENDED) + { + *p++ = ' '; + *p++ = '\t'; + *p++ = '\n'; + *p++ = '\v'; + *p++ = '\f'; + *p++ = '\r'; + *p++ = '#'; + } + *p = '\0'; + + /* PCRE regular expressions consist of UTF-8 characters of options contains + PCRE_UTF8 and of single bytes otherwise. */ + result.multibyte = false; + /* If options contains PCRE_ANCHORED, the anchoring is implicit. */ + result.anchored = (options & PCRE_ANCHORED ? 0 : anchored); + + return result; +} + size_t -regex_quote_length (const char *string, int cflags) +regex_quote_length (const char *string, const struct regex_quote_spec *spec) { - const char *special = (cflags != 0 ? ere_special : bre_special); + const char *special = spec->special; size_t length; - mbui_iterator_t iter; length = 0; - for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) + if (spec->anchored) + length += 2; /* for '^' at the beginning and '$' at the end */ + if (spec->multibyte) + { + mbui_iterator_t iter; + + for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) + { + /* We know that special contains only ASCII characters. */ + if (mb_len (mbui_cur (iter)) == 1 + && strchr (special, * mbui_cur_ptr (iter))) + length += 1; + length += mb_len (mbui_cur (iter)); + } + } + else { - /* We know that special contains only ASCII characters. */ - if (mb_len (mbui_cur (iter)) == 1 - && strchr (special, * mbui_cur_ptr (iter))) - length += 1; - length += mb_len (mbui_cur (iter)); + const char *iter; + + for (iter = string; *iter != '\0'; iter++) + { + if (strchr (special, *iter)) + length += 1; + length += 1; + } } + return length; } -/* Copies the quoted string to p and returns the incremented p. - There must be room for regex_quote_length (string, cflags) + 1 bytes at p. - */ char * -regex_quote_copy (char *p, const char *string, int cflags) +regex_quote_copy (char *p, const char *string, const struct regex_quote_spec *spec) { - const char *special = (cflags != 0 ? ere_special : bre_special); - mbui_iterator_t iter; + const char *special = spec->special; - for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) + if (spec->anchored) + *p++ = '^'; + if (spec->multibyte) { - /* We know that special contains only ASCII characters. */ - if (mb_len (mbui_cur (iter)) == 1 - && strchr (special, * mbui_cur_ptr (iter))) - *p++ = '\\'; - memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); - p += mb_len (mbui_cur (iter)); + mbui_iterator_t iter; + + for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) + { + /* We know that special contains only ASCII characters. */ + if (mb_len (mbui_cur (iter)) == 1 + && strchr (special, * mbui_cur_ptr (iter))) + *p++ = '\\'; + memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); + p += mb_len (mbui_cur (iter)); + } } + else + { + const char *iter; + + for (iter = string; *iter != '\0'; iter++) + { + if (strchr (special, *iter)) + *p++ = '\\'; + *p++ = *iter++; + } + } + if (spec->anchored) + *p++ = '$'; + return p; } -/* Returns the freshly allocated quoted string. */ char * -regex_quote (const char *string, int cflags) +regex_quote (const char *string, const struct regex_quote_spec *spec) { - size_t length = regex_quote_length (string, cflags); + size_t length = regex_quote_length (string, spec); char *result = XNMALLOC (length + 1, char); char *p; p = result; - p = regex_quote_copy (p, string, cflags); + p = regex_quote_copy (p, string, spec); *p = '\0'; return result; } diff --git a/lib/regex-quote.h b/lib/regex-quote.h index e1e2a64ac..402dfaa09 100644 --- a/lib/regex-quote.h +++ b/lib/regex-quote.h @@ -15,27 +15,74 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ +#ifndef _REGEX_QUOTE_H +#define _REGEX_QUOTE_H + #include +#include + + +/* Specifies a quotation task for converting a fixed string to a regular + expression pattern. */ +struct regex_quote_spec +{ + /* True if the regular expression pattern consists of multibyte characters + (in the encoding given by the LC_CTYPE category of the locale), + false if it consists of single bytes or UTF-8 characters. */ + unsigned int /*bool*/ multibyte : 1; + /* True if the regular expression pattern shall match only entire lines. */ + unsigned int /*bool*/ anchored : 1; + /* Set of characters that need to be escaped (all ASCII), as a + NUL-terminated string. */ + char special[30 + 1]; +}; -/* regex_quote converts a literal string to a regular expression that will - look for this literal string. - cflags can be 0 or REG_EXTENDED. + +/* Creates a quotation task that produces a POSIX regular expression, that is, + a pattern that can be compiled with regcomp(). + CFLAGS can be 0 or REG_EXTENDED. If it is 0, the result is a Basic Regular Expression (BRE) . If it is REG_EXTENDED, the result is an Extended Regular Expression (ERE) . - The result is not anchored; if you want it to match only complete lines, - you need to add "^" at the beginning of the result and "$" at the end of the - result. - */ + If ANCHORED is false, the regular expression will match substrings of lines. + If ANCHORED is true, it will match only complete lines, */ +extern struct regex_quote_spec + regex_quote_spec_posix (int cflags, bool anchored); + +/* Creates a quotation task that produces a regular expression that can be + compiled with the GNU API function re_compile_pattern(). + SYNTAX describes the syntax of the regular expression (such as + RE_SYNTAX_POSIX_BASIC, RE_SYNTAX_POSIX_EXTENDED, RE_SYNTAX_EMACS, all + defined in ). It must be the same value as 're_syntax_options' + at the moment of the re_compile_pattern() call. + If ANCHORED is false, the regular expression will match substrings of lines. + If ANCHORED is true, it will match only complete lines, */ +extern struct regex_quote_spec + regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored); + +/* Creates a quotation task that produces a PCRE regular expression, that is, + a pattern that can be compiled with pcre_compile(). + OPTIONS is the same value as the second argument passed to pcre_compile(). + If ANCHORED is false, the regular expression will match substrings of lines. + If ANCHORED is true, it will match only complete lines, */ +extern struct regex_quote_spec + regex_quote_spec_pcre (int options, bool anchored); + /* Returns the number of bytes needed for the quoted string. */ -extern size_t regex_quote_length (const char *string, int cflags); +extern size_t + regex_quote_length (const char *string, const struct regex_quote_spec *spec); /* Copies the quoted string to p and returns the incremented p. - There must be room for regex_quote_length (string, cflags) + 1 bytes at p. - */ -extern char * regex_quote_copy (char *p, const char *string, int cflags); + There must be room for regex_quote_length (string, spec) + 1 bytes at p. */ +extern char * + regex_quote_copy (char *p, + const char *string, const struct regex_quote_spec *spec); /* Returns the freshly allocated quoted string. */ -extern char * regex_quote (const char *string, int cflags); +extern char * + regex_quote (const char *string, const struct regex_quote_spec *spec); + + +#endif /* _REGEX_QUOTE_H */ diff --git a/modules/regex-quote b/modules/regex-quote index 2ca57b004..0f008cf3d 100644 --- a/modules/regex-quote +++ b/modules/regex-quote @@ -6,6 +6,7 @@ lib/regex-quote.h lib/regex-quote.c Depends-on: +stdbool xalloc mbuiter diff --git a/tests/test-regex-quote.c b/tests/test-regex-quote.c index 7f1e8f085..02728f934 100644 --- a/tests/test-regex-quote.c +++ b/tests/test-regex-quote.c @@ -29,18 +29,37 @@ static void check (const char *literal, int cflags, const char *expected) { + struct regex_quote_spec spec; char *result; size_t length; - result = regex_quote (literal, cflags); + spec = regex_quote_spec_posix (cflags, false); + result = regex_quote (literal, &spec); ASSERT (strcmp (result, expected) == 0); - length = regex_quote_length (literal, cflags); + length = regex_quote_length (literal, &spec); ASSERT (length == strlen (result)); free (result); result = (char *) xmalloc (1 + length + 1 + 1); result[0] = '^'; - strcpy (regex_quote_copy (result + 1, literal, cflags), "$"); + strcpy (regex_quote_copy (result + 1, literal, &spec), "$"); + { + regex_t regex; + regmatch_t match[1]; + + ASSERT (regcomp (®ex, result, cflags) == 0); + + ASSERT (regexec (®ex, literal, 1, match, 0) == 0); + ASSERT (match[0].rm_so == 0); + ASSERT (match[0].rm_eo == strlen (literal)); + regfree (®ex); + } + free (result); + + spec = regex_quote_spec_posix (cflags, true); + result = regex_quote (literal, &spec); + length = regex_quote_length (literal, &spec); + ASSERT (length == strlen (result)); { regex_t regex; regmatch_t match[1]; -- 2.11.0