From 8cada094a301d3f78c086ef0291e8ca88cbe7a1d Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sat, 13 Nov 2010 19:43:06 +0100 Subject: [PATCH] unistr/u8-mbtouc: Improve handling of ill-formed UTF-8 input. * lib/unistr/u8-mbtouc.c (u8_mbtouc): For an invalid multibyte character, return the number of bytes that belong together, not always 1. * lib/unistr/u8-mbtouc-unsafe.c (u8_mbtouc_unsafe): Likewise. * lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Likewise. * lib/unistr/u8-mbtouc-unsafe-aux.c (u8_mbtouc_unsafe_aux): Likewise. * lib/unistr/u8-mbsnlen.c (u8_mbsnlen): Use u8_mbtouc to determine the number of bytes of an invalid character. * tests/unistr/test-u8-mbtouc.c (test_safe_function): New function. (main): Invoke it. * tests/unistr/test-u8-mbtouc.h (test_function): Update two test results. * tests/unistr/test-u8-mbsnlen.c (main): Test various kinds of malformed byte sequences. * modules/unistr/u8-mbtouc (configure.ac): Bump version number. * modules/unistr/u8-mbtouc-unsafe (configure.ac): Likewise. * modules/unistr/u8-mbsnlen (configure.ac): Likewise. Reported by Ben Pfaff and Paolo Bonzini. --- ChangeLog | 21 +++++ lib/unistr/u8-mbsnlen.c | 4 +- lib/unistr/u8-mbtouc-aux.c | 164 +++++++++++++++++++++++++--------- lib/unistr/u8-mbtouc-unsafe-aux.c | 180 ++++++++++++++++++++++++++++---------- lib/unistr/u8-mbtouc-unsafe.c | 180 ++++++++++++++++++++++++++++---------- lib/unistr/u8-mbtouc.c | 164 +++++++++++++++++++++++++--------- modules/unistr/u8-mbsnlen | 2 +- modules/unistr/u8-mbtouc | 2 +- modules/unistr/u8-mbtouc-unsafe | 2 +- tests/unistr/test-u8-mbsnlen.c | 59 +++++++++++++ tests/unistr/test-u8-mbtouc.c | 159 +++++++++++++++++++++++++++++++++ tests/unistr/test-u8-mbtouc.h | 4 +- 12 files changed, 765 insertions(+), 176 deletions(-) diff --git a/ChangeLog b/ChangeLog index 95ce4abc1..fe84eade0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,26 @@ 2010-11-13 Bruno Haible + unistr/u8-mbtouc: Improve handling of ill-formed UTF-8 input. + * lib/unistr/u8-mbtouc.c (u8_mbtouc): For an invalid multibyte + character, return the number of bytes that belong together, not always + 1. + * lib/unistr/u8-mbtouc-unsafe.c (u8_mbtouc_unsafe): Likewise. + * lib/unistr/u8-mbtouc-aux.c (u8_mbtouc_aux): Likewise. + * lib/unistr/u8-mbtouc-unsafe-aux.c (u8_mbtouc_unsafe_aux): Likewise. + * lib/unistr/u8-mbsnlen.c (u8_mbsnlen): Use u8_mbtouc to determine the + number of bytes of an invalid character. + * tests/unistr/test-u8-mbtouc.c (test_safe_function): New function. + (main): Invoke it. + * tests/unistr/test-u8-mbtouc.h (test_function): Update two test results. + * tests/unistr/test-u8-mbsnlen.c (main): Test various kinds of + malformed byte sequences. + * modules/unistr/u8-mbtouc (configure.ac): Bump version number. + * modules/unistr/u8-mbtouc-unsafe (configure.ac): Likewise. + * modules/unistr/u8-mbsnlen (configure.ac): Likewise. + Reported by Ben Pfaff and Paolo Bonzini. + +2010-11-13 Bruno Haible + openat: Work around glibc bug with fchownat() and empty file names. * m4/openat.m4 (gl_FUNC_FCHOWNAT_EMPTY_FILENAME_BUG): New macro. (gl_FUNC_FCHOWNAT): Invoke it. diff --git a/lib/unistr/u8-mbsnlen.c b/lib/unistr/u8-mbsnlen.c index 9ddc42ea2..f72c91c88 100644 --- a/lib/unistr/u8-mbsnlen.c +++ b/lib/unistr/u8-mbsnlen.c @@ -33,7 +33,9 @@ u8_mbsnlen (const uint8_t *s, size_t n) characters++; if (count == -2) break; - if (count <= 0) + if (count < 0) + count = u8_mbtouc (&uc, s, n); + else if (count == 0) count = 1; s += count; n -= count; diff --git a/lib/unistr/u8-mbtouc-aux.c b/lib/unistr/u8-mbtouc-aux.c index c9975896f..378680285 100644 --- a/lib/unistr/u8-mbtouc-aux.c +++ b/lib/unistr/u8-mbtouc-aux.c @@ -45,21 +45,32 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + return 1; } } else if (c < 0xf0) { if (n >= 3) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (c >= 0xe1 || s[1] >= 0xa0) - && (c != 0xed || s[1] < 0xa0)) + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x0f) << 12) - | ((unsigned int) (s[1] ^ 0x80) << 6) - | (unsigned int) (s[2] ^ 0x80); - return 3; + if ((s[2] ^ 0x80) < 0x40) + { + if ((c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ } @@ -67,26 +78,45 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else + return 2; } } else if (c < 0xf8) { if (n >= 4) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 - && (c >= 0xf1 || s[1] >= 0x90) + if ((s[1] ^ 0x80) < 0x40) + { + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((c >= 0xf1 || s[1] >= 0x90) #if 1 - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) #endif - ) - { - *puc = ((unsigned int) (c & 0x07) << 18) - | ((unsigned int) (s[1] ^ 0x80) << 12) - | ((unsigned int) (s[2] ^ 0x80) << 6) - | (unsigned int) (s[3] ^ 0x80); - return 4; + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ } @@ -94,7 +124,12 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else if (n == 2 || (s[2] ^ 0x80) >= 0x40) + return 2; + else + return 3; } } #if 0 @@ -102,16 +137,37 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) { if (n >= 5) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (c >= 0xf9 || s[1] >= 0x88)) + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x03) << 24) - | ((unsigned int) (s[1] ^ 0x80) << 18) - | ((unsigned int) (s[2] ^ 0x80) << 12) - | ((unsigned int) (s[3] ^ 0x80) << 6) - | (unsigned int) (s[4] ^ 0x80); - return 5; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if (c >= 0xf9 || s[1] >= 0x88) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ } @@ -126,18 +182,44 @@ u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) { if (n >= 6) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (s[5] ^ 0x80) < 0x40 - && (c >= 0xfd || s[1] >= 0x84)) + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x01) << 30) - | ((unsigned int) (s[1] ^ 0x80) << 24) - | ((unsigned int) (s[2] ^ 0x80) << 18) - | ((unsigned int) (s[3] ^ 0x80) << 12) - | ((unsigned int) (s[4] ^ 0x80) << 6) - | (unsigned int) (s[5] ^ 0x80); - return 6; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if ((s[5] ^ 0x80) < 0x40) + { + if (c >= 0xfd || s[1] >= 0x84) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 6; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ } diff --git a/lib/unistr/u8-mbtouc-unsafe-aux.c b/lib/unistr/u8-mbtouc-unsafe-aux.c index 47590e392..1965e9636 100644 --- a/lib/unistr/u8-mbtouc-unsafe-aux.c +++ b/lib/unistr/u8-mbtouc-unsafe-aux.c @@ -41,13 +41,15 @@ u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n) | (unsigned int) (s[1] ^ 0x80); return 2; } +#if CONFIG_UNICODE_SAFETY /* invalid multibyte character */ +#endif } else { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + return 1; } } else if (c < 0xf0) @@ -55,23 +57,39 @@ u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 3) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (c >= 0xe1 || s[1] >= 0xa0) - && (c != 0xed || s[1] < 0xa0)) -#endif + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x0f) << 12) - | ((unsigned int) (s[1] ^ 0x80) << 6) - | (unsigned int) (s[2] ^ 0x80); - return 3; + if ((s[2] ^ 0x80) < 0x40) + { + if ((c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) +#endif + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ +#endif } else { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else + return 2; } } else if (c < 0xf8) @@ -79,28 +97,51 @@ u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 4) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 - && (c >= 0xf1 || s[1] >= 0x90) + if ((s[1] ^ 0x80) < 0x40) + { + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((c >= 0xf1 || s[1] >= 0x90) #if 1 - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) #endif - ) + ) #endif - { - *puc = ((unsigned int) (c & 0x07) << 18) - | ((unsigned int) (s[1] ^ 0x80) << 12) - | ((unsigned int) (s[2] ^ 0x80) << 6) - | (unsigned int) (s[3] ^ 0x80); - return 4; + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ +#endif } else { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else if (n == 2 || (s[2] ^ 0x80) >= 0x40) + return 2; + else + return 3; } } #if 0 @@ -109,19 +150,42 @@ u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 5) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (c >= 0xf9 || s[1] >= 0x88)) -#endif + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x03) << 24) - | ((unsigned int) (s[1] ^ 0x80) << 18) - | ((unsigned int) (s[2] ^ 0x80) << 12) - | ((unsigned int) (s[3] ^ 0x80) << 6) - | (unsigned int) (s[4] ^ 0x80); - return 5; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if (c >= 0xf9 || s[1] >= 0x88) +#endif + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ +#endif } else { @@ -135,21 +199,49 @@ u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 6) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (s[5] ^ 0x80) < 0x40 - && (c >= 0xfd || s[1] >= 0x84)) -#endif + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x01) << 30) - | ((unsigned int) (s[1] ^ 0x80) << 24) - | ((unsigned int) (s[2] ^ 0x80) << 18) - | ((unsigned int) (s[3] ^ 0x80) << 12) - | ((unsigned int) (s[4] ^ 0x80) << 6) - | (unsigned int) (s[5] ^ 0x80); - return 6; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if ((s[5] ^ 0x80) < 0x40) + { + if (c >= 0xfd || s[1] >= 0x84) +#endif + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 6; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ +#endif } else { diff --git a/lib/unistr/u8-mbtouc-unsafe.c b/lib/unistr/u8-mbtouc-unsafe.c index 41583f96a..16af8f98f 100644 --- a/lib/unistr/u8-mbtouc-unsafe.c +++ b/lib/unistr/u8-mbtouc-unsafe.c @@ -52,13 +52,15 @@ u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n) | (unsigned int) (s[1] ^ 0x80); return 2; } +#if CONFIG_UNICODE_SAFETY /* invalid multibyte character */ +#endif } else { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + return 1; } } else if (c < 0xf0) @@ -66,23 +68,39 @@ u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 3) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (c >= 0xe1 || s[1] >= 0xa0) - && (c != 0xed || s[1] < 0xa0)) -#endif + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x0f) << 12) - | ((unsigned int) (s[1] ^ 0x80) << 6) - | (unsigned int) (s[2] ^ 0x80); - return 3; + if ((s[2] ^ 0x80) < 0x40) + { + if ((c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) +#endif + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ +#endif } else { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else + return 2; } } else if (c < 0xf8) @@ -90,28 +108,51 @@ u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 4) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 - && (c >= 0xf1 || s[1] >= 0x90) + if ((s[1] ^ 0x80) < 0x40) + { + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((c >= 0xf1 || s[1] >= 0x90) #if 1 - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) #endif - ) + ) #endif - { - *puc = ((unsigned int) (c & 0x07) << 18) - | ((unsigned int) (s[1] ^ 0x80) << 12) - | ((unsigned int) (s[2] ^ 0x80) << 6) - | (unsigned int) (s[3] ^ 0x80); - return 4; + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ +#endif } else { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else if (n == 2 || (s[2] ^ 0x80) >= 0x40) + return 2; + else + return 3; } } #if 0 @@ -120,19 +161,42 @@ u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 5) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (c >= 0xf9 || s[1] >= 0x88)) -#endif + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x03) << 24) - | ((unsigned int) (s[1] ^ 0x80) << 18) - | ((unsigned int) (s[2] ^ 0x80) << 12) - | ((unsigned int) (s[3] ^ 0x80) << 6) - | (unsigned int) (s[4] ^ 0x80); - return 5; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if (c >= 0xf9 || s[1] >= 0x88) +#endif + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ +#endif } else { @@ -146,21 +210,49 @@ u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n) if (n >= 6) { #if CONFIG_UNICODE_SAFETY - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (s[5] ^ 0x80) < 0x40 - && (c >= 0xfd || s[1] >= 0x84)) -#endif + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x01) << 30) - | ((unsigned int) (s[1] ^ 0x80) << 24) - | ((unsigned int) (s[2] ^ 0x80) << 18) - | ((unsigned int) (s[3] ^ 0x80) << 12) - | ((unsigned int) (s[4] ^ 0x80) << 6) - | (unsigned int) (s[5] ^ 0x80); - return 6; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if ((s[5] ^ 0x80) < 0x40) + { + if (c >= 0xfd || s[1] >= 0x84) +#endif + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + *puc = 0xfffd; + return 6; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ +#endif } else { diff --git a/lib/unistr/u8-mbtouc.c b/lib/unistr/u8-mbtouc.c index 96cd5b741..2ddd864a0 100644 --- a/lib/unistr/u8-mbtouc.c +++ b/lib/unistr/u8-mbtouc.c @@ -55,21 +55,32 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + return 1; } } else if (c < 0xf0) { if (n >= 3) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (c >= 0xe1 || s[1] >= 0xa0) - && (c != 0xed || s[1] < 0xa0)) + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x0f) << 12) - | ((unsigned int) (s[1] ^ 0x80) << 6) - | (unsigned int) (s[2] ^ 0x80); - return 3; + if ((s[2] ^ 0x80) < 0x40) + { + if ((c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ } @@ -77,26 +88,45 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else + return 2; } } else if (c < 0xf8) { if (n >= 4) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 - && (c >= 0xf1 || s[1] >= 0x90) + if ((s[1] ^ 0x80) < 0x40) + { + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((c >= 0xf1 || s[1] >= 0x90) #if 1 - && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) #endif - ) - { - *puc = ((unsigned int) (c & 0x07) << 18) - | ((unsigned int) (s[1] ^ 0x80) << 12) - | ((unsigned int) (s[2] ^ 0x80) << 6) - | (unsigned int) (s[3] ^ 0x80); - return 4; + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 2; } /* invalid multibyte character */ } @@ -104,7 +134,12 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) { /* incomplete multibyte character */ *puc = 0xfffd; - return n; + if (n == 1 || (s[1] ^ 0x80) >= 0x40) + return 1; + else if (n == 2 || (s[2] ^ 0x80) >= 0x40) + return 2; + else + return 3; } } #if 0 @@ -112,16 +147,37 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) { if (n >= 5) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (c >= 0xf9 || s[1] >= 0x88)) + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x03) << 24) - | ((unsigned int) (s[1] ^ 0x80) << 18) - | ((unsigned int) (s[2] ^ 0x80) << 12) - | ((unsigned int) (s[3] ^ 0x80) << 6) - | (unsigned int) (s[4] ^ 0x80); - return 5; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if (c >= 0xf9 || s[1] >= 0x88) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ } @@ -136,18 +192,44 @@ u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) { if (n >= 6) { - if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 - && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 - && (s[5] ^ 0x80) < 0x40 - && (c >= 0xfd || s[1] >= 0x84)) + if ((s[1] ^ 0x80) < 0x40) { - *puc = ((unsigned int) (c & 0x01) << 30) - | ((unsigned int) (s[1] ^ 0x80) << 24) - | ((unsigned int) (s[2] ^ 0x80) << 18) - | ((unsigned int) (s[3] ^ 0x80) << 12) - | ((unsigned int) (s[4] ^ 0x80) << 6) - | (unsigned int) (s[5] ^ 0x80); - return 6; + if ((s[2] ^ 0x80) < 0x40) + { + if ((s[3] ^ 0x80) < 0x40) + { + if ((s[4] ^ 0x80) < 0x40) + { + if ((s[5] ^ 0x80) < 0x40) + { + if (c >= 0xfd || s[1] >= 0x84) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 6; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 5; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 4; + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 3; + } + /* invalid multibyte character */ + return 2; } /* invalid multibyte character */ } diff --git a/modules/unistr/u8-mbsnlen b/modules/unistr/u8-mbsnlen index bc5c60cd6..372626a07 100644 --- a/modules/unistr/u8-mbsnlen +++ b/modules/unistr/u8-mbsnlen @@ -9,7 +9,7 @@ unistr/base unistr/u8-mbtoucr configure.ac: -gl_LIBUNISTRING_MODULE([0.9.3], [unistr/u8-mbsnlen]) +gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mbsnlen]) Makefile.am: if LIBUNISTRING_COMPILE_UNISTR_U8_MBSNLEN diff --git a/modules/unistr/u8-mbtouc b/modules/unistr/u8-mbtouc index d64acf7d4..7b5d6e56a 100644 --- a/modules/unistr/u8-mbtouc +++ b/modules/unistr/u8-mbtouc @@ -10,7 +10,7 @@ unistr/base configure.ac: gl_MODULE_INDICATOR([unistr/u8-mbtouc]) -gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-mbtouc]) +gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mbtouc]) Makefile.am: if LIBUNISTRING_COMPILE_UNISTR_U8_MBTOUC diff --git a/modules/unistr/u8-mbtouc-unsafe b/modules/unistr/u8-mbtouc-unsafe index e083a8681..268ab2f1f 100644 --- a/modules/unistr/u8-mbtouc-unsafe +++ b/modules/unistr/u8-mbtouc-unsafe @@ -10,7 +10,7 @@ unistr/base configure.ac: gl_MODULE_INDICATOR([unistr/u8-mbtouc-unsafe]) -gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-mbtouc-unsafe]) +gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mbtouc-unsafe]) Makefile.am: if LIBUNISTRING_COMPILE_UNISTR_U8_MBTOUC_UNSAFE diff --git a/tests/unistr/test-u8-mbsnlen.c b/tests/unistr/test-u8-mbsnlen.c index 8e68550c0..d4bc2760e 100644 --- a/tests/unistr/test-u8-mbsnlen.c +++ b/tests/unistr/test-u8-mbsnlen.c @@ -57,5 +57,64 @@ main () } } + /* Test behaviour required by ISO 10646-1, sections R.7 and 2.3c, namely, + that a "malformed sequence" is interpreted in the same way as + "a character that is outside the adopted subset". + Reference: + Markus Kuhn: UTF-8 decoder capability and stress test + + + */ + /* 3.1. Test that each unexpected continuation byte is signalled as a + malformed sequence of its own. */ + { + static const uint8_t input[] = { '"', 0x80, 0xBF, 0x80, 0xBF, '"' }; + ASSERT (u8_mbsnlen (input, 6) == 6); + } + /* 3.2. Lonely start characters. */ + { + ucs4_t c; + uint8_t input[2]; + + for (c = 0xC0; c <= 0xFF; c++) + { + input[0] = c; + input[1] = ' '; + + ASSERT (u8_mbsnlen (input, 2) == 2); + } + } + /* 3.3. Sequences with last continuation byte missing. */ + /* 3.3.1. 2-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xC0, '"' }; + ASSERT (u8_mbsnlen (input, 3) == 3); + } + /* 3.3.6. 2-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xDF, '"' }; + ASSERT (u8_mbsnlen (input, 3) == 3); + } + /* 3.3.2. 3-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xE0, 0x80, '"' }; + ASSERT (u8_mbsnlen (input, 4) == 3); + } + /* 3.3.7. 3-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' }; + ASSERT (u8_mbsnlen (input, 4) == 3); + } + /* 3.3.3. 4-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' }; + ASSERT (u8_mbsnlen (input, 5) == 3); + } + /* 3.3.8. 4-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' }; + ASSERT (u8_mbsnlen (input, 5) == 3); + } + return 0; } diff --git a/tests/unistr/test-u8-mbtouc.c b/tests/unistr/test-u8-mbtouc.c index f6960a680..377d887af 100644 --- a/tests/unistr/test-u8-mbtouc.c +++ b/tests/unistr/test-u8-mbtouc.c @@ -24,10 +24,169 @@ #include "test-u8-mbtouc.h" +static void +test_safe_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) +{ + ucs4_t uc; + int ret; + + /* Test behaviour required by ISO 10646-1, sections R.7 and 2.3c, namely, + that a "malformed sequence" is interpreted in the same way as + "a character that is outside the adopted subset". + Reference: + Markus Kuhn: UTF-8 decoder capability and stress test + + + */ + /* 3.1. Test that each unexpected continuation byte is signalled as a + malformed sequence of its own. */ + { + static const uint8_t input[] = { '"', 0x80, 0xBF, 0x80, 0xBF, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 6); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 5); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 2, 4); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 3, 3); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 4, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 5, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } + /* 3.2. Lonely start characters. */ + { + ucs4_t c; + uint8_t input[2]; + + for (c = 0xC0; c <= 0xFF; c++) + { + input[0] = c; + input[1] = ' '; + + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + } + } + /* 3.3. Sequences with last continuation byte missing. */ + /* 3.3.1. 2-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xC0, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 3); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 2, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } + /* 3.3.6. 2-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xDF, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 3); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 2); + ASSERT (ret == 1); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 2, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } + /* 3.3.2. 3-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xE0, 0x80, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 4); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 3); + ASSERT (ret == 2); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 3, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } + /* 3.3.7. 3-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xEF, 0xBF, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 4); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 3); + ASSERT (ret == 2); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 3, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } + /* 3.3.3. 4-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xF0, 0x80, 0x80, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 5); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 4); + ASSERT (ret == 3); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 4, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } + /* 3.3.8. 4-byte sequence with last byte missing. */ + { + static const uint8_t input[] = { '"', 0xF7, 0xBF, 0xBF, '"' }; + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input, 5); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 1, 4); + ASSERT (ret == 3); + ASSERT (uc == 0xFFFD); + uc = 0xBADFACE; + ret = my_u8_mbtouc (&uc, input + 4, 1); + ASSERT (ret == 1); + ASSERT (uc == 0x0022); + } +} + int main () { test_function (u8_mbtouc); + test_safe_function (u8_mbtouc); return 0; } diff --git a/tests/unistr/test-u8-mbtouc.h b/tests/unistr/test-u8-mbtouc.h index bcafb0527..ee5e91ae4 100644 --- a/tests/unistr/test-u8-mbtouc.h +++ b/tests/unistr/test-u8-mbtouc.h @@ -166,14 +166,14 @@ test_function (int (*my_u8_mbtouc) (ucs4_t *, const uint8_t *, size_t)) static const uint8_t input[] = { 0xF3, 0xD0, 0xBF }; uc = 0xBADFACE; ret = my_u8_mbtouc (&uc, input, 3); - ASSERT (ret == 1 || ret == 3); + ASSERT (ret == 1); ASSERT (uc == 0xFFFD); } { static const uint8_t input[] = { 0xF3, 0x8F, 0xD0 }; uc = 0xBADFACE; ret = my_u8_mbtouc (&uc, input, 3); - ASSERT (ret == 1 || ret == 3); + ASSERT (ret == 2); ASSERT (uc == 0xFFFD); } } -- 2.11.0