lib/unistr/u8-chr.c

   1 /* Search character in piece of UTF-8 string.
   2    Copyright (C) 1999, 2002, 2006-2007, 2009-2010 Free Software Foundation,
   3    Inc.
   4    Written by Bruno Haible <bruno@clisp.org>, 2002.
   5
   6    This program is free software: you can redistribute it and/or modify it
   7    under the terms of the GNU Lesser General Public License as published
   8    by the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #include <config.h>
  20
  21 /* Specification.  */
  22 #include "unistr.h"
  23
  24 uint8_t *
  25 u8_chr (const uint8_t *s, size_t n, ucs4_t uc)
  26 {
  27   if (uc < 0x80)
  28     {
  29       uint8_t c0 = uc;
  30
  31       return (uint8_t *) memchr ((const char *) s, c0, n);
  32     }
  33
  34   {
  35     uint8_t c[6];
  36     size_t uc_size;
  37     uc_size = u8_uctomb_aux (c, uc, 6);
  38
  39     if (n < uc_size)
  40       return NULL;
  41
  42     /* For multibyte character matching we use a Boyer-Moore like
  43        algorithm that searches for the last byte, skipping multi-byte
  44        jumps, and matches back from there.
  45
  46        Instead of using a table as is usual for Boyer-Moore, we compare
  47        the candidate last byte s[UC_SIZE-1] with each of the possible
  48        bytes in the UTF-8 representation of UC.  If the final byte does
  49        not match, we will perform up to UC_SIZE comparisons per memory
  50        load---but each comparison lets us skip one byte in the input!
  51
  52        If the final byte matches, the "real" Boyer-Moore algorithm
  53        is approximated.  Instead, u8_chr just looks for other cN that
  54        are equal to the final byte and uses those to try realigning to
  55        another possible match.  For example, when searching for 0xF0
  56        0xAA 0xBB 0xAA it will always skip forward by two bytes, even if
  57        the character in the string was for example 0xF1 0xAA 0xBB 0xAA.
  58        The advantage of this scheme is that the skip count after a failed
  59        match can be computed outside the loop, and that it keeps the
  60        complexity low for a pretty rare case.  In particular, since c[0]
  61        is never between 0x80 and 0xBF, c[0] is never equal to c[UC_SIZE-1]
  62        and this is optimal for two-byte UTF-8 characters.  */
  63     switch (uc_size)
  64       {
  65       case 2:
  66         {
  67           uint8_t c0 = c[0];
  68           uint8_t c1 = c[1];
  69           const uint8_t *end = s + n - 1;
  70
  71           while (s < end)
  72             {
  73               uint8_t s1 = s[1];
  74               if (s1 == c1)
  75                 {
  76                   if (*s == c0)
  77                     return (uint8_t *) s;
  78                   else
  79                     s += 2;
  80                 }
  81               else
  82                 {
  83                   if (s1 == c0)
  84                     s++;
  85                   else
  86                     s += 2;
  87                 }
  88             }
  89           break;
  90         }
  91
  92       case 3:
  93         {
  94           uint8_t c0 = c[0];
  95           uint8_t c1 = c[1];
  96           uint8_t c2 = c[2];
  97           const uint8_t *end = s + n - 2;
  98           size_t skip;
  99
 100           if (c2 == c1)
 101             skip = 1;
 102           else
 103             skip = 3;
 104
 105           while (s < end)
 106             {
 107               uint8_t s2 = s[2];
 108               if (s2 == c2)
 109                 {
 110                   if (s[1] == c1 && *s == c0)
 111                     return (uint8_t *) s;
 112                   else
 113                     s += skip;
 114                 }
 115               else
 116                 {
 117                   if (s2 == c1)
 118                     s++;
 119                   else if (s2 == c0)
 120                     s += 2;
 121                   else
 122                     s += 3;
 123                 }
 124             }
 125           break;
 126         }
 127
 128       case 4:
 129         {
 130           uint8_t c0 = c[0];
 131           uint8_t c1 = c[1];
 132           uint8_t c2 = c[2];
 133           uint8_t c3 = c[3];
 134           const uint8_t *end = s + n - 3;
 135           size_t skip;
 136
 137           if (c3 == c2)
 138             skip = 1;
 139           else if (c3 == c1)
 140             skip = 2;
 141           else
 142             skip = 4;
 143
 144           while (s < end)
 145             {
 146               uint8_t s3 = s[3];
 147               if (s3 == c3)
 148                 {
 149                   if (s[2] == c2 && s[1] == c1 && *s == c0)
 150                     return (uint8_t *) s;
 151                   else
 152                     s += skip;
 153                 }
 154               else
 155                 {
 156                   if (s3 == c2)
 157                     s++;
 158                   else if (s3 == c1)
 159                     s += 2;
 160                   else if (s3 == c0)
 161                     s += 3;
 162                   else
 163                     s += 4;
 164                 }
 165             }
 166           break;
 167         }
 168       }
 169     return NULL;
 170   }
 171 }