X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=lib%2Fregex_internal.c;h=21298889a3161b644bf9bc653d08149de6fbb53f;hb=e3adb96fc71c48e1d3e638e7e1afadcd3ac0d840;hp=a377d2b288309312c42de3296b63f09fdd1e54e9;hpb=ce0f0550c109a4d97551924165659d7a80978999;p=gnulib.git

diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index a377d2b28..21298889a 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -1,5 +1,6 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Free Software
+   Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -488,16 +489,17 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
   mbstate_t prev_st;
   Idx rawbuf_idx;
   size_t mbclen;
-  wchar_t wc = WEOF;
+  wint_t wc = WEOF;
 
   /* Skip the characters which are not necessary to check.  */
   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
        rawbuf_idx < new_raw_idx;)
     {
+      wchar_t wc2;
       Idx remain_len;
       remain_len = pstr->len - rawbuf_idx;
       prev_st = pstr->cur_state;
-      mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
+      mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 			remain_len, &pstr->cur_state);
       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 	{
@@ -509,10 +511,12 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 	  mbclen = 1;
 	  pstr->cur_state = prev_st;
 	}
+      else
+	wc = wc2;
       /* Then proceed the next character.  */
       rawbuf_idx += mbclen;
     }
-  *last_wc = (wint_t) wc;
+  *last_wc = wc;
   return rawbuf_idx;
 }
 #endif /* RE_ENABLE_I18N  */
@@ -594,34 +598,98 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 
   if (BE (offset != 0, 1))
     {
-      /* Are the characters which are already checked remain?  */
-      if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
-	  /* Handling this would enlarge the code too much.
-	     Accept a slowdown in that case.  */
-	  && pstr->offsets_needed == 0
-#endif
-	 )
+      /* Should the already checked characters be kept?  */
+      if (BE (offset < pstr->valid_raw_len, 1))
 	{
 	  /* Yes, move them to the front of the buffer.  */
-	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 #ifdef RE_ENABLE_I18N
-	  if (pstr->mb_cur_max > 1)
-	    memmove (pstr->wcs, pstr->wcs + offset,
-		     (pstr->valid_len - offset) * sizeof (wint_t));
+	  if (BE (pstr->offsets_needed, 0))
+	    {
+	      Idx low = 0, high = pstr->valid_len, mid;
+	      do
+		{
+		  mid = (high + low) / 2;
+		  if (pstr->offsets[mid] > offset)
+		    high = mid;
+		  else if (pstr->offsets[mid] < offset)
+		    low = mid + 1;
+		  else
+		    break;
+		}
+	      while (low < high);
+	      if (pstr->offsets[mid] < offset)
+		++mid;
+	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
+							eflags);
+	      /* This can be quite complicated, so handle specially
+		 only the common and easy case where the character with
+		 different length representation of lower and upper
+		 case is present at or after offset.  */
+	      if (pstr->valid_len > offset
+		  && mid == offset && pstr->offsets[mid] == offset)
+		{
+		  memmove (pstr->wcs, pstr->wcs + offset,
+			   (pstr->valid_len - offset) * sizeof (wint_t));
+		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+		  pstr->valid_len -= offset;
+		  pstr->valid_raw_len -= offset;
+		  for (low = 0; low < pstr->valid_len; low++)
+		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+		}
+	      else
+		{
+		  /* Otherwise, just find out how long the partial multibyte
+		     character at offset is and fill it with WEOF/255.  */
+		  pstr->len = pstr->raw_len - idx + offset;
+		  pstr->stop = pstr->raw_stop - idx + offset;
+		  pstr->offsets_needed = 0;
+		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
+		    --mid;
+		  while (mid < pstr->valid_len)
+		    if (pstr->wcs[mid] != WEOF)
+		      break;
+		    else
+		      ++mid;
+		  if (mid == pstr->valid_len)
+		    pstr->valid_len = 0;
+		  else
+		    {
+		      pstr->valid_len = pstr->offsets[mid] - offset;
+		      if (pstr->valid_len)
+			{
+			  for (low = 0; low < pstr->valid_len; ++low)
+			    pstr->wcs[low] = WEOF;
+			  memset (pstr->mbs, 255, pstr->valid_len);
+			}
+		    }
+		  pstr->valid_raw_len = pstr->valid_len;
+		}
+	    }
+	  else
+#endif
+	    {
+	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
+							eflags);
+#ifdef RE_ENABLE_I18N
+	      if (pstr->mb_cur_max > 1)
+		memmove (pstr->wcs, pstr->wcs + offset,
+			 (pstr->valid_len - offset) * sizeof (wint_t));
 #endif /* RE_ENABLE_I18N */
-	  if (BE (pstr->mbs_allocated, 0))
-	    memmove (pstr->mbs, pstr->mbs + offset,
-		     pstr->valid_len - offset);
-	  pstr->valid_len -= offset;
-	  pstr->valid_raw_len -= offset;
+	      if (BE (pstr->mbs_allocated, 0))
+		memmove (pstr->mbs, pstr->mbs + offset,
+			 pstr->valid_len - offset);
+	      pstr->valid_len -= offset;
+	      pstr->valid_raw_len -= offset;
 #if DEBUG
-	  assert (pstr->valid_len > 0);
+	      assert (pstr->valid_len > 0);
 #endif
+	    }
 	}
       else
 	{
 	  /* No, skip all characters until IDX.  */
+	  Idx prev_valid_len = pstr->valid_len;
+
 #ifdef RE_ENABLE_I18N
 	  if (BE (pstr->offsets_needed, 0))
 	    {
@@ -639,12 +707,14 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 
 	      if (pstr->is_utf8)
 		{
-		  const unsigned char *raw, *p, *q, *end;
+		  const unsigned char *raw, *p, *end;
 
 		  /* Special case UTF-8.  Multi-byte chars start with any
 		     byte other than 0x80 - 0xbf.  */
 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 		  end = raw + (offset - pstr->mb_cur_max);
+		  if (end < pstr->raw_mbs)
+		    end = pstr->raw_mbs;
 		  p = raw + offset - 1;
 #ifdef _LIBC
 		  /* We know the wchar_t encoding is UCS4, so for the simple
@@ -652,7 +722,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
 		    {
 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
-		      pstr->valid_len = 0;
+		      /* pstr->valid_len = 0; */
 		      wc = (wchar_t) *p;
 		    }
 		  else
@@ -666,13 +736,11 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 			  unsigned char buf[6];
 			  size_t mbclen;
 
-			  q = p;
 			  if (BE (pstr->trans != NULL, 0))
 			    {
 			      int i = mlen < 6 ? mlen : 6;
 			      while (--i >= 0)
 				buf[i] = pstr->trans[p[i]];
-			      q = buf;
 			    }
 			  /* XXX Don't use mbrtowc, we know which conversion
 			     to use (UTF-8 -> UCS4).  */
@@ -695,7 +763,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 	      if (wc == WEOF)
 		pstr->tip_context
-		  = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 	      else
 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 				      && IS_WIDE_WORD_CHAR (wc))
@@ -708,7 +776,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 		    pstr->wcs[wcs_idx] = WEOF;
 		  if (pstr->mbs_allocated)
-		    memset (pstr->mbs, -1, pstr->valid_len);
+		    memset (pstr->mbs, 255, pstr->valid_len);
 		}
 	      pstr->valid_raw_len = pstr->valid_len;
 	    }
@@ -1337,7 +1405,6 @@ static Idx
 internal_function
 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
 {
-  int type = token.type;
   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
     {
       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
@@ -1373,8 +1440,11 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
   dfa->nodes[dfa->nodes_len] = token;
   dfa->nodes[dfa->nodes_len].constraint = 0;
 #ifdef RE_ENABLE_I18N
+  {
+  int type = token.type;
   dfa->nodes[dfa->nodes_len].accept_mb =
     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
+  }
 #endif
   dfa->nexts[dfa->nodes_len] = REG_MISSING;
   re_node_set_init_empty (dfa->edests + dfa->nodes_len);