[1/3] Refactor strdiff.

Message ID	20150513085810.GA31782@domone
State	New
Headers	show Return-Path: <libc-alpha-return-58821-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:subject:message-id:mime-version :content-type; q=dns; s=default; b=kdJ99SxfSZOyhlXoLHx5yQYdY9xjs M4LnUg45z9emwaWJbv0ZNGfTwLbauxNlmW//dZr/p9JCPVi3c9oAdd3brFwSHWoW M6D/rNiZFpQkIN499LH2ToVWPF16n2SqFUNXR+ZHXw8kqnBo/IZcVd+4qzeIrHNE ea8rL27DaVyMlA= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Date: Wed, 13 May 2015 10:58:10 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz> To: libc-alpha@sourceware.org Subject: [PATCH 1/3] Refactor strdiff. Message-ID: <20150513085810.GA31782@domone> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14)

Message ID

20150513085810.GA31782@domone

State

New

Headers

DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:date:from:to:subject:message-id:mime-version
	:content-type; q=dns; s=default; b=kdJ99SxfSZOyhlXoLHx5yQYdY9xjs
	M4LnUg45z9emwaWJbv0ZNGfTwLbauxNlmW//dZr/p9JCPVi3c9oAdd3brFwSHWoW
	M6D/rNiZFpQkIN499LH2ToVWPF16n2SqFUNXR+ZHXw8kqnBo/IZcVd+4qzeIrHNE
	ea8rL27DaVyMlA=
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
Date: Wed, 13 May 2015 10:58:10 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: [PATCH 1/3] Refactor strdiff.
Message-ID: <20150513085810.GA31782@domone>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Commit Message

Ondřej Bílka May 13, 2015, 8:58 a.m. UTC

Hi, as I want to improve strcasecmp with strdiff first step is move it
to separate file. I also factored out UTF-8 handling. I also added
microoptimization to find start as you could do a < x < b check with
single comparison and substraction and unroll loop as it could happen
maximally three times.

Then there is wide character handling. I added explicit encoding there
as widechar version could be directly used.

OK with this?

	* string/strdiff.h: New file.
	* string/strcoll_l.c: Move out STRDIFF implementation.

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 0fa005f..297ec9c 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -30,6 +30,7 @@ 
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
 # define STRDIFF __strdiff
+# define STRDIFF_L __strdiff_l
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@@ -42,19 +43,7 @@ 
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
-#define MASK_UTF8_7BIT  (1 << 7)
-#define MASK_UTF8_START (3 << 6)
-
-size_t
-STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
-{
-  size_t n;
-
-  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
-    continue;
-
-  return n;
-}
+#include "string/strdiff.h"
 
 /* Track status while looking for sequences in a string.  */
 typedef struct
@@ -274,24 +263,14 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   if (nrules == 0)
     return STRCMP (s1, s2);
 
-  /* Fast forward to the position of the first difference.  Needs to be
-     encoding aware as the byte-by-byte comparison can stop in the middle
-     of a char sequence for multibyte encodings like UTF-8.  */
+  /* Fast forward to the position of the first difference.  */
   uint_fast32_t encoding =
     current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
-  if (encoding != __cet_other)
-    {
-      size_t diff = STRDIFF (s1, s2);
-      if (diff > 0)
-	{
-	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
-	    do
-	      diff--;
-	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
-	  s1 += diff;
-	  s2 += diff;
-	}
-    }
+
+  if (sizeof (STRING_TYPE) > 1)
+    STRDIFF_L (&s1, &s2, __cet_8bit);
+  else if (encoding != __cet_other)
+    STRDIFF_L (&s1, &s2, encoding);
 
   /* Catch empty strings.  */
   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
diff --git a/string/strdiff.h b/string/strdiff.h
new file mode 100644
index 0000000..224d899
--- /dev/null
+++ b/string/strdiff.h
@@ -0,0 +1,36 @@ 
+static size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
+#define UTF8_CONT_START 128
+#define UTF8_CONT_END 195
+
+static void 
+STRDIFF_L (const STRING_TYPE **s1, const STRING_TYPE **s2, uint_fast32_t encoding)
+{
+  size_t diff = STRDIFF (*s1, *s2);
+	  
+  if (encoding == __cet_utf8)
+    {
+      USTRING_TYPE c = *(*s1 + diff);
+      if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+        {
+          diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+        }
+     }
+  *s1 += diff;
+  *s2 += diff;
+}

[1/3] Refactor strdiff.

Commit Message

Patch