From patchwork Wed May 13 08:58:10 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: =?utf-8?b?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
X-Patchwork-Id: 471763
Return-Path: 
 <libc-alpha-return-58821-incoming=patchwork.ozlabs.org@sourceware.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 1C6531402C2
	for <incoming@patchwork.ozlabs.org>;
	Wed, 13 May 2015 18:58:29 +1000 (AEST)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=sourceware.org header.i=@sourceware.org
	header.b=yzJAOIcP; dkim-atps=neutral
DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:date:from:to:subject:message-id:mime-version
	:content-type; q=dns; s=default; b=kdJ99SxfSZOyhlXoLHx5yQYdY9xjs
	M4LnUg45z9emwaWJbv0ZNGfTwLbauxNlmW//dZr/p9JCPVi3c9oAdd3brFwSHWoW
	M6D/rNiZFpQkIN499LH2ToVWPF16n2SqFUNXR+ZHXw8kqnBo/IZcVd+4qzeIrHNE
	ea8rL27DaVyMlA=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:date:from:to:subject:message-id:mime-version
	:content-type; s=default; bh=HDCoOLBLRjPgBdaDqmU2HSwvz+4=; b=yzJ
	AOIcPTg3tZl5Mq/qRWkqnBMISTUkYqxvLCHrtMQ2W5rkoFFlK4KY6Mzf8Tn281KH
	f417xh6fYvoFyoJmKFkjCWxf/LfVxxFZhEzI97khwcvDrb/ElWEX0ZPUykxZ6yg0
	HSTnNwcw1VxGbYKBCj7rfdKLqKZjWLzuDafUurzA=
Received: (qmail 33646 invoked by alias); 13 May 2015 08:58:23 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: 
 <mailto:libc-alpha-unsubscribe-incoming=patchwork.ozlabs.org@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 33636 invoked by uid 89); 13 May 2015 08:58:22 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=0.3 required=5.0 tests=AWL, BAYES_40,
	FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2
X-HELO: popelka.ms.mff.cuni.cz
Date: Wed, 13 May 2015 10:58:10 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: [PATCH 1/3] Refactor strdiff.
Message-ID: <20150513085810.GA31782@domone>
MIME-Version: 1.0
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Hi, as I want to improve strcasecmp with strdiff first step is move it
to separate file. I also factored out UTF-8 handling. I also added
microoptimization to find start as you could do a < x < b check with
single comparison and substraction and unroll loop as it could happen
maximally three times.

Then there is wide character handling. I added explicit encoding there
as widechar version could be directly used.

OK with this?

	* string/strdiff.h: New file.
	* string/strcoll_l.c: Move out STRDIFF implementation.

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 0fa005f..297ec9c 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -30,6 +30,7 @@
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
 # define STRDIFF __strdiff
+# define STRDIFF_L __strdiff_l
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@@ -42,19 +43,7 @@
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
-#define MASK_UTF8_7BIT  (1 << 7)
-#define MASK_UTF8_START (3 << 6)
-
-size_t
-STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
-{
-  size_t n;
-
-  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
-    continue;
-
-  return n;
-}
+#include "string/strdiff.h"
 
 /* Track status while looking for sequences in a string.  */
 typedef struct
@@ -274,24 +263,14 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   if (nrules == 0)
     return STRCMP (s1, s2);
 
-  /* Fast forward to the position of the first difference.  Needs to be
-     encoding aware as the byte-by-byte comparison can stop in the middle
-     of a char sequence for multibyte encodings like UTF-8.  */
+  /* Fast forward to the position of the first difference.  */
   uint_fast32_t encoding =
     current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
-  if (encoding != __cet_other)
-    {
-      size_t diff = STRDIFF (s1, s2);
-      if (diff > 0)
-	{
-	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
-	    do
-	      diff--;
-	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
-	  s1 += diff;
-	  s2 += diff;
-	}
-    }
+
+  if (sizeof (STRING_TYPE) > 1)
+    STRDIFF_L (&s1, &s2, __cet_8bit);
+  else if (encoding != __cet_other)
+    STRDIFF_L (&s1, &s2, encoding);
 
   /* Catch empty strings.  */
   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
diff --git a/string/strdiff.h b/string/strdiff.h
new file mode 100644
index 0000000..224d899
--- /dev/null
+++ b/string/strdiff.h
@@ -0,0 +1,36 @@
+static size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
+#define UTF8_CONT_START 128
+#define UTF8_CONT_END 195
+
+static void 
+STRDIFF_L (const STRING_TYPE **s1, const STRING_TYPE **s2, uint_fast32_t encoding)
+{
+  size_t diff = STRDIFF (*s1, *s2);
+	  
+  if (encoding == __cet_utf8)
+    {
+      USTRING_TYPE c = *(*s1 + diff);
+      if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+        {
+          diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+        }
+     }
+  *s1 += diff;
+  *s2 += diff;
+}