powerpc64: strcpy optimization for unaligned string

Message ID	20141218211348.GA16854@domone
State	New
Headers	show Return-Path: <libc-alpha-return-55649-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:cc:subject:message-id:references :mime-version:content-type:in-reply-to; q=dns; s=default; b=c+1Y uIK2DwLNn0YnWPvvL7O+bfbCIlHAFSUUjECMOfkybIvqp/GhatjQSyIe1gi9N1f9 X2JNzxbQqVC1RiTCrsKb1ivFwN/LOrZlAU6aGiWzgPKgk1xVOwCcbzV8G607/Jed 5z0agAkZ581RMCI5nbRdjIaZuGdFrupqRIbulYM= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Date: Thu, 18 Dec 2014 22:13:48 +0100 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz> To: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> Cc: libc-alpha@sourceware.org Subject: Re: [PATCH] powerpc64: strcpy optimization for unaligned string Message-ID: <20141218211348.GA16854@domone> References: <1418832071-93495-1-git-send-email-raji@linux.vnet.ibm.com> <5491A9A5.2000400@linux.vnet.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <5491A9A5.2000400@linux.vnet.ibm.com> User-Agent: Mutt/1.5.20 (2009-06-14)

Message ID

20141218211348.GA16854@domone

State

New

Headers

DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:date:from:to:cc:subject:message-id:references
	:mime-version:content-type:in-reply-to; q=dns; s=default; b=c+1Y
	uIK2DwLNn0YnWPvvL7O+bfbCIlHAFSUUjECMOfkybIvqp/GhatjQSyIe1gi9N1f9
	X2JNzxbQqVC1RiTCrsKb1ivFwN/LOrZlAU6aGiWzgPKgk1xVOwCcbzV8G607/Jed
	5z0agAkZ581RMCI5nbRdjIaZuGdFrupqRIbulYM=
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
Date: Thu, 18 Dec 2014 22:13:48 +0100
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Cc: libc-alpha@sourceware.org
Subject: Re: [PATCH] powerpc64: strcpy optimization for unaligned string
Message-ID: <20141218211348.GA16854@domone>
References: <1418832071-93495-1-git-send-email-raji@linux.vnet.ibm.com>
	<5491A9A5.2000400@linux.vnet.ibm.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <5491A9A5.2000400@linux.vnet.ibm.com>
User-Agent: Mutt/1.5.20 (2009-06-14)

Commit Message

Ondřej Bílka Dec. 18, 2014, 9:13 p.m. UTC

On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> This patch optimizes strcpy for ppc64 for unaligned source or
> destination address. The source or destination address is aligned
> to doubleword and data is shifted based on the alignment and
> added with the previous loaded data to be written as a doubleword.
> For each load, cmpb instruction is used for faster null check.
> 
> More combination of unaligned inputs is also added in benchtest
> to measure the improvement.The new optimization shows 2 to 80% of
> performance improvement for longer string though it does not show
> big difference on string size less than 16 due to additional checks.
> 
> This patch is tested on powerpc64 BE and LE and I have also attached
> the benchtest result.
> 
As I wrote that benchtests are suspect first retest what happens if you
do not always call strcpy with same input and output buffer. What
diffence that makes in benchmark?

Comments

Rajalakshmi Srinivasaraghavan Dec. 19, 2014, 3 p.m. UTC | #1

On 12/19/2014 02:43 AM, Ondřej Bílka wrote:
> On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
>>
>> This patch optimizes strcpy for ppc64 for unaligned source or
>> destination address. The source or destination address is aligned
>> to doubleword and data is shifted based on the alignment and
>> added with the previous loaded data to be written as a doubleword.
>> For each load, cmpb instruction is used for faster null check.
>>
>> More combination of unaligned inputs is also added in benchtest
>> to measure the improvement.The new optimization shows 2 to 80% of
>> performance improvement for longer string though it does not show
>> big difference on string size less than 16 due to additional checks.
>>
>> This patch is tested on powerpc64 BE and LE and I have also attached
>> the benchtest result.
>>
> As I wrote that benchtests are suspect first retest what happens if you
> do not always call strcpy with same input and output buffer. What
> diffence that makes in benchmark?
>
I applied this patch with and without my optimization and I
could not see any decrease in performance. Attached the results.
> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index c3ab4cf..0329f60 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -71,25 +71,25 @@ SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
>   typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
>   static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> +do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
>   	     size_t len __attribute__((unused)))
>   {
>     size_t i, iters = INNER_LOOP_ITERS;
>     timing_t start, stop, cur;
>
> -  if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
> +  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
Modified it as

  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len))

>       {
>         error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -	     CALL (impl, dst, src), STRCPY_RESULT (dst, len));
> +	     CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
>         ret = 1;
>         return;
>       }
>
> -  if (STRCMP (dst, src) != 0)
> +  if (STRCMP (dst[0], src[0]) != 0)
>       {
>         error (0, 0,
>   	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> -	     impl->name, dst, src);
> +	     impl->name, dst[0], src[0]);
>         ret = 1;
>         return;
>       }
> @@ -97,7 +97,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>     TIMING_NOW (start);
>     for (i = 0; i < iters; ++i)
>       {
> -	  CALL (impl, dst, src);
> +	  CALL (impl, dst[i % 16], src[i % 16]);
>       }
>     TIMING_NOW (stop);
>
> @@ -109,8 +109,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>   static void
>   do_test (size_t align1, size_t align2, size_t len, int max_char)
>   {
> -  size_t i;
> -  CHAR *s1, *s2;
> +  size_t i, j;
> +  CHAR **s1, **s2;
>   /* For wcscpy: align1 and align2 here mean alignment not in bytes,
>      but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>      len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> @@ -122,12 +122,17 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
>     if ((align2 + len) * sizeof(CHAR) >= page_size)
>       return;
>
> -  s1 = (CHAR *) (buf1) + align1;
> -  s2 = (CHAR *) (buf2) + align2;
> +  s1 = calloc (sizeof (char *), 16);
> +  s2 = calloc (sizeof (char *), 16);
> +  for (j = 0; j < 16; j++)
> +    {
> +      s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
> +      s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
>
> -  for (i = 0; i < len; i++)
> -    s1[i] = 32 + 23 * i % (max_char - 32);
> -  s1[len] = 0;
> +      for (i = 0; i < len; i++)
> +        s1[j][i] = 32 + 23 * i % (max_char - 32);
> +      s1[j][len] = 0;
> +    }
>
>     printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));
>
>
>

diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..0329f60 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -71,25 +71,25 @@  SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
+do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
 	     size_t len __attribute__((unused)))
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
-  if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
+  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
     {
       error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     CALL (impl, dst, src), STRCPY_RESULT (dst, len));
+	     CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
       ret = 1;
       return;
     }
 
-  if (STRCMP (dst, src) != 0)
+  if (STRCMP (dst[0], src[0]) != 0)
     {
       error (0, 0,
 	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
-	     impl->name, dst, src);
+	     impl->name, dst[0], src[0]);
       ret = 1;
       return;
     }
@@ -97,7 +97,7 @@  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
-	  CALL (impl, dst, src);
+	  CALL (impl, dst[i % 16], src[i % 16]);
     }
   TIMING_NOW (stop);
 
@@ -109,8 +109,8 @@  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
 static void
 do_test (size_t align1, size_t align2, size_t len, int max_char)
 {
-  size_t i;
-  CHAR *s1, *s2;
+  size_t i, j;
+  CHAR **s1, **s2;
 /* For wcscpy: align1 and align2 here mean alignment not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
@@ -122,12 +122,17 @@  do_test (size_t align1, size_t align2, size_t len, int max_char)
   if ((align2 + len) * sizeof(CHAR) >= page_size)
     return;
 
-  s1 = (CHAR *) (buf1) + align1;
-  s2 = (CHAR *) (buf2) + align2;
+  s1 = calloc (sizeof (char *), 16);
+  s2 = calloc (sizeof (char *), 16);
+  for (j = 0; j < 16; j++)
+    {
+      s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
+      s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
 
-  for (i = 0; i < len; i++)
-    s1[i] = 32 + 23 * i % (max_char - 32);
-  s1[len] = 0;
+      for (i = 0; i < len; i++)
+        s1[j][i] = 32 + 23 * i % (max_char - 32);
+      s1[j][len] = 0;
+    }
 
   printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));

powerpc64: strcpy optimization for unaligned string

Commit Message

Comments

Patch