diff mbox

powerpc64: strcpy optimization for unaligned string

Message ID 20141218211348.GA16854@domone
State New
Headers show

Commit Message

Ondřej Bílka Dec. 18, 2014, 9:13 p.m. UTC
On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> This patch optimizes strcpy for ppc64 for unaligned source or
> destination address. The source or destination address is aligned
> to doubleword and data is shifted based on the alignment and
> added with the previous loaded data to be written as a doubleword.
> For each load, cmpb instruction is used for faster null check.
> 
> More combination of unaligned inputs is also added in benchtest
> to measure the improvement.The new optimization shows 2 to 80% of
> performance improvement for longer string though it does not show
> big difference on string size less than 16 due to additional checks.
> 
> This patch is tested on powerpc64 BE and LE and I have also attached
> the benchtest result.
> 
As I wrote that benchtests are suspect first retest what happens if you
do not always call strcpy with same input and output buffer. What
diffence that makes in benchmark?

Comments

Rajalakshmi Srinivasaraghavan Dec. 19, 2014, 3 p.m. UTC | #1
On 12/19/2014 02:43 AM, Ondřej Bílka wrote:
> On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
>>
>> This patch optimizes strcpy for ppc64 for unaligned source or
>> destination address. The source or destination address is aligned
>> to doubleword and data is shifted based on the alignment and
>> added with the previous loaded data to be written as a doubleword.
>> For each load, cmpb instruction is used for faster null check.
>>
>> More combination of unaligned inputs is also added in benchtest
>> to measure the improvement.The new optimization shows 2 to 80% of
>> performance improvement for longer string though it does not show
>> big difference on string size less than 16 due to additional checks.
>>
>> This patch is tested on powerpc64 BE and LE and I have also attached
>> the benchtest result.
>>
> As I wrote that benchtests are suspect first retest what happens if you
> do not always call strcpy with same input and output buffer. What
> diffence that makes in benchmark?
>
I applied this patch with and without my optimization and I
could not see any decrease in performance. Attached the results.
> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index c3ab4cf..0329f60 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -71,25 +71,25 @@ SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
>   typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
>   static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> +do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
>   	     size_t len __attribute__((unused)))
>   {
>     size_t i, iters = INNER_LOOP_ITERS;
>     timing_t start, stop, cur;
>
> -  if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
> +  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
Modified it as

  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len))

>       {
>         error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -	     CALL (impl, dst, src), STRCPY_RESULT (dst, len));
> +	     CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
>         ret = 1;
>         return;
>       }
>
> -  if (STRCMP (dst, src) != 0)
> +  if (STRCMP (dst[0], src[0]) != 0)
>       {
>         error (0, 0,
>   	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> -	     impl->name, dst, src);
> +	     impl->name, dst[0], src[0]);
>         ret = 1;
>         return;
>       }
> @@ -97,7 +97,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>     TIMING_NOW (start);
>     for (i = 0; i < iters; ++i)
>       {
> -	  CALL (impl, dst, src);
> +	  CALL (impl, dst[i % 16], src[i % 16]);
>       }
>     TIMING_NOW (stop);
>
> @@ -109,8 +109,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>   static void
>   do_test (size_t align1, size_t align2, size_t len, int max_char)
>   {
> -  size_t i;
> -  CHAR *s1, *s2;
> +  size_t i, j;
> +  CHAR **s1, **s2;
>   /* For wcscpy: align1 and align2 here mean alignment not in bytes,
>      but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>      len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> @@ -122,12 +122,17 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
>     if ((align2 + len) * sizeof(CHAR) >= page_size)
>       return;
>
> -  s1 = (CHAR *) (buf1) + align1;
> -  s2 = (CHAR *) (buf2) + align2;
> +  s1 = calloc (sizeof (char *), 16);
> +  s2 = calloc (sizeof (char *), 16);
> +  for (j = 0; j < 16; j++)
> +    {
> +      s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
> +      s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
>
> -  for (i = 0; i < len; i++)
> -    s1[i] = 32 + 23 * i % (max_char - 32);
> -  s1[len] = 0;
> +      for (i = 0; i < len; i++)
> +        s1[j][i] = 32 + 23 * i % (max_char - 32);
> +      s1[j][len] = 0;
> +    }
>
>     printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));
>
>
>
diff mbox

Patch

diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..0329f60 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -71,25 +71,25 @@  SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
+do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
 	     size_t len __attribute__((unused)))
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
-  if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
+  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
     {
       error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     CALL (impl, dst, src), STRCPY_RESULT (dst, len));
+	     CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
       ret = 1;
       return;
     }
 
-  if (STRCMP (dst, src) != 0)
+  if (STRCMP (dst[0], src[0]) != 0)
     {
       error (0, 0,
 	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
-	     impl->name, dst, src);
+	     impl->name, dst[0], src[0]);
       ret = 1;
       return;
     }
@@ -97,7 +97,7 @@  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
-	  CALL (impl, dst, src);
+	  CALL (impl, dst[i % 16], src[i % 16]);
     }
   TIMING_NOW (stop);
 
@@ -109,8 +109,8 @@  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
 static void
 do_test (size_t align1, size_t align2, size_t len, int max_char)
 {
-  size_t i;
-  CHAR *s1, *s2;
+  size_t i, j;
+  CHAR **s1, **s2;
 /* For wcscpy: align1 and align2 here mean alignment not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
@@ -122,12 +122,17 @@  do_test (size_t align1, size_t align2, size_t len, int max_char)
   if ((align2 + len) * sizeof(CHAR) >= page_size)
     return;
 
-  s1 = (CHAR *) (buf1) + align1;
-  s2 = (CHAR *) (buf2) + align2;
+  s1 = calloc (sizeof (char *), 16);
+  s2 = calloc (sizeof (char *), 16);
+  for (j = 0; j < 16; j++)
+    {
+      s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
+      s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
 
-  for (i = 0; i < len; i++)
-    s1[i] = 32 + 23 * i % (max_char - 32);
-  s1[len] = 0;
+      for (i = 0; i < len; i++)
+        s1[j][i] = 32 + 23 * i % (max_char - 32);
+      s1[j][len] = 0;
+    }
 
   printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));