diff mbox series

powerpc64le: Optimized strcat for POWER10

Message ID 20241108140138.3880456-1-bmahi496@linux.ibm.com
State New
Headers show
Series powerpc64le: Optimized strcat for POWER10 | expand

Commit Message

Mahesh Bodapati Nov. 8, 2024, 2:01 p.m. UTC
With the new optimized strcpy and strlen implementation, this patch adds an
optimized strcat which uses it along with default implementation at strings.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  5 +--
 .../powerpc64/multiarch/ifunc-impl-list.c     |  5 +++
 .../powerpc64/multiarch/strcat-power10.c      | 33 +++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/strcat.c  | 23 +++++++++----
 4 files changed, 57 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c

Comments

Mahesh Bodapati Nov. 8, 2024, 2:05 p.m. UTC | #1
Attached the strcat benchmark results file.

Thanks,
Mahesh B


On 08/11/24 7:31 pm, Mahesh Bodapati wrote:
> With the new optimized strcpy and strlen implementation, this patch adds an
> optimized strcat which uses it along with default implementation at strings.
> ---
>   sysdeps/powerpc/powerpc64/multiarch/Makefile  |  5 +--
>   .../powerpc64/multiarch/ifunc-impl-list.c     |  5 +++
>   .../powerpc64/multiarch/strcat-power10.c      | 33 +++++++++++++++++++
>   sysdeps/powerpc/powerpc64/multiarch/strcat.c  | 23 +++++++++----
>   4 files changed, 57 insertions(+), 9 deletions(-)
>   create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c
>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index b847c19049..dc7c5b14ee 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -34,8 +34,9 @@ ifneq (,$(filter %le,$(config-machine)))
>   sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
>   		   memmove-power10 memset-power10 rawmemchr-power9 \
>   		   rawmemchr-power10 strcmp-power9 strcmp-power10 \
> -		   strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \
> -		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
> +		   strncmp-power9 strncmp-power10 strcpy-power9 strcat-power10 \
> +		   stpcpy-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
> +		   strlen-power10
>   endif
>   CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>   CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index 2bb47d3527..ab9e7c6142 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>   
>     /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
>     IFUNC_IMPL (i, name, strcat,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1
> +			      && hwcap & PPC_FEATURE_HAS_VSX,
> +			      __strcat_power10)
> +#endif
>   	      IFUNC_IMPL_ADD (array, i, strcat,
>   			      hwcap2 & PPC_FEATURE2_ARCH_2_07
>   			      && hwcap & PPC_FEATURE_HAS_VSX,
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c
> new file mode 100644
> index 0000000000..8d653ab500
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c
> @@ -0,0 +1,33 @@
> +/* Copyright (C) 2015-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/ >.  */
> +
> +#ifdef __LITTLE_ENDIAN__
> +#include <string.h>
> +
> +#define STRCAT __strcat_power10
> +
> +#undef libc_hidden_def
> +#define libc_hidden_def(name)
> +
> +extern typeof (strcpy) __strcpy_power9;
> +extern typeof (strlen) __strlen_power10;
> +
> +#define strcpy __strcpy_power9
> +#define strlen __strlen_power10
> +
> +#include <string/strcat.c>
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
> index 27e636e0ff..3493716c3c 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
> @@ -25,14 +25,23 @@
>   extern __typeof (strcat) __strcat_ppc attribute_hidden;
>   extern __typeof (strcat) __strcat_power7 attribute_hidden;
>   extern __typeof (strcat) __strcat_power8 attribute_hidden;
> +#ifdef __LITTLE_ENDIAN__
> +extern __typeof (strcat) __strcat_power10 attribute_hidden;
> +#endif
>   # undef strcat
>   
> +
>   libc_ifunc_redirected (__redirect_strcat, strcat,
> -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
> -			&& hwcap & PPC_FEATURE_HAS_VSX)
> -		       ? __strcat_power8
> -		       : (hwcap & PPC_FEATURE_ARCH_2_06
> -			  && hwcap & PPC_FEATURE_HAS_VSX)
> -			 ? __strcat_power7
> -			 : __strcat_ppc);
> +#ifdef __LITTLE_ENDIAN__
> +			(hwcap2 & PPC_FEATURE2_ARCH_3_1
> +			 && hwcap & PPC_FEATURE_HAS_VSX)
> +			? __strcat_power10 :
> +#endif
> +			  (hwcap2 & PPC_FEATURE2_ARCH_2_07
> +			   && hwcap & PPC_FEATURE_HAS_VSX)
> +		           ? __strcat_power8
> +		           : (hwcap & PPC_FEATURE_ARCH_2_06
> +			      && hwcap & PPC_FEATURE_HAS_VSX)
> +			     ? __strcat_power7
> +			     : __strcat_ppc);
>   #endif
Function: strcat
Variant: 
                                    __strcat_power10	__strcat_power8
========================================================================================================================
  align1=0, align2=0, len1=0, len2=0:         4.89 ( 18.42%)	        5.99	
  align1=0, align2=0, len1=0, len2=0:         4.90 ( 18.20%)	        5.99	
  align1=0, align2=0, len1=0, len2=0:         4.90 ( 18.26%)	        6.00	
  align1=0, align2=0, len1=0, len2=0:         4.90 ( 18.15%)	        5.99	
  align1=0, align2=0, len1=1, len2=1:         4.90 ( 18.27%)	        6.00	
  align1=0, align2=0, len1=1, len2=1:         4.91 ( 18.02%)	        5.99	
  align1=0, align2=1, len1=1, len2=1:         4.90 ( 18.21%)	        5.99	
  align1=1, align2=0, len1=1, len2=1:         4.90 ( 18.19%)	        5.99	
  align1=0, align2=0, len1=2, len2=2:         4.91 ( 18.76%)	        6.05	
  align1=0, align2=0, len1=2, len2=2:         4.90 ( 18.90%)	        6.04	
  align1=0, align2=2, len1=2, len2=2:         4.90 ( 18.92%)	        6.05	
  align1=2, align2=0, len1=2, len2=2:         4.90 ( 18.71%)	        6.03	
  align1=0, align2=0, len1=3, len2=3:         4.90 ( 18.21%)	        5.99	
  align1=0, align2=0, len1=3, len2=3:         4.90 ( 18.16%)	        5.99	
  align1=0, align2=3, len1=3, len2=3:         4.90 ( 18.16%)	        5.99	
  align1=3, align2=0, len1=3, len2=3:         4.90 ( 18.28%)	        6.00	
  align1=0, align2=0, len1=4, len2=4:         4.90 ( 19.29%)	        6.07	
  align1=0, align2=0, len1=4, len2=4:         4.90 ( 19.30%)	        6.07	
  align1=0, align2=4, len1=4, len2=4:         4.91 ( 15.73%)	        5.83	
  align1=4, align2=0, len1=4, len2=4:         4.90 ( 19.30%)	        6.07	
  align1=0, align2=0, len1=5, len2=5:         4.90 ( 19.11%)	        6.06	
  align1=0, align2=0, len1=5, len2=5:         4.90 ( 19.12%)	        6.06	
  align1=0, align2=5, len1=5, len2=5:         4.90 ( 15.62%)	        5.81	
  align1=5, align2=0, len1=5, len2=5:         4.90 ( 19.10%)	        6.06	
  align1=0, align2=0, len1=6, len2=6:         4.90 ( 20.41%)	        6.16	
  align1=0, align2=0, len1=6, len2=6:         4.90 ( 20.52%)	        6.17	
  align1=0, align2=6, len1=6, len2=6:         4.90 ( 17.05%)	        5.91	
  align1=6, align2=0, len1=6, len2=6:         4.90 ( 20.43%)	        6.16	
  align1=0, align2=0, len1=7, len2=7:         5.21 ( 36.94%)	        8.26	
  align1=0, align2=0, len1=7, len2=7:         5.46 ( 21.56%)	        6.96	
  align1=0, align2=7, len1=7, len2=7:         5.40 ( 26.55%)	        7.35	
  align1=7, align2=0, len1=7, len2=7:         5.40 ( 24.96%)	        7.20	
  align1=0, align2=0, len1=8, len2=8:         5.40 ( 36.89%)	        8.56	
  align1=0, align2=0, len1=8, len2=8:         5.41 ( 37.09%)	        8.59	
  align1=0, align2=0, len1=8, len2=8:         5.41 ( 36.93%)	        8.57	
  align1=0, align2=0, len1=8, len2=8:         4.90 ( 18.03%)	        5.98	
  align1=0, align2=0, len1=9, len2=9:         4.90 ( 19.42%)	        6.08	
  align1=0, align2=0, len1=9, len2=9:         4.90 ( 18.37%)	        6.00	
  align1=0, align2=1, len1=9, len2=9:         4.90 ( 18.36%)	        6.00	
  align1=1, align2=0, len1=9, len2=9:         4.95 ( 17.52%)	        6.01	
align1=0, align2=0, len1=10, len2=10:         4.90 ( 19.16%)	        6.06	
align1=0, align2=0, len1=10, len2=10:         4.90 ( 19.15%)	        6.06	
align1=0, align2=2, len1=10, len2=10:         5.05 ( 16.63%)	        6.06	
align1=2, align2=0, len1=10, len2=10:         4.90 ( 19.18%)	        6.06	
align1=0, align2=0, len1=11, len2=11:         4.90 ( 18.04%)	        5.98	
align1=0, align2=0, len1=11, len2=11:         4.90 ( 18.06%)	        5.98	
align1=0, align2=3, len1=11, len2=11:         4.90 ( 18.26%)	        6.00	
align1=3, align2=0, len1=11, len2=11:         4.90 ( 18.00%)	        5.98	
align1=0, align2=0, len1=12, len2=12:         4.90 ( 19.36%)	        6.08	
align1=0, align2=0, len1=12, len2=12:         4.90 ( 19.62%)	        6.10	
align1=0, align2=4, len1=12, len2=12:         4.96 ( 20.04%)	        6.20	
align1=4, align2=0, len1=12, len2=12:         5.21 ( 14.32%)	        6.08	
align1=0, align2=0, len1=13, len2=13:         4.92 ( 19.06%)	        6.08	
align1=0, align2=0, len1=13, len2=13:         4.90 ( 19.43%)	        6.08	
align1=0, align2=5, len1=13, len2=13:         4.96 ( 33.66%)	        7.47	
align1=5, align2=0, len1=13, len2=13:         5.21 ( 14.39%)	        6.08	
align1=0, align2=0, len1=14, len2=14:         4.90 ( 20.95%)	        6.20	
align1=0, align2=0, len1=14, len2=14:         4.90 ( 19.97%)	        6.12	
align1=0, align2=6, len1=14, len2=14:         4.96 ( 34.51%)	        7.57	
align1=6, align2=0, len1=14, len2=14:         5.23 ( 15.41%)	        6.19	
align1=0, align2=0, len1=15, len2=15:         4.90 ( 14.39%)	        5.73	
align1=0, align2=0, len1=15, len2=15:         4.90 ( 14.32%)	        5.72	
align1=0, align2=7, len1=15, len2=15:         5.00 ( 34.00%)	        7.57	
align1=7, align2=0, len1=15, len2=15:         5.21 (  9.29%)	        5.74	
align1=0, align2=0, len1=16, len2=16:         5.21 ( 37.77%)	        8.37	
align1=7, align2=2, len1=16, len2=16:         5.24 ( 32.38%)	        7.75	
 align1=0, align2=0, len1=16, len2=4:         5.21 ( 20.43%)	        6.55	
 align1=7, align2=2, len1=16, len2=4:         5.21 ( 20.58%)	        6.56	
align1=0, align2=0, len1=32, len2=32:         5.59 ( 34.38%)	        8.52	
align1=6, align2=4, len1=32, len2=32:         5.72 ( 30.00%)	        8.17	
 align1=0, align2=0, len1=32, len2=8:         5.51 ( 16.13%)	        6.57	
 align1=6, align2=4, len1=32, len2=8:         5.51 ( 18.61%)	        6.77	
align1=0, align2=0, len1=64, len2=64:         5.67 ( 36.02%)	        8.86	
align1=5, align2=6, len1=64, len2=64:         6.75 ( 28.17%)	        9.40	
align1=0, align2=0, len1=64, len2=16:         5.69 ( 33.65%)	        8.57	
align1=5, align2=6, len1=64, len2=16:         5.71 ( 28.33%)	        7.97	
align1=0, align2=0, len1=128, len2=128:         6.16 ( 37.03%)	        9.78	
align1=4, align2=0, len1=128, len2=128:         7.05 ( 31.25%)	       10.25	
align1=0, align2=0, len1=128, len2=32:         6.05 ( 23.56%)	        7.91	
align1=4, align2=0, len1=128, len2=32:         6.86 ( 27.46%)	        9.45	
align1=0, align2=0, len1=256, len2=256:         8.22 ( 29.53%)	       11.67	
align1=3, align2=2, len1=256, len2=256:         9.54 ( 44.34%)	       17.14	
align1=0, align2=0, len1=256, len2=64:         6.99 ( 35.56%)	       10.85	
align1=3, align2=2, len1=256, len2=64:         8.01 ( 52.61%)	       16.90	
align1=0, align2=0, len1=512, len2=512:        11.00 ( 30.02%)	       15.72	
align1=2, align2=4, len1=512, len2=512:        11.37 ( 54.42%)	       24.94	
align1=0, align2=0, len1=512, len2=128:         9.16 ( 35.01%)	       14.09	
align1=2, align2=4, len1=512, len2=128:        10.08 ( 56.41%)	       23.12	
align1=0, align2=0, len1=1024, len2=1024:        17.78 ( 25.21%)	       23.78	
align1=1, align2=6, len1=1024, len2=1024:        17.95 ( 58.32%)	       43.07	
align1=0, align2=0, len1=1024, len2=256:        15.03 ( 24.49%)	       19.90	
align1=1, align2=6, len1=1024, len2=256:        15.45 ( 60.52%)	       39.14	
 align1=1, align2=2, len1=16, len2=1:         5.21 ( 20.46%)	        6.55	
 align1=2, align2=1, len1=16, len2=1:         5.21 ( 21.64%)	        6.65	
align1=1, align2=1, len1=16, len2=10:         5.21 ( 17.44%)	        6.31	
align1=1, align2=1, len1=16, len2=10:         5.21 ( 17.16%)	        6.29	
 align1=2, align2=4, len1=32, len2=1:         5.51 ( 20.52%)	        6.93	
 align1=4, align2=2, len1=32, len2=1:         5.53 ( 20.41%)	        6.95	
align1=2, align2=2, len1=32, len2=10:         5.51 ( 17.39%)	        6.67	
align1=2, align2=2, len1=32, len2=10:         5.51 ( 17.24%)	        6.66	
 align1=3, align2=6, len1=64, len2=1:         5.66 ( 23.41%)	        7.38	
 align1=6, align2=3, len1=64, len2=1:         5.78 ( 24.57%)	        7.66	
align1=3, align2=3, len1=64, len2=10:         5.68 ( 20.29%)	        7.12	
align1=3, align2=3, len1=64, len2=10:         5.66 ( 20.88%)	        7.15	
align1=4, align2=0, len1=128, len2=1:         6.97 ( 29.61%)	        9.90	
align1=0, align2=4, len1=128, len2=1:         6.97 ( 27.45%)	        9.61	
align1=4, align2=4, len1=128, len2=10:         6.98 ( 28.63%)	        9.78	
align1=4, align2=4, len1=128, len2=10:         6.97 ( 26.62%)	        9.50	
align1=5, align2=2, len1=256, len2=1:         8.18 ( 43.05%)	       14.37	
align1=2, align2=5, len1=256, len2=1:         8.10 ( 44.39%)	       14.57	
align1=5, align2=5, len1=256, len2=10:         8.14 ( 31.25%)	       11.85	
align1=5, align2=5, len1=256, len2=10:         8.18 ( 31.12%)	       11.87	
align1=6, align2=4, len1=512, len2=1:         9.87 ( 46.66%)	       18.50	
align1=4, align2=6, len1=512, len2=1:         9.85 ( 47.04%)	       18.59	
align1=6, align2=6, len1=512, len2=10:         9.89 ( 45.08%)	       18.01	
align1=6, align2=6, len1=512, len2=10:         9.88 ( 46.26%)	       18.38	
align1=7, align2=6, len1=1024, len2=1:        13.33 ( 24.67%)	       17.70	
align1=6, align2=7, len1=1024, len2=1:        13.97 ( 61.26%)	       36.05	
align1=7, align2=7, len1=1024, len2=10:        14.02 ( 61.05%)	       35.99	
align1=7, align2=7, len1=1024, len2=10:        13.96 ( 60.91%)	       35.70	
align1=1, align2=0, len1=32, len2=31:         5.57 ( 19.42%)	        6.92	
align1=0, align2=0, len1=32, len2=31:         5.57 ( 19.73%)	        6.94	
align1=0, align2=0, len1=32, len2=31:         5.57 ( 19.54%)	        6.92	
align1=0, align2=0, len1=32, len2=31:         5.57 ( 19.57%)	        6.93	
align1=1, align2=0, len1=64, len2=31:         5.72 ( 23.20%)	        7.45	
align1=0, align2=0, len1=64, len2=31:         5.73 ( 23.50%)	        7.49	
align1=0, align2=0, len1=64, len2=31:         5.72 ( 23.81%)	        7.51	
align1=0, align2=0, len1=64, len2=31:         5.71 ( 23.65%)	        7.48	
align1=1, align2=0, len1=96, len2=31:         5.88 ( 27.16%)	        8.08	
align1=0, align2=0, len1=96, len2=31:         5.90 ( 30.87%)	        8.53	
align1=0, align2=0, len1=96, len2=31:         5.90 ( 29.63%)	        8.38	
align1=0, align2=0, len1=96, len2=31:         5.92 ( 32.72%)	        8.80	
align1=1, align2=0, len1=128, len2=31:         6.85 ( 28.01%)	        9.52	
align1=0, align2=0, len1=128, len2=31:         6.83 ( 28.85%)	        9.60	
align1=0, align2=0, len1=128, len2=31:         6.85 ( 28.69%)	        9.60	
align1=0, align2=0, len1=128, len2=31:         6.85 ( 28.70%)	        9.61	
align1=1, align2=0, len1=160, len2=31:         7.16 ( 42.05%)	       12.35	
align1=0, align2=0, len1=160, len2=31:         7.14 ( 44.06%)	       12.76	
align1=0, align2=0, len1=160, len2=31:         7.13 ( 44.17%)	       12.76	
align1=0, align2=0, len1=160, len2=31:         7.11 ( 44.36%)	       12.78	
align1=1, align2=0, len1=192, len2=31:         7.39 ( 44.87%)	       13.41	
align1=0, align2=0, len1=192, len2=31:         7.39 ( 46.09%)	       13.71	
align1=0, align2=0, len1=192, len2=31:         7.39 ( 46.05%)	       13.70	
align1=0, align2=0, len1=192, len2=31:         7.39 ( 46.13%)	       13.72	
align1=1, align2=0, len1=224, len2=31:         7.55 ( 47.08%)	       14.27	
align1=0, align2=0, len1=224, len2=31:         7.57 ( 48.12%)	       14.59	
align1=0, align2=0, len1=224, len2=31:         7.54 ( 48.27%)	       14.58	
align1=0, align2=0, len1=224, len2=31:         7.55 ( 48.19%)	       14.57	
align1=1, align2=0, len1=256, len2=31:         7.84 ( 48.13%)	       15.12	
align1=0, align2=0, len1=256, len2=31:         7.80 ( 49.69%)	       15.51	
align1=0, align2=0, len1=256, len2=31:         7.83 ( 49.52%)	       15.50	
align1=0, align2=0, len1=256, len2=31:         7.80 ( 49.67%)	       15.50	
align1=1, align2=0, len1=320, len2=31:         8.34 ( 50.23%)	       16.76	
align1=0, align2=0, len1=320, len2=31:         8.34 ( 50.67%)	       16.92	
align1=0, align2=0, len1=320, len2=31:         8.32 ( 51.12%)	       17.01	
align1=0, align2=0, len1=320, len2=31:         8.34 ( 50.99%)	       17.03	
align1=1, align2=0, len1=384, len2=31:         8.85 ( 51.87%)	       18.38	
align1=0, align2=0, len1=384, len2=31:         8.85 ( 52.39%)	       18.59	
align1=0, align2=0, len1=384, len2=31:         8.85 ( 52.48%)	       18.62	
align1=0, align2=0, len1=384, len2=31:         8.87 ( 52.28%)	       18.58	
align1=1, align2=0, len1=448, len2=31:         9.29 ( 53.08%)	       19.80	
align1=0, align2=0, len1=448, len2=31:         9.29 ( 53.86%)	       20.13	
align1=0, align2=0, len1=448, len2=31:         9.28 ( 53.78%)	       20.07	
align1=0, align2=0, len1=448, len2=31:         9.32 ( 53.67%)	       20.13	
align1=1, align2=0, len1=512, len2=31:         9.90 ( 53.88%)	       21.46	
align1=0, align2=0, len1=512, len2=31:         9.87 ( 54.16%)	       21.54	
align1=0, align2=0, len1=512, len2=31:         9.87 ( 54.44%)	       21.67	
align1=0, align2=0, len1=512, len2=31:         9.86 ( 54.22%)	       21.53	
align1=1, align2=0, len1=640, len2=31:        10.82 ( 55.02%)	       24.06	
align1=0, align2=0, len1=640, len2=31:        10.82 ( 55.14%)	       24.11	
align1=0, align2=0, len1=640, len2=31:        10.93 ( 54.51%)	       24.02	
align1=0, align2=0, len1=640, len2=31:        10.89 ( 54.67%)	       24.02	
align1=1, align2=0, len1=768, len2=31:        11.83 ( 55.94%)	       26.84	
align1=0, align2=0, len1=768, len2=31:        11.79 ( 56.24%)	       26.95	
align1=0, align2=0, len1=768, len2=31:        11.79 ( 56.24%)	       26.95	
align1=0, align2=0, len1=768, len2=31:        11.79 ( 56.28%)	       26.97	
align1=1, align2=0, len1=896, len2=31:        12.82 ( 62.09%)	       33.81	
align1=0, align2=0, len1=896, len2=31:        12.86 ( 62.45%)	       34.26	
align1=0, align2=0, len1=896, len2=31:        12.91 ( 62.61%)	       34.52	
align1=0, align2=0, len1=896, len2=31:        12.82 ( 62.86%)	       34.51	
align1=1, align2=0, len1=1024, len2=31:        13.94 ( 62.05%)	       36.74	
align1=0, align2=0, len1=1024, len2=31:        13.89 ( 62.74%)	       37.27	
align1=0, align2=0, len1=1024, len2=31:        14.03 ( 62.20%)	       37.11	
align1=0, align2=0, len1=1024, len2=31:        14.01 ( 62.44%)	       37.29	
align1=1, align2=0, len1=1280, len2=31:        16.10 ( 58.21%)	       38.53	
align1=0, align2=0, len1=1280, len2=31:        16.02 ( 59.38%)	       39.44	
align1=0, align2=0, len1=1280, len2=31:        16.04 ( 59.50%)	       39.60	
align1=0, align2=0, len1=1280, len2=31:        16.10 ( 59.26%)	       39.53	
align1=1, align2=0, len1=1536, len2=31:        18.21 ( 58.54%)	       43.92	
align1=0, align2=0, len1=1536, len2=31:        18.32 ( 59.22%)	       44.92	
align1=0, align2=0, len1=1536, len2=31:        18.19 ( 59.62%)	       45.05	
align1=0, align2=0, len1=1536, len2=31:        18.27 ( 59.39%)	       44.99	
align1=1, align2=0, len1=1792, len2=31:        20.27 ( 59.19%)	       49.66	
align1=0, align2=0, len1=1792, len2=31:        20.17 ( 59.91%)	       50.30	
align1=0, align2=0, len1=1792, len2=31:        20.18 ( 59.68%)	       50.05	
align1=0, align2=0, len1=1792, len2=31:        20.27 ( 59.53%)	       50.08
Peter Bergner Nov. 8, 2024, 7:12 p.m. UTC | #2
On 11/8/24 8:01 AM, Mahesh Bodapati wrote:
> With the new optimized strcpy and strlen implementation, this patch adds an
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
>    IFUNC_IMPL (i, name, strcat,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1
> +			      && hwcap & PPC_FEATURE_HAS_VSX,
> +			      __strcat_power10)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, strcat,
>  			      hwcap2 & PPC_FEATURE2_ARCH_2_07
>  			      && hwcap & PPC_FEATURE_HAS_VSX,

Why the limitation here and in the other files of this being guarded
by #ifdef __LITTLE_ENDIAN__ ?  The strcat-power8.c optimization is not
guarded by that, so why the Power10 version?  If it can be used on
big-endian too, then please remove the #ifdef tests and enable it
everywhere.


> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1

I believe the strcpy in the line above should be strcat like the entry
just below it?






> --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
> @@ -25,14 +25,23 @@
>  extern __typeof (strcat) __strcat_ppc attribute_hidden;
>  extern __typeof (strcat) __strcat_power7 attribute_hidden;
>  extern __typeof (strcat) __strcat_power8 attribute_hidden;
> +#ifdef __LITTLE_ENDIAN__
> +extern __typeof (strcat) __strcat_power10 attribute_hidden;
> +#endif
>  # undef strcat
>  
> +

Stray whitespace.


Peter
Mahesh Bodapati Nov. 11, 2024, 5:23 a.m. UTC | #3
On 09/11/24 12:42 am, Peter Bergner wrote:
> On 11/8/24 8:01 AM, Mahesh Bodapati wrote:
>> With the new optimized strcpy and strlen implementation, this patch adds an
>> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
>> @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>   
>>     /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
>>     IFUNC_IMPL (i, name, strcat,
>> +#ifdef __LITTLE_ENDIAN__
>> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1
>> +			      && hwcap & PPC_FEATURE_HAS_VSX,
>> +			      __strcat_power10)
>> +#endif
>>   	      IFUNC_IMPL_ADD (array, i, strcat,
>>   			      hwcap2 & PPC_FEATURE2_ARCH_2_07
>>   			      && hwcap & PPC_FEATURE_HAS_VSX,
> Why the limitation here and in the other files of this being guarded
> by #ifdef __LITTLE_ENDIAN__ ?  The strcat-power8.c optimization is not
> guarded by that, so why the Power10 version?  If it can be used on
> big-endian too, then please remove the #ifdef tests and enable it
> everywhere.

strcat uses strlen and strcpy library functions and these functions are 
optimized for little endian so guarded with
#ifdef __LITTLE_ENDIAN__ where as power8 versions of strlen and strcpy 
are optimized for both little endian and big endian.

>
>> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1
> I believe the strcpy in the line above should be strcat like the entry
> just below it?

yes,you are right. I will update.

>
>
>
>
>
>
>> --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
>> @@ -25,14 +25,23 @@
>>   extern __typeof (strcat) __strcat_ppc attribute_hidden;
>>   extern __typeof (strcat) __strcat_power7 attribute_hidden;
>>   extern __typeof (strcat) __strcat_power8 attribute_hidden;
>> +#ifdef __LITTLE_ENDIAN__
>> +extern __typeof (strcat) __strcat_power10 attribute_hidden;
>> +#endif
>>   # undef strcat
>>   
>> +
> Stray whitespace.
I will update.
>
> Peter
>
>
Peter Bergner Nov. 11, 2024, 4:27 p.m. UTC | #4
On 11/10/24 11:23 PM, MAHESH BODAPATI wrote:
> On 09/11/24 12:42 am, Peter Bergner wrote:
>> Why the limitation here and in the other files of this being guarded
>> by #ifdef __LITTLE_ENDIAN__ ?  The strcat-power8.c optimization is not
>> guarded by that, so why the Power10 version?  If it can be used on
>> big-endian too, then please remove the #ifdef tests and enable it
>> everywhere.
> 
> strcat uses strlen and strcpy library functions and these functions are
> optimized for little endian so guarded with #ifdef __LITTLE_ENDIAN__ where
> as power8 versions of strlen and strcpy are optimized for both little
> endian and big endian.

Ah, right you are.  It might be nice to get those to be endian agnostic,
but that is for another day and another patch.

Peter
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index b847c19049..dc7c5b14ee 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -34,8 +34,9 @@  ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
 		   memmove-power10 memset-power10 rawmemchr-power9 \
 		   rawmemchr-power10 strcmp-power9 strcmp-power10 \
-		   strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \
-		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
+		   strncmp-power9 strncmp-power10 strcpy-power9 strcat-power10 \
+		   stpcpy-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
+		   strlen-power10
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 2bb47d3527..ab9e7c6142 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -406,6 +406,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1
+			      && hwcap & PPC_FEATURE_HAS_VSX,
+			      __strcat_power10)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strcat,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 			      && hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c
new file mode 100644
index 0000000000..8d653ab500
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c
@@ -0,0 +1,33 @@ 
+/* Copyright (C) 2015-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/ >.  */
+
+#ifdef __LITTLE_ENDIAN__
+#include <string.h>
+
+#define STRCAT __strcat_power10
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power9;
+extern typeof (strlen) __strlen_power10;
+
+#define strcpy __strcpy_power9
+#define strlen __strlen_power10
+
+#include <string/strcat.c>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index 27e636e0ff..3493716c3c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -25,14 +25,23 @@ 
 extern __typeof (strcat) __strcat_ppc attribute_hidden;
 extern __typeof (strcat) __strcat_power7 attribute_hidden;
 extern __typeof (strcat) __strcat_power8 attribute_hidden;
+#ifdef __LITTLE_ENDIAN__
+extern __typeof (strcat) __strcat_power10 attribute_hidden;
+#endif
 # undef strcat
 
+
 libc_ifunc_redirected (__redirect_strcat, strcat,
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-			&& hwcap & PPC_FEATURE_HAS_VSX)
-		       ? __strcat_power8
-		       : (hwcap & PPC_FEATURE_ARCH_2_06
-			  && hwcap & PPC_FEATURE_HAS_VSX)
-			 ? __strcat_power7
-			 : __strcat_ppc);
+#ifdef __LITTLE_ENDIAN__
+			(hwcap2 & PPC_FEATURE2_ARCH_3_1
+			 && hwcap & PPC_FEATURE_HAS_VSX)
+			? __strcat_power10 :
+#endif
+			  (hwcap2 & PPC_FEATURE2_ARCH_2_07
+			   && hwcap & PPC_FEATURE_HAS_VSX)
+		           ? __strcat_power8
+		           : (hwcap & PPC_FEATURE_ARCH_2_06
+			      && hwcap & PPC_FEATURE_HAS_VSX)
+			     ? __strcat_power7
+			     : __strcat_ppc);
 #endif