Message ID | 20241108140138.3880456-1-bmahi496@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | powerpc64le: Optimized strcat for POWER10 | expand |
Attached the strcat benchmark results file. Thanks, Mahesh B On 08/11/24 7:31 pm, Mahesh Bodapati wrote: > With the new optimized strcpy and strlen implementation, this patch adds an > optimized strcat which uses it along with default implementation at strings. > --- > sysdeps/powerpc/powerpc64/multiarch/Makefile | 5 +-- > .../powerpc64/multiarch/ifunc-impl-list.c | 5 +++ > .../powerpc64/multiarch/strcat-power10.c | 33 +++++++++++++++++++ > sysdeps/powerpc/powerpc64/multiarch/strcat.c | 23 +++++++++---- > 4 files changed, 57 insertions(+), 9 deletions(-) > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index b847c19049..dc7c5b14ee 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -34,8 +34,9 @@ ifneq (,$(filter %le,$(config-machine))) > sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ > memmove-power10 memset-power10 rawmemchr-power9 \ > rawmemchr-power10 strcmp-power9 strcmp-power10 \ > - strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \ > - strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 > + strncmp-power9 strncmp-power10 strcpy-power9 strcat-power10 \ > + stpcpy-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ > + strlen-power10 > endif > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index 2bb47d3527..ab9e7c6142 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */ > IFUNC_IMPL (i, name, strcat, > +#ifdef __LITTLE_ENDIAN__ > + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1 > + && hwcap & PPC_FEATURE_HAS_VSX, > + __strcat_power10) > +#endif > IFUNC_IMPL_ADD (array, i, strcat, > hwcap2 & PPC_FEATURE2_ARCH_2_07 > && hwcap & PPC_FEATURE_HAS_VSX, > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c > new file mode 100644 > index 0000000000..8d653ab500 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c > @@ -0,0 +1,33 @@ > +/* Copyright (C) 2015-2024 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/ >. */ > + > +#ifdef __LITTLE_ENDIAN__ > +#include <string.h> > + > +#define STRCAT __strcat_power10 > + > +#undef libc_hidden_def > +#define libc_hidden_def(name) > + > +extern typeof (strcpy) __strcpy_power9; > +extern typeof (strlen) __strlen_power10; > + > +#define strcpy __strcpy_power9 > +#define strlen __strlen_power10 > + > +#include <string/strcat.c> > +#endif > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c > index 27e636e0ff..3493716c3c 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c > @@ -25,14 +25,23 @@ > extern __typeof (strcat) __strcat_ppc attribute_hidden; > extern __typeof (strcat) __strcat_power7 attribute_hidden; > extern __typeof (strcat) __strcat_power8 attribute_hidden; > +#ifdef __LITTLE_ENDIAN__ > +extern __typeof (strcat) __strcat_power10 attribute_hidden; > +#endif > # undef strcat > > + > libc_ifunc_redirected (__redirect_strcat, strcat, > - (hwcap2 & PPC_FEATURE2_ARCH_2_07 > - && hwcap & PPC_FEATURE_HAS_VSX) > - ? __strcat_power8 > - : (hwcap & PPC_FEATURE_ARCH_2_06 > - && hwcap & PPC_FEATURE_HAS_VSX) > - ? __strcat_power7 > - : __strcat_ppc); > +#ifdef __LITTLE_ENDIAN__ > + (hwcap2 & PPC_FEATURE2_ARCH_3_1 > + && hwcap & PPC_FEATURE_HAS_VSX) > + ? __strcat_power10 : > +#endif > + (hwcap2 & PPC_FEATURE2_ARCH_2_07 > + && hwcap & PPC_FEATURE_HAS_VSX) > + ? __strcat_power8 > + : (hwcap & PPC_FEATURE_ARCH_2_06 > + && hwcap & PPC_FEATURE_HAS_VSX) > + ? __strcat_power7 > + : __strcat_ppc); > #endif Function: strcat Variant: __strcat_power10 __strcat_power8 ======================================================================================================================== align1=0, align2=0, len1=0, len2=0: 4.89 ( 18.42%) 5.99 align1=0, align2=0, len1=0, len2=0: 4.90 ( 18.20%) 5.99 align1=0, align2=0, len1=0, len2=0: 4.90 ( 18.26%) 6.00 align1=0, align2=0, len1=0, len2=0: 4.90 ( 18.15%) 5.99 align1=0, align2=0, len1=1, len2=1: 4.90 ( 18.27%) 6.00 align1=0, align2=0, len1=1, len2=1: 4.91 ( 18.02%) 5.99 align1=0, align2=1, len1=1, len2=1: 4.90 ( 18.21%) 5.99 align1=1, align2=0, len1=1, len2=1: 4.90 ( 18.19%) 5.99 align1=0, align2=0, len1=2, len2=2: 4.91 ( 18.76%) 6.05 align1=0, align2=0, len1=2, len2=2: 4.90 ( 18.90%) 6.04 align1=0, align2=2, len1=2, len2=2: 4.90 ( 18.92%) 6.05 align1=2, align2=0, len1=2, len2=2: 4.90 ( 18.71%) 6.03 align1=0, align2=0, len1=3, len2=3: 4.90 ( 18.21%) 5.99 align1=0, align2=0, len1=3, len2=3: 4.90 ( 18.16%) 5.99 align1=0, align2=3, len1=3, len2=3: 4.90 ( 18.16%) 5.99 align1=3, align2=0, len1=3, len2=3: 4.90 ( 18.28%) 6.00 align1=0, align2=0, len1=4, len2=4: 4.90 ( 19.29%) 6.07 align1=0, align2=0, len1=4, len2=4: 4.90 ( 19.30%) 6.07 align1=0, align2=4, len1=4, len2=4: 4.91 ( 15.73%) 5.83 align1=4, align2=0, len1=4, len2=4: 4.90 ( 19.30%) 6.07 align1=0, align2=0, len1=5, len2=5: 4.90 ( 19.11%) 6.06 align1=0, align2=0, len1=5, len2=5: 4.90 ( 19.12%) 6.06 align1=0, align2=5, len1=5, len2=5: 4.90 ( 15.62%) 5.81 align1=5, align2=0, len1=5, len2=5: 4.90 ( 19.10%) 6.06 align1=0, align2=0, len1=6, len2=6: 4.90 ( 20.41%) 6.16 align1=0, align2=0, len1=6, len2=6: 4.90 ( 20.52%) 6.17 align1=0, align2=6, len1=6, len2=6: 4.90 ( 17.05%) 5.91 align1=6, align2=0, len1=6, len2=6: 4.90 ( 20.43%) 6.16 align1=0, align2=0, len1=7, len2=7: 5.21 ( 36.94%) 8.26 align1=0, align2=0, len1=7, len2=7: 5.46 ( 21.56%) 6.96 align1=0, align2=7, len1=7, len2=7: 5.40 ( 26.55%) 7.35 align1=7, align2=0, len1=7, len2=7: 5.40 ( 24.96%) 7.20 align1=0, align2=0, len1=8, len2=8: 5.40 ( 36.89%) 8.56 align1=0, align2=0, len1=8, len2=8: 5.41 ( 37.09%) 8.59 align1=0, align2=0, len1=8, len2=8: 5.41 ( 36.93%) 8.57 align1=0, align2=0, len1=8, len2=8: 4.90 ( 18.03%) 5.98 align1=0, align2=0, len1=9, len2=9: 4.90 ( 19.42%) 6.08 align1=0, align2=0, len1=9, len2=9: 4.90 ( 18.37%) 6.00 align1=0, align2=1, len1=9, len2=9: 4.90 ( 18.36%) 6.00 align1=1, align2=0, len1=9, len2=9: 4.95 ( 17.52%) 6.01 align1=0, align2=0, len1=10, len2=10: 4.90 ( 19.16%) 6.06 align1=0, align2=0, len1=10, len2=10: 4.90 ( 19.15%) 6.06 align1=0, align2=2, len1=10, len2=10: 5.05 ( 16.63%) 6.06 align1=2, align2=0, len1=10, len2=10: 4.90 ( 19.18%) 6.06 align1=0, align2=0, len1=11, len2=11: 4.90 ( 18.04%) 5.98 align1=0, align2=0, len1=11, len2=11: 4.90 ( 18.06%) 5.98 align1=0, align2=3, len1=11, len2=11: 4.90 ( 18.26%) 6.00 align1=3, align2=0, len1=11, len2=11: 4.90 ( 18.00%) 5.98 align1=0, align2=0, len1=12, len2=12: 4.90 ( 19.36%) 6.08 align1=0, align2=0, len1=12, len2=12: 4.90 ( 19.62%) 6.10 align1=0, align2=4, len1=12, len2=12: 4.96 ( 20.04%) 6.20 align1=4, align2=0, len1=12, len2=12: 5.21 ( 14.32%) 6.08 align1=0, align2=0, len1=13, len2=13: 4.92 ( 19.06%) 6.08 align1=0, align2=0, len1=13, len2=13: 4.90 ( 19.43%) 6.08 align1=0, align2=5, len1=13, len2=13: 4.96 ( 33.66%) 7.47 align1=5, align2=0, len1=13, len2=13: 5.21 ( 14.39%) 6.08 align1=0, align2=0, len1=14, len2=14: 4.90 ( 20.95%) 6.20 align1=0, align2=0, len1=14, len2=14: 4.90 ( 19.97%) 6.12 align1=0, align2=6, len1=14, len2=14: 4.96 ( 34.51%) 7.57 align1=6, align2=0, len1=14, len2=14: 5.23 ( 15.41%) 6.19 align1=0, align2=0, len1=15, len2=15: 4.90 ( 14.39%) 5.73 align1=0, align2=0, len1=15, len2=15: 4.90 ( 14.32%) 5.72 align1=0, align2=7, len1=15, len2=15: 5.00 ( 34.00%) 7.57 align1=7, align2=0, len1=15, len2=15: 5.21 ( 9.29%) 5.74 align1=0, align2=0, len1=16, len2=16: 5.21 ( 37.77%) 8.37 align1=7, align2=2, len1=16, len2=16: 5.24 ( 32.38%) 7.75 align1=0, align2=0, len1=16, len2=4: 5.21 ( 20.43%) 6.55 align1=7, align2=2, len1=16, len2=4: 5.21 ( 20.58%) 6.56 align1=0, align2=0, len1=32, len2=32: 5.59 ( 34.38%) 8.52 align1=6, align2=4, len1=32, len2=32: 5.72 ( 30.00%) 8.17 align1=0, align2=0, len1=32, len2=8: 5.51 ( 16.13%) 6.57 align1=6, align2=4, len1=32, len2=8: 5.51 ( 18.61%) 6.77 align1=0, align2=0, len1=64, len2=64: 5.67 ( 36.02%) 8.86 align1=5, align2=6, len1=64, len2=64: 6.75 ( 28.17%) 9.40 align1=0, align2=0, len1=64, len2=16: 5.69 ( 33.65%) 8.57 align1=5, align2=6, len1=64, len2=16: 5.71 ( 28.33%) 7.97 align1=0, align2=0, len1=128, len2=128: 6.16 ( 37.03%) 9.78 align1=4, align2=0, len1=128, len2=128: 7.05 ( 31.25%) 10.25 align1=0, align2=0, len1=128, len2=32: 6.05 ( 23.56%) 7.91 align1=4, align2=0, len1=128, len2=32: 6.86 ( 27.46%) 9.45 align1=0, align2=0, len1=256, len2=256: 8.22 ( 29.53%) 11.67 align1=3, align2=2, len1=256, len2=256: 9.54 ( 44.34%) 17.14 align1=0, align2=0, len1=256, len2=64: 6.99 ( 35.56%) 10.85 align1=3, align2=2, len1=256, len2=64: 8.01 ( 52.61%) 16.90 align1=0, align2=0, len1=512, len2=512: 11.00 ( 30.02%) 15.72 align1=2, align2=4, len1=512, len2=512: 11.37 ( 54.42%) 24.94 align1=0, align2=0, len1=512, len2=128: 9.16 ( 35.01%) 14.09 align1=2, align2=4, len1=512, len2=128: 10.08 ( 56.41%) 23.12 align1=0, align2=0, len1=1024, len2=1024: 17.78 ( 25.21%) 23.78 align1=1, align2=6, len1=1024, len2=1024: 17.95 ( 58.32%) 43.07 align1=0, align2=0, len1=1024, len2=256: 15.03 ( 24.49%) 19.90 align1=1, align2=6, len1=1024, len2=256: 15.45 ( 60.52%) 39.14 align1=1, align2=2, len1=16, len2=1: 5.21 ( 20.46%) 6.55 align1=2, align2=1, len1=16, len2=1: 5.21 ( 21.64%) 6.65 align1=1, align2=1, len1=16, len2=10: 5.21 ( 17.44%) 6.31 align1=1, align2=1, len1=16, len2=10: 5.21 ( 17.16%) 6.29 align1=2, align2=4, len1=32, len2=1: 5.51 ( 20.52%) 6.93 align1=4, align2=2, len1=32, len2=1: 5.53 ( 20.41%) 6.95 align1=2, align2=2, len1=32, len2=10: 5.51 ( 17.39%) 6.67 align1=2, align2=2, len1=32, len2=10: 5.51 ( 17.24%) 6.66 align1=3, align2=6, len1=64, len2=1: 5.66 ( 23.41%) 7.38 align1=6, align2=3, len1=64, len2=1: 5.78 ( 24.57%) 7.66 align1=3, align2=3, len1=64, len2=10: 5.68 ( 20.29%) 7.12 align1=3, align2=3, len1=64, len2=10: 5.66 ( 20.88%) 7.15 align1=4, align2=0, len1=128, len2=1: 6.97 ( 29.61%) 9.90 align1=0, align2=4, len1=128, len2=1: 6.97 ( 27.45%) 9.61 align1=4, align2=4, len1=128, len2=10: 6.98 ( 28.63%) 9.78 align1=4, align2=4, len1=128, len2=10: 6.97 ( 26.62%) 9.50 align1=5, align2=2, len1=256, len2=1: 8.18 ( 43.05%) 14.37 align1=2, align2=5, len1=256, len2=1: 8.10 ( 44.39%) 14.57 align1=5, align2=5, len1=256, len2=10: 8.14 ( 31.25%) 11.85 align1=5, align2=5, len1=256, len2=10: 8.18 ( 31.12%) 11.87 align1=6, align2=4, len1=512, len2=1: 9.87 ( 46.66%) 18.50 align1=4, align2=6, len1=512, len2=1: 9.85 ( 47.04%) 18.59 align1=6, align2=6, len1=512, len2=10: 9.89 ( 45.08%) 18.01 align1=6, align2=6, len1=512, len2=10: 9.88 ( 46.26%) 18.38 align1=7, align2=6, len1=1024, len2=1: 13.33 ( 24.67%) 17.70 align1=6, align2=7, len1=1024, len2=1: 13.97 ( 61.26%) 36.05 align1=7, align2=7, len1=1024, len2=10: 14.02 ( 61.05%) 35.99 align1=7, align2=7, len1=1024, len2=10: 13.96 ( 60.91%) 35.70 align1=1, align2=0, len1=32, len2=31: 5.57 ( 19.42%) 6.92 align1=0, align2=0, len1=32, len2=31: 5.57 ( 19.73%) 6.94 align1=0, align2=0, len1=32, len2=31: 5.57 ( 19.54%) 6.92 align1=0, align2=0, len1=32, len2=31: 5.57 ( 19.57%) 6.93 align1=1, align2=0, len1=64, len2=31: 5.72 ( 23.20%) 7.45 align1=0, align2=0, len1=64, len2=31: 5.73 ( 23.50%) 7.49 align1=0, align2=0, len1=64, len2=31: 5.72 ( 23.81%) 7.51 align1=0, align2=0, len1=64, len2=31: 5.71 ( 23.65%) 7.48 align1=1, align2=0, len1=96, len2=31: 5.88 ( 27.16%) 8.08 align1=0, align2=0, len1=96, len2=31: 5.90 ( 30.87%) 8.53 align1=0, align2=0, len1=96, len2=31: 5.90 ( 29.63%) 8.38 align1=0, align2=0, len1=96, len2=31: 5.92 ( 32.72%) 8.80 align1=1, align2=0, len1=128, len2=31: 6.85 ( 28.01%) 9.52 align1=0, align2=0, len1=128, len2=31: 6.83 ( 28.85%) 9.60 align1=0, align2=0, len1=128, len2=31: 6.85 ( 28.69%) 9.60 align1=0, align2=0, len1=128, len2=31: 6.85 ( 28.70%) 9.61 align1=1, align2=0, len1=160, len2=31: 7.16 ( 42.05%) 12.35 align1=0, align2=0, len1=160, len2=31: 7.14 ( 44.06%) 12.76 align1=0, align2=0, len1=160, len2=31: 7.13 ( 44.17%) 12.76 align1=0, align2=0, len1=160, len2=31: 7.11 ( 44.36%) 12.78 align1=1, align2=0, len1=192, len2=31: 7.39 ( 44.87%) 13.41 align1=0, align2=0, len1=192, len2=31: 7.39 ( 46.09%) 13.71 align1=0, align2=0, len1=192, len2=31: 7.39 ( 46.05%) 13.70 align1=0, align2=0, len1=192, len2=31: 7.39 ( 46.13%) 13.72 align1=1, align2=0, len1=224, len2=31: 7.55 ( 47.08%) 14.27 align1=0, align2=0, len1=224, len2=31: 7.57 ( 48.12%) 14.59 align1=0, align2=0, len1=224, len2=31: 7.54 ( 48.27%) 14.58 align1=0, align2=0, len1=224, len2=31: 7.55 ( 48.19%) 14.57 align1=1, align2=0, len1=256, len2=31: 7.84 ( 48.13%) 15.12 align1=0, align2=0, len1=256, len2=31: 7.80 ( 49.69%) 15.51 align1=0, align2=0, len1=256, len2=31: 7.83 ( 49.52%) 15.50 align1=0, align2=0, len1=256, len2=31: 7.80 ( 49.67%) 15.50 align1=1, align2=0, len1=320, len2=31: 8.34 ( 50.23%) 16.76 align1=0, align2=0, len1=320, len2=31: 8.34 ( 50.67%) 16.92 align1=0, align2=0, len1=320, len2=31: 8.32 ( 51.12%) 17.01 align1=0, align2=0, len1=320, len2=31: 8.34 ( 50.99%) 17.03 align1=1, align2=0, len1=384, len2=31: 8.85 ( 51.87%) 18.38 align1=0, align2=0, len1=384, len2=31: 8.85 ( 52.39%) 18.59 align1=0, align2=0, len1=384, len2=31: 8.85 ( 52.48%) 18.62 align1=0, align2=0, len1=384, len2=31: 8.87 ( 52.28%) 18.58 align1=1, align2=0, len1=448, len2=31: 9.29 ( 53.08%) 19.80 align1=0, align2=0, len1=448, len2=31: 9.29 ( 53.86%) 20.13 align1=0, align2=0, len1=448, len2=31: 9.28 ( 53.78%) 20.07 align1=0, align2=0, len1=448, len2=31: 9.32 ( 53.67%) 20.13 align1=1, align2=0, len1=512, len2=31: 9.90 ( 53.88%) 21.46 align1=0, align2=0, len1=512, len2=31: 9.87 ( 54.16%) 21.54 align1=0, align2=0, len1=512, len2=31: 9.87 ( 54.44%) 21.67 align1=0, align2=0, len1=512, len2=31: 9.86 ( 54.22%) 21.53 align1=1, align2=0, len1=640, len2=31: 10.82 ( 55.02%) 24.06 align1=0, align2=0, len1=640, len2=31: 10.82 ( 55.14%) 24.11 align1=0, align2=0, len1=640, len2=31: 10.93 ( 54.51%) 24.02 align1=0, align2=0, len1=640, len2=31: 10.89 ( 54.67%) 24.02 align1=1, align2=0, len1=768, len2=31: 11.83 ( 55.94%) 26.84 align1=0, align2=0, len1=768, len2=31: 11.79 ( 56.24%) 26.95 align1=0, align2=0, len1=768, len2=31: 11.79 ( 56.24%) 26.95 align1=0, align2=0, len1=768, len2=31: 11.79 ( 56.28%) 26.97 align1=1, align2=0, len1=896, len2=31: 12.82 ( 62.09%) 33.81 align1=0, align2=0, len1=896, len2=31: 12.86 ( 62.45%) 34.26 align1=0, align2=0, len1=896, len2=31: 12.91 ( 62.61%) 34.52 align1=0, align2=0, len1=896, len2=31: 12.82 ( 62.86%) 34.51 align1=1, align2=0, len1=1024, len2=31: 13.94 ( 62.05%) 36.74 align1=0, align2=0, len1=1024, len2=31: 13.89 ( 62.74%) 37.27 align1=0, align2=0, len1=1024, len2=31: 14.03 ( 62.20%) 37.11 align1=0, align2=0, len1=1024, len2=31: 14.01 ( 62.44%) 37.29 align1=1, align2=0, len1=1280, len2=31: 16.10 ( 58.21%) 38.53 align1=0, align2=0, len1=1280, len2=31: 16.02 ( 59.38%) 39.44 align1=0, align2=0, len1=1280, len2=31: 16.04 ( 59.50%) 39.60 align1=0, align2=0, len1=1280, len2=31: 16.10 ( 59.26%) 39.53 align1=1, align2=0, len1=1536, len2=31: 18.21 ( 58.54%) 43.92 align1=0, align2=0, len1=1536, len2=31: 18.32 ( 59.22%) 44.92 align1=0, align2=0, len1=1536, len2=31: 18.19 ( 59.62%) 45.05 align1=0, align2=0, len1=1536, len2=31: 18.27 ( 59.39%) 44.99 align1=1, align2=0, len1=1792, len2=31: 20.27 ( 59.19%) 49.66 align1=0, align2=0, len1=1792, len2=31: 20.17 ( 59.91%) 50.30 align1=0, align2=0, len1=1792, len2=31: 20.18 ( 59.68%) 50.05 align1=0, align2=0, len1=1792, len2=31: 20.27 ( 59.53%) 50.08
On 11/8/24 8:01 AM, Mahesh Bodapati wrote: > With the new optimized strcpy and strlen implementation, this patch adds an > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */ > IFUNC_IMPL (i, name, strcat, > +#ifdef __LITTLE_ENDIAN__ > + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1 > + && hwcap & PPC_FEATURE_HAS_VSX, > + __strcat_power10) > +#endif > IFUNC_IMPL_ADD (array, i, strcat, > hwcap2 & PPC_FEATURE2_ARCH_2_07 > && hwcap & PPC_FEATURE_HAS_VSX, Why the limitation here and in the other files of this being guarded by #ifdef __LITTLE_ENDIAN__ ? The strcat-power8.c optimization is not guarded by that, so why the Power10 version? If it can be used on big-endian too, then please remove the #ifdef tests and enable it everywhere. > + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1 I believe the strcpy in the line above should be strcat like the entry just below it? > --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c > @@ -25,14 +25,23 @@ > extern __typeof (strcat) __strcat_ppc attribute_hidden; > extern __typeof (strcat) __strcat_power7 attribute_hidden; > extern __typeof (strcat) __strcat_power8 attribute_hidden; > +#ifdef __LITTLE_ENDIAN__ > +extern __typeof (strcat) __strcat_power10 attribute_hidden; > +#endif > # undef strcat > > + Stray whitespace. Peter
On 09/11/24 12:42 am, Peter Bergner wrote: > On 11/8/24 8:01 AM, Mahesh Bodapati wrote: >> With the new optimized strcpy and strlen implementation, this patch adds an >> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> >> /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */ >> IFUNC_IMPL (i, name, strcat, >> +#ifdef __LITTLE_ENDIAN__ >> + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1 >> + && hwcap & PPC_FEATURE_HAS_VSX, >> + __strcat_power10) >> +#endif >> IFUNC_IMPL_ADD (array, i, strcat, >> hwcap2 & PPC_FEATURE2_ARCH_2_07 >> && hwcap & PPC_FEATURE_HAS_VSX, > Why the limitation here and in the other files of this being guarded > by #ifdef __LITTLE_ENDIAN__ ? The strcat-power8.c optimization is not > guarded by that, so why the Power10 version? If it can be used on > big-endian too, then please remove the #ifdef tests and enable it > everywhere. strcat uses strlen and strcpy library functions and these functions are optimized for little endian so guarded with #ifdef __LITTLE_ENDIAN__ where as power8 versions of strlen and strcpy are optimized for both little endian and big endian. > >> + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1 > I believe the strcpy in the line above should be strcat like the entry > just below it? yes,you are right. I will update. > > > > > > >> --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c >> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c >> @@ -25,14 +25,23 @@ >> extern __typeof (strcat) __strcat_ppc attribute_hidden; >> extern __typeof (strcat) __strcat_power7 attribute_hidden; >> extern __typeof (strcat) __strcat_power8 attribute_hidden; >> +#ifdef __LITTLE_ENDIAN__ >> +extern __typeof (strcat) __strcat_power10 attribute_hidden; >> +#endif >> # undef strcat >> >> + > Stray whitespace. I will update. > > Peter > >
On 11/10/24 11:23 PM, MAHESH BODAPATI wrote: > On 09/11/24 12:42 am, Peter Bergner wrote: >> Why the limitation here and in the other files of this being guarded >> by #ifdef __LITTLE_ENDIAN__ ? The strcat-power8.c optimization is not >> guarded by that, so why the Power10 version? If it can be used on >> big-endian too, then please remove the #ifdef tests and enable it >> everywhere. > > strcat uses strlen and strcpy library functions and these functions are > optimized for little endian so guarded with #ifdef __LITTLE_ENDIAN__ where > as power8 versions of strlen and strcpy are optimized for both little > endian and big endian. Ah, right you are. It might be nice to get those to be endian agnostic, but that is for another day and another patch. Peter
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index b847c19049..dc7c5b14ee 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -34,8 +34,9 @@ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ memmove-power10 memset-power10 rawmemchr-power9 \ rawmemchr-power10 strcmp-power9 strcmp-power10 \ - strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \ - strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 + strncmp-power9 strncmp-power10 strcpy-power9 strcat-power10 \ + stpcpy-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ + strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 2bb47d3527..ab9e7c6142 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -406,6 +406,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */ IFUNC_IMPL (i, name, strcat, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX, + __strcat_power10) +#endif IFUNC_IMPL_ADD (array, i, strcat, hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_VSX, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c new file mode 100644 index 0000000000..8d653ab500 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power10.c @@ -0,0 +1,33 @@ +/* Copyright (C) 2015-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/ >. */ + +#ifdef __LITTLE_ENDIAN__ +#include <string.h> + +#define STRCAT __strcat_power10 + +#undef libc_hidden_def +#define libc_hidden_def(name) + +extern typeof (strcpy) __strcpy_power9; +extern typeof (strlen) __strlen_power10; + +#define strcpy __strcpy_power9 +#define strlen __strlen_power10 + +#include <string/strcat.c> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c index 27e636e0ff..3493716c3c 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c @@ -25,14 +25,23 @@ extern __typeof (strcat) __strcat_ppc attribute_hidden; extern __typeof (strcat) __strcat_power7 attribute_hidden; extern __typeof (strcat) __strcat_power8 attribute_hidden; +#ifdef __LITTLE_ENDIAN__ +extern __typeof (strcat) __strcat_power10 attribute_hidden; +#endif # undef strcat + libc_ifunc_redirected (__redirect_strcat, strcat, - (hwcap2 & PPC_FEATURE2_ARCH_2_07 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __strcat_power8 - : (hwcap & PPC_FEATURE_ARCH_2_06 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __strcat_power7 - : __strcat_ppc); +#ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX) + ? __strcat_power10 : +#endif + (hwcap2 & PPC_FEATURE2_ARCH_2_07 + && hwcap & PPC_FEATURE_HAS_VSX) + ? __strcat_power8 + : (hwcap & PPC_FEATURE_ARCH_2_06 + && hwcap & PPC_FEATURE_HAS_VSX) + ? __strcat_power7 + : __strcat_ppc); #endif