diff mbox series

[v1] x86: Remove generic strncat, strncpy, and stpncpy implementations

Message ID 20220706000641.3657347-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Remove generic strncat, strncpy, and stpncpy implementations | expand

Commit Message

Noah Goldstein July 6, 2022, 12:06 a.m. UTC
These functions all have optimized versions:
__strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
stpncpy_sse2_unaligned which are faster than their respective generic
implementations.  Since the sse2 versions can run on baseline x86_64,
we should use these as the baseline implementation and can remove the
generic implementations.

Geometric mean of N=20 runs of the entire benchmark suite on:
11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)

__strncat_sse2_unaligned / __strncat_generic: .944
__strncpy_sse2_unaligned / __strncpy_generic: .726
__stpncpy_sse2_unaligned / __stpncpy_generic: .650

Tested build with and without multiarch and full check with multiarch.
---
 sysdeps/x86_64/multiarch/Makefile          |  3 --
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
 sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
 sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
 sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
 sysdeps/x86_64/multiarch/strncat.c         |  3 +-
 sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
 sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
 10 files changed, 56 insertions(+), 92 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
 delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c

Comments

H.J. Lu July 6, 2022, 1:32 a.m. UTC | #1
On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> These functions all have optimized versions:
> __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> stpncpy_sse2_unaligned which are faster than their respective generic
> implementations.  Since the sse2 versions can run on baseline x86_64,

Is this true on all x86-64 processors?

> we should use these as the baseline implementation and can remove the
> generic implementations.
>
> Geometric mean of N=20 runs of the entire benchmark suite on:
> 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
>
> __strncat_sse2_unaligned / __strncat_generic: .944
> __strncpy_sse2_unaligned / __strncpy_generic: .726
> __stpncpy_sse2_unaligned / __stpncpy_generic: .650
>
> Tested build with and without multiarch and full check with multiarch.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |  3 --
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
>  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
>  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
>  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
>  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
>  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
>  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
>  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
>  10 files changed, 56 insertions(+), 92 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
>  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 18cea04423..04e238efa0 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -45,7 +45,6 @@ sysdep_routines += \
>    stpcpy-sse2-unaligned \
>    stpncpy-avx2 \
>    stpncpy-avx2-rtm \
> -  stpncpy-generic \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
>    strcasecmp_l-avx2 \
> @@ -92,7 +91,6 @@ sysdep_routines += \
>    strncase_l-sse4_2 \
>    strncat-avx2 \
>    strncat-avx2-rtm \
> -  strncat-generic \
>    strncat-evex \
>    strncat-sse2-unaligned \
>    strncmp-avx2 \
> @@ -102,7 +100,6 @@ sysdep_routines += \
>    strncmp-sse4_2 \
>    strncpy-avx2 \
>    strncpy-avx2-rtm \
> -  strncpy-generic \
>    strncpy-evex \
>    strncpy-sse2-unaligned \
>    strnlen-avx2 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index adf7d4bafd..2c96cb62d2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __stpncpy_evex)
>               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> -                             __stpncpy_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> +                             __stpncpy_sse2_unaligned))
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncat_evex)
>               IFUNC_IMPL_ADD (array, i, strncat, 1,
> -                             __strncat_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> +                             __strncat_sse2_unaligned))
>
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
> @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncpy_evex)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> -                             __strncpy_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> +                             __strncpy_sse2_unaligned))
>
>    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
>    IFUNC_IMPL (i, name, strpbrk,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index 80529458d1..a15afa44e9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -20,11 +20,7 @@
>
>  #include <init-arch.h>
>
> -#ifndef GENERIC
> -# define GENERIC sse2
> -#endif
> -
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>    attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  return OPTIMIZE (GENERIC);
> +  return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> new file mode 100644
> index 0000000000..323225af4d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> @@ -0,0 +1,48 @@
> +/* Common definition for ifunc st{r|p}n{cpy|cat}
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +       return OPTIMIZE (evex);
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +       return OPTIMIZE (avx2_rtm);
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +       return OPTIMIZE (avx2);
> +    }
> +
> +  return OPTIMIZE (sse2_unaligned);
> +}
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> deleted file mode 100644
> index 87826845b0..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> +++ /dev/null
> @@ -1,26 +0,0 @@
> -/* stpncpy.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -
> -#define STPNCPY __stpncpy_generic
> -#undef weak_alias
> -#define weak_alias(ignored1, ignored2)
> -#undef libc_hidden_def
> -#define libc_hidden_def(stpncpy)
> -
> -#include <string/stpncpy.c>
> diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> index 879bc83f0b..a8d083ff0d 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy.c
> +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> @@ -25,9 +25,8 @@
>  # undef stpncpy
>  # undef __stpncpy
>
> -# define GENERIC generic
>  # define SYMBOL_NAME stpncpy
> -# include "ifunc-strcpy.h"
> +# include "ifunc-strncpy.h"
>
>  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
>
> diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> deleted file mode 100644
> index 0090669cd1..0000000000
> --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> +++ /dev/null
> @@ -1,21 +0,0 @@
> -/* strncat.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -
> -#define STRNCAT __strncat_generic
> -#include <string/strncat.c>
> diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> index 50fba8a41f..a590c25d51 100644
> --- a/sysdeps/x86_64/multiarch/strncat.c
> +++ b/sysdeps/x86_64/multiarch/strncat.c
> @@ -24,8 +24,7 @@
>  # undef strncat
>
>  # define SYMBOL_NAME strncat
> -# define GENERIC generic
> -# include "ifunc-strcpy.h"
> +# include "ifunc-strncpy.h"
>
>  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
>  strong_alias (strncat, __strncat);
> diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> deleted file mode 100644
> index 9916153dd5..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> +++ /dev/null
> @@ -1,24 +0,0 @@
> -/* strncpy.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -
> -#define STRNCPY __strncpy_generic
> -#undef libc_hidden_builtin_def
> -#define libc_hidden_builtin_def(strncpy)
> -
> -#include <string/strncpy.c>
> diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> index 7fc7d72ec5..c83440f0e3 100644
> --- a/sysdeps/x86_64/multiarch/strncpy.c
> +++ b/sysdeps/x86_64/multiarch/strncpy.c
> @@ -24,8 +24,7 @@
>  # undef strncpy
>
>  # define SYMBOL_NAME strncpy
> -# define GENERIC generic
> -# include "ifunc-strcpy.h"
> +# include "ifunc-strncpy.h"
>
>  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
>
> --
> 2.34.1
>


--
H.J.
Noah Goldstein July 6, 2022, 2:05 a.m. UTC | #2
On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > These functions all have optimized versions:
> > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > stpncpy_sse2_unaligned which are faster than their respective generic
> > implementations.  Since the sse2 versions can run on baseline x86_64,
>
> Is this true on all x86-64 processors?

That they are faster or that sse2 can run on baseline x86_64?


>
> > we should use these as the baseline implementation and can remove the
> > generic implementations.
> >
> > Geometric mean of N=20 runs of the entire benchmark suite on:
> > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> >
> > __strncat_sse2_unaligned / __strncat_generic: .944
> > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> >
> > Tested build with and without multiarch and full check with multiarch.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> >  10 files changed, 56 insertions(+), 92 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 18cea04423..04e238efa0 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -45,7 +45,6 @@ sysdep_routines += \
> >    stpcpy-sse2-unaligned \
> >    stpncpy-avx2 \
> >    stpncpy-avx2-rtm \
> > -  stpncpy-generic \
> >    stpncpy-evex \
> >    stpncpy-sse2-unaligned \
> >    strcasecmp_l-avx2 \
> > @@ -92,7 +91,6 @@ sysdep_routines += \
> >    strncase_l-sse4_2 \
> >    strncat-avx2 \
> >    strncat-avx2-rtm \
> > -  strncat-generic \
> >    strncat-evex \
> >    strncat-sse2-unaligned \
> >    strncmp-avx2 \
> > @@ -102,7 +100,6 @@ sysdep_routines += \
> >    strncmp-sse4_2 \
> >    strncpy-avx2 \
> >    strncpy-avx2-rtm \
> > -  strncpy-generic \
> >    strncpy-evex \
> >    strncpy-sse2-unaligned \
> >    strnlen-avx2 \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index adf7d4bafd..2c96cb62d2 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)),
> >                               __stpncpy_evex)
> >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > -                             __stpncpy_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > +                             __stpncpy_sse2_unaligned))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> >    IFUNC_IMPL (i, name, stpcpy,
> > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)),
> >                               __strncat_evex)
> >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > -                             __strncat_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > +                             __strncat_sse2_unaligned))
> >
> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >    IFUNC_IMPL (i, name, strncpy,
> > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)),
> >                               __strncpy_evex)
> >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > -                             __strncpy_sse2_unaligned)
> > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > +                             __strncpy_sse2_unaligned))
> >
> >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> >    IFUNC_IMPL (i, name, strpbrk,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > index 80529458d1..a15afa44e9 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > @@ -20,11 +20,7 @@
> >
> >  #include <init-arch.h>
> >
> > -#ifndef GENERIC
> > -# define GENERIC sse2
> > -#endif
> > -
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> >    attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> >      return OPTIMIZE (sse2_unaligned);
> >
> > -  return OPTIMIZE (GENERIC);
> > +  return OPTIMIZE (sse2);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > new file mode 100644
> > index 0000000000..323225af4d
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > @@ -0,0 +1,48 @@
> > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <init-arch.h>
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > +
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > +       return OPTIMIZE (evex);
> > +
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > +       return OPTIMIZE (avx2_rtm);
> > +
> > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > +       return OPTIMIZE (avx2);
> > +    }
> > +
> > +  return OPTIMIZE (sse2_unaligned);
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > deleted file mode 100644
> > index 87826845b0..0000000000
> > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > +++ /dev/null
> > @@ -1,26 +0,0 @@
> > -/* stpncpy.
> > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -
> > -#define STPNCPY __stpncpy_generic
> > -#undef weak_alias
> > -#define weak_alias(ignored1, ignored2)
> > -#undef libc_hidden_def
> > -#define libc_hidden_def(stpncpy)
> > -
> > -#include <string/stpncpy.c>
> > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > index 879bc83f0b..a8d083ff0d 100644
> > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > @@ -25,9 +25,8 @@
> >  # undef stpncpy
> >  # undef __stpncpy
> >
> > -# define GENERIC generic
> >  # define SYMBOL_NAME stpncpy
> > -# include "ifunc-strcpy.h"
> > +# include "ifunc-strncpy.h"
> >
> >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> >
> > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > deleted file mode 100644
> > index 0090669cd1..0000000000
> > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > +++ /dev/null
> > @@ -1,21 +0,0 @@
> > -/* strncat.
> > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -
> > -#define STRNCAT __strncat_generic
> > -#include <string/strncat.c>
> > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > index 50fba8a41f..a590c25d51 100644
> > --- a/sysdeps/x86_64/multiarch/strncat.c
> > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > @@ -24,8 +24,7 @@
> >  # undef strncat
> >
> >  # define SYMBOL_NAME strncat
> > -# define GENERIC generic
> > -# include "ifunc-strcpy.h"
> > +# include "ifunc-strncpy.h"
> >
> >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> >  strong_alias (strncat, __strncat);
> > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > deleted file mode 100644
> > index 9916153dd5..0000000000
> > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > +++ /dev/null
> > @@ -1,24 +0,0 @@
> > -/* strncpy.
> > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -
> > -#define STRNCPY __strncpy_generic
> > -#undef libc_hidden_builtin_def
> > -#define libc_hidden_builtin_def(strncpy)
> > -
> > -#include <string/strncpy.c>
> > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > index 7fc7d72ec5..c83440f0e3 100644
> > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > @@ -24,8 +24,7 @@
> >  # undef strncpy
> >
> >  # define SYMBOL_NAME strncpy
> > -# define GENERIC generic
> > -# include "ifunc-strcpy.h"
> > +# include "ifunc-strncpy.h"
> >
> >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> >
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
Noah Goldstein July 6, 2022, 2:14 a.m. UTC | #3
On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > These functions all have optimized versions:
> > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > implementations.  Since the sse2 versions can run on baseline x86_64,
> >
> > Is this true on all x86-64 processors?
>
> That they are faster or that sse2 can run on baseline x86_64?

Assuming you mean faster:

maybe not for large strings on a processor that heavily prefers aligned
access.

The generics are implemented as:

strncat = strnlen + strlen + memcpy
strncpy = strnlen + memset + memcpy
stpncpy = strnlen + memset + memcpy

For very large strings on processors with SSSE3
all of those functions can be implemented with only aligned
access in the loops.

Not sure if it would work in favor of the generic implementations though as
they are essentially doing 2x + the number of memory accesses.

mayshao do you have benchmark numbers on these functions?

>
>
> >
> > > we should use these as the baseline implementation and can remove the
> > > generic implementations.
> > >
> > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > >
> > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > >
> > > Tested build with and without multiarch and full check with multiarch.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index 18cea04423..04e238efa0 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > >    stpcpy-sse2-unaligned \
> > >    stpncpy-avx2 \
> > >    stpncpy-avx2-rtm \
> > > -  stpncpy-generic \
> > >    stpncpy-evex \
> > >    stpncpy-sse2-unaligned \
> > >    strcasecmp_l-avx2 \
> > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > >    strncase_l-sse4_2 \
> > >    strncat-avx2 \
> > >    strncat-avx2-rtm \
> > > -  strncat-generic \
> > >    strncat-evex \
> > >    strncat-sse2-unaligned \
> > >    strncmp-avx2 \
> > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > >    strncmp-sse4_2 \
> > >    strncpy-avx2 \
> > >    strncpy-avx2-rtm \
> > > -  strncpy-generic \
> > >    strncpy-evex \
> > >    strncpy-sse2-unaligned \
> > >    strnlen-avx2 \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index adf7d4bafd..2c96cb62d2 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > >                               __stpncpy_evex)
> > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > -                             __stpncpy_sse2_unaligned)
> > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > +                             __stpncpy_sse2_unaligned))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > >    IFUNC_IMPL (i, name, stpcpy,
> > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > >                               __strncat_evex)
> > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > -                             __strncat_sse2_unaligned)
> > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > +                             __strncat_sse2_unaligned))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > >    IFUNC_IMPL (i, name, strncpy,
> > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > >                               __strncpy_evex)
> > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > -                             __strncpy_sse2_unaligned)
> > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > +                             __strncpy_sse2_unaligned))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > >    IFUNC_IMPL (i, name, strpbrk,
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > index 80529458d1..a15afa44e9 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > @@ -20,11 +20,7 @@
> > >
> > >  #include <init-arch.h>
> > >
> > > -#ifndef GENERIC
> > > -# define GENERIC sse2
> > > -#endif
> > > -
> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > >    attribute_hidden;
> > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > >      return OPTIMIZE (sse2_unaligned);
> > >
> > > -  return OPTIMIZE (GENERIC);
> > > +  return OPTIMIZE (sse2);
> > >  }
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > new file mode 100644
> > > index 0000000000..323225af4d
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > @@ -0,0 +1,48 @@
> > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > +   All versions must be listed in ifunc-impl-list.c.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <init-arch.h>
> > > +
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > +  attribute_hidden;
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > +
> > > +static inline void *
> > > +IFUNC_SELECTOR (void)
> > > +{
> > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > +
> > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > +    {
> > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > +       return OPTIMIZE (evex);
> > > +
> > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > +       return OPTIMIZE (avx2_rtm);
> > > +
> > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > +       return OPTIMIZE (avx2);
> > > +    }
> > > +
> > > +  return OPTIMIZE (sse2_unaligned);
> > > +}
> > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > deleted file mode 100644
> > > index 87826845b0..0000000000
> > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > +++ /dev/null
> > > @@ -1,26 +0,0 @@
> > > -/* stpncpy.
> > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > -   This file is part of the GNU C Library.
> > > -
> > > -   The GNU C Library is free software; you can redistribute it and/or
> > > -   modify it under the terms of the GNU Lesser General Public
> > > -   License as published by the Free Software Foundation; either
> > > -   version 2.1 of the License, or (at your option) any later version.
> > > -
> > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > -   Lesser General Public License for more details.
> > > -
> > > -   You should have received a copy of the GNU Lesser General Public
> > > -   License along with the GNU C Library; if not, see
> > > -   <https://www.gnu.org/licenses/>.  */
> > > -
> > > -
> > > -#define STPNCPY __stpncpy_generic
> > > -#undef weak_alias
> > > -#define weak_alias(ignored1, ignored2)
> > > -#undef libc_hidden_def
> > > -#define libc_hidden_def(stpncpy)
> > > -
> > > -#include <string/stpncpy.c>
> > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > index 879bc83f0b..a8d083ff0d 100644
> > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > @@ -25,9 +25,8 @@
> > >  # undef stpncpy
> > >  # undef __stpncpy
> > >
> > > -# define GENERIC generic
> > >  # define SYMBOL_NAME stpncpy
> > > -# include "ifunc-strcpy.h"
> > > +# include "ifunc-strncpy.h"
> > >
> > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > deleted file mode 100644
> > > index 0090669cd1..0000000000
> > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > +++ /dev/null
> > > @@ -1,21 +0,0 @@
> > > -/* strncat.
> > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > -   This file is part of the GNU C Library.
> > > -
> > > -   The GNU C Library is free software; you can redistribute it and/or
> > > -   modify it under the terms of the GNU Lesser General Public
> > > -   License as published by the Free Software Foundation; either
> > > -   version 2.1 of the License, or (at your option) any later version.
> > > -
> > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > -   Lesser General Public License for more details.
> > > -
> > > -   You should have received a copy of the GNU Lesser General Public
> > > -   License along with the GNU C Library; if not, see
> > > -   <https://www.gnu.org/licenses/>.  */
> > > -
> > > -
> > > -#define STRNCAT __strncat_generic
> > > -#include <string/strncat.c>
> > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > index 50fba8a41f..a590c25d51 100644
> > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > @@ -24,8 +24,7 @@
> > >  # undef strncat
> > >
> > >  # define SYMBOL_NAME strncat
> > > -# define GENERIC generic
> > > -# include "ifunc-strcpy.h"
> > > +# include "ifunc-strncpy.h"
> > >
> > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > >  strong_alias (strncat, __strncat);
> > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > deleted file mode 100644
> > > index 9916153dd5..0000000000
> > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > +++ /dev/null
> > > @@ -1,24 +0,0 @@
> > > -/* strncpy.
> > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > -   This file is part of the GNU C Library.
> > > -
> > > -   The GNU C Library is free software; you can redistribute it and/or
> > > -   modify it under the terms of the GNU Lesser General Public
> > > -   License as published by the Free Software Foundation; either
> > > -   version 2.1 of the License, or (at your option) any later version.
> > > -
> > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > -   Lesser General Public License for more details.
> > > -
> > > -   You should have received a copy of the GNU Lesser General Public
> > > -   License along with the GNU C Library; if not, see
> > > -   <https://www.gnu.org/licenses/>.  */
> > > -
> > > -
> > > -#define STRNCPY __strncpy_generic
> > > -#undef libc_hidden_builtin_def
> > > -#define libc_hidden_builtin_def(strncpy)
> > > -
> > > -#include <string/strncpy.c>
> > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > index 7fc7d72ec5..c83440f0e3 100644
> > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > @@ -24,8 +24,7 @@
> > >  # undef strncpy
> > >
> > >  # define SYMBOL_NAME strncpy
> > > -# define GENERIC generic
> > > -# include "ifunc-strcpy.h"
> > > +# include "ifunc-strncpy.h"
> > >
> > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > >
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.
Noah Goldstein July 6, 2022, 2:47 a.m. UTC | #4
On Tue, Jul 5, 2022 at 7:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > These functions all have optimized versions:
> > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > > implementations.  Since the sse2 versions can run on baseline x86_64,
> > >
> > > Is this true on all x86-64 processors?
> >
> > That they are faster or that sse2 can run on baseline x86_64?
>
> Assuming you mean faster:
>
> maybe not for large strings on a processor that heavily prefers aligned
> access.
>
> The generics are implemented as:
>
> strncat = strnlen + strlen + memcpy
> strncpy = strnlen + memset + memcpy
> stpncpy = strnlen + memset + memcpy
>
> For very large strings on processors with SSSE3
> all of those functions can be implemented with only aligned
> access in the loops.
>
> Not sure if it would work in favor of the generic implementations though as
> they are essentially doing 2x + the number of memory accesses.
>
> mayshao do you have benchmark numbers on these functions?

If you do benchmark the strcpy family, any chance you can also
report results for strcpy/stpcpy/strcat sse2 vs sse2_unaligned?

My guess is the sse2_unaligned versions are much faster and if
so we can drop the sse2 implementations.

>
> >
> >
> > >
> > > > we should use these as the baseline implementation and can remove the
> > > > generic implementations.
> > > >
> > > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > > >
> > > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > > >
> > > > Tested build with and without multiarch and full check with multiarch.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index 18cea04423..04e238efa0 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > > >    stpcpy-sse2-unaligned \
> > > >    stpncpy-avx2 \
> > > >    stpncpy-avx2-rtm \
> > > > -  stpncpy-generic \
> > > >    stpncpy-evex \
> > > >    stpncpy-sse2-unaligned \
> > > >    strcasecmp_l-avx2 \
> > > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > > >    strncase_l-sse4_2 \
> > > >    strncat-avx2 \
> > > >    strncat-avx2-rtm \
> > > > -  strncat-generic \
> > > >    strncat-evex \
> > > >    strncat-sse2-unaligned \
> > > >    strncmp-avx2 \
> > > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > > >    strncmp-sse4_2 \
> > > >    strncpy-avx2 \
> > > >    strncpy-avx2-rtm \
> > > > -  strncpy-generic \
> > > >    strncpy-evex \
> > > >    strncpy-sse2-unaligned \
> > > >    strnlen-avx2 \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index adf7d4bafd..2c96cb62d2 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                               __stpncpy_evex)
> > > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > > -                             __stpncpy_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > > +                             __stpncpy_sse2_unaligned))
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > > >    IFUNC_IMPL (i, name, stpcpy,
> > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                               __strncat_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > > -                             __strncat_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > > +                             __strncat_sse2_unaligned))
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > > >    IFUNC_IMPL (i, name, strncpy,
> > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                               __strncpy_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > > -                             __strncpy_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > > +                             __strncpy_sse2_unaligned))
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > > >    IFUNC_IMPL (i, name, strpbrk,
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > index 80529458d1..a15afa44e9 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > @@ -20,11 +20,7 @@
> > > >
> > > >  #include <init-arch.h>
> > > >
> > > > -#ifndef GENERIC
> > > > -# define GENERIC sse2
> > > > -#endif
> > > > -
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > >    attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > >      return OPTIMIZE (sse2_unaligned);
> > > >
> > > > -  return OPTIMIZE (GENERIC);
> > > > +  return OPTIMIZE (sse2);
> > > >  }
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > new file mode 100644
> > > > index 0000000000..323225af4d
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > @@ -0,0 +1,48 @@
> > > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > > +   All versions must be listed in ifunc-impl-list.c.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <init-arch.h>
> > > > +
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > +  attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > > +
> > > > +static inline void *
> > > > +IFUNC_SELECTOR (void)
> > > > +{
> > > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > +
> > > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > +    {
> > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > +       return OPTIMIZE (evex);
> > > > +
> > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > +       return OPTIMIZE (avx2_rtm);
> > > > +
> > > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > +       return OPTIMIZE (avx2);
> > > > +    }
> > > > +
> > > > +  return OPTIMIZE (sse2_unaligned);
> > > > +}
> > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > deleted file mode 100644
> > > > index 87826845b0..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > +++ /dev/null
> > > > @@ -1,26 +0,0 @@
> > > > -/* stpncpy.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -
> > > > -#define STPNCPY __stpncpy_generic
> > > > -#undef weak_alias
> > > > -#define weak_alias(ignored1, ignored2)
> > > > -#undef libc_hidden_def
> > > > -#define libc_hidden_def(stpncpy)
> > > > -
> > > > -#include <string/stpncpy.c>
> > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > index 879bc83f0b..a8d083ff0d 100644
> > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > @@ -25,9 +25,8 @@
> > > >  # undef stpncpy
> > > >  # undef __stpncpy
> > > >
> > > > -# define GENERIC generic
> > > >  # define SYMBOL_NAME stpncpy
> > > > -# include "ifunc-strcpy.h"
> > > > +# include "ifunc-strncpy.h"
> > > >
> > > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > deleted file mode 100644
> > > > index 0090669cd1..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > +++ /dev/null
> > > > @@ -1,21 +0,0 @@
> > > > -/* strncat.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -
> > > > -#define STRNCAT __strncat_generic
> > > > -#include <string/strncat.c>
> > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > > index 50fba8a41f..a590c25d51 100644
> > > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > > @@ -24,8 +24,7 @@
> > > >  # undef strncat
> > > >
> > > >  # define SYMBOL_NAME strncat
> > > > -# define GENERIC generic
> > > > -# include "ifunc-strcpy.h"
> > > > +# include "ifunc-strncpy.h"
> > > >
> > > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > > >  strong_alias (strncat, __strncat);
> > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > deleted file mode 100644
> > > > index 9916153dd5..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > +++ /dev/null
> > > > @@ -1,24 +0,0 @@
> > > > -/* strncpy.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -
> > > > -#define STRNCPY __strncpy_generic
> > > > -#undef libc_hidden_builtin_def
> > > > -#define libc_hidden_builtin_def(strncpy)
> > > > -
> > > > -#include <string/strncpy.c>
> > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > > index 7fc7d72ec5..c83440f0e3 100644
> > > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > > @@ -24,8 +24,7 @@
> > > >  # undef strncpy
> > > >
> > > >  # define SYMBOL_NAME strncpy
> > > > -# define GENERIC generic
> > > > -# include "ifunc-strcpy.h"
> > > > +# include "ifunc-strncpy.h"
> > > >
> > > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > > >
> > > > --
> > > > 2.34.1
> > > >
> > >
> > >
> > > --
> > > H.J.
Mayshao-oc July 6, 2022, 7:42 a.m. UTC | #5
On Wed, Jul 6, 2022 at 10:14 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > These functions all have optimized versions:
> > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > > implementations.  Since the sse2 versions can run on baseline x86_64,
> > >
> > > Is this true on all x86-64 processors?
> >
> > That they are faster or that sse2 can run on baseline x86_64?

Both cases are true on Zhaoxin processors.

>
> Assuming you mean faster:
>
> maybe not for large strings on a processor that heavily prefers aligned
> access.
>
> The generics are implemented as:
>
> strncat = strnlen + strlen + memcpy
> strncpy = strnlen + memset + memcpy
> stpncpy = strnlen + memset + memcpy
>
> For very large strings on processors with SSSE3
> all of those functions can be implemented with only aligned
> access in the loops.
>
> Not sure if it would work in favor of the generic implementations though as
> they are essentially doing 2x + the number of memory accesses.
>
> mayshao do you have benchmark numbers on these functions?

The results are attached.  If there is any problem with the results, please
feel free to let me know.


Best Regards,
May Shao

>
> >
> >
> > >
> > > > we should use these as the baseline implementation and can remove the
> > > > generic implementations.
> > > >
> > > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > > >
> > > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > > >
> > > > Tested build with and without multiarch and full check with multiarch.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index 18cea04423..04e238efa0 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > > >    stpcpy-sse2-unaligned \
> > > >    stpncpy-avx2 \
> > > >    stpncpy-avx2-rtm \
> > > > -  stpncpy-generic \
> > > >    stpncpy-evex \
> > > >    stpncpy-sse2-unaligned \
> > > >    strcasecmp_l-avx2 \
> > > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > > >    strncase_l-sse4_2 \
> > > >    strncat-avx2 \
> > > >    strncat-avx2-rtm \
> > > > -  strncat-generic \
> > > >    strncat-evex \
> > > >    strncat-sse2-unaligned \
> > > >    strncmp-avx2 \
> > > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > > >    strncmp-sse4_2 \
> > > >    strncpy-avx2 \
> > > >    strncpy-avx2-rtm \
> > > > -  strncpy-generic \
> > > >    strncpy-evex \
> > > >    strncpy-sse2-unaligned \
> > > >    strnlen-avx2 \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index adf7d4bafd..2c96cb62d2 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                               __stpncpy_evex)
> > > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > > -                             __stpncpy_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > > +                             __stpncpy_sse2_unaligned))
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > > >    IFUNC_IMPL (i, name, stpcpy,
> > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                               __strncat_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > > -                             __strncat_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > > +                             __strncat_sse2_unaligned))
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > > >    IFUNC_IMPL (i, name, strncpy,
> > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                               __strncpy_evex)
> > > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > > -                             __strncpy_sse2_unaligned)
> > > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > > +                             __strncpy_sse2_unaligned))
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > > >    IFUNC_IMPL (i, name, strpbrk,
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > index 80529458d1..a15afa44e9 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > @@ -20,11 +20,7 @@
> > > >
> > > >  #include <init-arch.h>
> > > >
> > > > -#ifndef GENERIC
> > > > -# define GENERIC sse2
> > > > -#endif
> > > > -
> > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > >    attribute_hidden;
> > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > >      return OPTIMIZE (sse2_unaligned);
> > > >
> > > > -  return OPTIMIZE (GENERIC);
> > > > +  return OPTIMIZE (sse2);
> > > >  }
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > new file mode 100644
> > > > index 0000000000..323225af4d
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > @@ -0,0 +1,48 @@
> > > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > > +   All versions must be listed in ifunc-impl-list.c.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <init-arch.h>
> > > > +
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > +  attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > > +
> > > > +static inline void *
> > > > +IFUNC_SELECTOR (void)
> > > > +{
> > > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > +
> > > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > +    {
> > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > +       return OPTIMIZE (evex);
> > > > +
> > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > +       return OPTIMIZE (avx2_rtm);
> > > > +
> > > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > +       return OPTIMIZE (avx2);
> > > > +    }
> > > > +
> > > > +  return OPTIMIZE (sse2_unaligned);
> > > > +}
> > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > deleted file mode 100644
> > > > index 87826845b0..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > +++ /dev/null
> > > > @@ -1,26 +0,0 @@
> > > > -/* stpncpy.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -
> > > > -#define STPNCPY __stpncpy_generic
> > > > -#undef weak_alias
> > > > -#define weak_alias(ignored1, ignored2)
> > > > -#undef libc_hidden_def
> > > > -#define libc_hidden_def(stpncpy)
> > > > -
> > > > -#include <string/stpncpy.c>
> > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > index 879bc83f0b..a8d083ff0d 100644
> > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > @@ -25,9 +25,8 @@
> > > >  # undef stpncpy
> > > >  # undef __stpncpy
> > > >
> > > > -# define GENERIC generic
> > > >  # define SYMBOL_NAME stpncpy
> > > > -# include "ifunc-strcpy.h"
> > > > +# include "ifunc-strncpy.h"
> > > >
> > > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > deleted file mode 100644
> > > > index 0090669cd1..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > +++ /dev/null
> > > > @@ -1,21 +0,0 @@
> > > > -/* strncat.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -
> > > > -#define STRNCAT __strncat_generic
> > > > -#include <string/strncat.c>
> > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > > index 50fba8a41f..a590c25d51 100644
> > > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > > @@ -24,8 +24,7 @@
> > > >  # undef strncat
> > > >
> > > >  # define SYMBOL_NAME strncat
> > > > -# define GENERIC generic
> > > > -# include "ifunc-strcpy.h"
> > > > +# include "ifunc-strncpy.h"
> > > >
> > > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > > >  strong_alias (strncat, __strncat);
> > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > deleted file mode 100644
> > > > index 9916153dd5..0000000000
> > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > +++ /dev/null
> > > > @@ -1,24 +0,0 @@
> > > > -/* strncpy.
> > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > -   This file is part of the GNU C Library.
> > > > -
> > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > -   modify it under the terms of the GNU Lesser General Public
> > > > -   License as published by the Free Software Foundation; either
> > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > -
> > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > -   Lesser General Public License for more details.
> > > > -
> > > > -   You should have received a copy of the GNU Lesser General Public
> > > > -   License along with the GNU C Library; if not, see
> > > > -   <https://www.gnu.org/licenses/>.  */
> > > > -
> > > > -
> > > > -#define STRNCPY __strncpy_generic
> > > > -#undef libc_hidden_builtin_def
> > > > -#define libc_hidden_builtin_def(strncpy)
> > > > -
> > > > -#include <string/strncpy.c>
> > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > > index 7fc7d72ec5..c83440f0e3 100644
> > > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > > @@ -24,8 +24,7 @@
> > > >  # undef strncpy
> > > >
> > > >  # define SYMBOL_NAME strncpy
> > > > -# define GENERIC generic
> > > > -# include "ifunc-strcpy.h"
> > > > +# include "ifunc-strncpy.h"
> > > >
> > > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > > >
> > > > --
> > > > 2.34.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
H.J. Lu July 6, 2022, 6:08 p.m. UTC | #6
On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> These functions all have optimized versions:
> __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> stpncpy_sse2_unaligned which are faster than their respective generic
> implementations.  Since the sse2 versions can run on baseline x86_64,
> we should use these as the baseline implementation and can remove the
> generic implementations.
>
> Geometric mean of N=20 runs of the entire benchmark suite on:
> 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
>
> __strncat_sse2_unaligned / __strncat_generic: .944
> __strncpy_sse2_unaligned / __strncpy_generic: .726
> __stpncpy_sse2_unaligned / __stpncpy_generic: .650
>
> Tested build with and without multiarch and full check with multiarch.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |  3 --
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
>  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
>  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
>  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
>  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
>  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
>  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
>  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
>  10 files changed, 56 insertions(+), 92 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
>  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
>  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 18cea04423..04e238efa0 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -45,7 +45,6 @@ sysdep_routines += \
>    stpcpy-sse2-unaligned \
>    stpncpy-avx2 \
>    stpncpy-avx2-rtm \
> -  stpncpy-generic \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
>    strcasecmp_l-avx2 \
> @@ -92,7 +91,6 @@ sysdep_routines += \
>    strncase_l-sse4_2 \
>    strncat-avx2 \
>    strncat-avx2-rtm \
> -  strncat-generic \
>    strncat-evex \
>    strncat-sse2-unaligned \
>    strncmp-avx2 \
> @@ -102,7 +100,6 @@ sysdep_routines += \
>    strncmp-sse4_2 \
>    strncpy-avx2 \
>    strncpy-avx2-rtm \
> -  strncpy-generic \
>    strncpy-evex \
>    strncpy-sse2-unaligned \
>    strnlen-avx2 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index adf7d4bafd..2c96cb62d2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __stpncpy_evex)
>               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> -                             __stpncpy_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> +                             __stpncpy_sse2_unaligned))
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncat_evex)
>               IFUNC_IMPL_ADD (array, i, strncat, 1,
> -                             __strncat_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> +                             __strncat_sse2_unaligned))
>
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
> @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncpy_evex)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> -                             __strncpy_sse2_unaligned)
> -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> +                             __strncpy_sse2_unaligned))
>
>    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
>    IFUNC_IMPL (i, name, strpbrk,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index 80529458d1..a15afa44e9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -20,11 +20,7 @@
>
>  #include <init-arch.h>
>
> -#ifndef GENERIC
> -# define GENERIC sse2
> -#endif
> -
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>    attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  return OPTIMIZE (GENERIC);
> +  return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> new file mode 100644
> index 0000000000..323225af4d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> @@ -0,0 +1,48 @@
> +/* Common definition for ifunc st{r|p}n{cpy|cat}
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +       return OPTIMIZE (evex);
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +       return OPTIMIZE (avx2_rtm);
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +       return OPTIMIZE (avx2);
> +    }
> +
> +  return OPTIMIZE (sse2_unaligned);
> +}
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> deleted file mode 100644
> index 87826845b0..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> +++ /dev/null
> @@ -1,26 +0,0 @@
> -/* stpncpy.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -
> -#define STPNCPY __stpncpy_generic
> -#undef weak_alias
> -#define weak_alias(ignored1, ignored2)
> -#undef libc_hidden_def
> -#define libc_hidden_def(stpncpy)
> -
> -#include <string/stpncpy.c>
> diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> index 879bc83f0b..a8d083ff0d 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy.c
> +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> @@ -25,9 +25,8 @@
>  # undef stpncpy
>  # undef __stpncpy
>
> -# define GENERIC generic
>  # define SYMBOL_NAME stpncpy
> -# include "ifunc-strcpy.h"
> +# include "ifunc-strncpy.h"
>
>  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
>
> diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> deleted file mode 100644
> index 0090669cd1..0000000000
> --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> +++ /dev/null
> @@ -1,21 +0,0 @@
> -/* strncat.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -
> -#define STRNCAT __strncat_generic
> -#include <string/strncat.c>
> diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> index 50fba8a41f..a590c25d51 100644
> --- a/sysdeps/x86_64/multiarch/strncat.c
> +++ b/sysdeps/x86_64/multiarch/strncat.c
> @@ -24,8 +24,7 @@
>  # undef strncat
>
>  # define SYMBOL_NAME strncat
> -# define GENERIC generic
> -# include "ifunc-strcpy.h"
> +# include "ifunc-strncpy.h"
>
>  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
>  strong_alias (strncat, __strncat);
> diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> deleted file mode 100644
> index 9916153dd5..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> +++ /dev/null
> @@ -1,24 +0,0 @@
> -/* strncpy.
> -   Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -
> -#define STRNCPY __strncpy_generic
> -#undef libc_hidden_builtin_def
> -#define libc_hidden_builtin_def(strncpy)
> -
> -#include <string/strncpy.c>
> diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> index 7fc7d72ec5..c83440f0e3 100644
> --- a/sysdeps/x86_64/multiarch/strncpy.c
> +++ b/sysdeps/x86_64/multiarch/strncpy.c
> @@ -24,8 +24,7 @@
>  # undef strncpy
>
>  # define SYMBOL_NAME strncpy
> -# define GENERIC generic
> -# include "ifunc-strcpy.h"
> +# include "ifunc-strncpy.h"
>
>  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
>
> --
> 2.34.1
>

LGTM.

Thanks.
Noah Goldstein July 6, 2022, 6:49 p.m. UTC | #7
On Wed, Jul 6, 2022 at 12:43 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
>
> On Wed, Jul 6, 2022 at 10:14 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > These functions all have optimized versions:
> > > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > > > implementations.  Since the sse2 versions can run on baseline x86_64,
> > > >
> > > > Is this true on all x86-64 processors?
> > >
> > > That they are faster or that sse2 can run on baseline x86_64?
>
> Both cases are true on Zhaoxin processors.
>
> >
> > Assuming you mean faster:
> >
> > maybe not for large strings on a processor that heavily prefers aligned
> > access.
> >
> > The generics are implemented as:
> >
> > strncat = strnlen + strlen + memcpy
> > strncpy = strnlen + memset + memcpy
> > stpncpy = strnlen + memset + memcpy
> >
> > For very large strings on processors with SSSE3
> > all of those functions can be implemented with only aligned
> > access in the loops.
> >
> > Not sure if it would work in favor of the generic implementations though as
> > they are essentially doing 2x + the number of memory accesses.
> >
> > mayshao do you have benchmark numbers on these functions?
>
> The results are attached.  If there is any problem with the results, please
> feel free to let me know.

Thank you,

Any chance you have performance numbers for:

strcpy_sse2 vs strcpy_sse2_unaligned
stpcpy_sse2 vs stpcpy_sse2_unaligned
strcat_sse2 vs strcat_sse2_unaligned
?
>
>
> Best Regards,
> May Shao
>
> >
> > >
> > >
> > > >
> > > > > we should use these as the baseline implementation and can remove the
> > > > > generic implementations.
> > > > >
> > > > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > > > >
> > > > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > > > >
> > > > > Tested build with and without multiarch and full check with multiarch.
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > > > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > > > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > > > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > > > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > > > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > > > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > > > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > > > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > > > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > index 18cea04423..04e238efa0 100644
> > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > > > >    stpcpy-sse2-unaligned \
> > > > >    stpncpy-avx2 \
> > > > >    stpncpy-avx2-rtm \
> > > > > -  stpncpy-generic \
> > > > >    stpncpy-evex \
> > > > >    stpncpy-sse2-unaligned \
> > > > >    strcasecmp_l-avx2 \
> > > > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > > > >    strncase_l-sse4_2 \
> > > > >    strncat-avx2 \
> > > > >    strncat-avx2-rtm \
> > > > > -  strncat-generic \
> > > > >    strncat-evex \
> > > > >    strncat-sse2-unaligned \
> > > > >    strncmp-avx2 \
> > > > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > > > >    strncmp-sse4_2 \
> > > > >    strncpy-avx2 \
> > > > >    strncpy-avx2-rtm \
> > > > > -  strncpy-generic \
> > > > >    strncpy-evex \
> > > > >    strncpy-sse2-unaligned \
> > > > >    strnlen-avx2 \
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > index adf7d4bafd..2c96cb62d2 100644
> > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > >                               __stpncpy_evex)
> > > > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > > > -                             __stpncpy_sse2_unaligned)
> > > > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > > > +                             __stpncpy_sse2_unaligned))
> > > > >
> > > > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > > > >    IFUNC_IMPL (i, name, stpcpy,
> > > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > >                               __strncat_evex)
> > > > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > > > -                             __strncat_sse2_unaligned)
> > > > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > > > +                             __strncat_sse2_unaligned))
> > > > >
> > > > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > > > >    IFUNC_IMPL (i, name, strncpy,
> > > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > >                               __strncpy_evex)
> > > > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > > > -                             __strncpy_sse2_unaligned)
> > > > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > > > +                             __strncpy_sse2_unaligned))
> > > > >
> > > > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > > > >    IFUNC_IMPL (i, name, strpbrk,
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > index 80529458d1..a15afa44e9 100644
> > > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > @@ -20,11 +20,7 @@
> > > > >
> > > > >  #include <init-arch.h>
> > > > >
> > > > > -#ifndef GENERIC
> > > > > -# define GENERIC sse2
> > > > > -#endif
> > > > > -
> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > >    attribute_hidden;
> > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > > > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > > >      return OPTIMIZE (sse2_unaligned);
> > > > >
> > > > > -  return OPTIMIZE (GENERIC);
> > > > > +  return OPTIMIZE (sse2);
> > > > >  }
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > new file mode 100644
> > > > > index 0000000000..323225af4d
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > @@ -0,0 +1,48 @@
> > > > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > > > +   All versions must be listed in ifunc-impl-list.c.
> > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > +   This file is part of the GNU C Library.
> > > > > +
> > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > +   License as published by the Free Software Foundation; either
> > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > +
> > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +   Lesser General Public License for more details.
> > > > > +
> > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > +   License along with the GNU C Library; if not, see
> > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > +
> > > > > +#include <init-arch.h>
> > > > > +
> > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > +  attribute_hidden;
> > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > > > +
> > > > > +static inline void *
> > > > > +IFUNC_SELECTOR (void)
> > > > > +{
> > > > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > +
> > > > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > > +    {
> > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > > +       return OPTIMIZE (evex);
> > > > > +
> > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > > +       return OPTIMIZE (avx2_rtm);
> > > > > +
> > > > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > > +       return OPTIMIZE (avx2);
> > > > > +    }
> > > > > +
> > > > > +  return OPTIMIZE (sse2_unaligned);
> > > > > +}
> > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > deleted file mode 100644
> > > > > index 87826845b0..0000000000
> > > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > +++ /dev/null
> > > > > @@ -1,26 +0,0 @@
> > > > > -/* stpncpy.
> > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > -   This file is part of the GNU C Library.
> > > > > -
> > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > -   License as published by the Free Software Foundation; either
> > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > -
> > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > -   Lesser General Public License for more details.
> > > > > -
> > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > -   License along with the GNU C Library; if not, see
> > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > -
> > > > > -
> > > > > -#define STPNCPY __stpncpy_generic
> > > > > -#undef weak_alias
> > > > > -#define weak_alias(ignored1, ignored2)
> > > > > -#undef libc_hidden_def
> > > > > -#define libc_hidden_def(stpncpy)
> > > > > -
> > > > > -#include <string/stpncpy.c>
> > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > index 879bc83f0b..a8d083ff0d 100644
> > > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > @@ -25,9 +25,8 @@
> > > > >  # undef stpncpy
> > > > >  # undef __stpncpy
> > > > >
> > > > > -# define GENERIC generic
> > > > >  # define SYMBOL_NAME stpncpy
> > > > > -# include "ifunc-strcpy.h"
> > > > > +# include "ifunc-strncpy.h"
> > > > >
> > > > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > deleted file mode 100644
> > > > > index 0090669cd1..0000000000
> > > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > +++ /dev/null
> > > > > @@ -1,21 +0,0 @@
> > > > > -/* strncat.
> > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > -   This file is part of the GNU C Library.
> > > > > -
> > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > -   License as published by the Free Software Foundation; either
> > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > -
> > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > -   Lesser General Public License for more details.
> > > > > -
> > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > -   License along with the GNU C Library; if not, see
> > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > -
> > > > > -
> > > > > -#define STRNCAT __strncat_generic
> > > > > -#include <string/strncat.c>
> > > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > > > index 50fba8a41f..a590c25d51 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > > > @@ -24,8 +24,7 @@
> > > > >  # undef strncat
> > > > >
> > > > >  # define SYMBOL_NAME strncat
> > > > > -# define GENERIC generic
> > > > > -# include "ifunc-strcpy.h"
> > > > > +# include "ifunc-strncpy.h"
> > > > >
> > > > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > > > >  strong_alias (strncat, __strncat);
> > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > deleted file mode 100644
> > > > > index 9916153dd5..0000000000
> > > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > +++ /dev/null
> > > > > @@ -1,24 +0,0 @@
> > > > > -/* strncpy.
> > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > -   This file is part of the GNU C Library.
> > > > > -
> > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > -   License as published by the Free Software Foundation; either
> > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > -
> > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > -   Lesser General Public License for more details.
> > > > > -
> > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > -   License along with the GNU C Library; if not, see
> > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > -
> > > > > -
> > > > > -#define STRNCPY __strncpy_generic
> > > > > -#undef libc_hidden_builtin_def
> > > > > -#define libc_hidden_builtin_def(strncpy)
> > > > > -
> > > > > -#include <string/strncpy.c>
> > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > index 7fc7d72ec5..c83440f0e3 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > @@ -24,8 +24,7 @@
> > > > >  # undef strncpy
> > > > >
> > > > >  # define SYMBOL_NAME strncpy
> > > > > -# define GENERIC generic
> > > > > -# include "ifunc-strcpy.h"
> > > > > +# include "ifunc-strncpy.h"
> > > > >
> > > > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > > > >
> > > > > --
> > > > > 2.34.1
> > > > >
> > > >
> > > >
> > > > --
> > > > H.J.
> >
Mayshao-oc July 7, 2022, 2:57 a.m. UTC | #8
On Thur, Jul 7, 2022 at 2:49 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Jul 6, 2022 at 12:43 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> >
> > On Wed, Jul 6, 2022 at 10:14 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > These functions all have optimized versions:
> > > > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > > > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > > > > implementations.  Since the sse2 versions can run on baseline x86_64,
> > > > >
> > > > > Is this true on all x86-64 processors?
> > > >
> > > > That they are faster or that sse2 can run on baseline x86_64?
> >
> > Both cases are true on Zhaoxin processors.
> >
> > >
> > > Assuming you mean faster:
> > >
> > > maybe not for large strings on a processor that heavily prefers aligned
> > > access.
> > >
> > > The generics are implemented as:
> > >
> > > strncat = strnlen + strlen + memcpy
> > > strncpy = strnlen + memset + memcpy
> > > stpncpy = strnlen + memset + memcpy
> > >
> > > For very large strings on processors with SSSE3
> > > all of those functions can be implemented with only aligned
> > > access in the loops.
> > >
> > > Not sure if it would work in favor of the generic implementations though as
> > > they are essentially doing 2x + the number of memory accesses.
> > >
> > > mayshao do you have benchmark numbers on these functions?
> >
> > The results are attached.  If there is any problem with the results, please
> > feel free to let me know.
>
> Thank you,
>
> Any chance you have performance numbers for:
>
> strcpy_sse2 vs strcpy_sse2_unaligned
> stpcpy_sse2 vs stpcpy_sse2_unaligned
> strcat_sse2 vs strcat_sse2_unaligned
> ?

The requested performance data has been attached,and the overall performance
of sse2_unaligned is better than that of sse2.

BTW, will the sse2 implementation of strcpy,stpcpy,strcat functions also be
removed?If so, this should be ok for Zhaoxin processors.

Thank you for your interest in this matter.

> >
> >
> > Best Regards,
> > May Shao
> >
> > >
> > > >
> > > >
> > > > >
> > > > > > we should use these as the baseline implementation and can remove the
> > > > > > generic implementations.
> > > > > >
> > > > > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > > > > >
> > > > > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > > > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > > > > >
> > > > > > Tested build with and without multiarch and full check with multiarch.
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > > > > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > > > > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > > > > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > > > > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > > > > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > > > > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > > > > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > > > > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > > > > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > > index 18cea04423..04e238efa0 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > > > > >    stpcpy-sse2-unaligned \
> > > > > >    stpncpy-avx2 \
> > > > > >    stpncpy-avx2-rtm \
> > > > > > -  stpncpy-generic \
> > > > > >    stpncpy-evex \
> > > > > >    stpncpy-sse2-unaligned \
> > > > > >    strcasecmp_l-avx2 \
> > > > > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > > > > >    strncase_l-sse4_2 \
> > > > > >    strncat-avx2 \
> > > > > >    strncat-avx2-rtm \
> > > > > > -  strncat-generic \
> > > > > >    strncat-evex \
> > > > > >    strncat-sse2-unaligned \
> > > > > >    strncmp-avx2 \
> > > > > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > > > > >    strncmp-sse4_2 \
> > > > > >    strncpy-avx2 \
> > > > > >    strncpy-avx2-rtm \
> > > > > > -  strncpy-generic \
> > > > > >    strncpy-evex \
> > > > > >    strncpy-sse2-unaligned \
> > > > > >    strnlen-avx2 \
> > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > index adf7d4bafd..2c96cb62d2 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > >                               __stpncpy_evex)
> > > > > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > > > > -                             __stpncpy_sse2_unaligned)
> > > > > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > > > > +                             __stpncpy_sse2_unaligned))
> > > > > >
> > > > > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > > > > >    IFUNC_IMPL (i, name, stpcpy,
> > > > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > >                               __strncat_evex)
> > > > > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > > > > -                             __strncat_sse2_unaligned)
> > > > > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > > > > +                             __strncat_sse2_unaligned))
> > > > > >
> > > > > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > > > > >    IFUNC_IMPL (i, name, strncpy,
> > > > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > >                               __strncpy_evex)
> > > > > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > > > > -                             __strncpy_sse2_unaligned)
> > > > > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > > > > +                             __strncpy_sse2_unaligned))
> > > > > >
> > > > > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > > > > >    IFUNC_IMPL (i, name, strpbrk,
> > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > index 80529458d1..a15afa44e9 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > @@ -20,11 +20,7 @@
> > > > > >
> > > > > >  #include <init-arch.h>
> > > > > >
> > > > > > -#ifndef GENERIC
> > > > > > -# define GENERIC sse2
> > > > > > -#endif
> > > > > > -
> > > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > >    attribute_hidden;
> > > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > > > > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > > > >      return OPTIMIZE (sse2_unaligned);
> > > > > >
> > > > > > -  return OPTIMIZE (GENERIC);
> > > > > > +  return OPTIMIZE (sse2);
> > > > > >  }
> > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > new file mode 100644
> > > > > > index 0000000000..323225af4d
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > @@ -0,0 +1,48 @@
> > > > > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > > > > +   All versions must be listed in ifunc-impl-list.c.
> > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > +   This file is part of the GNU C Library.
> > > > > > +
> > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > +   License as published by the Free Software Foundation; either
> > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > +
> > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > +   Lesser General Public License for more details.
> > > > > > +
> > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > +   License along with the GNU C Library; if not, see
> > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > +
> > > > > > +#include <init-arch.h>
> > > > > > +
> > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > > +  attribute_hidden;
> > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > > > > +
> > > > > > +static inline void *
> > > > > > +IFUNC_SELECTOR (void)
> > > > > > +{
> > > > > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > > +
> > > > > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > > > +    {
> > > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > > > +       return OPTIMIZE (evex);
> > > > > > +
> > > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > > > +       return OPTIMIZE (avx2_rtm);
> > > > > > +
> > > > > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > > > +       return OPTIMIZE (avx2);
> > > > > > +    }
> > > > > > +
> > > > > > +  return OPTIMIZE (sse2_unaligned);
> > > > > > +}
> > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > deleted file mode 100644
> > > > > > index 87826845b0..0000000000
> > > > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > +++ /dev/null
> > > > > > @@ -1,26 +0,0 @@
> > > > > > -/* stpncpy.
> > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > -   This file is part of the GNU C Library.
> > > > > > -
> > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > -   License as published by the Free Software Foundation; either
> > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > -
> > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > -   Lesser General Public License for more details.
> > > > > > -
> > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > -   License along with the GNU C Library; if not, see
> > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > -
> > > > > > -
> > > > > > -#define STPNCPY __stpncpy_generic
> > > > > > -#undef weak_alias
> > > > > > -#define weak_alias(ignored1, ignored2)
> > > > > > -#undef libc_hidden_def
> > > > > > -#define libc_hidden_def(stpncpy)
> > > > > > -
> > > > > > -#include <string/stpncpy.c>
> > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > index 879bc83f0b..a8d083ff0d 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > @@ -25,9 +25,8 @@
> > > > > >  # undef stpncpy
> > > > > >  # undef __stpncpy
> > > > > >
> > > > > > -# define GENERIC generic
> > > > > >  # define SYMBOL_NAME stpncpy
> > > > > > -# include "ifunc-strcpy.h"
> > > > > > +# include "ifunc-strncpy.h"
> > > > > >
> > > > > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > deleted file mode 100644
> > > > > > index 0090669cd1..0000000000
> > > > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > +++ /dev/null
> > > > > > @@ -1,21 +0,0 @@
> > > > > > -/* strncat.
> > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > -   This file is part of the GNU C Library.
> > > > > > -
> > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > -   License as published by the Free Software Foundation; either
> > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > -
> > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > -   Lesser General Public License for more details.
> > > > > > -
> > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > -   License along with the GNU C Library; if not, see
> > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > -
> > > > > > -
> > > > > > -#define STRNCAT __strncat_generic
> > > > > > -#include <string/strncat.c>
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > > > > index 50fba8a41f..a590c25d51 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > > > > @@ -24,8 +24,7 @@
> > > > > >  # undef strncat
> > > > > >
> > > > > >  # define SYMBOL_NAME strncat
> > > > > > -# define GENERIC generic
> > > > > > -# include "ifunc-strcpy.h"
> > > > > > +# include "ifunc-strncpy.h"
> > > > > >
> > > > > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > > > > >  strong_alias (strncat, __strncat);
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > deleted file mode 100644
> > > > > > index 9916153dd5..0000000000
> > > > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > +++ /dev/null
> > > > > > @@ -1,24 +0,0 @@
> > > > > > -/* strncpy.
> > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > -   This file is part of the GNU C Library.
> > > > > > -
> > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > -   License as published by the Free Software Foundation; either
> > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > -
> > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > -   Lesser General Public License for more details.
> > > > > > -
> > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > -   License along with the GNU C Library; if not, see
> > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > -
> > > > > > -
> > > > > > -#define STRNCPY __strncpy_generic
> > > > > > -#undef libc_hidden_builtin_def
> > > > > > -#define libc_hidden_builtin_def(strncpy)
> > > > > > -
> > > > > > -#include <string/strncpy.c>
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > index 7fc7d72ec5..c83440f0e3 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > @@ -24,8 +24,7 @@
> > > > > >  # undef strncpy
> > > > > >
> > > > > >  # define SYMBOL_NAME strncpy
> > > > > > -# define GENERIC generic
> > > > > > -# include "ifunc-strcpy.h"
> > > > > > +# include "ifunc-strncpy.h"
> > > > > >
> > > > > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > > > > >
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > H.J.
> > >
Noah Goldstein July 7, 2022, 3:11 a.m. UTC | #9
On Wed, Jul 6, 2022 at 7:59 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
>
> On Thur, Jul 7, 2022 at 2:49 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Jul 6, 2022 at 12:43 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > >
> > > On Wed, Jul 6, 2022 at 10:14 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > >
> > > > > > > These functions all have optimized versions:
> > > > > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > > > > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > > > > > implementations.  Since the sse2 versions can run on baseline x86_64,
> > > > > >
> > > > > > Is this true on all x86-64 processors?
> > > > >
> > > > > That they are faster or that sse2 can run on baseline x86_64?
> > >
> > > Both cases are true on Zhaoxin processors.
> > >
> > > >
> > > > Assuming you mean faster:
> > > >
> > > > maybe not for large strings on a processor that heavily prefers aligned
> > > > access.
> > > >
> > > > The generics are implemented as:
> > > >
> > > > strncat = strnlen + strlen + memcpy
> > > > strncpy = strnlen + memset + memcpy
> > > > stpncpy = strnlen + memset + memcpy
> > > >
> > > > For very large strings on processors with SSSE3
> > > > all of those functions can be implemented with only aligned
> > > > access in the loops.
> > > >
> > > > Not sure if it would work in favor of the generic implementations though as
> > > > they are essentially doing 2x + the number of memory accesses.
> > > >
> > > > mayshao do you have benchmark numbers on these functions?
> > >
> > > The results are attached.  If there is any problem with the results, please
> > > feel free to let me know.
> >
> > Thank you,
> >
> > Any chance you have performance numbers for:
> >
> > strcpy_sse2 vs strcpy_sse2_unaligned
> > stpcpy_sse2 vs stpcpy_sse2_unaligned
> > strcat_sse2 vs strcat_sse2_unaligned
> > ?
>
> The requested performance data has been attached,and the overall performance
> of sse2_unaligned is better than that of sse2.
>
> BTW, will the sse2 implementation of strcpy,stpcpy,strcat functions also be
> removed?If so, this should be ok for Zhaoxin processors.

Will probably add a patch for that in not too long.
>
> Thank you for your interest in this matter.
>
> > >
> > >
> > > Best Regards,
> > > May Shao
> > >
> > > >
> > > > >
> > > > >
> > > > > >
> > > > > > > we should use these as the baseline implementation and can remove the
> > > > > > > generic implementations.
> > > > > > >
> > > > > > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > > > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > > > > > >
> > > > > > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > > > > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > > > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > > > > > >
> > > > > > > Tested build with and without multiarch and full check with multiarch.
> > > > > > > ---
> > > > > > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > > > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > > > > > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > > > > > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > > > > > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > > > > > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > > > > > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > > > > > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > > > > > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > > > > > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > > > > > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > >
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > > > index 18cea04423..04e238efa0 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > > > > > >    stpcpy-sse2-unaligned \
> > > > > > >    stpncpy-avx2 \
> > > > > > >    stpncpy-avx2-rtm \
> > > > > > > -  stpncpy-generic \
> > > > > > >    stpncpy-evex \
> > > > > > >    stpncpy-sse2-unaligned \
> > > > > > >    strcasecmp_l-avx2 \
> > > > > > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > > > > > >    strncase_l-sse4_2 \
> > > > > > >    strncat-avx2 \
> > > > > > >    strncat-avx2-rtm \
> > > > > > > -  strncat-generic \
> > > > > > >    strncat-evex \
> > > > > > >    strncat-sse2-unaligned \
> > > > > > >    strncmp-avx2 \
> > > > > > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > > > > > >    strncmp-sse4_2 \
> > > > > > >    strncpy-avx2 \
> > > > > > >    strncpy-avx2-rtm \
> > > > > > > -  strncpy-generic \
> > > > > > >    strncpy-evex \
> > > > > > >    strncpy-sse2-unaligned \
> > > > > > >    strnlen-avx2 \
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > > index adf7d4bafd..2c96cb62d2 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > > >                               __stpncpy_evex)
> > > > > > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > > > > > -                             __stpncpy_sse2_unaligned)
> > > > > > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > > > > > +                             __stpncpy_sse2_unaligned))
> > > > > > >
> > > > > > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > > > > > >    IFUNC_IMPL (i, name, stpcpy,
> > > > > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > > >                               __strncat_evex)
> > > > > > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > > > > > -                             __strncat_sse2_unaligned)
> > > > > > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > > > > > +                             __strncat_sse2_unaligned))
> > > > > > >
> > > > > > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > > > > > >    IFUNC_IMPL (i, name, strncpy,
> > > > > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > > >                               __strncpy_evex)
> > > > > > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > > > > > -                             __strncpy_sse2_unaligned)
> > > > > > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > > > > > +                             __strncpy_sse2_unaligned))
> > > > > > >
> > > > > > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > > > > > >    IFUNC_IMPL (i, name, strpbrk,
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > > index 80529458d1..a15afa44e9 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > > @@ -20,11 +20,7 @@
> > > > > > >
> > > > > > >  #include <init-arch.h>
> > > > > > >
> > > > > > > -#ifndef GENERIC
> > > > > > > -# define GENERIC sse2
> > > > > > > -#endif
> > > > > > > -
> > > > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > > > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > > >    attribute_hidden;
> > > > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > > > > > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > > > > >      return OPTIMIZE (sse2_unaligned);
> > > > > > >
> > > > > > > -  return OPTIMIZE (GENERIC);
> > > > > > > +  return OPTIMIZE (sse2);
> > > > > > >  }
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > > new file mode 100644
> > > > > > > index 0000000000..323225af4d
> > > > > > > --- /dev/null
> > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > > @@ -0,0 +1,48 @@
> > > > > > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > > > > > +   All versions must be listed in ifunc-impl-list.c.
> > > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > +   This file is part of the GNU C Library.
> > > > > > > +
> > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > +
> > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > +   Lesser General Public License for more details.
> > > > > > > +
> > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > +
> > > > > > > +#include <init-arch.h>
> > > > > > > +
> > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > > > +  attribute_hidden;
> > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > > > > > +
> > > > > > > +static inline void *
> > > > > > > +IFUNC_SELECTOR (void)
> > > > > > > +{
> > > > > > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > > > +
> > > > > > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > > > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > > > > +    {
> > > > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > > > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > > > > +       return OPTIMIZE (evex);
> > > > > > > +
> > > > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > > > > +       return OPTIMIZE (avx2_rtm);
> > > > > > > +
> > > > > > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > > > > +       return OPTIMIZE (avx2);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  return OPTIMIZE (sse2_unaligned);
> > > > > > > +}
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > > deleted file mode 100644
> > > > > > > index 87826845b0..0000000000
> > > > > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > > +++ /dev/null
> > > > > > > @@ -1,26 +0,0 @@
> > > > > > > -/* stpncpy.
> > > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > -   This file is part of the GNU C Library.
> > > > > > > -
> > > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > > -   License as published by the Free Software Foundation; either
> > > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > > -
> > > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > -   Lesser General Public License for more details.
> > > > > > > -
> > > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > > -   License along with the GNU C Library; if not, see
> > > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > > -
> > > > > > > -
> > > > > > > -#define STPNCPY __stpncpy_generic
> > > > > > > -#undef weak_alias
> > > > > > > -#define weak_alias(ignored1, ignored2)
> > > > > > > -#undef libc_hidden_def
> > > > > > > -#define libc_hidden_def(stpncpy)
> > > > > > > -
> > > > > > > -#include <string/stpncpy.c>
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > > index 879bc83f0b..a8d083ff0d 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > > @@ -25,9 +25,8 @@
> > > > > > >  # undef stpncpy
> > > > > > >  # undef __stpncpy
> > > > > > >
> > > > > > > -# define GENERIC generic
> > > > > > >  # define SYMBOL_NAME stpncpy
> > > > > > > -# include "ifunc-strcpy.h"
> > > > > > > +# include "ifunc-strncpy.h"
> > > > > > >
> > > > > > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > > > > > >
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > > deleted file mode 100644
> > > > > > > index 0090669cd1..0000000000
> > > > > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > > +++ /dev/null
> > > > > > > @@ -1,21 +0,0 @@
> > > > > > > -/* strncat.
> > > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > -   This file is part of the GNU C Library.
> > > > > > > -
> > > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > > -   License as published by the Free Software Foundation; either
> > > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > > -
> > > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > -   Lesser General Public License for more details.
> > > > > > > -
> > > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > > -   License along with the GNU C Library; if not, see
> > > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > > -
> > > > > > > -
> > > > > > > -#define STRNCAT __strncat_generic
> > > > > > > -#include <string/strncat.c>
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > > > > > index 50fba8a41f..a590c25d51 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > > > > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > > > > > @@ -24,8 +24,7 @@
> > > > > > >  # undef strncat
> > > > > > >
> > > > > > >  # define SYMBOL_NAME strncat
> > > > > > > -# define GENERIC generic
> > > > > > > -# include "ifunc-strcpy.h"
> > > > > > > +# include "ifunc-strncpy.h"
> > > > > > >
> > > > > > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > > > > > >  strong_alias (strncat, __strncat);
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > > deleted file mode 100644
> > > > > > > index 9916153dd5..0000000000
> > > > > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > > +++ /dev/null
> > > > > > > @@ -1,24 +0,0 @@
> > > > > > > -/* strncpy.
> > > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > -   This file is part of the GNU C Library.
> > > > > > > -
> > > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > > -   License as published by the Free Software Foundation; either
> > > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > > -
> > > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > -   Lesser General Public License for more details.
> > > > > > > -
> > > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > > -   License along with the GNU C Library; if not, see
> > > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > > -
> > > > > > > -
> > > > > > > -#define STRNCPY __strncpy_generic
> > > > > > > -#undef libc_hidden_builtin_def
> > > > > > > -#define libc_hidden_builtin_def(strncpy)
> > > > > > > -
> > > > > > > -#include <string/strncpy.c>
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > > index 7fc7d72ec5..c83440f0e3 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > > @@ -24,8 +24,7 @@
> > > > > > >  # undef strncpy
> > > > > > >
> > > > > > >  # define SYMBOL_NAME strncpy
> > > > > > > -# define GENERIC generic
> > > > > > > -# include "ifunc-strcpy.h"
> > > > > > > +# include "ifunc-strncpy.h"
> > > > > > >
> > > > > > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > > > > > >
> > > > > > > --
> > > > > > > 2.34.1
> > > > > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > H.J.
> > > >
Noah Goldstein July 12, 2022, 6:43 p.m. UTC | #10
On Wed, Jul 6, 2022 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Jul 6, 2022 at 7:59 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> >
> > On Thur, Jul 7, 2022 at 2:49 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Jul 6, 2022 at 12:43 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > > >
> > > > On Wed, Jul 6, 2022 at 10:14 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 5, 2022 at 7:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Jul 5, 2022 at 6:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jul 5, 2022 at 5:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > > >
> > > > > > > > These functions all have optimized versions:
> > > > > > > > __strncat_sse2_unaligned, __strncpy_sse2_unaligned, and
> > > > > > > > stpncpy_sse2_unaligned which are faster than their respective generic
> > > > > > > > implementations.  Since the sse2 versions can run on baseline x86_64,
> > > > > > >
> > > > > > > Is this true on all x86-64 processors?
> > > > > >
> > > > > > That they are faster or that sse2 can run on baseline x86_64?
> > > >
> > > > Both cases are true on Zhaoxin processors.
> > > >
> > > > >
> > > > > Assuming you mean faster:
> > > > >
> > > > > maybe not for large strings on a processor that heavily prefers aligned
> > > > > access.
> > > > >
> > > > > The generics are implemented as:
> > > > >
> > > > > strncat = strnlen + strlen + memcpy
> > > > > strncpy = strnlen + memset + memcpy
> > > > > stpncpy = strnlen + memset + memcpy
> > > > >
> > > > > For very large strings on processors with SSSE3
> > > > > all of those functions can be implemented with only aligned
> > > > > access in the loops.
> > > > >
> > > > > Not sure if it would work in favor of the generic implementations though as
> > > > > they are essentially doing 2x + the number of memory accesses.
> > > > >
> > > > > mayshao do you have benchmark numbers on these functions?
> > > >
> > > > The results are attached.  If there is any problem with the results, please
> > > > feel free to let me know.
> > >
> > > Thank you,
> > >
> > > Any chance you have performance numbers for:
> > >
> > > strcpy_sse2 vs strcpy_sse2_unaligned
> > > stpcpy_sse2 vs stpcpy_sse2_unaligned
> > > strcat_sse2 vs strcat_sse2_unaligned
> > > ?
> >
> > The requested performance data has been attached,and the overall performance
> > of sse2_unaligned is better than that of sse2.
> >
> > BTW, will the sse2 implementation of strcpy,stpcpy,strcat functions also be
> > removed?If so, this should be ok for Zhaoxin processors.
>
> Will probably add a patch for that in not too long.
> >
> > Thank you for your interest in this matter.
> >
> > > >
> > > >
> > > > Best Regards,
> > > > May Shao
> > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > > we should use these as the baseline implementation and can remove the
> > > > > > > > generic implementations.
> > > > > > > >
> > > > > > > > Geometric mean of N=20 runs of the entire benchmark suite on:
> > > > > > > > 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz (Tigerlake)
> > > > > > > >
> > > > > > > > __strncat_sse2_unaligned / __strncat_generic: .944
> > > > > > > > __strncpy_sse2_unaligned / __strncpy_generic: .726
> > > > > > > > __stpncpy_sse2_unaligned / __stpncpy_generic: .650
> > > > > > > >
> > > > > > > > Tested build with and without multiarch and full check with multiarch.
> > > > > > > > ---
> > > > > > > >  sysdeps/x86_64/multiarch/Makefile          |  3 --
> > > > > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  9 ++--
> > > > > > > >  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |  8 +---
> > > > > > > >  sysdeps/x86_64/multiarch/ifunc-strncpy.h   | 48 ++++++++++++++++++++++
> > > > > > > >  sysdeps/x86_64/multiarch/stpncpy-generic.c | 26 ------------
> > > > > > > >  sysdeps/x86_64/multiarch/stpncpy.c         |  3 +-
> > > > > > > >  sysdeps/x86_64/multiarch/strncat-generic.c | 21 ----------
> > > > > > > >  sysdeps/x86_64/multiarch/strncat.c         |  3 +-
> > > > > > > >  sysdeps/x86_64/multiarch/strncpy-generic.c | 24 -----------
> > > > > > > >  sysdeps/x86_64/multiarch/strncpy.c         |  3 +-
> > > > > > > >  10 files changed, 56 insertions(+), 92 deletions(-)
> > > > > > > >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > > >  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > > >
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > > > > index 18cea04423..04e238efa0 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > > > > @@ -45,7 +45,6 @@ sysdep_routines += \
> > > > > > > >    stpcpy-sse2-unaligned \
> > > > > > > >    stpncpy-avx2 \
> > > > > > > >    stpncpy-avx2-rtm \
> > > > > > > > -  stpncpy-generic \
> > > > > > > >    stpncpy-evex \
> > > > > > > >    stpncpy-sse2-unaligned \
> > > > > > > >    strcasecmp_l-avx2 \
> > > > > > > > @@ -92,7 +91,6 @@ sysdep_routines += \
> > > > > > > >    strncase_l-sse4_2 \
> > > > > > > >    strncat-avx2 \
> > > > > > > >    strncat-avx2-rtm \
> > > > > > > > -  strncat-generic \
> > > > > > > >    strncat-evex \
> > > > > > > >    strncat-sse2-unaligned \
> > > > > > > >    strncmp-avx2 \
> > > > > > > > @@ -102,7 +100,6 @@ sysdep_routines += \
> > > > > > > >    strncmp-sse4_2 \
> > > > > > > >    strncpy-avx2 \
> > > > > > > >    strncpy-avx2-rtm \
> > > > > > > > -  strncpy-generic \
> > > > > > > >    strncpy-evex \
> > > > > > > >    strncpy-sse2-unaligned \
> > > > > > > >    strnlen-avx2 \
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > > > index adf7d4bafd..2c96cb62d2 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > > > @@ -403,8 +403,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > > > >                               __stpncpy_evex)
> > > > > > > >               IFUNC_IMPL_ADD (array, i, stpncpy, 1,
> > > > > > > > -                             __stpncpy_sse2_unaligned)
> > > > > > > > -             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
> > > > > > > > +                             __stpncpy_sse2_unaligned))
> > > > > > > >
> > > > > > > >    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
> > > > > > > >    IFUNC_IMPL (i, name, stpcpy,
> > > > > > > > @@ -618,8 +617,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > > > >                               __strncat_evex)
> > > > > > > >               IFUNC_IMPL_ADD (array, i, strncat, 1,
> > > > > > > > -                             __strncat_sse2_unaligned)
> > > > > > > > -             IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
> > > > > > > > +                             __strncat_sse2_unaligned))
> > > > > > > >
> > > > > > > >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> > > > > > > >    IFUNC_IMPL (i, name, strncpy,
> > > > > > > > @@ -634,8 +632,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > > > >                                && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > > > >                               __strncpy_evex)
> > > > > > > >               IFUNC_IMPL_ADD (array, i, strncpy, 1,
> > > > > > > > -                             __strncpy_sse2_unaligned)
> > > > > > > > -             IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
> > > > > > > > +                             __strncpy_sse2_unaligned))
> > > > > > > >
> > > > > > > >    /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
> > > > > > > >    IFUNC_IMPL (i, name, strpbrk,
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > > > index 80529458d1..a15afa44e9 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> > > > > > > > @@ -20,11 +20,7 @@
> > > > > > > >
> > > > > > > >  #include <init-arch.h>
> > > > > > > >
> > > > > > > > -#ifndef GENERIC
> > > > > > > > -# define GENERIC sse2
> > > > > > > > -#endif
> > > > > > > > -
> > > > > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
> > > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > > > > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > > > >    attribute_hidden;
> > > > > > > >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > > > > @@ -53,5 +49,5 @@ IFUNC_SELECTOR (void)
> > > > > > > >    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > > > > > >      return OPTIMIZE (sse2_unaligned);
> > > > > > > >
> > > > > > > > -  return OPTIMIZE (GENERIC);
> > > > > > > > +  return OPTIMIZE (sse2);
> > > > > > > >  }
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000000..323225af4d
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
> > > > > > > > @@ -0,0 +1,48 @@
> > > > > > > > +/* Common definition for ifunc st{r|p}n{cpy|cat}
> > > > > > > > +   All versions must be listed in ifunc-impl-list.c.
> > > > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > +   This file is part of the GNU C Library.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > > > +   License as published by the Free Software Foundation; either
> > > > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > +
> > > > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > +   Lesser General Public License for more details.
> > > > > > > > +
> > > > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > > > +   License along with the GNU C Library; if not, see
> > > > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > > > +
> > > > > > > > +#include <init-arch.h>
> > > > > > > > +
> > > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> > > > > > > > +  attribute_hidden;
> > > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> > > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> > > > > > > > +
> > > > > > > > +static inline void *
> > > > > > > > +IFUNC_SELECTOR (void)
> > > > > > > > +{
> > > > > > > > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > > > > +
> > > > > > > > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > > > > > +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > > > > > +    {
> > > > > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > > > > > +         && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > > > > > > > +       return OPTIMIZE (evex);
> > > > > > > > +
> > > > > > > > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > > > > > > > +       return OPTIMIZE (avx2_rtm);
> > > > > > > > +
> > > > > > > > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > > > > > > > +       return OPTIMIZE (avx2);
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  return OPTIMIZE (sse2_unaligned);
> > > > > > > > +}
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > > > deleted file mode 100644
> > > > > > > > index 87826845b0..0000000000
> > > > > > > > --- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
> > > > > > > > +++ /dev/null
> > > > > > > > @@ -1,26 +0,0 @@
> > > > > > > > -/* stpncpy.
> > > > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > -   This file is part of the GNU C Library.
> > > > > > > > -
> > > > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > > > -   License as published by the Free Software Foundation; either
> > > > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > -
> > > > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > -   Lesser General Public License for more details.
> > > > > > > > -
> > > > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > > > -   License along with the GNU C Library; if not, see
> > > > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > > > -
> > > > > > > > -
> > > > > > > > -#define STPNCPY __stpncpy_generic
> > > > > > > > -#undef weak_alias
> > > > > > > > -#define weak_alias(ignored1, ignored2)
> > > > > > > > -#undef libc_hidden_def
> > > > > > > > -#define libc_hidden_def(stpncpy)
> > > > > > > > -
> > > > > > > > -#include <string/stpncpy.c>
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > > > index 879bc83f0b..a8d083ff0d 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c
> > > > > > > > @@ -25,9 +25,8 @@
> > > > > > > >  # undef stpncpy
> > > > > > > >  # undef __stpncpy
> > > > > > > >
> > > > > > > > -# define GENERIC generic
> > > > > > > >  # define SYMBOL_NAME stpncpy
> > > > > > > > -# include "ifunc-strcpy.h"
> > > > > > > > +# include "ifunc-strncpy.h"
> > > > > > > >
> > > > > > > >  libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
> > > > > > > >
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > > > deleted file mode 100644
> > > > > > > > index 0090669cd1..0000000000
> > > > > > > > --- a/sysdeps/x86_64/multiarch/strncat-generic.c
> > > > > > > > +++ /dev/null
> > > > > > > > @@ -1,21 +0,0 @@
> > > > > > > > -/* strncat.
> > > > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > -   This file is part of the GNU C Library.
> > > > > > > > -
> > > > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > > > -   License as published by the Free Software Foundation; either
> > > > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > -
> > > > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > -   Lesser General Public License for more details.
> > > > > > > > -
> > > > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > > > -   License along with the GNU C Library; if not, see
> > > > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > > > -
> > > > > > > > -
> > > > > > > > -#define STRNCAT __strncat_generic
> > > > > > > > -#include <string/strncat.c>
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
> > > > > > > > index 50fba8a41f..a590c25d51 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/strncat.c
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/strncat.c
> > > > > > > > @@ -24,8 +24,7 @@
> > > > > > > >  # undef strncat
> > > > > > > >
> > > > > > > >  # define SYMBOL_NAME strncat
> > > > > > > > -# define GENERIC generic
> > > > > > > > -# include "ifunc-strcpy.h"
> > > > > > > > +# include "ifunc-strncpy.h"
> > > > > > > >
> > > > > > > >  libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
> > > > > > > >  strong_alias (strncat, __strncat);
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > > > deleted file mode 100644
> > > > > > > > index 9916153dd5..0000000000
> > > > > > > > --- a/sysdeps/x86_64/multiarch/strncpy-generic.c
> > > > > > > > +++ /dev/null
> > > > > > > > @@ -1,24 +0,0 @@
> > > > > > > > -/* strncpy.
> > > > > > > > -   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > > > -   This file is part of the GNU C Library.
> > > > > > > > -
> > > > > > > > -   The GNU C Library is free software; you can redistribute it and/or
> > > > > > > > -   modify it under the terms of the GNU Lesser General Public
> > > > > > > > -   License as published by the Free Software Foundation; either
> > > > > > > > -   version 2.1 of the License, or (at your option) any later version.
> > > > > > > > -
> > > > > > > > -   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > > > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > > > -   Lesser General Public License for more details.
> > > > > > > > -
> > > > > > > > -   You should have received a copy of the GNU Lesser General Public
> > > > > > > > -   License along with the GNU C Library; if not, see
> > > > > > > > -   <https://www.gnu.org/licenses/>.  */
> > > > > > > > -
> > > > > > > > -
> > > > > > > > -#define STRNCPY __strncpy_generic
> > > > > > > > -#undef libc_hidden_builtin_def
> > > > > > > > -#define libc_hidden_builtin_def(strncpy)
> > > > > > > > -
> > > > > > > > -#include <string/strncpy.c>
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > > > index 7fc7d72ec5..c83440f0e3 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/strncpy.c
> > > > > > > > @@ -24,8 +24,7 @@
> > > > > > > >  # undef strncpy
> > > > > > > >
> > > > > > > >  # define SYMBOL_NAME strncpy
> > > > > > > > -# define GENERIC generic
> > > > > > > > -# include "ifunc-strcpy.h"
> > > > > > > > +# include "ifunc-strncpy.h"
> > > > > > > >
> > > > > > > >  libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
> > > > > > > >
> > > > > > > > --
> > > > > > > > 2.34.1
> > > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > H.J.
> > > > >

Going to pushing this.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 18cea04423..04e238efa0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -45,7 +45,6 @@  sysdep_routines += \
   stpcpy-sse2-unaligned \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
-  stpncpy-generic \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
   strcasecmp_l-avx2 \
@@ -92,7 +91,6 @@  sysdep_routines += \
   strncase_l-sse4_2 \
   strncat-avx2 \
   strncat-avx2-rtm \
-  strncat-generic \
   strncat-evex \
   strncat-sse2-unaligned \
   strncmp-avx2 \
@@ -102,7 +100,6 @@  sysdep_routines += \
   strncmp-sse4_2 \
   strncpy-avx2 \
   strncpy-avx2-rtm \
-  strncpy-generic \
   strncpy-evex \
   strncpy-sse2-unaligned \
   strnlen-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index adf7d4bafd..2c96cb62d2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -403,8 +403,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __stpncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
-			      __stpncpy_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic))
+			      __stpncpy_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
@@ -618,8 +617,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncat_evex)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
-			      __strncat_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic))
+			      __strncat_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
@@ -634,8 +632,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
-			      __strncpy_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic))
+			      __strncpy_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/strpbrk.c.  */
   IFUNC_IMPL (i, name, strpbrk,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 80529458d1..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -20,11 +20,7 @@ 
 
 #include <init-arch.h>
 
-#ifndef GENERIC
-# define GENERIC sse2
-#endif
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
@@ -53,5 +49,5 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  return OPTIMIZE (GENERIC);
+  return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/ifunc-strncpy.h b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
new file mode 100644
index 0000000000..323225af4d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-strncpy.h
@@ -0,0 +1,48 @@ 
+/* Common definition for ifunc st{r|p}n{cpy|cat}
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  return OPTIMIZE (sse2_unaligned);
+}
diff --git a/sysdeps/x86_64/multiarch/stpncpy-generic.c b/sysdeps/x86_64/multiarch/stpncpy-generic.c
deleted file mode 100644
index 87826845b0..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-generic.c
+++ /dev/null
@@ -1,26 +0,0 @@ 
-/* stpncpy.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-
-#define STPNCPY __stpncpy_generic
-#undef weak_alias
-#define weak_alias(ignored1, ignored2)
-#undef libc_hidden_def
-#define libc_hidden_def(stpncpy)
-
-#include <string/stpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 879bc83f0b..a8d083ff0d 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -25,9 +25,8 @@ 
 # undef stpncpy
 # undef __stpncpy
 
-# define GENERIC generic
 # define SYMBOL_NAME stpncpy
-# include "ifunc-strcpy.h"
+# include "ifunc-strncpy.h"
 
 libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/strncat-generic.c b/sysdeps/x86_64/multiarch/strncat-generic.c
deleted file mode 100644
index 0090669cd1..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-generic.c
+++ /dev/null
@@ -1,21 +0,0 @@ 
-/* strncat.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-
-#define STRNCAT __strncat_generic
-#include <string/strncat.c>
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index 50fba8a41f..a590c25d51 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,8 +24,7 @@ 
 # undef strncat
 
 # define SYMBOL_NAME strncat
-# define GENERIC generic
-# include "ifunc-strcpy.h"
+# include "ifunc-strncpy.h"
 
 libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
 strong_alias (strncat, __strncat);
diff --git a/sysdeps/x86_64/multiarch/strncpy-generic.c b/sysdeps/x86_64/multiarch/strncpy-generic.c
deleted file mode 100644
index 9916153dd5..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-generic.c
+++ /dev/null
@@ -1,24 +0,0 @@ 
-/* strncpy.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-
-#define STRNCPY __strncpy_generic
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strncpy)
-
-#include <string/strncpy.c>
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index 7fc7d72ec5..c83440f0e3 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,8 +24,7 @@ 
 # undef strncpy
 
 # define SYMBOL_NAME strncpy
-# define GENERIC generic
-# include "ifunc-strcpy.h"
+# include "ifunc-strncpy.h"
 
 libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());