diff mbox series

[v2] x86-64: Optimize bzero

Message ID 20220208224319.40271-1-hjl.tools@gmail.com
State New
Headers show
Series [v2] x86-64: Optimize bzero | expand

Commit Message

H.J. Lu Feb. 8, 2022, 10:43 p.m. UTC
Rebase against the current master branch.

--
memset with zero as the value to set is by far the majority value (99%+
for Python3 and GCC).

bzero can be slightly more optimized for this case by using a zero-idiom
xor for broadcasting the set value to a register (vector or GPR).

Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/memset.S                       |   8 ++
 sysdeps/x86_64/multiarch/Makefile             |   1 +
 sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
 .../memset-avx2-unaligned-erms-rtm.S          |   1 +
 .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
 .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
 .../multiarch/memset-evex-unaligned-erms.S    |   3 +
 .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
 .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
 10 files changed, 256 insertions(+), 25 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/bzero.c

Comments

Noah Goldstein Feb. 8, 2022, 11:56 p.m. UTC | #1
On Tue, Feb 8, 2022 at 4:43 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Rebase against the current master branch.
>
> --
> memset with zero as the value to set is by far the majority value (99%+
> for Python3 and GCC).
>
> bzero can be slightly more optimized for this case by using a zero-idiom
> xor for broadcasting the set value to a register (vector or GPR).
>
> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/memset.S                       |   8 ++
>  sysdeps/x86_64/multiarch/Makefile             |   1 +
>  sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
>  .../memset-avx2-unaligned-erms-rtm.S          |   1 +
>  .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
>  .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
>  .../multiarch/memset-evex-unaligned-erms.S    |   3 +
>  .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
>  .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
>  10 files changed, 256 insertions(+), 25 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/bzero.c
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 3f0517bbfc..af26e9cedc 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -35,6 +35,9 @@
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
> +# define BZERO_ZERO_VEC0() \
> +  pxor %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0; \
> @@ -53,6 +56,10 @@
>  # define MEMSET_SYMBOL(p,s)    memset
>  #endif
>
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)     __bzero
> +#endif
> +
>  #ifndef WMEMSET_SYMBOL
>  # define WMEMSET_CHK_SYMBOL(p,s) p
>  # define WMEMSET_SYMBOL(p,s)   __wmemset
> @@ -63,6 +70,7 @@
>  libc_hidden_builtin_def (memset)
>
>  #if IS_IN (libc)
> +weak_alias (__bzero, bzero)
>  libc_hidden_def (__wmemset)
>  weak_alias (__wmemset, wmemset)
>  libc_hidden_weak (wmemset)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 4274bfdd0d..e7b413edad 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -1,6 +1,7 @@
>  ifeq ($(subdir),string)
>
>  sysdep_routines += \
> +  bzero \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> new file mode 100644
> index 0000000000..58a14b2c33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/bzero.c
> @@ -0,0 +1,106 @@
> +/* Multiple versions of bzero.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __bzero __redirect___bzero
> +# include <string.h>
> +# undef __bzero
> +
> +# define SYMBOL_NAME __bzero
> +# include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> +  attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (avx512_unaligned_erms);
> +
> +         return OPTIMIZE1 (avx512_unaligned);
> +       }
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (evex_unaligned_erms);
> +
> +         return OPTIMIZE1 (evex_unaligned);
> +       }
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> +
> +         return OPTIMIZE1 (avx2_unaligned_rtm);
> +       }
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (avx2_unaligned_erms);
> +
> +         return OPTIMIZE1 (avx2_unaligned);
> +       }
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +    return OPTIMIZE1 (sse2_unaligned_erms);
> +
> +  return OPTIMIZE1 (sse2_unaligned);
> +}
> +
> +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> +
> +weak_alias (__bzero, bzero)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 68a56797d4..a594f4176e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memset_avx512_no_vzeroupper)
>              )
>
> +  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> +  IFUNC_IMPL (i, name, bzero,
> +             IFUNC_IMPL_ADD (array, i, bzero, 1,
> +                             __bzero_sse2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, bzero, 1,
> +                             __bzero_sse2_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __bzero_avx2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __bzero_avx2_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __bzero_avx2_unaligned_rtm)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __bzero_avx2_unaligned_erms_rtm)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_evex_unaligned)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_evex_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_avx512_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_avx512_unaligned)
> +            )
> +
>    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>    IFUNC_IMPL (i, name, rawmemchr,
>               IFUNC_IMPL_ADD (array, i, rawmemchr,
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..5a5ee6f672 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -5,6 +5,7 @@
>
>  #define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
> +#define BZERO_SYMBOL(p,s)      p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>
>  #include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index c0bf2875d0..a093a2831f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -14,6 +14,9 @@
>    vmovd d, %xmm0; \
>    movq r, %rax;
>
> +# define BZERO_ZERO_VEC0() \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>
> @@ -29,6 +32,9 @@
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
>  # endif
> +# ifndef BZERO_SYMBOL
> +#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
> +# endif
>  # ifndef WMEMSET_SYMBOL
>  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 5241216a77..727c92133a 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 6370021506..5d8fa78f05 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 8a6f0c561a..329c58ee46 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -22,6 +22,7 @@
>
>  #if IS_IN (libc)
>  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> +# define BZERO_SYMBOL(p,s)     MEMSET_SYMBOL (p, s)
>  # define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
>
>  # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1b502b78e4..7c94fcdae1 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -26,6 +26,10 @@
>
>  #include <sysdep.h>
>
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)             MEMSET_SYMBOL (p, s)
> +#endif
> +
>  #ifndef MEMSET_CHK_SYMBOL
>  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
>  #endif
> @@ -87,6 +91,18 @@
>  # define XMM_SMALL     0
>  #endif
>
> +#ifdef USE_LESS_VEC_MASK_STORE
> +# define SET_REG64     rcx
> +# define SET_REG32     ecx
> +# define SET_REG16     cx
> +# define SET_REG8      cl
> +#else
> +# define SET_REG64     rsi
> +# define SET_REG32     esi
> +# define SET_REG16     si
> +# define SET_REG8      sil
> +#endif
> +
>  #define PAGE_SIZE 4096
>
>  /* Macro to calculate size of small memset block for aligning
> @@ -96,18 +112,6 @@
>
>  #ifndef SECTION
>  # error SECTION is not defined!
> -#endif
> -
> -       .section SECTION(.text),"ax",@progbits
> -#if VEC_SIZE == 16 && IS_IN (libc)
> -ENTRY (__bzero)
> -       mov     %RDI_LP, %RAX_LP /* Set return value.  */
> -       mov     %RSI_LP, %RDX_LP /* Set n.  */
> -       xorl    %esi, %esi
> -       pxor    %XMM0, %XMM0
> -       jmp     L(entry_from_bzero)
> -END (__bzero)
> -weak_alias (__bzero, bzero)
>  #endif
>
>  #if IS_IN (libc)
> @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>         WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>         WMEMSET_VDUP_TO_VEC0_LOW()
>         cmpq    $VEC_SIZE, %rdx
> -       jb      L(less_vec_no_vdup)
> +       jb      L(less_vec_from_wmemset)
>         WMEMSET_VDUP_TO_VEC0_HIGH()
>         jmp     L(entry_from_wmemset)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> +#if VEC_SIZE > 16
> +       BZERO_ZERO_VEC0 ()
> +#endif
> +       mov     %RDI_LP, %RAX_LP
> +       mov     %RSI_LP, %RDX_LP
> +#ifndef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +#endif
> +       cmp     $VEC_SIZE, %RDX_LP
> +       jb      L(less_vec_no_vdup)
> +#ifdef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +#endif
> +#if VEC_SIZE <= 16
> +       BZERO_ZERO_VEC0 ()
> +#endif
> +       cmp     $(VEC_SIZE * 2), %RDX_LP
> +       ja      L(more_2x_vec)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned))
> +
>  #if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>         cmp     %RDX_LP, %RCX_LP
> @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
>  # endif
> -L(entry_from_bzero):
>         cmpq    $VEC_SIZE, %rdx
>         jb      L(less_vec)
>         MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -187,6 +215,31 @@ END (__memset_erms)
>  END (MEMSET_SYMBOL (__memset, erms))
>  # endif
>
> +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> +# if VEC_SIZE > 16
> +       BZERO_ZERO_VEC0 ()
> +# endif
> +       mov     %RDI_LP, %RAX_LP
> +       mov     %RSI_LP, %RDX_LP
> +# ifndef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +# endif
> +       cmp     $VEC_SIZE, %RDX_LP
> +       jb      L(less_vec_no_vdup)
> +# ifdef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +# endif
> +# if VEC_SIZE <= 16
> +       BZERO_ZERO_VEC0 ()
> +# endif
> +       cmp     $(VEC_SIZE * 2), %RDX_LP
> +       ja      L(stosb_more_2x_vec)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> +
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>         cmp     %RDX_LP, %RCX_LP
> @@ -229,6 +282,7 @@ L(last_2x_vec):
>         .p2align 4,, 10
>  L(less_vec):
>  L(less_vec_no_vdup):
> +L(less_vec_from_wmemset):
>         /* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
> @@ -374,8 +428,11 @@ L(less_vec):
>         /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
>            xmm). This is only does anything for AVX2.  */
>         MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_from_wmemset):
> +#if VEC_SIZE > 16
>  L(less_vec_no_vdup):
>  #endif
> +#endif
>  L(cross_page):
>  #if VEC_SIZE > 32
>         cmpl    $32, %edx
> @@ -386,7 +443,10 @@ L(cross_page):
>         jge     L(between_16_31)
>  #endif
>  #ifndef USE_XMM_LESS_VEC
> -       MOVQ    %XMM0, %rcx
> +       MOVQ    %XMM0, %SET_REG64
> +#endif
> +#if VEC_SIZE <= 16
> +L(less_vec_no_vdup):
>  #endif
>         cmpl    $8, %edx
>         jge     L(between_8_15)
> @@ -395,7 +455,7 @@ L(cross_page):
>         cmpl    $1, %edx
>         jg      L(between_2_3)
>         jl      L(between_0_0)
> -       movb    %sil, (%LESS_VEC_REG)
> +       movb    %SET_REG8, (%LESS_VEC_REG)
>  L(between_0_0):
>         ret
>
> @@ -428,8 +488,8 @@ L(between_8_15):
>         MOVQ    %XMM0, (%rdi)
>         MOVQ    %XMM0, -8(%rdi, %rdx)
>  #else
> -       movq    %rcx, (%LESS_VEC_REG)
> -       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> +       movq    %SET_REG64, (%LESS_VEC_REG)
> +       movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
>  #endif
>         ret
>
> @@ -442,8 +502,8 @@ L(between_4_7):
>         MOVD    %XMM0, (%rdi)
>         MOVD    %XMM0, -4(%rdi, %rdx)
>  #else
> -       movl    %ecx, (%LESS_VEC_REG)
> -       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> +       movl    %SET_REG32, (%LESS_VEC_REG)
> +       movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
>  #endif
>         ret
>
> @@ -452,12 +512,12 @@ L(between_4_7):
>  L(between_2_3):
>         /* From 2 to 3.  No branch when size == 2.  */
>  #ifdef USE_XMM_LESS_VEC
> -       movb    %sil, (%rdi)
> -       movb    %sil, 1(%rdi)
> -       movb    %sil, -1(%rdi, %rdx)
> +       movb    %SET_REG8, (%rdi)
> +       movb    %SET_REG8, 1(%rdi)
> +       movb    %SET_REG8, -1(%rdi, %rdx)
>  #else
> -       movw    %cx, (%LESS_VEC_REG)
> -       movb    %sil, -1(%LESS_VEC_REG, %rdx)
> +       movw    %SET_REG16, (%LESS_VEC_REG)
> +       movb    %SET_REG8, -1(%LESS_VEC_REG, %rdx)
>  #endif
>         ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.34.1
>

LGTM.
Adhemerval Zanella Netto Feb. 9, 2022, 11:41 a.m. UTC | #2
On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote:
> Rebase against the current master branch.
> 
> --
> memset with zero as the value to set is by far the majority value (99%+
> for Python3 and GCC).
> 
> bzero can be slightly more optimized for this case by using a zero-idiom
> xor for broadcasting the set value to a register (vector or GPR).
> 
> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>

Is it really worth to ressurerect bzero with this multiple ifunc variants?
Would Python3/GCC or any programs start to replace memset with bzero for
the sake of this optimization?

I agree with Wilco where the gain are marginal in this case.

> ---
>  sysdeps/x86_64/memset.S                       |   8 ++
>  sysdeps/x86_64/multiarch/Makefile             |   1 +
>  sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
>  .../memset-avx2-unaligned-erms-rtm.S          |   1 +
>  .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
>  .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
>  .../multiarch/memset-evex-unaligned-erms.S    |   3 +
>  .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
>  .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
>  10 files changed, 256 insertions(+), 25 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/bzero.c
> 
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 3f0517bbfc..af26e9cedc 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -35,6 +35,9 @@
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>  
> +# define BZERO_ZERO_VEC0() \
> +  pxor %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0; \
> @@ -53,6 +56,10 @@
>  # define MEMSET_SYMBOL(p,s)	memset
>  #endif
>  
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)	__bzero
> +#endif
> +
>  #ifndef WMEMSET_SYMBOL
>  # define WMEMSET_CHK_SYMBOL(p,s) p
>  # define WMEMSET_SYMBOL(p,s)	__wmemset
> @@ -63,6 +70,7 @@
>  libc_hidden_builtin_def (memset)
>  
>  #if IS_IN (libc)
> +weak_alias (__bzero, bzero)
>  libc_hidden_def (__wmemset)
>  weak_alias (__wmemset, wmemset)
>  libc_hidden_weak (wmemset)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 4274bfdd0d..e7b413edad 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -1,6 +1,7 @@
>  ifeq ($(subdir),string)
>  
>  sysdep_routines += \
> +  bzero \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> new file mode 100644
> index 0000000000..58a14b2c33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/bzero.c
> @@ -0,0 +1,106 @@
> +/* Multiple versions of bzero.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __bzero __redirect___bzero
> +# include <string.h>
> +# undef __bzero
> +
> +# define SYMBOL_NAME __bzero
> +# include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> +  attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (avx512_unaligned_erms);
> +
> +	  return OPTIMIZE1 (avx512_unaligned);
> +	}
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (evex_unaligned_erms);
> +
> +	  return OPTIMIZE1 (evex_unaligned);
> +	}
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> +
> +	  return OPTIMIZE1 (avx2_unaligned_rtm);
> +	}
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (avx2_unaligned_erms);
> +
> +	  return OPTIMIZE1 (avx2_unaligned);
> +	}
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +    return OPTIMIZE1 (sse2_unaligned_erms);
> +
> +  return OPTIMIZE1 (sse2_unaligned);
> +}
> +
> +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> +
> +weak_alias (__bzero, bzero)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 68a56797d4..a594f4176e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  			      __memset_avx512_no_vzeroupper)
>  	     )
>  
> +  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> +  IFUNC_IMPL (i, name, bzero,
> +	      IFUNC_IMPL_ADD (array, i, bzero, 1,
> +			      __bzero_sse2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, bzero, 1,
> +			      __bzero_sse2_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      CPU_FEATURE_USABLE (AVX2),
> +			      __bzero_avx2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      CPU_FEATURE_USABLE (AVX2),
> +			      __bzero_avx2_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (RTM)),
> +			      __bzero_avx2_unaligned_rtm)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (RTM)),
> +			      __bzero_avx2_unaligned_erms_rtm)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_evex_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_evex_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_avx512_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_avx512_unaligned)
> +	     )
> +
>    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>    IFUNC_IMPL (i, name, rawmemchr,
>  	      IFUNC_IMPL_ADD (array, i, rawmemchr,
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..5a5ee6f672 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -5,6 +5,7 @@
>  
>  #define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
> +#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
>  
>  #include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index c0bf2875d0..a093a2831f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -14,6 +14,9 @@
>    vmovd d, %xmm0; \
>    movq r, %rax;
>  
> +# define BZERO_ZERO_VEC0() \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>  
> @@ -29,6 +32,9 @@
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
>  # endif
> +# ifndef BZERO_SYMBOL
> +#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
> +# endif
>  # ifndef WMEMSET_SYMBOL
>  #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 5241216a77..727c92133a 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>  
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 6370021506..5d8fa78f05 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>  
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 8a6f0c561a..329c58ee46 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -22,6 +22,7 @@
>  
>  #if IS_IN (libc)
>  # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
> +# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
>  # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
>  
>  # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1b502b78e4..7c94fcdae1 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -26,6 +26,10 @@
>  
>  #include <sysdep.h>
>  
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
> +#endif
> +
>  #ifndef MEMSET_CHK_SYMBOL
>  # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
>  #endif
> @@ -87,6 +91,18 @@
>  # define XMM_SMALL	0
>  #endif
>  
> +#ifdef USE_LESS_VEC_MASK_STORE
> +# define SET_REG64	rcx
> +# define SET_REG32	ecx
> +# define SET_REG16	cx
> +# define SET_REG8	cl
> +#else
> +# define SET_REG64	rsi
> +# define SET_REG32	esi
> +# define SET_REG16	si
> +# define SET_REG8	sil
> +#endif
> +
>  #define PAGE_SIZE 4096
>  
>  /* Macro to calculate size of small memset block for aligning
> @@ -96,18 +112,6 @@
>  
>  #ifndef SECTION
>  # error SECTION is not defined!
> -#endif
> -
> -	.section SECTION(.text),"ax",@progbits
> -#if VEC_SIZE == 16 && IS_IN (libc)
> -ENTRY (__bzero)
> -	mov	%RDI_LP, %RAX_LP /* Set return value.  */
> -	mov	%RSI_LP, %RDX_LP /* Set n.  */
> -	xorl	%esi, %esi
> -	pxor	%XMM0, %XMM0
> -	jmp	L(entry_from_bzero)
> -END (__bzero)
> -weak_alias (__bzero, bzero)
>  #endif
>  
>  #if IS_IN (libc)
> @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>  	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>  	WMEMSET_VDUP_TO_VEC0_LOW()
>  	cmpq	$VEC_SIZE, %rdx
> -	jb	L(less_vec_no_vdup)
> +	jb	L(less_vec_from_wmemset)
>  	WMEMSET_VDUP_TO_VEC0_HIGH()
>  	jmp	L(entry_from_wmemset)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>  
> +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> +#if VEC_SIZE > 16
> +	BZERO_ZERO_VEC0 ()
> +#endif
> +	mov	%RDI_LP, %RAX_LP
> +	mov	%RSI_LP, %RDX_LP
> +#ifndef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +#endif
> +	cmp	$VEC_SIZE, %RDX_LP
> +	jb	L(less_vec_no_vdup)
> +#ifdef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +#endif
> +#if VEC_SIZE <= 16
> +	BZERO_ZERO_VEC0 ()
> +#endif
> +	cmp	$(VEC_SIZE * 2), %RDX_LP
> +	ja	L(more_2x_vec)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +	VMOVU	%VEC(0), (%rdi)
> +	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +	VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned))
> +
>  #if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>  	cmp	%RDX_LP, %RCX_LP
> @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>  	/* Clear the upper 32 bits.  */
>  	mov	%edx, %edx
>  # endif
> -L(entry_from_bzero):
>  	cmpq	$VEC_SIZE, %rdx
>  	jb	L(less_vec)
>  	MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -187,6 +215,31 @@ END (__memset_erms)
>  END (MEMSET_SYMBOL (__memset, erms))
>  # endif
>  
> +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> +# if VEC_SIZE > 16
> +	BZERO_ZERO_VEC0 ()
> +# endif
> +	mov	%RDI_LP, %RAX_LP
> +	mov	%RSI_LP, %RDX_LP
> +# ifndef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +# endif
> +	cmp	$VEC_SIZE, %RDX_LP
> +	jb	L(less_vec_no_vdup)
> +# ifdef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +# endif
> +# if VEC_SIZE <= 16
> +	BZERO_ZERO_VEC0 ()
> +# endif
> +	cmp	$(VEC_SIZE * 2), %RDX_LP
> +	ja	L(stosb_more_2x_vec)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +	VMOVU	%VEC(0), (%rdi)
> +	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +	VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> +
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>  	cmp	%RDX_LP, %RCX_LP
> @@ -229,6 +282,7 @@ L(last_2x_vec):
>  	.p2align 4,, 10
>  L(less_vec):
>  L(less_vec_no_vdup):
> +L(less_vec_from_wmemset):
>  	/* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
> @@ -374,8 +428,11 @@ L(less_vec):
>  	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
>  	   xmm). This is only does anything for AVX2.  */
>  	MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_from_wmemset):
> +#if VEC_SIZE > 16
>  L(less_vec_no_vdup):
>  #endif
> +#endif
>  L(cross_page):
>  #if VEC_SIZE > 32
>  	cmpl	$32, %edx
> @@ -386,7 +443,10 @@ L(cross_page):
>  	jge	L(between_16_31)
>  #endif
>  #ifndef USE_XMM_LESS_VEC
> -	MOVQ	%XMM0, %rcx
> +	MOVQ	%XMM0, %SET_REG64
> +#endif
> +#if VEC_SIZE <= 16
> +L(less_vec_no_vdup):
>  #endif
>  	cmpl	$8, %edx
>  	jge	L(between_8_15)
> @@ -395,7 +455,7 @@ L(cross_page):
>  	cmpl	$1, %edx
>  	jg	L(between_2_3)
>  	jl	L(between_0_0)
> -	movb	%sil, (%LESS_VEC_REG)
> +	movb	%SET_REG8, (%LESS_VEC_REG)
>  L(between_0_0):
>  	ret
>  
> @@ -428,8 +488,8 @@ L(between_8_15):
>  	MOVQ	%XMM0, (%rdi)
>  	MOVQ	%XMM0, -8(%rdi, %rdx)
>  #else
> -	movq	%rcx, (%LESS_VEC_REG)
> -	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
> +	movq	%SET_REG64, (%LESS_VEC_REG)
> +	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
>  #endif
>  	ret
>  
> @@ -442,8 +502,8 @@ L(between_4_7):
>  	MOVD	%XMM0, (%rdi)
>  	MOVD	%XMM0, -4(%rdi, %rdx)
>  #else
> -	movl	%ecx, (%LESS_VEC_REG)
> -	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
> +	movl	%SET_REG32, (%LESS_VEC_REG)
> +	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
>  #endif
>  	ret
>  
> @@ -452,12 +512,12 @@ L(between_4_7):
>  L(between_2_3):
>  	/* From 2 to 3.  No branch when size == 2.  */
>  #ifdef USE_XMM_LESS_VEC
> -	movb	%sil, (%rdi)
> -	movb	%sil, 1(%rdi)
> -	movb	%sil, -1(%rdi, %rdx)
> +	movb	%SET_REG8, (%rdi)
> +	movb	%SET_REG8, 1(%rdi)
> +	movb	%SET_REG8, -1(%rdi, %rdx)
>  #else
> -	movw	%cx, (%LESS_VEC_REG)
> -	movb	%sil, -1(%LESS_VEC_REG, %rdx)
> +	movw	%SET_REG16, (%LESS_VEC_REG)
> +	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
>  #endif
>  	ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
Noah Goldstein Feb. 9, 2022, 10:14 p.m. UTC | #3
On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote:
> > Rebase against the current master branch.
> >
> > --
> > memset with zero as the value to set is by far the majority value (99%+
> > for Python3 and GCC).
> >
> > bzero can be slightly more optimized for this case by using a zero-idiom
> > xor for broadcasting the set value to a register (vector or GPR).
> >
> > Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
>
> Is it really worth to ressurerect bzero with this multiple ifunc variants?
> Would Python3/GCC or any programs start to replace memset with bzero for
> the sake of this optimization?
>
> I agree with Wilco where the gain are marginal in this case.

The cost is only 1 cache line and it doesn't interfere with memset at all
so it's unlikely to cause any problems.

The saving is in the lane-cross broadcast which is on the critical
path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).

As well for EVEX + AVX512, because it uses predicate execution for
[0, VEC_SIZE] there is a slight benefit there (although only in throughput
because the critical path in mask construction is longer than the lane
VEC setup).

Agreed it's not clear if it's worth it to start replacing memset calls with
bzero calls, but at the very least this will improve existing code that
uses bzero.


>
> > ---
> >  sysdeps/x86_64/memset.S                       |   8 ++
> >  sysdeps/x86_64/multiarch/Makefile             |   1 +
> >  sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
> >  .../memset-avx2-unaligned-erms-rtm.S          |   1 +
> >  .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
> >  .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
> >  .../multiarch/memset-evex-unaligned-erms.S    |   3 +
> >  .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
> >  .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
> >  10 files changed, 256 insertions(+), 25 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/bzero.c
> >
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index 3f0517bbfc..af26e9cedc 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -35,6 +35,9 @@
> >    punpcklwd %xmm0, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  pxor %xmm0, %xmm0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    movd d, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0; \
> > @@ -53,6 +56,10 @@
> >  # define MEMSET_SYMBOL(p,s)  memset
> >  #endif
> >
> > +#ifndef BZERO_SYMBOL
> > +# define BZERO_SYMBOL(p,s)   __bzero
> > +#endif
> > +
> >  #ifndef WMEMSET_SYMBOL
> >  # define WMEMSET_CHK_SYMBOL(p,s) p
> >  # define WMEMSET_SYMBOL(p,s) __wmemset
> > @@ -63,6 +70,7 @@
> >  libc_hidden_builtin_def (memset)
> >
> >  #if IS_IN (libc)
> > +weak_alias (__bzero, bzero)
> >  libc_hidden_def (__wmemset)
> >  weak_alias (__wmemset, wmemset)
> >  libc_hidden_weak (wmemset)
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 4274bfdd0d..e7b413edad 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -1,6 +1,7 @@
> >  ifeq ($(subdir),string)
> >
> >  sysdep_routines += \
> > +  bzero \
> >    memchr-avx2 \
> >    memchr-avx2-rtm \
> >    memchr-evex \
> > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> > new file mode 100644
> > index 0000000000..58a14b2c33
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/bzero.c
> > @@ -0,0 +1,106 @@
> > +/* Multiple versions of bzero.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __bzero __redirect___bzero
> > +# include <string.h>
> > +# undef __bzero
> > +
> > +# define SYMBOL_NAME __bzero
> > +# include <init-arch.h>
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> > +  attribute_hidden;
> > +
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > +      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (avx512_unaligned_erms);
> > +
> > +       return OPTIMIZE1 (avx512_unaligned);
> > +     }
> > +    }
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (evex_unaligned_erms);
> > +
> > +       return OPTIMIZE1 (evex_unaligned);
> > +     }
> > +
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> > +
> > +       return OPTIMIZE1 (avx2_unaligned_rtm);
> > +     }
> > +
> > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (avx2_unaligned_erms);
> > +
> > +       return OPTIMIZE1 (avx2_unaligned);
> > +     }
> > +    }
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +    return OPTIMIZE1 (sse2_unaligned_erms);
> > +
> > +  return OPTIMIZE1 (sse2_unaligned);
> > +}
> > +
> > +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> > +
> > +weak_alias (__bzero, bzero)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 68a56797d4..a594f4176e 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                             __memset_avx512_no_vzeroupper)
> >            )
> >
> > +  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> > +  IFUNC_IMPL (i, name, bzero,
> > +           IFUNC_IMPL_ADD (array, i, bzero, 1,
> > +                           __bzero_sse2_unaligned)
> > +           IFUNC_IMPL_ADD (array, i, bzero, 1,
> > +                           __bzero_sse2_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           CPU_FEATURE_USABLE (AVX2),
> > +                           __bzero_avx2_unaligned)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           CPU_FEATURE_USABLE (AVX2),
> > +                           __bzero_avx2_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (RTM)),
> > +                           __bzero_avx2_unaligned_rtm)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (RTM)),
> > +                           __bzero_avx2_unaligned_erms_rtm)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_evex_unaligned)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_evex_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_avx512_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_avx512_unaligned)
> > +          )
> > +
> >    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
> >    IFUNC_IMPL (i, name, rawmemchr,
> >             IFUNC_IMPL_ADD (array, i, rawmemchr,
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > index 8ac3e479bb..5a5ee6f672 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > @@ -5,6 +5,7 @@
> >
> >  #define SECTION(p) p##.avx.rtm
> >  #define MEMSET_SYMBOL(p,s)   p##_avx2_##s##_rtm
> > +#define BZERO_SYMBOL(p,s)    p##_avx2_##s##_rtm
> >  #define WMEMSET_SYMBOL(p,s)  p##_avx2_##s##_rtm
> >
> >  #include "memset-avx2-unaligned-erms.S"
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index c0bf2875d0..a093a2831f 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -14,6 +14,9 @@
> >    vmovd d, %xmm0; \
> >    movq r, %rax;
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  vpxor %xmm0, %xmm0, %xmm0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> >
> > @@ -29,6 +32,9 @@
> >  # ifndef MEMSET_SYMBOL
> >  #  define MEMSET_SYMBOL(p,s) p##_avx2_##s
> >  # endif
> > +# ifndef BZERO_SYMBOL
> > +#  define BZERO_SYMBOL(p,s)  p##_avx2_##s
> > +# endif
> >  # ifndef WMEMSET_SYMBOL
> >  #  define WMEMSET_SYMBOL(p,s)        p##_avx2_##s
> >  # endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index 5241216a77..727c92133a 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -19,6 +19,9 @@
> >    vpbroadcastb d, %VEC0; \
> >    movq r, %rax
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vpbroadcastd d, %VEC0; \
> >    movq r, %rax
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 6370021506..5d8fa78f05 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -19,6 +19,9 @@
> >    vpbroadcastb d, %VEC0; \
> >    movq r, %rax
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vpbroadcastd d, %VEC0; \
> >    movq r, %rax
> > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > index 8a6f0c561a..329c58ee46 100644
> > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > @@ -22,6 +22,7 @@
> >
> >  #if IS_IN (libc)
> >  # define MEMSET_SYMBOL(p,s)  p##_sse2_##s
> > +# define BZERO_SYMBOL(p,s)   MEMSET_SYMBOL (p, s)
> >  # define WMEMSET_SYMBOL(p,s) p##_sse2_##s
> >
> >  # ifdef SHARED
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index 1b502b78e4..7c94fcdae1 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -26,6 +26,10 @@
> >
> >  #include <sysdep.h>
> >
> > +#ifndef BZERO_SYMBOL
> > +# define BZERO_SYMBOL(p,s)           MEMSET_SYMBOL (p, s)
> > +#endif
> > +
> >  #ifndef MEMSET_CHK_SYMBOL
> >  # define MEMSET_CHK_SYMBOL(p,s)              MEMSET_SYMBOL(p, s)
> >  #endif
> > @@ -87,6 +91,18 @@
> >  # define XMM_SMALL   0
> >  #endif
> >
> > +#ifdef USE_LESS_VEC_MASK_STORE
> > +# define SET_REG64   rcx
> > +# define SET_REG32   ecx
> > +# define SET_REG16   cx
> > +# define SET_REG8    cl
> > +#else
> > +# define SET_REG64   rsi
> > +# define SET_REG32   esi
> > +# define SET_REG16   si
> > +# define SET_REG8    sil
> > +#endif
> > +
> >  #define PAGE_SIZE 4096
> >
> >  /* Macro to calculate size of small memset block for aligning
> > @@ -96,18 +112,6 @@
> >
> >  #ifndef SECTION
> >  # error SECTION is not defined!
> > -#endif
> > -
> > -     .section SECTION(.text),"ax",@progbits
> > -#if VEC_SIZE == 16 && IS_IN (libc)
> > -ENTRY (__bzero)
> > -     mov     %RDI_LP, %RAX_LP /* Set return value.  */
> > -     mov     %RSI_LP, %RDX_LP /* Set n.  */
> > -     xorl    %esi, %esi
> > -     pxor    %XMM0, %XMM0
> > -     jmp     L(entry_from_bzero)
> > -END (__bzero)
> > -weak_alias (__bzero, bzero)
> >  #endif
> >
> >  #if IS_IN (libc)
> > @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> >       WMEMSET_VDUP_TO_VEC0_LOW()
> >       cmpq    $VEC_SIZE, %rdx
> > -     jb      L(less_vec_no_vdup)
> > +     jb      L(less_vec_from_wmemset)
> >       WMEMSET_VDUP_TO_VEC0_HIGH()
> >       jmp     L(entry_from_wmemset)
> >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  #endif
> >
> > +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> > +#if VEC_SIZE > 16
> > +     BZERO_ZERO_VEC0 ()
> > +#endif
> > +     mov     %RDI_LP, %RAX_LP
> > +     mov     %RSI_LP, %RDX_LP
> > +#ifndef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +#endif
> > +     cmp     $VEC_SIZE, %RDX_LP
> > +     jb      L(less_vec_no_vdup)
> > +#ifdef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +#endif
> > +#if VEC_SIZE <= 16
> > +     BZERO_ZERO_VEC0 ()
> > +#endif
> > +     cmp     $(VEC_SIZE * 2), %RDX_LP
> > +     ja      L(more_2x_vec)
> > +     /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > +     VMOVU   %VEC(0), (%rdi)
> > +     VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > +     VZEROUPPER_RETURN
> > +END (BZERO_SYMBOL(__bzero, unaligned))
> > +
> >  #if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >       cmp     %RDX_LP, %RCX_LP
> > @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> >       /* Clear the upper 32 bits.  */
> >       mov     %edx, %edx
> >  # endif
> > -L(entry_from_bzero):
> >       cmpq    $VEC_SIZE, %rdx
> >       jb      L(less_vec)
> >       MEMSET_VDUP_TO_VEC0_HIGH()
> > @@ -187,6 +215,31 @@ END (__memset_erms)
> >  END (MEMSET_SYMBOL (__memset, erms))
> >  # endif
> >
> > +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> > +# if VEC_SIZE > 16
> > +     BZERO_ZERO_VEC0 ()
> > +# endif
> > +     mov     %RDI_LP, %RAX_LP
> > +     mov     %RSI_LP, %RDX_LP
> > +# ifndef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +# endif
> > +     cmp     $VEC_SIZE, %RDX_LP
> > +     jb      L(less_vec_no_vdup)
> > +# ifdef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +# endif
> > +# if VEC_SIZE <= 16
> > +     BZERO_ZERO_VEC0 ()
> > +# endif
> > +     cmp     $(VEC_SIZE * 2), %RDX_LP
> > +     ja      L(stosb_more_2x_vec)
> > +     /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > +     VMOVU   %VEC(0), (%rdi)
> > +     VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > +     VZEROUPPER_RETURN
> > +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> > +
> >  # if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >       cmp     %RDX_LP, %RCX_LP
> > @@ -229,6 +282,7 @@ L(last_2x_vec):
> >       .p2align 4,, 10
> >  L(less_vec):
> >  L(less_vec_no_vdup):
> > +L(less_vec_from_wmemset):
> >       /* Less than 1 VEC.  */
> >  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> >  #  error Unsupported VEC_SIZE!
> > @@ -374,8 +428,11 @@ L(less_vec):
> >       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> >          xmm). This is only does anything for AVX2.  */
> >       MEMSET_VDUP_TO_VEC0_LOW ()
> > +L(less_vec_from_wmemset):
> > +#if VEC_SIZE > 16
> >  L(less_vec_no_vdup):
> >  #endif
> > +#endif
> >  L(cross_page):
> >  #if VEC_SIZE > 32
> >       cmpl    $32, %edx
> > @@ -386,7 +443,10 @@ L(cross_page):
> >       jge     L(between_16_31)
> >  #endif
> >  #ifndef USE_XMM_LESS_VEC
> > -     MOVQ    %XMM0, %rcx
> > +     MOVQ    %XMM0, %SET_REG64
> > +#endif
> > +#if VEC_SIZE <= 16
> > +L(less_vec_no_vdup):
> >  #endif
> >       cmpl    $8, %edx
> >       jge     L(between_8_15)
> > @@ -395,7 +455,7 @@ L(cross_page):
> >       cmpl    $1, %edx
> >       jg      L(between_2_3)
> >       jl      L(between_0_0)
> > -     movb    %sil, (%LESS_VEC_REG)
> > +     movb    %SET_REG8, (%LESS_VEC_REG)
> >  L(between_0_0):
> >       ret
> >
> > @@ -428,8 +488,8 @@ L(between_8_15):
> >       MOVQ    %XMM0, (%rdi)
> >       MOVQ    %XMM0, -8(%rdi, %rdx)
> >  #else
> > -     movq    %rcx, (%LESS_VEC_REG)
> > -     movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> > +     movq    %SET_REG64, (%LESS_VEC_REG)
> > +     movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
> >  #endif
> >       ret
> >
> > @@ -442,8 +502,8 @@ L(between_4_7):
> >       MOVD    %XMM0, (%rdi)
> >       MOVD    %XMM0, -4(%rdi, %rdx)
> >  #else
> > -     movl    %ecx, (%LESS_VEC_REG)
> > -     movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> > +     movl    %SET_REG32, (%LESS_VEC_REG)
> > +     movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
> >  #endif
> >       ret
> >
> > @@ -452,12 +512,12 @@ L(between_4_7):
> >  L(between_2_3):
> >       /* From 2 to 3.  No branch when size == 2.  */
> >  #ifdef USE_XMM_LESS_VEC
> > -     movb    %sil, (%rdi)
> > -     movb    %sil, 1(%rdi)
> > -     movb    %sil, -1(%rdi, %rdx)
> > +     movb    %SET_REG8, (%rdi)
> > +     movb    %SET_REG8, 1(%rdi)
> > +     movb    %SET_REG8, -1(%rdi, %rdx)
> >  #else
> > -     movw    %cx, (%LESS_VEC_REG)
> > -     movb    %sil, -1(%LESS_VEC_REG, %rdx)
> > +     movw    %SET_REG16, (%LESS_VEC_REG)
> > +     movb    %SET_REG8, -1(%LESS_VEC_REG, %rdx)
> >  #endif
> >       ret
> >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
Adhemerval Zanella Netto Feb. 10, 2022, 12:35 p.m. UTC | #4
On 09/02/2022 19:14, Noah Goldstein wrote:
> On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>>
>> On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote:
>>> Rebase against the current master branch.
>>>
>>> --
>>> memset with zero as the value to set is by far the majority value (99%+
>>> for Python3 and GCC).
>>>
>>> bzero can be slightly more optimized for this case by using a zero-idiom
>>> xor for broadcasting the set value to a register (vector or GPR).
>>>
>>> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
>>
>> Is it really worth to ressurerect bzero with this multiple ifunc variants?
>> Would Python3/GCC or any programs start to replace memset with bzero for
>> the sake of this optimization?
>>
>> I agree with Wilco where the gain are marginal in this case.
> 
> The cost is only 1 cache line and it doesn't interfere with memset at all
> so it's unlikely to cause any problems.
> 
> The saving is in the lane-cross broadcast which is on the critical
> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
> 
> As well for EVEX + AVX512, because it uses predicate execution for
> [0, VEC_SIZE] there is a slight benefit there (although only in throughput
> because the critical path in mask construction is longer than the lane
> VEC setup).
> 
> Agreed it's not clear if it's worth it to start replacing memset calls with
> bzero calls, but at the very least this will improve existing code that
> uses bzero.
> 

My point is this is a lot of code and infrastructure for a symbol marked
as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
marginal gains in specific cases.
Wilco Dijkstra Feb. 10, 2022, 1:01 p.m. UTC | #5
Hi,

>> The saving is in the lane-cross broadcast which is on the critical
>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).

What is the speedup in eg. bench-memset? Generally the OoO engine will
be able to hide a small increase in latency, so I'd be surprised it shows up
as a significant gain.

If you can show a good speedup in an important application (or benchmark
like SPEC2017) then it may be worth pursuing. However there are other
optimization opportunities that may be easier or give a larger benefit.

>> Agreed it's not clear if it's worth it to start replacing memset calls with
>> bzero calls, but at the very least this will improve existing code that
>> uses bzero.

No code uses bzero, no compiler emits bzero. It died 2 decades ago...

> My point is this is a lot of code and infrastructure for a symbol marked
> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
> marginal gains in specific cases.

Indeed, what we really should discuss is how to remove the last traces of
bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
or could we just get rid of it altogether?

Cheers,
Wilco
Adhemerval Zanella Netto Feb. 10, 2022, 1:10 p.m. UTC | #6
On 10/02/2022 10:01, Wilco Dijkstra wrote:
> Hi,
> 
>>> The saving is in the lane-cross broadcast which is on the critical
>>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
> 
> What is the speedup in eg. bench-memset? Generally the OoO engine will
> be able to hide a small increase in latency, so I'd be surprised it shows up
> as a significant gain.
> 
> If you can show a good speedup in an important application (or benchmark
> like SPEC2017) then it may be worth pursuing. However there are other
> optimization opportunities that may be easier or give a larger benefit.
> 
>>> Agreed it's not clear if it's worth it to start replacing memset calls with
>>> bzero calls, but at the very least this will improve existing code that
>>> uses bzero.
> 
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
> 
>> My point is this is a lot of code and infrastructure for a symbol marked
>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>> marginal gains in specific cases.
> 
> Indeed, what we really should discuss is how to remove the last traces of
> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
> or could we just get rid of it altogether?

We need to keep the symbols as-is afaiu, since callers might still target
old POSIX where the symbol is defined as supported.  We might add either
compiler or linker warning stating the symbols is deprecated, but imho it
would just be better if we stop trying to microoptimize it and just use
the generic interface (which call memset).
Adhemerval Zanella Netto Feb. 10, 2022, 1:16 p.m. UTC | #7
On 10/02/2022 10:10, Adhemerval Zanella wrote:
> 
> 
> On 10/02/2022 10:01, Wilco Dijkstra wrote:
>> Hi,
>>
>>>> The saving is in the lane-cross broadcast which is on the critical
>>>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
>>
>> What is the speedup in eg. bench-memset? Generally the OoO engine will
>> be able to hide a small increase in latency, so I'd be surprised it shows up
>> as a significant gain.
>>
>> If you can show a good speedup in an important application (or benchmark
>> like SPEC2017) then it may be worth pursuing. However there are other
>> optimization opportunities that may be easier or give a larger benefit.
>>
>>>> Agreed it's not clear if it's worth it to start replacing memset calls with
>>>> bzero calls, but at the very least this will improve existing code that
>>>> uses bzero.
>>
>> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>>
>>> My point is this is a lot of code and infrastructure for a symbol marked
>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>>> marginal gains in specific cases.
>>
>> Indeed, what we really should discuss is how to remove the last traces of
>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
>> or could we just get rid of it altogether?
> 
> We need to keep the symbols as-is afaiu, since callers might still target
> old POSIX where the symbol is defined as supported.  We might add either
> compiler or linker warning stating the symbols is deprecated, but imho it
> would just be better if we stop trying to microoptimize it and just use
> the generic interface (which call memset).

And it even makes more sense since gcc either generates memset call or
expand bzero, so there is even less point in adding either optimized
version for it or ifunc variants to call memset (to avoid the intra-plt
call).

I will probably cleanup all the bzero optimization we have, it is
unfortunate that this patch got in without further discussion.
Wilco Dijkstra Feb. 10, 2022, 1:17 p.m. UTC | #8
Hi,

> We need to keep the symbols as-is afaiu, since callers might still target
> old POSIX where the symbol is defined as supported.  We might add either
> compiler or linker warning stating the symbols is deprecated, but imho it
> would just be better if we stop trying to microoptimize it and just use
> the generic interface (which call memset).

Compilers have been doing that forever - if you use bzero GCC and LLVM
emit a call to memset. No new calls are emitted to these functions, so there
is no point in having target-specific optimization at all.

However we could disallow use of these functions at all in future GLIBCs.

Cheers,
Wilco
Adhemerval Zanella Netto Feb. 10, 2022, 1:22 p.m. UTC | #9
On 10/02/2022 10:17, Wilco Dijkstra wrote:
> Hi,
> 
>> We need to keep the symbols as-is afaiu, since callers might still target
>> old POSIX where the symbol is defined as supported.  We might add either
>> compiler or linker warning stating the symbols is deprecated, but imho it
>> would just be better if we stop trying to microoptimize it and just use
>> the generic interface (which call memset).
> 
> Compilers have been doing that forever - if you use bzero GCC and LLVM
> emit a call to memset. No new calls are emitted to these functions, so there
> is no point in having target-specific optimization at all.
> 
> However we could disallow use of these functions at all in future GLIBCs.

Unfortunately we can not stop providing such symbols unless we start to stop
being compatible to legacy POSIX.  And I think we already do not use these
internally.

Also we can start to remove to other bstring functions as well, bcmp and bcopy.
Alejandro Colomar Feb. 10, 2022, 5:50 p.m. UTC | #10
Hi,


On 2/10/22 14:16, Adhemerval Zanella via Libc-alpha wrote:
> On 10/02/2022 10:10, Adhemerval Zanella wrote:
>> On 10/02/2022 10:01, Wilco Dijkstra wrote:
>>>>> Agreed it's not clear if it's worth it to start replacing memset
calls with
>>>>> bzero calls, but at the very least this will improve existing code
that
>>>>> uses bzero.
>>>
>>> No code uses bzero, no compiler emits bzero. It died 2 decades ago...

Sorry to ruin your day, but there's a bzero(3) user here :)

There are rational reasons to prefer bzero(3) due to it's better
interface, and only one to prefer memset(3): standards people decided to
remove bzero(3).  See <https://stackoverflow.com/a/17097978/6872717>.

Consider the following quote from "UNIX Network Programming" by Stevens
et al., Section 1.2 (emphasis added):
> `bzero` is not an ANSI C function. It is derived from early Berkely
> networking code. Nevertheless, we use it throughout the text, instead
> of the ANSI C `memset` function, because `bzero` is easier to remember
> (with only two arguments) than `memset` (with three arguments). Almost
> every vendor that supports the sockets API also provides `bzero`, and
> if not, we provide a macro definition in our `unp.h` header.
>
> Indeed, **the author of TCPv3 [TCP/IP Illustrated, Volume 3 - Stevens
1996] made the mistake of swapping the second
> and third arguments to `memset` in 10 occurrences in the first
> printing**. A C compiler cannot catch this error because both arguments
> are of the same type. (Actually, the second argument is an `int` and
> the third argument is `size_t`, which is typically an `unsigned int`,
> but the values specified, 0 and 16, respectively, are still acceptable
> for the other type of argument.) The call to `memset` still worked,
> because only a few of the socket functions actually require that the
> final 8 bytes of an Internet socket address structure be set to 0.
> Nevertheless, it was an error, and one that could be avoided by using
> `bzero`, because swapping the two arguments to `bzero` will always be
> caught by the C compiler if function prototypes are used.

I consistently use bzero(3), and dislike the interface of memset(3) for
zeroing memory.  I checked how many memset(3) calls there are in a
codebase of mine, and there is exactly 1 call to memset(3), for setting
an array representing a large bitfield to 1s: memset(arr, UCHAR_MAX,
sizeof(arr)).  And there are 41 calls to bzero(3).

>>>
>>>> My point is this is a lot of code and infrastructure for a symbol
marked
>>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>>>> marginal gains in specific cases.
>>>
>>> Indeed, what we really should discuss is how to remove the last
traces of
>>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
>>> or could we just get rid of it altogether?

I think it's sad that POSIX removed bzero(3).  In the end, people need
to zero memory, and there's no simpler interface than bzero(3) for that.
 memset(3) has a useless extra parameter.  Even if you just do a simple
wrapper as the following, which is no big overhead for glibc I guess,
you would be improving (or not worsening) my and hopefully others' lives:

static inline bzero(s, n)
{
	memset(s, 0, n);
}

Is that really a pain to maintain?

If libc ever removes bzero(3), it's not like I'm going to start using
memset(3).  I'll just define it myself.  That's not an improvement, but
the opposite, IMO.

Ideally, POSIX would reconsider some day, and reincorporate bzero(3),
but I don't expect that any soon :(.

>>
>> We need to keep the symbols as-is afaiu, since callers might still target
>> old POSIX where the symbol is defined as supported.  We might add either
>> compiler or linker warning stating the symbols is deprecated, but imho it
>> would just be better if we stop trying to microoptimize it and just use
>> the generic interface (which call memset).

No please, I know they are deprecated, and explicitly want to use it.
Don't need some extra spurious warning.

Other projects that I have touched (including nginx and shadow-utils),
seem to have some similar opinion to me, and they define memzero() or
some other similar name, with an interface identical to bzero(3) (just a
different name to avoid problems), to wrap either bzero(3) or memset(3),
depending on what is available.

Thanks,

Alex
Noah Goldstein Feb. 10, 2022, 6:28 p.m. UTC | #11
On Thu, Feb 10, 2022 at 7:02 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi,
>
> >> The saving is in the lane-cross broadcast which is on the critical
> >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
>
> What is the speedup in eg. bench-memset? Generally the OoO engine will
> be able to hide a small increase in latency, so I'd be surprised it shows up
> as a significant gain.
>
> If you can show a good speedup in an important application (or benchmark
> like SPEC2017) then it may be worth pursuing. However there are other
> optimization opportunities that may be easier or give a larger benefit.

Very much so doubt any benefit on SPEC/other unless the compiler
decided to build zeros with long dependency chains instead of
xor.

>
> >> Agreed it's not clear if it's worth it to start replacing memset calls with
> >> bzero calls, but at the very least this will improve existing code that
> >> uses bzero.
>
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>
> > My point is this is a lot of code and infrastructure for a symbol marked
> > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
> > marginal gains in specific cases.
>
> Indeed, what we really should discuss is how to remove the last traces of
> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
> or could we just get rid of it altogether?
>
> Cheers,
> Wilco
Noah Goldstein Feb. 10, 2022, 6:35 p.m. UTC | #12
On Thu, Feb 10, 2022 at 7:02 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi,
>
> >> The saving is in the lane-cross broadcast which is on the critical
> >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
>
> What is the speedup in eg. bench-memset? Generally the OoO engine will
> be able to hide a small increase in latency, so I'd be surprised it shows up
> as a significant gain.

Well comparing the previous sse2 bzero against avx2/evex/avx512 versions
there is obviously speedup. Comparing memset-${version} vs bzero-${version}
it's ambiguous if there is any benefits.

>
> If you can show a good speedup in an important application (or benchmark
> like SPEC2017) then it may be worth pursuing. However there are other
> optimization opportunities that may be easier or give a larger benefit.
>
> >> Agreed it's not clear if it's worth it to start replacing memset calls with
> >> bzero calls, but at the very least this will improve existing code that
> >> uses bzero.
>
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>
> > My point is this is a lot of code and infrastructure for a symbol marked
> > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
> > marginal gains in specific cases.
>
> Indeed, what we really should discuss is how to remove the last traces of
> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
> or could we just get rid of it altogether?
>
> Cheers,
> Wilco
Wilco Dijkstra Feb. 10, 2022, 7:19 p.m. UTC | #13
Hi Alex,

Don't worry, you're not a bzero user - just check your binaries and
I bet you will find zero calls to bzero!

GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to
avoid unnecessary proliferation of almost identical string functions where
there is absolutely no performance benefit. In fact this duplication is typically
a net loss, partly due to reducing optimization effort and the extra runtime
overhead due to having multiple copies of the same function in caches and
predictors.

However if you care about having a standardized call to zero memory, just
propose it for the next C/C++ standard. Given we will not agree on naming,
we could just add memzero, memclr, memset_zero and bcopy as synonyms
for memset.

Cheers,
Wilco
Adhemerval Zanella Netto Feb. 10, 2022, 7:42 p.m. UTC | #14
On 10/02/2022 14:50, Alejandro Colomar (man-pages) wrote:
> Hi,
> 
> 
> On 2/10/22 14:16, Adhemerval Zanella via Libc-alpha wrote:
>> On 10/02/2022 10:10, Adhemerval Zanella wrote:
>>> On 10/02/2022 10:01, Wilco Dijkstra wrote:
>>>>>> Agreed it's not clear if it's worth it to start replacing memset
> calls with
>>>>>> bzero calls, but at the very least this will improve existing code
> that
>>>>>> uses bzero.
>>>>
>>>> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
> 
> Sorry to ruin your day, but there's a bzero(3) user here :)
> 
> There are rational reasons to prefer bzero(3) due to it's better
> interface, and only one to prefer memset(3): standards people decided to
> remove bzero(3).  See <https://stackoverflow.com/a/17097978/6872717>.
> 
> Consider the following quote from "UNIX Network Programming" by Stevens
> et al., Section 1.2 (emphasis added):
>> `bzero` is not an ANSI C function. It is derived from early Berkely
>> networking code. Nevertheless, we use it throughout the text, instead
>> of the ANSI C `memset` function, because `bzero` is easier to remember
>> (with only two arguments) than `memset` (with three arguments). Almost
>> every vendor that supports the sockets API also provides `bzero`, and
>> if not, we provide a macro definition in our `unp.h` header.
>>
>> Indeed, **the author of TCPv3 [TCP/IP Illustrated, Volume 3 - Stevens
> 1996] made the mistake of swapping the second
>> and third arguments to `memset` in 10 occurrences in the first
>> printing**. A C compiler cannot catch this error because both arguments
>> are of the same type. (Actually, the second argument is an `int` and
>> the third argument is `size_t`, which is typically an `unsigned int`,
>> but the values specified, 0 and 16, respectively, are still acceptable
>> for the other type of argument.) The call to `memset` still worked,
>> because only a few of the socket functions actually require that the
>> final 8 bytes of an Internet socket address structure be set to 0.
>> Nevertheless, it was an error, and one that could be avoided by using
>> `bzero`, because swapping the two arguments to `bzero` will always be
>> caught by the C compiler if function prototypes are used.
> 
> I consistently use bzero(3), and dislike the interface of memset(3) for
> zeroing memory.  I checked how many memset(3) calls there are in a
> codebase of mine, and there is exactly 1 call to memset(3), for setting
> an array representing a large bitfield to 1s: memset(arr, UCHAR_MAX,
> sizeof(arr)).  And there are 41 calls to bzero(3).

The bzero won't go away since glibc will continue to support old POSIX 
interfaces, but Wilco has a point: it is an interface that has been 
deprecated for more than 2 decades and usually only BSD-like code
still keeps using it (explicit_bzero was initialized from openbsd).

> 
>>>>
>>>>> My point is this is a lot of code and infrastructure for a symbol
> marked
>>>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>>>>> marginal gains in specific cases.
>>>>
>>>> Indeed, what we really should discuss is how to remove the last
> traces of
>>>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
>>>> or could we just get rid of it altogether?
> 
> I think it's sad that POSIX removed bzero(3).  In the end, people need
> to zero memory, and there's no simpler interface than bzero(3) for that.
>  memset(3) has a useless extra parameter.  Even if you just do a simple
> wrapper as the following, which is no big overhead for glibc I guess,
> you would be improving (or not worsening) my and hopefully others' lives:
> 
> static inline bzero(s, n)
> {
> 	memset(s, 0, n);
> }
> 
> Is that really a pain to maintain?

Yes, this is *exactly* what we are trying to remove and I will oppose to
reintroduce it after all the cleanups we did in this front (for instance
18b10de7ced9e9).

Besides your code most likely is calling memset under the hoods, gcc by 
default converts bzero calls to memset.  It seems to be really tricky to 
actually force gcc emit a bzero call to libc, I would could make by not 
including string.h, adding an explicit prototype to bzero, along with
-fno-builtin:

$ cat bzero.c 
#include <stddef.h>
extern void bzero (void *, size_t s);

int main (int argc, char *argv[])
{
  char b[1024];
  bzero (b, sizeof b);
  return 0;
}
$ gcc -Wall -fno-builtin bzero.c -o test && objdump -R test | grep -w bzero
0000000000003fd0 R_X86_64_JUMP_SLOT  bzero@GLIBC_2.2.5

> 
> If libc ever removes bzero(3), it's not like I'm going to start using
> memset(3).  I'll just define it myself.  That's not an improvement, but
> the opposite, IMO.
> 
> Ideally, POSIX would reconsider some day, and reincorporate bzero(3),
> but I don't expect that any soon :(.

I really think it is unlikely, but it is just my view.

> 
>>>
>>> We need to keep the symbols as-is afaiu, since callers might still target
>>> old POSIX where the symbol is defined as supported.  We might add either
>>> compiler or linker warning stating the symbols is deprecated, but imho it
>>> would just be better if we stop trying to microoptimize it and just use
>>> the generic interface (which call memset).
> 
> No please, I know they are deprecated, and explicitly want to use it.
> Don't need some extra spurious warning.
> 
> Other projects that I have touched (including nginx and shadow-utils),
> seem to have some similar opinion to me, and they define memzero() or
> some other similar name, with an interface identical to bzero(3) (just a
> different name to avoid problems), to wrap either bzero(3) or memset(3),
> depending on what is available.

We are discussing different subjects here: what I want is to remove the
glibc *internal* optimization for bzero, which is essentially an
implementation detail.  In a first glance it would change performance,
however gcc does a hard job replacing bzero/bcmp/bcopy with their
str* counterparts, so it highly unlike that newer binaries will actually
call bzero.

I did an experiment on my system and indeed there is no calls to bzero.

$ cat count_bzero.sh
#!/bin/bash

files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
total=0
for file in $files; do
  bzero=`objdump -R $file 2>&1`
  if [ $? -eq 0 ]; then
    ncalls=`echo $bzero | grep -w bzero | wc -l`
    ((total=total+ncalls))
    if [ $ncalls -gt 0 ]; then
      echo "$file: $ncalls"
    fi
  fi
done
echo "TOTAL=$total"
$ ./count_bzero.sh
TOTAL=0
Alejandro Colomar Feb. 10, 2022, 8:27 p.m. UTC | #15
Hi Wilco and Adhemerval,


On 2/10/22 14:01, Wilco Dijkstra via Libc-alpha wrote:
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>
>> My point is this is a lot of code and infrastructure for a symbol marked
>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>> marginal gains in specific cases.

That was what triggered my fears.  I as a user, not a compiler or libc
writer, am not really interested in the implementation details, as long
as the implementation does the magic for me.  I want to be able to keep
using bzero(3) in source code, but if the compiler transforms it into
memset(3) (or a for loop), I don't even want to know.



On 2/10/22 20:19, Wilco Dijkstra wrote:
> Hi Alex,
> 
> Don't worry, you're not a bzero user - just check your binaries and
> I bet you will find zero calls to bzero!
> 

:)

> GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to
> avoid unnecessary proliferation of almost identical string functions where
> there is absolutely no performance benefit. In fact this duplication is typically
> a net loss, partly due to reducing optimization effort and the extra runtime
> overhead due to having multiple copies of the same function in caches and
> predictors.
> 
> However if you care about having a standardized call to zero memory, just
> propose it for the next C/C++ standard. Given we will not agree on naming,
> we could just add memzero, memclr, memset_zero and bcopy as synonyms
> for memset.

Hmm, I'll do so.  Will try to press bzero(3) into C3x.

On 2/10/22 20:42, Adhemerval Zanella wrote:
> The bzero won't go away since glibc will continue to support old POSIX
> interfaces, but Wilco has a point: it is an interface that has been
> deprecated for more than 2 decades and usually only BSD-like code
> still keeps using it (explicit_bzero was initialized from openbsd).
> [...]>>
>> static inline bzero(s, n)
>> {
>> 	memset(s, 0, n);
>> }
>>
>> Is that really a pain to maintain?
>
> Yes, this is *exactly* what we are trying to remove and I will oppose to
> reintroduce it after all the cleanups we did in this front (for instance
> 18b10de7ced9e9).
>
> Besides your code most likely is calling memset under the hoods, gcc by
> default converts bzero calls to memset.  It seems to be really tricky to
> actually force gcc emit a bzero call to libc, I would could make by not
> including string.h, adding an explicit prototype to bzero, along with
> -fno-builtin:

I guess that it's GCC that defines that (or equivalent) code, instead of
glibc.  As a user-space programmer, that's fine by me :-).

>> If libc ever removes bzero(3), it's not like I'm going to start using
>> memset(3).  I'll just define it myself.  That's not an improvement, but
>> the opposite, IMO.
>>
>> Ideally, POSIX would reconsider some day, and reincorporate bzero(3),
>> but I don't expect that any soon :(.
>
> I really think it is unlikely, but it is just my view.
> [...]>> Other projects that I have touched (including nginx and
shadow-utils),
>> seem to have some similar opinion to me, and they define memzero() or
>> some other similar name, with an interface identical to bzero(3) (just a
>> different name to avoid problems), to wrap either bzero(3) or memset(3),
>> depending on what is available.
>
> We are discussing different subjects here: what I want is to remove the
> glibc *internal* optimization for bzero, which is essentially an
> implementation detail.  In a first glance it would change performance,
> however gcc does a hard job replacing bzero/bcmp/bcopy with their
> str* counterparts, so it highly unlike that newer binaries will actually
> call bzero.

Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
continue supporting it.  Just remember that some users keep writing and
wanting to write bzero(3) instead of memset(3) in their .c files, so
it's far from being dead in source code.

Cheers,

Alex
Adhemerval Zanella Netto Feb. 10, 2022, 8:42 p.m. UTC | #16
On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
>> We are discussing different subjects here: what I want is to remove the
>> glibc *internal* optimization for bzero, which is essentially an
>> implementation detail.  In a first glance it would change performance,
>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
>> str* counterparts, so it highly unlike that newer binaries will actually
>> call bzero.
> 
> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
> continue supporting it.  Just remember that some users keep writing and
> wanting to write bzero(3) instead of memset(3) in their .c files, so
> it's far from being dead in source code.

Again, I am not proposing to *remove* bzero, but rather the internal
optimizations that currently only adds code complexity and maintenance
burden.  My patchset [1] will keep the ABI as-is, the difference is
bcopy and bzero will use the default implementation on all architectures.

[1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
Patrick McGehearty Feb. 10, 2022, 9:07 p.m. UTC | #17
Just as another point of information, Solaris libc implemented
bzero as moving arguments around appropriately then jumping to
memset. Noone noticed enough to file a complaint. Of course,
short fixed-length bzero was handled with in line stores of zero
by the compiler. For long vector bzeroing, the overhead was
negligible.

When certain Sparc hardware implementations provided faster methods
for zeroing a cache line at a time on cache line boundaries,
memset added a single test for zero ifandonlyif the length of code
to memset was over a threshold that seemed likely to make it
worthwhile to use the faster method. The principal advantage
of the fast zeroing operation is that it did not require data
to move from memory to cache before writing zeros to memory,
protecting cache locality in the face of large block zeroing.
I was responsible for much of that optimization effort.
Whether that optimization was really worth it is open for debate
for a variety of reasons that I won't go into just now.

Apps still used bzero or memset(target,zero,length) according to
their preferences, but the code was unified under memset.

I am inclined to agree with keeping bzero in the API for
compatibility with old code/old binaries/old programmers. :-)

Using shared memset code for the implementation of bzero
is worthwhile for reducing future maintenance costs.

- Patrick McGehearty
former Sparc/Solaris performance tuning person



On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>
> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
>>> We are discussing different subjects here: what I want is to remove the
>>> glibc *internal* optimization for bzero, which is essentially an
>>> implementation detail.  In a first glance it would change performance,
>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
>>> str* counterparts, so it highly unlike that newer binaries will actually
>>> call bzero.
>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
>> continue supporting it.  Just remember that some users keep writing and
>> wanting to write bzero(3) instead of memset(3) in their .c files, so
>> it's far from being dead in source code.
> Again, I am not proposing to *remove* bzero, but rather the internal
> optimizations that currently only adds code complexity and maintenance
> burden.  My patchset [1] will keep the ABI as-is, the difference is
> bcopy and bzero will use the default implementation on all architectures.
>
> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
Alejandro Colomar Feb. 10, 2022, 10 p.m. UTC | #18
On 2/10/22 21:42, Adhemerval Zanella wrote:
> Again, I am not proposing to *remove* bzero, but rather the internal
> optimizations that currently only adds code complexity and maintenance
> burden.  My patchset [1] will keep the ABI as-is, the difference is
> bcopy and bzero will use the default implementation on all architectures.

That's good.

Thanks,

Alex
Adhemerval Zanella Netto Feb. 11, 2022, 1:01 p.m. UTC | #19
On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> Just as another point of information, Solaris libc implemented
> bzero as moving arguments around appropriately then jumping to
> memset. Noone noticed enough to file a complaint. Of course,
> short fixed-length bzero was handled with in line stores of zero
> by the compiler. For long vector bzeroing, the overhead was
> negligible.
> 
> When certain Sparc hardware implementations provided faster methods
> for zeroing a cache line at a time on cache line boundaries,
> memset added a single test for zero ifandonlyif the length of code
> to memset was over a threshold that seemed likely to make it
> worthwhile to use the faster method. The principal advantage
> of the fast zeroing operation is that it did not require data
> to move from memory to cache before writing zeros to memory,
> protecting cache locality in the face of large block zeroing.
> I was responsible for much of that optimization effort.
> Whether that optimization was really worth it is open for debate
> for a variety of reasons that I won't go into just now.

Afaik this is pretty much what optimized memset implementations
does, if architecture allows it. For instance, aarch64 uses 
'dc zva' for sizes larger than 256 and powerpc uses dcbz with a 
similar strategy.

> 
> Apps still used bzero or memset(target,zero,length) according to
> their preferences, but the code was unified under memset.
> 
> I am inclined to agree with keeping bzero in the API for
> compatibility with old code/old binaries/old programmers. :-)

The main driver to remove the bzero internal implementation is just
the *currently* gcc just do not generate bzero calls as default
(I couldn't find a single binary that calls bzero in my system).

So to actually see any performance advantage from the optimized
bzero, we will need to reevaluate the gcc optimization to transform
it on memset (which will need to be applied per-architecture base)
which I seem highly unlikely gcc maintainer will accept it.

Some time ago LLVM tried to do something similar to bcmp, but in the
end it was not a good idea to use an already define symbol and it
ended up with __memcmp_eq instead. 

> 
> Using shared memset code for the implementation of bzero
> is worthwhile for reducing future maintenance costs.
> 
> - Patrick McGehearty
> former Sparc/Solaris performance tuning person
> 
> 
> 
> On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>
>> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
>>>> We are discussing different subjects here: what I want is to remove the
>>>> glibc *internal* optimization for bzero, which is essentially an
>>>> implementation detail.  In a first glance it would change performance,
>>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
>>>> str* counterparts, so it highly unlike that newer binaries will actually
>>>> call bzero.
>>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
>>> continue supporting it.  Just remember that some users keep writing and
>>> wanting to write bzero(3) instead of memset(3) in their .c files, so
>>> it's far from being dead in source code.
>> Again, I am not proposing to *remove* bzero, but rather the internal
>> optimizations that currently only adds code complexity and maintenance
>> burden.  My patchset [1] will keep the ABI as-is, the difference is
>> bcopy and bzero will use the default implementation on all architectures.
>>
>> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
>
Noah Goldstein Feb. 12, 2022, 11:46 p.m. UTC | #20
On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> > Just as another point of information, Solaris libc implemented
> > bzero as moving arguments around appropriately then jumping to
> > memset. Noone noticed enough to file a complaint. Of course,
> > short fixed-length bzero was handled with in line stores of zero
> > by the compiler. For long vector bzeroing, the overhead was
> > negligible.
> >
> > When certain Sparc hardware implementations provided faster methods
> > for zeroing a cache line at a time on cache line boundaries,
> > memset added a single test for zero ifandonlyif the length of code
> > to memset was over a threshold that seemed likely to make it
> > worthwhile to use the faster method. The principal advantage
> > of the fast zeroing operation is that it did not require data
> > to move from memory to cache before writing zeros to memory,
> > protecting cache locality in the face of large block zeroing.
> > I was responsible for much of that optimization effort.
> > Whether that optimization was really worth it is open for debate
> > for a variety of reasons that I won't go into just now.
>
> Afaik this is pretty much what optimized memset implementations
> does, if architecture allows it. For instance, aarch64 uses
> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> similar strategy.
>
> >
> > Apps still used bzero or memset(target,zero,length) according to
> > their preferences, but the code was unified under memset.
> >
> > I am inclined to agree with keeping bzero in the API for
> > compatibility with old code/old binaries/old programmers. :-)
>
> The main driver to remove the bzero internal implementation is just
> the *currently* gcc just do not generate bzero calls as default
> (I couldn't find a single binary that calls bzero in my system).

Does it make sense then to add '__memsetzero' so that we can have
a function optimized for setting zero?

>
> So to actually see any performance advantage from the optimized
> bzero, we will need to reevaluate the gcc optimization to transform
> it on memset (which will need to be applied per-architecture base)
> which I seem highly unlikely gcc maintainer will accept it.
>
> Some time ago LLVM tried to do something similar to bcmp, but in the
> end it was not a good idea to use an already define symbol and it
> ended up with __memcmp_eq instead.
>
> >
> > Using shared memset code for the implementation of bzero
> > is worthwhile for reducing future maintenance costs.
> >
> > - Patrick McGehearty
> > former Sparc/Solaris performance tuning person
> >
> >
> >
> > On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote:
> >>
> >> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
> >>>> We are discussing different subjects here: what I want is to remove the
> >>>> glibc *internal* optimization for bzero, which is essentially an
> >>>> implementation detail.  In a first glance it would change performance,
> >>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
> >>>> str* counterparts, so it highly unlike that newer binaries will actually
> >>>> call bzero.
> >>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
> >>> continue supporting it.  Just remember that some users keep writing and
> >>> wanting to write bzero(3) instead of memset(3) in their .c files, so
> >>> it's far from being dead in source code.
> >> Again, I am not proposing to *remove* bzero, but rather the internal
> >> optimizations that currently only adds code complexity and maintenance
> >> burden.  My patchset [1] will keep the ABI as-is, the difference is
> >> bcopy and bzero will use the default implementation on all architectures.
> >>
> >> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
> >
Adhemerval Zanella Netto Feb. 14, 2022, 12:07 p.m. UTC | #21
On 12/02/2022 20:46, Noah Goldstein wrote:
> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>>
>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>> Just as another point of information, Solaris libc implemented
>>> bzero as moving arguments around appropriately then jumping to
>>> memset. Noone noticed enough to file a complaint. Of course,
>>> short fixed-length bzero was handled with in line stores of zero
>>> by the compiler. For long vector bzeroing, the overhead was
>>> negligible.
>>>
>>> When certain Sparc hardware implementations provided faster methods
>>> for zeroing a cache line at a time on cache line boundaries,
>>> memset added a single test for zero ifandonlyif the length of code
>>> to memset was over a threshold that seemed likely to make it
>>> worthwhile to use the faster method. The principal advantage
>>> of the fast zeroing operation is that it did not require data
>>> to move from memory to cache before writing zeros to memory,
>>> protecting cache locality in the face of large block zeroing.
>>> I was responsible for much of that optimization effort.
>>> Whether that optimization was really worth it is open for debate
>>> for a variety of reasons that I won't go into just now.
>>
>> Afaik this is pretty much what optimized memset implementations
>> does, if architecture allows it. For instance, aarch64 uses
>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>> similar strategy.
>>
>>>
>>> Apps still used bzero or memset(target,zero,length) according to
>>> their preferences, but the code was unified under memset.
>>>
>>> I am inclined to agree with keeping bzero in the API for
>>> compatibility with old code/old binaries/old programmers. :-)
>>
>> The main driver to remove the bzero internal implementation is just
>> the *currently* gcc just do not generate bzero calls as default
>> (I couldn't find a single binary that calls bzero in my system).
> 
> Does it make sense then to add '__memsetzero' so that we can have
> a function optimized for setting zero?

Will it be really a huge gain instead of a microoptimization that will
just a bunch of more ifunc variants along with the maintenance cost
associated with this?

My understanding is __memsetzero would maybe yield some gain in the
store mask generation (some architecture might have a zero register
or some instruction to generate one), however it would require to
use the same strategy as memset to use specific architecture instruction
that optimize cache utilization (dc zva, dcbz).

So it would mostly require a lot of arch-specific code to to share
the memset code with __memsetzero (to avoid increasing code size),
so I am not sure if this is really a gain in the long term.
Noah Goldstein Feb. 14, 2022, 12:41 p.m. UTC | #22
On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 12/02/2022 20:46, Noah Goldstein wrote:
> > On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >>
> >>
> >> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> >>> Just as another point of information, Solaris libc implemented
> >>> bzero as moving arguments around appropriately then jumping to
> >>> memset. Noone noticed enough to file a complaint. Of course,
> >>> short fixed-length bzero was handled with in line stores of zero
> >>> by the compiler. For long vector bzeroing, the overhead was
> >>> negligible.
> >>>
> >>> When certain Sparc hardware implementations provided faster methods
> >>> for zeroing a cache line at a time on cache line boundaries,
> >>> memset added a single test for zero ifandonlyif the length of code
> >>> to memset was over a threshold that seemed likely to make it
> >>> worthwhile to use the faster method. The principal advantage
> >>> of the fast zeroing operation is that it did not require data
> >>> to move from memory to cache before writing zeros to memory,
> >>> protecting cache locality in the face of large block zeroing.
> >>> I was responsible for much of that optimization effort.
> >>> Whether that optimization was really worth it is open for debate
> >>> for a variety of reasons that I won't go into just now.
> >>
> >> Afaik this is pretty much what optimized memset implementations
> >> does, if architecture allows it. For instance, aarch64 uses
> >> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> >> similar strategy.
> >>
> >>>
> >>> Apps still used bzero or memset(target,zero,length) according to
> >>> their preferences, but the code was unified under memset.
> >>>
> >>> I am inclined to agree with keeping bzero in the API for
> >>> compatibility with old code/old binaries/old programmers. :-)
> >>
> >> The main driver to remove the bzero internal implementation is just
> >> the *currently* gcc just do not generate bzero calls as default
> >> (I couldn't find a single binary that calls bzero in my system).
> >
> > Does it make sense then to add '__memsetzero' so that we can have
> > a function optimized for setting zero?
>
> Will it be really a huge gain instead of a microoptimization that will
> just a bunch of more ifunc variants along with the maintenance cost
> associated with this?
Is there any way it can be setup so that one C impl can cover all the
arch that want to just leave `__memsetzero` as an alias to `memset`?
I know they have incompatible interfaces that make it hard but would
a weak static inline in string.h work?

For some of the shorter control flows (which are generally small sizes
and very hot) we saw reasonable benefits on x86_64.

The most significant was the EVEX/AVX2 [32, 64] case where it net
us ~25% throughput. This is a pretty hot set value so it may be worth it.

>
> My understanding is __memsetzero would maybe yield some gain in the
> store mask generation (some architecture might have a zero register
> or some instruction to generate one), however it would require to
> use the same strategy as memset to use specific architecture instruction
> that optimize cache utilization (dc zva, dcbz).
>
> So it would mostly require a lot of arch-specific code to to share
> the memset code with __memsetzero (to avoid increasing code size),
> so I am not sure if this is really a gain in the long term.

It's worth noting that between the two `memset` is the cold function
and `__memsetzero` is the hot one. Based on profiles of GCC11 and
Python3.7.7 setting zero covers 99%+ cases.
Adhemerval Zanella Netto Feb. 14, 2022, 2:07 p.m. UTC | #23
On 14/02/2022 09:41, Noah Goldstein wrote:
> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 12/02/2022 20:46, Noah Goldstein wrote:
>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>>
>>>>
>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>>>> Just as another point of information, Solaris libc implemented
>>>>> bzero as moving arguments around appropriately then jumping to
>>>>> memset. Noone noticed enough to file a complaint. Of course,
>>>>> short fixed-length bzero was handled with in line stores of zero
>>>>> by the compiler. For long vector bzeroing, the overhead was
>>>>> negligible.
>>>>>
>>>>> When certain Sparc hardware implementations provided faster methods
>>>>> for zeroing a cache line at a time on cache line boundaries,
>>>>> memset added a single test for zero ifandonlyif the length of code
>>>>> to memset was over a threshold that seemed likely to make it
>>>>> worthwhile to use the faster method. The principal advantage
>>>>> of the fast zeroing operation is that it did not require data
>>>>> to move from memory to cache before writing zeros to memory,
>>>>> protecting cache locality in the face of large block zeroing.
>>>>> I was responsible for much of that optimization effort.
>>>>> Whether that optimization was really worth it is open for debate
>>>>> for a variety of reasons that I won't go into just now.
>>>>
>>>> Afaik this is pretty much what optimized memset implementations
>>>> does, if architecture allows it. For instance, aarch64 uses
>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>>>> similar strategy.
>>>>
>>>>>
>>>>> Apps still used bzero or memset(target,zero,length) according to
>>>>> their preferences, but the code was unified under memset.
>>>>>
>>>>> I am inclined to agree with keeping bzero in the API for
>>>>> compatibility with old code/old binaries/old programmers. :-)
>>>>
>>>> The main driver to remove the bzero internal implementation is just
>>>> the *currently* gcc just do not generate bzero calls as default
>>>> (I couldn't find a single binary that calls bzero in my system).
>>>
>>> Does it make sense then to add '__memsetzero' so that we can have
>>> a function optimized for setting zero?
>>
>> Will it be really a huge gain instead of a microoptimization that will
>> just a bunch of more ifunc variants along with the maintenance cost
>> associated with this?
> Is there any way it can be setup so that one C impl can cover all the
> arch that want to just leave `__memsetzero` as an alias to `memset`?
> I know they have incompatible interfaces that make it hard but would
> a weak static inline in string.h work?
> 
> For some of the shorter control flows (which are generally small sizes
> and very hot) we saw reasonable benefits on x86_64.
> 
> The most significant was the EVEX/AVX2 [32, 64] case where it net
> us ~25% throughput. This is a pretty hot set value so it may be worth it.

With different prototypes and semantics we won't be able to define an
alias. What we used to do, but we move away in recent version, was to
define static inline function that glue the two function if optimization
is set.

> 
>>
>> My understanding is __memsetzero would maybe yield some gain in the
>> store mask generation (some architecture might have a zero register
>> or some instruction to generate one), however it would require to
>> use the same strategy as memset to use specific architecture instruction
>> that optimize cache utilization (dc zva, dcbz).
>>
>> So it would mostly require a lot of arch-specific code to to share
>> the memset code with __memsetzero (to avoid increasing code size),
>> so I am not sure if this is really a gain in the long term.
> 
> It's worth noting that between the two `memset` is the cold function
> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> Python3.7.7 setting zero covers 99%+ cases.

This is very workload specific and I think with more advance compiler
optimization like LTO and PGO such calls could most likely being
optimized by the compiler itself (either by inline or by create a
synthetic function to handle it).

What I worried is such symbols might ended up as the AEBI memcpy variants
that was added as way to optimize when alignment is know to be multiple
of words, but it ended up not being implemented and also not being generated
by the compiler (at least not gcc).
H.J. Lu Feb. 14, 2022, 3:03 p.m. UTC | #24
On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 14/02/2022 09:41, Noah Goldstein wrote:
> > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 12/02/2022 20:46, Noah Goldstein wrote:
> >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> >>> <libc-alpha@sourceware.org> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> >>>>> Just as another point of information, Solaris libc implemented
> >>>>> bzero as moving arguments around appropriately then jumping to
> >>>>> memset. Noone noticed enough to file a complaint. Of course,
> >>>>> short fixed-length bzero was handled with in line stores of zero
> >>>>> by the compiler. For long vector bzeroing, the overhead was
> >>>>> negligible.
> >>>>>
> >>>>> When certain Sparc hardware implementations provided faster methods
> >>>>> for zeroing a cache line at a time on cache line boundaries,
> >>>>> memset added a single test for zero ifandonlyif the length of code
> >>>>> to memset was over a threshold that seemed likely to make it
> >>>>> worthwhile to use the faster method. The principal advantage
> >>>>> of the fast zeroing operation is that it did not require data
> >>>>> to move from memory to cache before writing zeros to memory,
> >>>>> protecting cache locality in the face of large block zeroing.
> >>>>> I was responsible for much of that optimization effort.
> >>>>> Whether that optimization was really worth it is open for debate
> >>>>> for a variety of reasons that I won't go into just now.
> >>>>
> >>>> Afaik this is pretty much what optimized memset implementations
> >>>> does, if architecture allows it. For instance, aarch64 uses
> >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> >>>> similar strategy.
> >>>>
> >>>>>
> >>>>> Apps still used bzero or memset(target,zero,length) according to
> >>>>> their preferences, but the code was unified under memset.
> >>>>>
> >>>>> I am inclined to agree with keeping bzero in the API for
> >>>>> compatibility with old code/old binaries/old programmers. :-)
> >>>>
> >>>> The main driver to remove the bzero internal implementation is just
> >>>> the *currently* gcc just do not generate bzero calls as default
> >>>> (I couldn't find a single binary that calls bzero in my system).
> >>>
> >>> Does it make sense then to add '__memsetzero' so that we can have
> >>> a function optimized for setting zero?
> >>
> >> Will it be really a huge gain instead of a microoptimization that will
> >> just a bunch of more ifunc variants along with the maintenance cost
> >> associated with this?
> > Is there any way it can be setup so that one C impl can cover all the
> > arch that want to just leave `__memsetzero` as an alias to `memset`?
> > I know they have incompatible interfaces that make it hard but would
> > a weak static inline in string.h work?
> >
> > For some of the shorter control flows (which are generally small sizes
> > and very hot) we saw reasonable benefits on x86_64.
> >
> > The most significant was the EVEX/AVX2 [32, 64] case where it net
> > us ~25% throughput. This is a pretty hot set value so it may be worth it.
>
> With different prototypes and semantics we won't be able to define an
> alias. What we used to do, but we move away in recent version, was to
> define static inline function that glue the two function if optimization
> is set.

I have

/* NB: bzero returns void and __memsetzero returns void *.  */
asm (".weak bzero");
asm ("bzero = __memsetzero");
asm (".global __bzero");
asm ("__bzero = __memsetzero");

> >
> >>
> >> My understanding is __memsetzero would maybe yield some gain in the
> >> store mask generation (some architecture might have a zero register
> >> or some instruction to generate one), however it would require to
> >> use the same strategy as memset to use specific architecture instruction
> >> that optimize cache utilization (dc zva, dcbz).
> >>
> >> So it would mostly require a lot of arch-specific code to to share
> >> the memset code with __memsetzero (to avoid increasing code size),
> >> so I am not sure if this is really a gain in the long term.
> >
> > It's worth noting that between the two `memset` is the cold function
> > and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> > Python3.7.7 setting zero covers 99%+ cases.
>
> This is very workload specific and I think with more advance compiler
> optimization like LTO and PGO such calls could most likely being
> optimized by the compiler itself (either by inline or by create a
> synthetic function to handle it).
>
> What I worried is such symbols might ended up as the AEBI memcpy variants
> that was added as way to optimize when alignment is know to be multiple
> of words, but it ended up not being implemented and also not being generated
> by the compiler (at least not gcc).
Sunil Pandey May 4, 2022, 6:35 a.m. UTC | #25
On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> >
> >
> > On 14/02/2022 09:41, Noah Goldstein wrote:
> > > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> > > <adhemerval.zanella@linaro.org> wrote:
> > >>
> > >>
> > >>
> > >> On 12/02/2022 20:46, Noah Goldstein wrote:
> > >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> > >>> <libc-alpha@sourceware.org> wrote:
> > >>>>
> > >>>>
> > >>>>
> > >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> > >>>>> Just as another point of information, Solaris libc implemented
> > >>>>> bzero as moving arguments around appropriately then jumping to
> > >>>>> memset. Noone noticed enough to file a complaint. Of course,
> > >>>>> short fixed-length bzero was handled with in line stores of zero
> > >>>>> by the compiler. For long vector bzeroing, the overhead was
> > >>>>> negligible.
> > >>>>>
> > >>>>> When certain Sparc hardware implementations provided faster methods
> > >>>>> for zeroing a cache line at a time on cache line boundaries,
> > >>>>> memset added a single test for zero ifandonlyif the length of code
> > >>>>> to memset was over a threshold that seemed likely to make it
> > >>>>> worthwhile to use the faster method. The principal advantage
> > >>>>> of the fast zeroing operation is that it did not require data
> > >>>>> to move from memory to cache before writing zeros to memory,
> > >>>>> protecting cache locality in the face of large block zeroing.
> > >>>>> I was responsible for much of that optimization effort.
> > >>>>> Whether that optimization was really worth it is open for debate
> > >>>>> for a variety of reasons that I won't go into just now.
> > >>>>
> > >>>> Afaik this is pretty much what optimized memset implementations
> > >>>> does, if architecture allows it. For instance, aarch64 uses
> > >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> > >>>> similar strategy.
> > >>>>
> > >>>>>
> > >>>>> Apps still used bzero or memset(target,zero,length) according to
> > >>>>> their preferences, but the code was unified under memset.
> > >>>>>
> > >>>>> I am inclined to agree with keeping bzero in the API for
> > >>>>> compatibility with old code/old binaries/old programmers. :-)
> > >>>>
> > >>>> The main driver to remove the bzero internal implementation is just
> > >>>> the *currently* gcc just do not generate bzero calls as default
> > >>>> (I couldn't find a single binary that calls bzero in my system).
> > >>>
> > >>> Does it make sense then to add '__memsetzero' so that we can have
> > >>> a function optimized for setting zero?
> > >>
> > >> Will it be really a huge gain instead of a microoptimization that will
> > >> just a bunch of more ifunc variants along with the maintenance cost
> > >> associated with this?
> > > Is there any way it can be setup so that one C impl can cover all the
> > > arch that want to just leave `__memsetzero` as an alias to `memset`?
> > > I know they have incompatible interfaces that make it hard but would
> > > a weak static inline in string.h work?
> > >
> > > For some of the shorter control flows (which are generally small sizes
> > > and very hot) we saw reasonable benefits on x86_64.
> > >
> > > The most significant was the EVEX/AVX2 [32, 64] case where it net
> > > us ~25% throughput. This is a pretty hot set value so it may be worth it.
> >
> > With different prototypes and semantics we won't be able to define an
> > alias. What we used to do, but we move away in recent version, was to
> > define static inline function that glue the two function if optimization
> > is set.
>
> I have
>
> /* NB: bzero returns void and __memsetzero returns void *.  */
> asm (".weak bzero");
> asm ("bzero = __memsetzero");
> asm (".global __bzero");
> asm ("__bzero = __memsetzero");
>
> > >
> > >>
> > >> My understanding is __memsetzero would maybe yield some gain in the
> > >> store mask generation (some architecture might have a zero register
> > >> or some instruction to generate one), however it would require to
> > >> use the same strategy as memset to use specific architecture instruction
> > >> that optimize cache utilization (dc zva, dcbz).
> > >>
> > >> So it would mostly require a lot of arch-specific code to to share
> > >> the memset code with __memsetzero (to avoid increasing code size),
> > >> so I am not sure if this is really a gain in the long term.
> > >
> > > It's worth noting that between the two `memset` is the cold function
> > > and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> > > Python3.7.7 setting zero covers 99%+ cases.
> >
> > This is very workload specific and I think with more advance compiler
> > optimization like LTO and PGO such calls could most likely being
> > optimized by the compiler itself (either by inline or by create a
> > synthetic function to handle it).
> >
> > What I worried is such symbols might ended up as the AEBI memcpy variants
> > that was added as way to optimize when alignment is know to be multiple
> > of words, but it ended up not being implemented and also not being generated
> > by the compiler (at least not gcc).
>
>
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
Adhemerval Zanella Netto May 4, 2022, 12:52 p.m. UTC | #26
On 04/05/2022 03:35, Sunil Pandey wrote:
> On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
>> <libc-alpha@sourceware.org> wrote:
>>>
>>>
>>>
>>> On 14/02/2022 09:41, Noah Goldstein wrote:
>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
>>>> <adhemerval.zanella@linaro.org> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 12/02/2022 20:46, Noah Goldstein wrote:
>>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
>>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>>>>>>> Just as another point of information, Solaris libc implemented
>>>>>>>> bzero as moving arguments around appropriately then jumping to
>>>>>>>> memset. Noone noticed enough to file a complaint. Of course,
>>>>>>>> short fixed-length bzero was handled with in line stores of zero
>>>>>>>> by the compiler. For long vector bzeroing, the overhead was
>>>>>>>> negligible.
>>>>>>>>
>>>>>>>> When certain Sparc hardware implementations provided faster methods
>>>>>>>> for zeroing a cache line at a time on cache line boundaries,
>>>>>>>> memset added a single test for zero ifandonlyif the length of code
>>>>>>>> to memset was over a threshold that seemed likely to make it
>>>>>>>> worthwhile to use the faster method. The principal advantage
>>>>>>>> of the fast zeroing operation is that it did not require data
>>>>>>>> to move from memory to cache before writing zeros to memory,
>>>>>>>> protecting cache locality in the face of large block zeroing.
>>>>>>>> I was responsible for much of that optimization effort.
>>>>>>>> Whether that optimization was really worth it is open for debate
>>>>>>>> for a variety of reasons that I won't go into just now.
>>>>>>>
>>>>>>> Afaik this is pretty much what optimized memset implementations
>>>>>>> does, if architecture allows it. For instance, aarch64 uses
>>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>>>>>>> similar strategy.
>>>>>>>
>>>>>>>>
>>>>>>>> Apps still used bzero or memset(target,zero,length) according to
>>>>>>>> their preferences, but the code was unified under memset.
>>>>>>>>
>>>>>>>> I am inclined to agree with keeping bzero in the API for
>>>>>>>> compatibility with old code/old binaries/old programmers. :-)
>>>>>>>
>>>>>>> The main driver to remove the bzero internal implementation is just
>>>>>>> the *currently* gcc just do not generate bzero calls as default
>>>>>>> (I couldn't find a single binary that calls bzero in my system).
>>>>>>
>>>>>> Does it make sense then to add '__memsetzero' so that we can have
>>>>>> a function optimized for setting zero?
>>>>>
>>>>> Will it be really a huge gain instead of a microoptimization that will
>>>>> just a bunch of more ifunc variants along with the maintenance cost
>>>>> associated with this?
>>>> Is there any way it can be setup so that one C impl can cover all the
>>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
>>>> I know they have incompatible interfaces that make it hard but would
>>>> a weak static inline in string.h work?
>>>>
>>>> For some of the shorter control flows (which are generally small sizes
>>>> and very hot) we saw reasonable benefits on x86_64.
>>>>
>>>> The most significant was the EVEX/AVX2 [32, 64] case where it net
>>>> us ~25% throughput. This is a pretty hot set value so it may be worth it.
>>>
>>> With different prototypes and semantics we won't be able to define an
>>> alias. What we used to do, but we move away in recent version, was to
>>> define static inline function that glue the two function if optimization
>>> is set.
>>
>> I have
>>
>> /* NB: bzero returns void and __memsetzero returns void *.  */
>> asm (".weak bzero");
>> asm ("bzero = __memsetzero");
>> asm (".global __bzero");
>> asm ("__bzero = __memsetzero");
>>
>>>>
>>>>>
>>>>> My understanding is __memsetzero would maybe yield some gain in the
>>>>> store mask generation (some architecture might have a zero register
>>>>> or some instruction to generate one), however it would require to
>>>>> use the same strategy as memset to use specific architecture instruction
>>>>> that optimize cache utilization (dc zva, dcbz).
>>>>>
>>>>> So it would mostly require a lot of arch-specific code to to share
>>>>> the memset code with __memsetzero (to avoid increasing code size),
>>>>> so I am not sure if this is really a gain in the long term.
>>>>
>>>> It's worth noting that between the two `memset` is the cold function
>>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
>>>> Python3.7.7 setting zero covers 99%+ cases.
>>>
>>> This is very workload specific and I think with more advance compiler
>>> optimization like LTO and PGO such calls could most likely being
>>> optimized by the compiler itself (either by inline or by create a
>>> synthetic function to handle it).
>>>
>>> What I worried is such symbols might ended up as the AEBI memcpy variants
>>> that was added as way to optimize when alignment is know to be multiple
>>> of words, but it ended up not being implemented and also not being generated
>>> by the compiler (at least not gcc).
>>
>>
>>
>> --
>> H.J.
> 
> I would like to backport this patch to release branches.
> Any comments or objections?

Nothing really against, but as previous discussion we had on this maillist optimizing
bzero does not yield much gain compared to memset (compiler won't generate libcall
for loop transformation, among other shortcomings). My idea is to follow other
architecture and just remove all x86_64 optimizations.
H.J. Lu May 4, 2022, 2:50 p.m. UTC | #27
On Wed, May 4, 2022 at 5:52 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 04/05/2022 03:35, Sunil Pandey wrote:
> > On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
> >> <libc-alpha@sourceware.org> wrote:
> >>>
> >>>
> >>>
> >>> On 14/02/2022 09:41, Noah Goldstein wrote:
> >>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> >>>> <adhemerval.zanella@linaro.org> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 12/02/2022 20:46, Noah Goldstein wrote:
> >>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> >>>>>> <libc-alpha@sourceware.org> wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> >>>>>>>> Just as another point of information, Solaris libc implemented
> >>>>>>>> bzero as moving arguments around appropriately then jumping to
> >>>>>>>> memset. Noone noticed enough to file a complaint. Of course,
> >>>>>>>> short fixed-length bzero was handled with in line stores of zero
> >>>>>>>> by the compiler. For long vector bzeroing, the overhead was
> >>>>>>>> negligible.
> >>>>>>>>
> >>>>>>>> When certain Sparc hardware implementations provided faster methods
> >>>>>>>> for zeroing a cache line at a time on cache line boundaries,
> >>>>>>>> memset added a single test for zero ifandonlyif the length of code
> >>>>>>>> to memset was over a threshold that seemed likely to make it
> >>>>>>>> worthwhile to use the faster method. The principal advantage
> >>>>>>>> of the fast zeroing operation is that it did not require data
> >>>>>>>> to move from memory to cache before writing zeros to memory,
> >>>>>>>> protecting cache locality in the face of large block zeroing.
> >>>>>>>> I was responsible for much of that optimization effort.
> >>>>>>>> Whether that optimization was really worth it is open for debate
> >>>>>>>> for a variety of reasons that I won't go into just now.
> >>>>>>>
> >>>>>>> Afaik this is pretty much what optimized memset implementations
> >>>>>>> does, if architecture allows it. For instance, aarch64 uses
> >>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> >>>>>>> similar strategy.
> >>>>>>>
> >>>>>>>>
> >>>>>>>> Apps still used bzero or memset(target,zero,length) according to
> >>>>>>>> their preferences, but the code was unified under memset.
> >>>>>>>>
> >>>>>>>> I am inclined to agree with keeping bzero in the API for
> >>>>>>>> compatibility with old code/old binaries/old programmers. :-)
> >>>>>>>
> >>>>>>> The main driver to remove the bzero internal implementation is just
> >>>>>>> the *currently* gcc just do not generate bzero calls as default
> >>>>>>> (I couldn't find a single binary that calls bzero in my system).
> >>>>>>
> >>>>>> Does it make sense then to add '__memsetzero' so that we can have
> >>>>>> a function optimized for setting zero?
> >>>>>
> >>>>> Will it be really a huge gain instead of a microoptimization that will
> >>>>> just a bunch of more ifunc variants along with the maintenance cost
> >>>>> associated with this?
> >>>> Is there any way it can be setup so that one C impl can cover all the
> >>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
> >>>> I know they have incompatible interfaces that make it hard but would
> >>>> a weak static inline in string.h work?
> >>>>
> >>>> For some of the shorter control flows (which are generally small sizes
> >>>> and very hot) we saw reasonable benefits on x86_64.
> >>>>
> >>>> The most significant was the EVEX/AVX2 [32, 64] case where it net
> >>>> us ~25% throughput. This is a pretty hot set value so it may be worth it.
> >>>
> >>> With different prototypes and semantics we won't be able to define an
> >>> alias. What we used to do, but we move away in recent version, was to
> >>> define static inline function that glue the two function if optimization
> >>> is set.
> >>
> >> I have
> >>
> >> /* NB: bzero returns void and __memsetzero returns void *.  */
> >> asm (".weak bzero");
> >> asm ("bzero = __memsetzero");
> >> asm (".global __bzero");
> >> asm ("__bzero = __memsetzero");
> >>
> >>>>
> >>>>>
> >>>>> My understanding is __memsetzero would maybe yield some gain in the
> >>>>> store mask generation (some architecture might have a zero register
> >>>>> or some instruction to generate one), however it would require to
> >>>>> use the same strategy as memset to use specific architecture instruction
> >>>>> that optimize cache utilization (dc zva, dcbz).
> >>>>>
> >>>>> So it would mostly require a lot of arch-specific code to to share
> >>>>> the memset code with __memsetzero (to avoid increasing code size),
> >>>>> so I am not sure if this is really a gain in the long term.
> >>>>
> >>>> It's worth noting that between the two `memset` is the cold function
> >>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> >>>> Python3.7.7 setting zero covers 99%+ cases.
> >>>
> >>> This is very workload specific and I think with more advance compiler
> >>> optimization like LTO and PGO such calls could most likely being
> >>> optimized by the compiler itself (either by inline or by create a
> >>> synthetic function to handle it).
> >>>
> >>> What I worried is such symbols might ended up as the AEBI memcpy variants
> >>> that was added as way to optimize when alignment is know to be multiple
> >>> of words, but it ended up not being implemented and also not being generated
> >>> by the compiler (at least not gcc).
> >>
> >>
> >>
> >> --
> >> H.J.
> >
> > I would like to backport this patch to release branches.
> > Any comments or objections?
>
> Nothing really against, but as previous discussion we had on this maillist optimizing
> bzero does not yield much gain compared to memset (compiler won't generate libcall
> for loop transformation, among other shortcomings). My idea is to follow other
> architecture and just remove all x86_64 optimizations.

We'd like to reduce the differences between master and release branches to help
future backports to release branches.
Adhemerval Zanella Netto May 4, 2022, 2:54 p.m. UTC | #28
On 04/05/2022 11:50, H.J. Lu wrote:
> On Wed, May 4, 2022 at 5:52 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 04/05/2022 03:35, Sunil Pandey wrote:
>>> On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
>>>> <libc-alpha@sourceware.org> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 14/02/2022 09:41, Noah Goldstein wrote:
>>>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
>>>>>> <adhemerval.zanella@linaro.org> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 12/02/2022 20:46, Noah Goldstein wrote:
>>>>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
>>>>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>>>>>>>>> Just as another point of information, Solaris libc implemented
>>>>>>>>>> bzero as moving arguments around appropriately then jumping to
>>>>>>>>>> memset. Noone noticed enough to file a complaint. Of course,
>>>>>>>>>> short fixed-length bzero was handled with in line stores of zero
>>>>>>>>>> by the compiler. For long vector bzeroing, the overhead was
>>>>>>>>>> negligible.
>>>>>>>>>>
>>>>>>>>>> When certain Sparc hardware implementations provided faster methods
>>>>>>>>>> for zeroing a cache line at a time on cache line boundaries,
>>>>>>>>>> memset added a single test for zero ifandonlyif the length of code
>>>>>>>>>> to memset was over a threshold that seemed likely to make it
>>>>>>>>>> worthwhile to use the faster method. The principal advantage
>>>>>>>>>> of the fast zeroing operation is that it did not require data
>>>>>>>>>> to move from memory to cache before writing zeros to memory,
>>>>>>>>>> protecting cache locality in the face of large block zeroing.
>>>>>>>>>> I was responsible for much of that optimization effort.
>>>>>>>>>> Whether that optimization was really worth it is open for debate
>>>>>>>>>> for a variety of reasons that I won't go into just now.
>>>>>>>>>
>>>>>>>>> Afaik this is pretty much what optimized memset implementations
>>>>>>>>> does, if architecture allows it. For instance, aarch64 uses
>>>>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>>>>>>>>> similar strategy.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Apps still used bzero or memset(target,zero,length) according to
>>>>>>>>>> their preferences, but the code was unified under memset.
>>>>>>>>>>
>>>>>>>>>> I am inclined to agree with keeping bzero in the API for
>>>>>>>>>> compatibility with old code/old binaries/old programmers. :-)
>>>>>>>>>
>>>>>>>>> The main driver to remove the bzero internal implementation is just
>>>>>>>>> the *currently* gcc just do not generate bzero calls as default
>>>>>>>>> (I couldn't find a single binary that calls bzero in my system).
>>>>>>>>
>>>>>>>> Does it make sense then to add '__memsetzero' so that we can have
>>>>>>>> a function optimized for setting zero?
>>>>>>>
>>>>>>> Will it be really a huge gain instead of a microoptimization that will
>>>>>>> just a bunch of more ifunc variants along with the maintenance cost
>>>>>>> associated with this?
>>>>>> Is there any way it can be setup so that one C impl can cover all the
>>>>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
>>>>>> I know they have incompatible interfaces that make it hard but would
>>>>>> a weak static inline in string.h work?
>>>>>>
>>>>>> For some of the shorter control flows (which are generally small sizes
>>>>>> and very hot) we saw reasonable benefits on x86_64.
>>>>>>
>>>>>> The most significant was the EVEX/AVX2 [32, 64] case where it net
>>>>>> us ~25% throughput. This is a pretty hot set value so it may be worth it.
>>>>>
>>>>> With different prototypes and semantics we won't be able to define an
>>>>> alias. What we used to do, but we move away in recent version, was to
>>>>> define static inline function that glue the two function if optimization
>>>>> is set.
>>>>
>>>> I have
>>>>
>>>> /* NB: bzero returns void and __memsetzero returns void *.  */
>>>> asm (".weak bzero");
>>>> asm ("bzero = __memsetzero");
>>>> asm (".global __bzero");
>>>> asm ("__bzero = __memsetzero");
>>>>
>>>>>>
>>>>>>>
>>>>>>> My understanding is __memsetzero would maybe yield some gain in the
>>>>>>> store mask generation (some architecture might have a zero register
>>>>>>> or some instruction to generate one), however it would require to
>>>>>>> use the same strategy as memset to use specific architecture instruction
>>>>>>> that optimize cache utilization (dc zva, dcbz).
>>>>>>>
>>>>>>> So it would mostly require a lot of arch-specific code to to share
>>>>>>> the memset code with __memsetzero (to avoid increasing code size),
>>>>>>> so I am not sure if this is really a gain in the long term.
>>>>>>
>>>>>> It's worth noting that between the two `memset` is the cold function
>>>>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
>>>>>> Python3.7.7 setting zero covers 99%+ cases.
>>>>>
>>>>> This is very workload specific and I think with more advance compiler
>>>>> optimization like LTO and PGO such calls could most likely being
>>>>> optimized by the compiler itself (either by inline or by create a
>>>>> synthetic function to handle it).
>>>>>
>>>>> What I worried is such symbols might ended up as the AEBI memcpy variants
>>>>> that was added as way to optimize when alignment is know to be multiple
>>>>> of words, but it ended up not being implemented and also not being generated
>>>>> by the compiler (at least not gcc).
>>>>
>>>>
>>>>
>>>> --
>>>> H.J.
>>>
>>> I would like to backport this patch to release branches.
>>> Any comments or objections?
>>
>> Nothing really against, but as previous discussion we had on this maillist optimizing
>> bzero does not yield much gain compared to memset (compiler won't generate libcall
>> for loop transformation, among other shortcomings). My idea is to follow other
>> architecture and just remove all x86_64 optimizations.
> 
> We'd like to reduce the differences between master and release branches to help
> future backports to release branches.
> 

Ok, fair enough.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 3f0517bbfc..af26e9cedc 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -35,6 +35,9 @@ 
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
+# define BZERO_ZERO_VEC0() \
+  pxor %xmm0, %xmm0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   pshufd $0, %xmm0, %xmm0; \
@@ -53,6 +56,10 @@ 
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)	__bzero
+#endif
+
 #ifndef WMEMSET_SYMBOL
 # define WMEMSET_CHK_SYMBOL(p,s) p
 # define WMEMSET_SYMBOL(p,s)	__wmemset
@@ -63,6 +70,7 @@ 
 libc_hidden_builtin_def (memset)
 
 #if IS_IN (libc)
+weak_alias (__bzero, bzero)
 libc_hidden_def (__wmemset)
 weak_alias (__wmemset, wmemset)
 libc_hidden_weak (wmemset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4274bfdd0d..e7b413edad 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,6 +1,7 @@ 
 ifeq ($(subdir),string)
 
 sysdep_routines += \
+  bzero \
   memchr-avx2 \
   memchr-avx2-rtm \
   memchr-evex \
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
new file mode 100644
index 0000000000..58a14b2c33
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bzero.c
@@ -0,0 +1,106 @@ 
+/* Multiple versions of bzero.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __bzero __redirect___bzero
+# include <string.h>
+# undef __bzero
+
+# define SYMBOL_NAME __bzero
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
+  attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx512_unaligned_erms);
+
+	  return OPTIMIZE1 (avx512_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (evex_unaligned_erms);
+
+	  return OPTIMIZE1 (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE1 (avx2_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms);
+
+	  return OPTIMIZE1 (avx2_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE1 (sse2_unaligned_erms);
+
+  return OPTIMIZE1 (sse2_unaligned);
+}
+
+libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
+
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 68a56797d4..a594f4176e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,48 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_avx512_no_vzeroupper)
 	     )
 
+  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned)
+	     )
+
   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
   IFUNC_IMPL (i, name, rawmemchr,
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..5a5ee6f672 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -5,6 +5,7 @@ 
 
 #define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
 #include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index c0bf2875d0..a093a2831f 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,6 +14,9 @@ 
   vmovd d, %xmm0; \
   movq r, %rax;
 
+# define BZERO_ZERO_VEC0() \
+  vpxor %xmm0, %xmm0, %xmm0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
 
@@ -29,6 +32,9 @@ 
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
+# ifndef BZERO_SYMBOL
+#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
+# endif
 # ifndef WMEMSET_SYMBOL
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 5241216a77..727c92133a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,6 +19,9 @@ 
   vpbroadcastb d, %VEC0; \
   movq r, %rax
 
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vpbroadcastd d, %VEC0; \
   movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 6370021506..5d8fa78f05 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,6 +19,9 @@ 
   vpbroadcastb d, %VEC0; \
   movq r, %rax
 
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vpbroadcastd d, %VEC0; \
   movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 8a6f0c561a..329c58ee46 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -22,6 +22,7 @@ 
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
 # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1b502b78e4..7c94fcdae1 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -26,6 +26,10 @@ 
 
 #include <sysdep.h>
 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
+#endif
+
 #ifndef MEMSET_CHK_SYMBOL
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
@@ -87,6 +91,18 @@ 
 # define XMM_SMALL	0
 #endif
 
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64	rcx
+# define SET_REG32	ecx
+# define SET_REG16	cx
+# define SET_REG8	cl
+#else
+# define SET_REG64	rsi
+# define SET_REG32	esi
+# define SET_REG16	si
+# define SET_REG8	sil
+#endif
+
 #define PAGE_SIZE 4096
 
 /* Macro to calculate size of small memset block for aligning
@@ -96,18 +112,6 @@ 
 
 #ifndef SECTION
 # error SECTION is not defined!
-#endif
-
-	.section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
-	mov	%RSI_LP, %RDX_LP /* Set n.  */
-	xorl	%esi, %esi
-	pxor	%XMM0, %XMM0
-	jmp	L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
 #endif
 
 #if IS_IN (libc)
@@ -123,12 +127,37 @@  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 	WMEMSET_VDUP_TO_VEC0_LOW()
 	cmpq	$VEC_SIZE, %rdx
-	jb	L(less_vec_no_vdup)
+	jb	L(less_vec_from_wmemset)
 	WMEMSET_VDUP_TO_VEC0_HIGH()
 	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
+ENTRY (BZERO_SYMBOL(__bzero, unaligned))
+#if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+#ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+#ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+#if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned))
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmp	%RDX_LP, %RCX_LP
@@ -142,7 +171,6 @@  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
-L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
 	MEMSET_VDUP_TO_VEC0_HIGH()
@@ -187,6 +215,31 @@  END (__memset_erms)
 END (MEMSET_SYMBOL (__memset, erms))
 # endif
 
+ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
+# if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+# ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+# ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+# if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned_erms))
+
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP
@@ -229,6 +282,7 @@  L(last_2x_vec):
 	.p2align 4,, 10
 L(less_vec):
 L(less_vec_no_vdup):
+L(less_vec_from_wmemset):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
@@ -374,8 +428,11 @@  L(less_vec):
 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
 	   xmm). This is only does anything for AVX2.  */
 	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
+#if VEC_SIZE > 16
 L(less_vec_no_vdup):
 #endif
+#endif
 L(cross_page):
 #if VEC_SIZE > 32
 	cmpl	$32, %edx
@@ -386,7 +443,10 @@  L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %rcx
+	MOVQ	%XMM0, %SET_REG64
+#endif
+#if VEC_SIZE <= 16
+L(less_vec_no_vdup):
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -395,7 +455,7 @@  L(cross_page):
 	cmpl	$1, %edx
 	jg	L(between_2_3)
 	jl	L(between_0_0)
-	movb	%sil, (%LESS_VEC_REG)
+	movb	%SET_REG8, (%LESS_VEC_REG)
 L(between_0_0):
 	ret
 
@@ -428,8 +488,8 @@  L(between_8_15):
 	MOVQ	%XMM0, (%rdi)
 	MOVQ	%XMM0, -8(%rdi, %rdx)
 #else
-	movq	%rcx, (%LESS_VEC_REG)
-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+	movq	%SET_REG64, (%LESS_VEC_REG)
+	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 
@@ -442,8 +502,8 @@  L(between_4_7):
 	MOVD	%XMM0, (%rdi)
 	MOVD	%XMM0, -4(%rdi, %rdx)
 #else
-	movl	%ecx, (%LESS_VEC_REG)
-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+	movl	%SET_REG32, (%LESS_VEC_REG)
+	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 
@@ -452,12 +512,12 @@  L(between_4_7):
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
 #ifdef USE_XMM_LESS_VEC
-	movb	%sil, (%rdi)
-	movb	%sil, 1(%rdi)
-	movb	%sil, -1(%rdi, %rdx)
+	movb	%SET_REG8, (%rdi)
+	movb	%SET_REG8, 1(%rdi)
+	movb	%SET_REG8, -1(%rdi, %rdx)
 #else
-	movw	%cx, (%LESS_VEC_REG)
-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+	movw	%SET_REG16, (%LESS_VEC_REG)
+	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))