Message ID | 20220208224319.40271-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2] x86-64: Optimize bzero | expand |
On Tue, Feb 8, 2022 at 4:43 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > Rebase against the current master branch. > > -- > memset with zero as the value to set is by far the majority value (99%+ > for Python3 and GCC). > > bzero can be slightly more optimized for this case by using a zero-idiom > xor for broadcasting the set value to a register (vector or GPR). > > Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/memset.S | 8 ++ > sysdeps/x86_64/multiarch/Makefile | 1 + > sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++++++++++ > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 +++++++ > .../memset-avx2-unaligned-erms-rtm.S | 1 + > .../multiarch/memset-avx2-unaligned-erms.S | 6 + > .../multiarch/memset-avx512-unaligned-erms.S | 3 + > .../multiarch/memset-evex-unaligned-erms.S | 3 + > .../multiarch/memset-sse2-unaligned-erms.S | 1 + > .../multiarch/memset-vec-unaligned-erms.S | 110 ++++++++++++++---- > 10 files changed, 256 insertions(+), 25 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/bzero.c > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > index 3f0517bbfc..af26e9cedc 100644 > --- a/sysdeps/x86_64/memset.S > +++ b/sysdeps/x86_64/memset.S > @@ -35,6 +35,9 @@ > punpcklwd %xmm0, %xmm0; \ > pshufd $0, %xmm0, %xmm0 > > +# define BZERO_ZERO_VEC0() \ > + pxor %xmm0, %xmm0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > movd d, %xmm0; \ > pshufd $0, %xmm0, %xmm0; \ > @@ -53,6 +56,10 @@ > # define MEMSET_SYMBOL(p,s) memset > #endif > > +#ifndef BZERO_SYMBOL > +# define BZERO_SYMBOL(p,s) __bzero > +#endif > + > #ifndef WMEMSET_SYMBOL > # define WMEMSET_CHK_SYMBOL(p,s) p > # define WMEMSET_SYMBOL(p,s) __wmemset > @@ -63,6 +70,7 @@ > libc_hidden_builtin_def (memset) > > #if IS_IN (libc) > +weak_alias (__bzero, bzero) > libc_hidden_def (__wmemset) > weak_alias (__wmemset, wmemset) > libc_hidden_weak (wmemset) > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 4274bfdd0d..e7b413edad 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -1,6 +1,7 @@ > ifeq ($(subdir),string) > > sysdep_routines += \ > + bzero \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c > new file mode 100644 > index 0000000000..58a14b2c33 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/bzero.c > @@ -0,0 +1,106 @@ > +/* Multiple versions of bzero. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __bzero __redirect___bzero > +# include <string.h> > +# undef __bzero > + > +# define SYMBOL_NAME __bzero > +# include <init-arch.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) > + attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features* cpu_features = __get_cpu_features (); > + > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (avx512_unaligned_erms); > + > + return OPTIMIZE1 (avx512_unaligned); > + } > + } > + > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (evex_unaligned_erms); > + > + return OPTIMIZE1 (evex_unaligned); > + } > + > + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (avx2_unaligned_erms_rtm); > + > + return OPTIMIZE1 (avx2_unaligned_rtm); > + } > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (avx2_unaligned_erms); > + > + return OPTIMIZE1 (avx2_unaligned); > + } > + } > + > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (sse2_unaligned_erms); > + > + return OPTIMIZE1 (sse2_unaligned); > +} > + > +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); > + > +weak_alias (__bzero, bzero) > +#endif > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 68a56797d4..a594f4176e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memset_avx512_no_vzeroupper) > ) > > + /* Support sysdeps/x86_64/multiarch/bzero.c. */ > + IFUNC_IMPL (i, name, bzero, > + IFUNC_IMPL_ADD (array, i, bzero, 1, > + __bzero_sse2_unaligned) > + IFUNC_IMPL_ADD (array, i, bzero, 1, > + __bzero_sse2_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + CPU_FEATURE_USABLE (AVX2), > + __bzero_avx2_unaligned) > + IFUNC_IMPL_ADD (array, i, bzero, > + CPU_FEATURE_USABLE (AVX2), > + __bzero_avx2_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __bzero_avx2_unaligned_rtm) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __bzero_avx2_unaligned_erms_rtm) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_evex_unaligned) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_evex_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_avx512_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_avx512_unaligned) > + ) > + > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > IFUNC_IMPL (i, name, rawmemchr, > IFUNC_IMPL_ADD (array, i, rawmemchr, > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > index 8ac3e479bb..5a5ee6f672 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > @@ -5,6 +5,7 @@ > > #define SECTION(p) p##.avx.rtm > #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > +#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm > #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > #include "memset-avx2-unaligned-erms.S" > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > index c0bf2875d0..a093a2831f 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > @@ -14,6 +14,9 @@ > vmovd d, %xmm0; \ > movq r, %rax; > > +# define BZERO_ZERO_VEC0() \ > + vpxor %xmm0, %xmm0, %xmm0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > MEMSET_SET_VEC0_AND_SET_RETURN(d, r) > > @@ -29,6 +32,9 @@ > # ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > +# ifndef BZERO_SYMBOL > +# define BZERO_SYMBOL(p,s) p##_avx2_##s > +# endif > # ifndef WMEMSET_SYMBOL > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > index 5241216a77..727c92133a 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > @@ -19,6 +19,9 @@ > vpbroadcastb d, %VEC0; \ > movq r, %rax > > +# define BZERO_ZERO_VEC0() \ > + vpxorq %XMM0, %XMM0, %XMM0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vpbroadcastd d, %VEC0; \ > movq r, %rax > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > index 6370021506..5d8fa78f05 100644 > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > @@ -19,6 +19,9 @@ > vpbroadcastb d, %VEC0; \ > movq r, %rax > > +# define BZERO_ZERO_VEC0() \ > + vpxorq %XMM0, %XMM0, %XMM0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vpbroadcastd d, %VEC0; \ > movq r, %rax > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > index 8a6f0c561a..329c58ee46 100644 > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > @@ -22,6 +22,7 @@ > > #if IS_IN (libc) > # define MEMSET_SYMBOL(p,s) p##_sse2_##s > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > # define WMEMSET_SYMBOL(p,s) p##_sse2_##s > > # ifdef SHARED > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 1b502b78e4..7c94fcdae1 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -26,6 +26,10 @@ > > #include <sysdep.h> > > +#ifndef BZERO_SYMBOL > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > +#endif > + > #ifndef MEMSET_CHK_SYMBOL > # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) > #endif > @@ -87,6 +91,18 @@ > # define XMM_SMALL 0 > #endif > > +#ifdef USE_LESS_VEC_MASK_STORE > +# define SET_REG64 rcx > +# define SET_REG32 ecx > +# define SET_REG16 cx > +# define SET_REG8 cl > +#else > +# define SET_REG64 rsi > +# define SET_REG32 esi > +# define SET_REG16 si > +# define SET_REG8 sil > +#endif > + > #define PAGE_SIZE 4096 > > /* Macro to calculate size of small memset block for aligning > @@ -96,18 +112,6 @@ > > #ifndef SECTION > # error SECTION is not defined! > -#endif > - > - .section SECTION(.text),"ax",@progbits > -#if VEC_SIZE == 16 && IS_IN (libc) > -ENTRY (__bzero) > - mov %RDI_LP, %RAX_LP /* Set return value. */ > - mov %RSI_LP, %RDX_LP /* Set n. */ > - xorl %esi, %esi > - pxor %XMM0, %XMM0 > - jmp L(entry_from_bzero) > -END (__bzero) > -weak_alias (__bzero, bzero) > #endif > > #if IS_IN (libc) > @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > WMEMSET_VDUP_TO_VEC0_LOW() > cmpq $VEC_SIZE, %rdx > - jb L(less_vec_no_vdup) > + jb L(less_vec_from_wmemset) > WMEMSET_VDUP_TO_VEC0_HIGH() > jmp L(entry_from_wmemset) > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > #endif > > +ENTRY (BZERO_SYMBOL(__bzero, unaligned)) > +#if VEC_SIZE > 16 > + BZERO_ZERO_VEC0 () > +#endif > + mov %RDI_LP, %RAX_LP > + mov %RSI_LP, %RDX_LP > +#ifndef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +#endif > + cmp $VEC_SIZE, %RDX_LP > + jb L(less_vec_no_vdup) > +#ifdef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +#endif > +#if VEC_SIZE <= 16 > + BZERO_ZERO_VEC0 () > +#endif > + cmp $(VEC_SIZE * 2), %RDX_LP > + ja L(more_2x_vec) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VZEROUPPER_RETURN > +END (BZERO_SYMBOL(__bzero, unaligned)) > + > #if defined SHARED && IS_IN (libc) > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > cmp %RDX_LP, %RCX_LP > @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > /* Clear the upper 32 bits. */ > mov %edx, %edx > # endif > -L(entry_from_bzero): > cmpq $VEC_SIZE, %rdx > jb L(less_vec) > MEMSET_VDUP_TO_VEC0_HIGH() > @@ -187,6 +215,31 @@ END (__memset_erms) > END (MEMSET_SYMBOL (__memset, erms)) > # endif > > +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) > +# if VEC_SIZE > 16 > + BZERO_ZERO_VEC0 () > +# endif > + mov %RDI_LP, %RAX_LP > + mov %RSI_LP, %RDX_LP > +# ifndef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +# endif > + cmp $VEC_SIZE, %RDX_LP > + jb L(less_vec_no_vdup) > +# ifdef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +# endif > +# if VEC_SIZE <= 16 > + BZERO_ZERO_VEC0 () > +# endif > + cmp $(VEC_SIZE * 2), %RDX_LP > + ja L(stosb_more_2x_vec) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VZEROUPPER_RETURN > +END (BZERO_SYMBOL(__bzero, unaligned_erms)) > + > # if defined SHARED && IS_IN (libc) > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > cmp %RDX_LP, %RCX_LP > @@ -229,6 +282,7 @@ L(last_2x_vec): > .p2align 4,, 10 > L(less_vec): > L(less_vec_no_vdup): > +L(less_vec_from_wmemset): > /* Less than 1 VEC. */ > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > # error Unsupported VEC_SIZE! > @@ -374,8 +428,11 @@ L(less_vec): > /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to > xmm). This is only does anything for AVX2. */ > MEMSET_VDUP_TO_VEC0_LOW () > +L(less_vec_from_wmemset): > +#if VEC_SIZE > 16 > L(less_vec_no_vdup): > #endif > +#endif > L(cross_page): > #if VEC_SIZE > 32 > cmpl $32, %edx > @@ -386,7 +443,10 @@ L(cross_page): > jge L(between_16_31) > #endif > #ifndef USE_XMM_LESS_VEC > - MOVQ %XMM0, %rcx > + MOVQ %XMM0, %SET_REG64 > +#endif > +#if VEC_SIZE <= 16 > +L(less_vec_no_vdup): > #endif > cmpl $8, %edx > jge L(between_8_15) > @@ -395,7 +455,7 @@ L(cross_page): > cmpl $1, %edx > jg L(between_2_3) > jl L(between_0_0) > - movb %sil, (%LESS_VEC_REG) > + movb %SET_REG8, (%LESS_VEC_REG) > L(between_0_0): > ret > > @@ -428,8 +488,8 @@ L(between_8_15): > MOVQ %XMM0, (%rdi) > MOVQ %XMM0, -8(%rdi, %rdx) > #else > - movq %rcx, (%LESS_VEC_REG) > - movq %rcx, -8(%LESS_VEC_REG, %rdx) > + movq %SET_REG64, (%LESS_VEC_REG) > + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) > #endif > ret > > @@ -442,8 +502,8 @@ L(between_4_7): > MOVD %XMM0, (%rdi) > MOVD %XMM0, -4(%rdi, %rdx) > #else > - movl %ecx, (%LESS_VEC_REG) > - movl %ecx, -4(%LESS_VEC_REG, %rdx) > + movl %SET_REG32, (%LESS_VEC_REG) > + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) > #endif > ret > > @@ -452,12 +512,12 @@ L(between_4_7): > L(between_2_3): > /* From 2 to 3. No branch when size == 2. */ > #ifdef USE_XMM_LESS_VEC > - movb %sil, (%rdi) > - movb %sil, 1(%rdi) > - movb %sil, -1(%rdi, %rdx) > + movb %SET_REG8, (%rdi) > + movb %SET_REG8, 1(%rdi) > + movb %SET_REG8, -1(%rdi, %rdx) > #else > - movw %cx, (%LESS_VEC_REG) > - movb %sil, -1(%LESS_VEC_REG, %rdx) > + movw %SET_REG16, (%LESS_VEC_REG) > + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) > #endif > ret > END (MEMSET_SYMBOL (__memset, unaligned_erms)) > -- > 2.34.1 > LGTM.
On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote: > Rebase against the current master branch. > > -- > memset with zero as the value to set is by far the majority value (99%+ > for Python3 and GCC). > > bzero can be slightly more optimized for this case by using a zero-idiom > xor for broadcasting the set value to a register (vector or GPR). > > Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com> Is it really worth to ressurerect bzero with this multiple ifunc variants? Would Python3/GCC or any programs start to replace memset with bzero for the sake of this optimization? I agree with Wilco where the gain are marginal in this case. > --- > sysdeps/x86_64/memset.S | 8 ++ > sysdeps/x86_64/multiarch/Makefile | 1 + > sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++++++++++ > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 +++++++ > .../memset-avx2-unaligned-erms-rtm.S | 1 + > .../multiarch/memset-avx2-unaligned-erms.S | 6 + > .../multiarch/memset-avx512-unaligned-erms.S | 3 + > .../multiarch/memset-evex-unaligned-erms.S | 3 + > .../multiarch/memset-sse2-unaligned-erms.S | 1 + > .../multiarch/memset-vec-unaligned-erms.S | 110 ++++++++++++++---- > 10 files changed, 256 insertions(+), 25 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/bzero.c > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > index 3f0517bbfc..af26e9cedc 100644 > --- a/sysdeps/x86_64/memset.S > +++ b/sysdeps/x86_64/memset.S > @@ -35,6 +35,9 @@ > punpcklwd %xmm0, %xmm0; \ > pshufd $0, %xmm0, %xmm0 > > +# define BZERO_ZERO_VEC0() \ > + pxor %xmm0, %xmm0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > movd d, %xmm0; \ > pshufd $0, %xmm0, %xmm0; \ > @@ -53,6 +56,10 @@ > # define MEMSET_SYMBOL(p,s) memset > #endif > > +#ifndef BZERO_SYMBOL > +# define BZERO_SYMBOL(p,s) __bzero > +#endif > + > #ifndef WMEMSET_SYMBOL > # define WMEMSET_CHK_SYMBOL(p,s) p > # define WMEMSET_SYMBOL(p,s) __wmemset > @@ -63,6 +70,7 @@ > libc_hidden_builtin_def (memset) > > #if IS_IN (libc) > +weak_alias (__bzero, bzero) > libc_hidden_def (__wmemset) > weak_alias (__wmemset, wmemset) > libc_hidden_weak (wmemset) > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 4274bfdd0d..e7b413edad 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -1,6 +1,7 @@ > ifeq ($(subdir),string) > > sysdep_routines += \ > + bzero \ > memchr-avx2 \ > memchr-avx2-rtm \ > memchr-evex \ > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c > new file mode 100644 > index 0000000000..58a14b2c33 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/bzero.c > @@ -0,0 +1,106 @@ > +/* Multiple versions of bzero. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __bzero __redirect___bzero > +# include <string.h> > +# undef __bzero > + > +# define SYMBOL_NAME __bzero > +# include <init-arch.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) > + attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) > + attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features* cpu_features = __get_cpu_features (); > + > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (avx512_unaligned_erms); > + > + return OPTIMIZE1 (avx512_unaligned); > + } > + } > + > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (evex_unaligned_erms); > + > + return OPTIMIZE1 (evex_unaligned); > + } > + > + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (avx2_unaligned_erms_rtm); > + > + return OPTIMIZE1 (avx2_unaligned_rtm); > + } > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > + { > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (avx2_unaligned_erms); > + > + return OPTIMIZE1 (avx2_unaligned); > + } > + } > + > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE1 (sse2_unaligned_erms); > + > + return OPTIMIZE1 (sse2_unaligned); > +} > + > +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); > + > +weak_alias (__bzero, bzero) > +#endif > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 68a56797d4..a594f4176e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memset_avx512_no_vzeroupper) > ) > > + /* Support sysdeps/x86_64/multiarch/bzero.c. */ > + IFUNC_IMPL (i, name, bzero, > + IFUNC_IMPL_ADD (array, i, bzero, 1, > + __bzero_sse2_unaligned) > + IFUNC_IMPL_ADD (array, i, bzero, 1, > + __bzero_sse2_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + CPU_FEATURE_USABLE (AVX2), > + __bzero_avx2_unaligned) > + IFUNC_IMPL_ADD (array, i, bzero, > + CPU_FEATURE_USABLE (AVX2), > + __bzero_avx2_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __bzero_avx2_unaligned_rtm) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __bzero_avx2_unaligned_erms_rtm) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_evex_unaligned) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_evex_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_avx512_unaligned_erms) > + IFUNC_IMPL_ADD (array, i, bzero, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __bzero_avx512_unaligned) > + ) > + > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > IFUNC_IMPL (i, name, rawmemchr, > IFUNC_IMPL_ADD (array, i, rawmemchr, > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > index 8ac3e479bb..5a5ee6f672 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > @@ -5,6 +5,7 @@ > > #define SECTION(p) p##.avx.rtm > #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > +#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm > #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > #include "memset-avx2-unaligned-erms.S" > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > index c0bf2875d0..a093a2831f 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > @@ -14,6 +14,9 @@ > vmovd d, %xmm0; \ > movq r, %rax; > > +# define BZERO_ZERO_VEC0() \ > + vpxor %xmm0, %xmm0, %xmm0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > MEMSET_SET_VEC0_AND_SET_RETURN(d, r) > > @@ -29,6 +32,9 @@ > # ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > +# ifndef BZERO_SYMBOL > +# define BZERO_SYMBOL(p,s) p##_avx2_##s > +# endif > # ifndef WMEMSET_SYMBOL > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > index 5241216a77..727c92133a 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > @@ -19,6 +19,9 @@ > vpbroadcastb d, %VEC0; \ > movq r, %rax > > +# define BZERO_ZERO_VEC0() \ > + vpxorq %XMM0, %XMM0, %XMM0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vpbroadcastd d, %VEC0; \ > movq r, %rax > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > index 6370021506..5d8fa78f05 100644 > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > @@ -19,6 +19,9 @@ > vpbroadcastb d, %VEC0; \ > movq r, %rax > > +# define BZERO_ZERO_VEC0() \ > + vpxorq %XMM0, %XMM0, %XMM0 > + > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vpbroadcastd d, %VEC0; \ > movq r, %rax > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > index 8a6f0c561a..329c58ee46 100644 > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > @@ -22,6 +22,7 @@ > > #if IS_IN (libc) > # define MEMSET_SYMBOL(p,s) p##_sse2_##s > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > # define WMEMSET_SYMBOL(p,s) p##_sse2_##s > > # ifdef SHARED > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 1b502b78e4..7c94fcdae1 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -26,6 +26,10 @@ > > #include <sysdep.h> > > +#ifndef BZERO_SYMBOL > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > +#endif > + > #ifndef MEMSET_CHK_SYMBOL > # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) > #endif > @@ -87,6 +91,18 @@ > # define XMM_SMALL 0 > #endif > > +#ifdef USE_LESS_VEC_MASK_STORE > +# define SET_REG64 rcx > +# define SET_REG32 ecx > +# define SET_REG16 cx > +# define SET_REG8 cl > +#else > +# define SET_REG64 rsi > +# define SET_REG32 esi > +# define SET_REG16 si > +# define SET_REG8 sil > +#endif > + > #define PAGE_SIZE 4096 > > /* Macro to calculate size of small memset block for aligning > @@ -96,18 +112,6 @@ > > #ifndef SECTION > # error SECTION is not defined! > -#endif > - > - .section SECTION(.text),"ax",@progbits > -#if VEC_SIZE == 16 && IS_IN (libc) > -ENTRY (__bzero) > - mov %RDI_LP, %RAX_LP /* Set return value. */ > - mov %RSI_LP, %RDX_LP /* Set n. */ > - xorl %esi, %esi > - pxor %XMM0, %XMM0 > - jmp L(entry_from_bzero) > -END (__bzero) > -weak_alias (__bzero, bzero) > #endif > > #if IS_IN (libc) > @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > WMEMSET_VDUP_TO_VEC0_LOW() > cmpq $VEC_SIZE, %rdx > - jb L(less_vec_no_vdup) > + jb L(less_vec_from_wmemset) > WMEMSET_VDUP_TO_VEC0_HIGH() > jmp L(entry_from_wmemset) > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > #endif > > +ENTRY (BZERO_SYMBOL(__bzero, unaligned)) > +#if VEC_SIZE > 16 > + BZERO_ZERO_VEC0 () > +#endif > + mov %RDI_LP, %RAX_LP > + mov %RSI_LP, %RDX_LP > +#ifndef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +#endif > + cmp $VEC_SIZE, %RDX_LP > + jb L(less_vec_no_vdup) > +#ifdef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +#endif > +#if VEC_SIZE <= 16 > + BZERO_ZERO_VEC0 () > +#endif > + cmp $(VEC_SIZE * 2), %RDX_LP > + ja L(more_2x_vec) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VZEROUPPER_RETURN > +END (BZERO_SYMBOL(__bzero, unaligned)) > + > #if defined SHARED && IS_IN (libc) > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > cmp %RDX_LP, %RCX_LP > @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > /* Clear the upper 32 bits. */ > mov %edx, %edx > # endif > -L(entry_from_bzero): > cmpq $VEC_SIZE, %rdx > jb L(less_vec) > MEMSET_VDUP_TO_VEC0_HIGH() > @@ -187,6 +215,31 @@ END (__memset_erms) > END (MEMSET_SYMBOL (__memset, erms)) > # endif > > +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) > +# if VEC_SIZE > 16 > + BZERO_ZERO_VEC0 () > +# endif > + mov %RDI_LP, %RAX_LP > + mov %RSI_LP, %RDX_LP > +# ifndef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +# endif > + cmp $VEC_SIZE, %RDX_LP > + jb L(less_vec_no_vdup) > +# ifdef USE_LESS_VEC_MASK_STORE > + xorl %esi, %esi > +# endif > +# if VEC_SIZE <= 16 > + BZERO_ZERO_VEC0 () > +# endif > + cmp $(VEC_SIZE * 2), %RDX_LP > + ja L(stosb_more_2x_vec) > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > + VMOVU %VEC(0), (%rdi) > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VZEROUPPER_RETURN > +END (BZERO_SYMBOL(__bzero, unaligned_erms)) > + > # if defined SHARED && IS_IN (libc) > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > cmp %RDX_LP, %RCX_LP > @@ -229,6 +282,7 @@ L(last_2x_vec): > .p2align 4,, 10 > L(less_vec): > L(less_vec_no_vdup): > +L(less_vec_from_wmemset): > /* Less than 1 VEC. */ > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > # error Unsupported VEC_SIZE! > @@ -374,8 +428,11 @@ L(less_vec): > /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to > xmm). This is only does anything for AVX2. */ > MEMSET_VDUP_TO_VEC0_LOW () > +L(less_vec_from_wmemset): > +#if VEC_SIZE > 16 > L(less_vec_no_vdup): > #endif > +#endif > L(cross_page): > #if VEC_SIZE > 32 > cmpl $32, %edx > @@ -386,7 +443,10 @@ L(cross_page): > jge L(between_16_31) > #endif > #ifndef USE_XMM_LESS_VEC > - MOVQ %XMM0, %rcx > + MOVQ %XMM0, %SET_REG64 > +#endif > +#if VEC_SIZE <= 16 > +L(less_vec_no_vdup): > #endif > cmpl $8, %edx > jge L(between_8_15) > @@ -395,7 +455,7 @@ L(cross_page): > cmpl $1, %edx > jg L(between_2_3) > jl L(between_0_0) > - movb %sil, (%LESS_VEC_REG) > + movb %SET_REG8, (%LESS_VEC_REG) > L(between_0_0): > ret > > @@ -428,8 +488,8 @@ L(between_8_15): > MOVQ %XMM0, (%rdi) > MOVQ %XMM0, -8(%rdi, %rdx) > #else > - movq %rcx, (%LESS_VEC_REG) > - movq %rcx, -8(%LESS_VEC_REG, %rdx) > + movq %SET_REG64, (%LESS_VEC_REG) > + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) > #endif > ret > > @@ -442,8 +502,8 @@ L(between_4_7): > MOVD %XMM0, (%rdi) > MOVD %XMM0, -4(%rdi, %rdx) > #else > - movl %ecx, (%LESS_VEC_REG) > - movl %ecx, -4(%LESS_VEC_REG, %rdx) > + movl %SET_REG32, (%LESS_VEC_REG) > + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) > #endif > ret > > @@ -452,12 +512,12 @@ L(between_4_7): > L(between_2_3): > /* From 2 to 3. No branch when size == 2. */ > #ifdef USE_XMM_LESS_VEC > - movb %sil, (%rdi) > - movb %sil, 1(%rdi) > - movb %sil, -1(%rdi, %rdx) > + movb %SET_REG8, (%rdi) > + movb %SET_REG8, 1(%rdi) > + movb %SET_REG8, -1(%rdi, %rdx) > #else > - movw %cx, (%LESS_VEC_REG) > - movb %sil, -1(%LESS_VEC_REG, %rdx) > + movw %SET_REG16, (%LESS_VEC_REG) > + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) > #endif > ret > END (MEMSET_SYMBOL (__memset, unaligned_erms))
On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> wrote: > > > > On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote: > > Rebase against the current master branch. > > > > -- > > memset with zero as the value to set is by far the majority value (99%+ > > for Python3 and GCC). > > > > bzero can be slightly more optimized for this case by using a zero-idiom > > xor for broadcasting the set value to a register (vector or GPR). > > > > Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com> > > Is it really worth to ressurerect bzero with this multiple ifunc variants? > Would Python3/GCC or any programs start to replace memset with bzero for > the sake of this optimization? > > I agree with Wilco where the gain are marginal in this case. The cost is only 1 cache line and it doesn't interfere with memset at all so it's unlikely to cause any problems. The saving is in the lane-cross broadcast which is on the critical path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). As well for EVEX + AVX512, because it uses predicate execution for [0, VEC_SIZE] there is a slight benefit there (although only in throughput because the critical path in mask construction is longer than the lane VEC setup). Agreed it's not clear if it's worth it to start replacing memset calls with bzero calls, but at the very least this will improve existing code that uses bzero. > > > --- > > sysdeps/x86_64/memset.S | 8 ++ > > sysdeps/x86_64/multiarch/Makefile | 1 + > > sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++++++++++ > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 +++++++ > > .../memset-avx2-unaligned-erms-rtm.S | 1 + > > .../multiarch/memset-avx2-unaligned-erms.S | 6 + > > .../multiarch/memset-avx512-unaligned-erms.S | 3 + > > .../multiarch/memset-evex-unaligned-erms.S | 3 + > > .../multiarch/memset-sse2-unaligned-erms.S | 1 + > > .../multiarch/memset-vec-unaligned-erms.S | 110 ++++++++++++++---- > > 10 files changed, 256 insertions(+), 25 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/bzero.c > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > > index 3f0517bbfc..af26e9cedc 100644 > > --- a/sysdeps/x86_64/memset.S > > +++ b/sysdeps/x86_64/memset.S > > @@ -35,6 +35,9 @@ > > punpcklwd %xmm0, %xmm0; \ > > pshufd $0, %xmm0, %xmm0 > > > > +# define BZERO_ZERO_VEC0() \ > > + pxor %xmm0, %xmm0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > movd d, %xmm0; \ > > pshufd $0, %xmm0, %xmm0; \ > > @@ -53,6 +56,10 @@ > > # define MEMSET_SYMBOL(p,s) memset > > #endif > > > > +#ifndef BZERO_SYMBOL > > +# define BZERO_SYMBOL(p,s) __bzero > > +#endif > > + > > #ifndef WMEMSET_SYMBOL > > # define WMEMSET_CHK_SYMBOL(p,s) p > > # define WMEMSET_SYMBOL(p,s) __wmemset > > @@ -63,6 +70,7 @@ > > libc_hidden_builtin_def (memset) > > > > #if IS_IN (libc) > > +weak_alias (__bzero, bzero) > > libc_hidden_def (__wmemset) > > weak_alias (__wmemset, wmemset) > > libc_hidden_weak (wmemset) > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index 4274bfdd0d..e7b413edad 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -1,6 +1,7 @@ > > ifeq ($(subdir),string) > > > > sysdep_routines += \ > > + bzero \ > > memchr-avx2 \ > > memchr-avx2-rtm \ > > memchr-evex \ > > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c > > new file mode 100644 > > index 0000000000..58a14b2c33 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/bzero.c > > @@ -0,0 +1,106 @@ > > +/* Multiple versions of bzero. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* Define multiple versions only for the definition in libc. */ > > +#if IS_IN (libc) > > +# define __bzero __redirect___bzero > > +# include <string.h> > > +# undef __bzero > > + > > +# define SYMBOL_NAME __bzero > > +# include <init-arch.h> > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) > > + attribute_hidden; > > + > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features* cpu_features = __get_cpu_features (); > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > > + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (avx512_unaligned_erms); > > + > > + return OPTIMIZE1 (avx512_unaligned); > > + } > > + } > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (evex_unaligned_erms); > > + > > + return OPTIMIZE1 (evex_unaligned); > > + } > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (avx2_unaligned_erms_rtm); > > + > > + return OPTIMIZE1 (avx2_unaligned_rtm); > > + } > > + > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (avx2_unaligned_erms); > > + > > + return OPTIMIZE1 (avx2_unaligned); > > + } > > + } > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (sse2_unaligned_erms); > > + > > + return OPTIMIZE1 (sse2_unaligned); > > +} > > + > > +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); > > + > > +weak_alias (__bzero, bzero) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 68a56797d4..a594f4176e 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > __memset_avx512_no_vzeroupper) > > ) > > > > + /* Support sysdeps/x86_64/multiarch/bzero.c. */ > > + IFUNC_IMPL (i, name, bzero, > > + IFUNC_IMPL_ADD (array, i, bzero, 1, > > + __bzero_sse2_unaligned) > > + IFUNC_IMPL_ADD (array, i, bzero, 1, > > + __bzero_sse2_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + CPU_FEATURE_USABLE (AVX2), > > + __bzero_avx2_unaligned) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + CPU_FEATURE_USABLE (AVX2), > > + __bzero_avx2_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __bzero_avx2_unaligned_rtm) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __bzero_avx2_unaligned_erms_rtm) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_evex_unaligned) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_evex_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_avx512_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_avx512_unaligned) > > + ) > > + > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > > IFUNC_IMPL (i, name, rawmemchr, > > IFUNC_IMPL_ADD (array, i, rawmemchr, > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > > index 8ac3e479bb..5a5ee6f672 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > > @@ -5,6 +5,7 @@ > > > > #define SECTION(p) p##.avx.rtm > > #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > +#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm > > #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > > > #include "memset-avx2-unaligned-erms.S" > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > index c0bf2875d0..a093a2831f 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > @@ -14,6 +14,9 @@ > > vmovd d, %xmm0; \ > > movq r, %rax; > > > > +# define BZERO_ZERO_VEC0() \ > > + vpxor %xmm0, %xmm0, %xmm0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > MEMSET_SET_VEC0_AND_SET_RETURN(d, r) > > > > @@ -29,6 +32,9 @@ > > # ifndef MEMSET_SYMBOL > > # define MEMSET_SYMBOL(p,s) p##_avx2_##s > > # endif > > +# ifndef BZERO_SYMBOL > > +# define BZERO_SYMBOL(p,s) p##_avx2_##s > > +# endif > > # ifndef WMEMSET_SYMBOL > > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s > > # endif > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > index 5241216a77..727c92133a 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > @@ -19,6 +19,9 @@ > > vpbroadcastb d, %VEC0; \ > > movq r, %rax > > > > +# define BZERO_ZERO_VEC0() \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > vpbroadcastd d, %VEC0; \ > > movq r, %rax > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > index 6370021506..5d8fa78f05 100644 > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > @@ -19,6 +19,9 @@ > > vpbroadcastb d, %VEC0; \ > > movq r, %rax > > > > +# define BZERO_ZERO_VEC0() \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > vpbroadcastd d, %VEC0; \ > > movq r, %rax > > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > > index 8a6f0c561a..329c58ee46 100644 > > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > > @@ -22,6 +22,7 @@ > > > > #if IS_IN (libc) > > # define MEMSET_SYMBOL(p,s) p##_sse2_##s > > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > > # define WMEMSET_SYMBOL(p,s) p##_sse2_##s > > > > # ifdef SHARED > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > index 1b502b78e4..7c94fcdae1 100644 > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > @@ -26,6 +26,10 @@ > > > > #include <sysdep.h> > > > > +#ifndef BZERO_SYMBOL > > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > > +#endif > > + > > #ifndef MEMSET_CHK_SYMBOL > > # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) > > #endif > > @@ -87,6 +91,18 @@ > > # define XMM_SMALL 0 > > #endif > > > > +#ifdef USE_LESS_VEC_MASK_STORE > > +# define SET_REG64 rcx > > +# define SET_REG32 ecx > > +# define SET_REG16 cx > > +# define SET_REG8 cl > > +#else > > +# define SET_REG64 rsi > > +# define SET_REG32 esi > > +# define SET_REG16 si > > +# define SET_REG8 sil > > +#endif > > + > > #define PAGE_SIZE 4096 > > > > /* Macro to calculate size of small memset block for aligning > > @@ -96,18 +112,6 @@ > > > > #ifndef SECTION > > # error SECTION is not defined! > > -#endif > > - > > - .section SECTION(.text),"ax",@progbits > > -#if VEC_SIZE == 16 && IS_IN (libc) > > -ENTRY (__bzero) > > - mov %RDI_LP, %RAX_LP /* Set return value. */ > > - mov %RSI_LP, %RDX_LP /* Set n. */ > > - xorl %esi, %esi > > - pxor %XMM0, %XMM0 > > - jmp L(entry_from_bzero) > > -END (__bzero) > > -weak_alias (__bzero, bzero) > > #endif > > > > #if IS_IN (libc) > > @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > > WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > > WMEMSET_VDUP_TO_VEC0_LOW() > > cmpq $VEC_SIZE, %rdx > > - jb L(less_vec_no_vdup) > > + jb L(less_vec_from_wmemset) > > WMEMSET_VDUP_TO_VEC0_HIGH() > > jmp L(entry_from_wmemset) > > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > > #endif > > > > +ENTRY (BZERO_SYMBOL(__bzero, unaligned)) > > +#if VEC_SIZE > 16 > > + BZERO_ZERO_VEC0 () > > +#endif > > + mov %RDI_LP, %RAX_LP > > + mov %RSI_LP, %RDX_LP > > +#ifndef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +#endif > > + cmp $VEC_SIZE, %RDX_LP > > + jb L(less_vec_no_vdup) > > +#ifdef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +#endif > > +#if VEC_SIZE <= 16 > > + BZERO_ZERO_VEC0 () > > +#endif > > + cmp $(VEC_SIZE * 2), %RDX_LP > > + ja L(more_2x_vec) > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > + VMOVU %VEC(0), (%rdi) > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > > + VZEROUPPER_RETURN > > +END (BZERO_SYMBOL(__bzero, unaligned)) > > + > > #if defined SHARED && IS_IN (libc) > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > > cmp %RDX_LP, %RCX_LP > > @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > /* Clear the upper 32 bits. */ > > mov %edx, %edx > > # endif > > -L(entry_from_bzero): > > cmpq $VEC_SIZE, %rdx > > jb L(less_vec) > > MEMSET_VDUP_TO_VEC0_HIGH() > > @@ -187,6 +215,31 @@ END (__memset_erms) > > END (MEMSET_SYMBOL (__memset, erms)) > > # endif > > > > +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) > > +# if VEC_SIZE > 16 > > + BZERO_ZERO_VEC0 () > > +# endif > > + mov %RDI_LP, %RAX_LP > > + mov %RSI_LP, %RDX_LP > > +# ifndef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +# endif > > + cmp $VEC_SIZE, %RDX_LP > > + jb L(less_vec_no_vdup) > > +# ifdef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +# endif > > +# if VEC_SIZE <= 16 > > + BZERO_ZERO_VEC0 () > > +# endif > > + cmp $(VEC_SIZE * 2), %RDX_LP > > + ja L(stosb_more_2x_vec) > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > + VMOVU %VEC(0), (%rdi) > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > > + VZEROUPPER_RETURN > > +END (BZERO_SYMBOL(__bzero, unaligned_erms)) > > + > > # if defined SHARED && IS_IN (libc) > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > cmp %RDX_LP, %RCX_LP > > @@ -229,6 +282,7 @@ L(last_2x_vec): > > .p2align 4,, 10 > > L(less_vec): > > L(less_vec_no_vdup): > > +L(less_vec_from_wmemset): > > /* Less than 1 VEC. */ > > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > > # error Unsupported VEC_SIZE! > > @@ -374,8 +428,11 @@ L(less_vec): > > /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to > > xmm). This is only does anything for AVX2. */ > > MEMSET_VDUP_TO_VEC0_LOW () > > +L(less_vec_from_wmemset): > > +#if VEC_SIZE > 16 > > L(less_vec_no_vdup): > > #endif > > +#endif > > L(cross_page): > > #if VEC_SIZE > 32 > > cmpl $32, %edx > > @@ -386,7 +443,10 @@ L(cross_page): > > jge L(between_16_31) > > #endif > > #ifndef USE_XMM_LESS_VEC > > - MOVQ %XMM0, %rcx > > + MOVQ %XMM0, %SET_REG64 > > +#endif > > +#if VEC_SIZE <= 16 > > +L(less_vec_no_vdup): > > #endif > > cmpl $8, %edx > > jge L(between_8_15) > > @@ -395,7 +455,7 @@ L(cross_page): > > cmpl $1, %edx > > jg L(between_2_3) > > jl L(between_0_0) > > - movb %sil, (%LESS_VEC_REG) > > + movb %SET_REG8, (%LESS_VEC_REG) > > L(between_0_0): > > ret > > > > @@ -428,8 +488,8 @@ L(between_8_15): > > MOVQ %XMM0, (%rdi) > > MOVQ %XMM0, -8(%rdi, %rdx) > > #else > > - movq %rcx, (%LESS_VEC_REG) > > - movq %rcx, -8(%LESS_VEC_REG, %rdx) > > + movq %SET_REG64, (%LESS_VEC_REG) > > + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) > > #endif > > ret > > > > @@ -442,8 +502,8 @@ L(between_4_7): > > MOVD %XMM0, (%rdi) > > MOVD %XMM0, -4(%rdi, %rdx) > > #else > > - movl %ecx, (%LESS_VEC_REG) > > - movl %ecx, -4(%LESS_VEC_REG, %rdx) > > + movl %SET_REG32, (%LESS_VEC_REG) > > + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) > > #endif > > ret > > > > @@ -452,12 +512,12 @@ L(between_4_7): > > L(between_2_3): > > /* From 2 to 3. No branch when size == 2. */ > > #ifdef USE_XMM_LESS_VEC > > - movb %sil, (%rdi) > > - movb %sil, 1(%rdi) > > - movb %sil, -1(%rdi, %rdx) > > + movb %SET_REG8, (%rdi) > > + movb %SET_REG8, 1(%rdi) > > + movb %SET_REG8, -1(%rdi, %rdx) > > #else > > - movw %cx, (%LESS_VEC_REG) > > - movb %sil, -1(%LESS_VEC_REG, %rdx) > > + movw %SET_REG16, (%LESS_VEC_REG) > > + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) > > #endif > > ret > > END (MEMSET_SYMBOL (__memset, unaligned_erms))
On 09/02/2022 19:14, Noah Goldstein wrote: > On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> >> >> On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote: >>> Rebase against the current master branch. >>> >>> -- >>> memset with zero as the value to set is by far the majority value (99%+ >>> for Python3 and GCC). >>> >>> bzero can be slightly more optimized for this case by using a zero-idiom >>> xor for broadcasting the set value to a register (vector or GPR). >>> >>> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com> >> >> Is it really worth to ressurerect bzero with this multiple ifunc variants? >> Would Python3/GCC or any programs start to replace memset with bzero for >> the sake of this optimization? >> >> I agree with Wilco where the gain are marginal in this case. > > The cost is only 1 cache line and it doesn't interfere with memset at all > so it's unlikely to cause any problems. > > The saving is in the lane-cross broadcast which is on the critical > path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). > > As well for EVEX + AVX512, because it uses predicate execution for > [0, VEC_SIZE] there is a slight benefit there (although only in throughput > because the critical path in mask construction is longer than the lane > VEC setup). > > Agreed it's not clear if it's worth it to start replacing memset calls with > bzero calls, but at the very least this will improve existing code that > uses bzero. > My point is this is a lot of code and infrastructure for a symbol marked as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of marginal gains in specific cases.
Hi, >> The saving is in the lane-cross broadcast which is on the critical >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). What is the speedup in eg. bench-memset? Generally the OoO engine will be able to hide a small increase in latency, so I'd be surprised it shows up as a significant gain. If you can show a good speedup in an important application (or benchmark like SPEC2017) then it may be worth pursuing. However there are other optimization opportunities that may be easier or give a larger benefit. >> Agreed it's not clear if it's worth it to start replacing memset calls with >> bzero calls, but at the very least this will improve existing code that >> uses bzero. No code uses bzero, no compiler emits bzero. It died 2 decades ago... > My point is this is a lot of code and infrastructure for a symbol marked > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of > marginal gains in specific cases. Indeed, what we really should discuss is how to remove the last traces of bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol or could we just get rid of it altogether? Cheers, Wilco
On 10/02/2022 10:01, Wilco Dijkstra wrote: > Hi, > >>> The saving is in the lane-cross broadcast which is on the critical >>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). > > What is the speedup in eg. bench-memset? Generally the OoO engine will > be able to hide a small increase in latency, so I'd be surprised it shows up > as a significant gain. > > If you can show a good speedup in an important application (or benchmark > like SPEC2017) then it may be worth pursuing. However there are other > optimization opportunities that may be easier or give a larger benefit. > >>> Agreed it's not clear if it's worth it to start replacing memset calls with >>> bzero calls, but at the very least this will improve existing code that >>> uses bzero. > > No code uses bzero, no compiler emits bzero. It died 2 decades ago... > >> My point is this is a lot of code and infrastructure for a symbol marked >> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of >> marginal gains in specific cases. > > Indeed, what we really should discuss is how to remove the last traces of > bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol > or could we just get rid of it altogether? We need to keep the symbols as-is afaiu, since callers might still target old POSIX where the symbol is defined as supported. We might add either compiler or linker warning stating the symbols is deprecated, but imho it would just be better if we stop trying to microoptimize it and just use the generic interface (which call memset).
On 10/02/2022 10:10, Adhemerval Zanella wrote: > > > On 10/02/2022 10:01, Wilco Dijkstra wrote: >> Hi, >> >>>> The saving is in the lane-cross broadcast which is on the critical >>>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). >> >> What is the speedup in eg. bench-memset? Generally the OoO engine will >> be able to hide a small increase in latency, so I'd be surprised it shows up >> as a significant gain. >> >> If you can show a good speedup in an important application (or benchmark >> like SPEC2017) then it may be worth pursuing. However there are other >> optimization opportunities that may be easier or give a larger benefit. >> >>>> Agreed it's not clear if it's worth it to start replacing memset calls with >>>> bzero calls, but at the very least this will improve existing code that >>>> uses bzero. >> >> No code uses bzero, no compiler emits bzero. It died 2 decades ago... >> >>> My point is this is a lot of code and infrastructure for a symbol marked >>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of >>> marginal gains in specific cases. >> >> Indeed, what we really should discuss is how to remove the last traces of >> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol >> or could we just get rid of it altogether? > > We need to keep the symbols as-is afaiu, since callers might still target > old POSIX where the symbol is defined as supported. We might add either > compiler or linker warning stating the symbols is deprecated, but imho it > would just be better if we stop trying to microoptimize it and just use > the generic interface (which call memset). And it even makes more sense since gcc either generates memset call or expand bzero, so there is even less point in adding either optimized version for it or ifunc variants to call memset (to avoid the intra-plt call). I will probably cleanup all the bzero optimization we have, it is unfortunate that this patch got in without further discussion.
Hi, > We need to keep the symbols as-is afaiu, since callers might still target > old POSIX where the symbol is defined as supported. We might add either > compiler or linker warning stating the symbols is deprecated, but imho it > would just be better if we stop trying to microoptimize it and just use > the generic interface (which call memset). Compilers have been doing that forever - if you use bzero GCC and LLVM emit a call to memset. No new calls are emitted to these functions, so there is no point in having target-specific optimization at all. However we could disallow use of these functions at all in future GLIBCs. Cheers, Wilco
On 10/02/2022 10:17, Wilco Dijkstra wrote: > Hi, > >> We need to keep the symbols as-is afaiu, since callers might still target >> old POSIX where the symbol is defined as supported. We might add either >> compiler or linker warning stating the symbols is deprecated, but imho it >> would just be better if we stop trying to microoptimize it and just use >> the generic interface (which call memset). > > Compilers have been doing that forever - if you use bzero GCC and LLVM > emit a call to memset. No new calls are emitted to these functions, so there > is no point in having target-specific optimization at all. > > However we could disallow use of these functions at all in future GLIBCs. Unfortunately we can not stop providing such symbols unless we start to stop being compatible to legacy POSIX. And I think we already do not use these internally. Also we can start to remove to other bstring functions as well, bcmp and bcopy.
Hi, On 2/10/22 14:16, Adhemerval Zanella via Libc-alpha wrote: > On 10/02/2022 10:10, Adhemerval Zanella wrote: >> On 10/02/2022 10:01, Wilco Dijkstra wrote: >>>>> Agreed it's not clear if it's worth it to start replacing memset calls with >>>>> bzero calls, but at the very least this will improve existing code that >>>>> uses bzero. >>> >>> No code uses bzero, no compiler emits bzero. It died 2 decades ago... Sorry to ruin your day, but there's a bzero(3) user here :) There are rational reasons to prefer bzero(3) due to it's better interface, and only one to prefer memset(3): standards people decided to remove bzero(3). See <https://stackoverflow.com/a/17097978/6872717>. Consider the following quote from "UNIX Network Programming" by Stevens et al., Section 1.2 (emphasis added): > `bzero` is not an ANSI C function. It is derived from early Berkely > networking code. Nevertheless, we use it throughout the text, instead > of the ANSI C `memset` function, because `bzero` is easier to remember > (with only two arguments) than `memset` (with three arguments). Almost > every vendor that supports the sockets API also provides `bzero`, and > if not, we provide a macro definition in our `unp.h` header. > > Indeed, **the author of TCPv3 [TCP/IP Illustrated, Volume 3 - Stevens 1996] made the mistake of swapping the second > and third arguments to `memset` in 10 occurrences in the first > printing**. A C compiler cannot catch this error because both arguments > are of the same type. (Actually, the second argument is an `int` and > the third argument is `size_t`, which is typically an `unsigned int`, > but the values specified, 0 and 16, respectively, are still acceptable > for the other type of argument.) The call to `memset` still worked, > because only a few of the socket functions actually require that the > final 8 bytes of an Internet socket address structure be set to 0. > Nevertheless, it was an error, and one that could be avoided by using > `bzero`, because swapping the two arguments to `bzero` will always be > caught by the C compiler if function prototypes are used. I consistently use bzero(3), and dislike the interface of memset(3) for zeroing memory. I checked how many memset(3) calls there are in a codebase of mine, and there is exactly 1 call to memset(3), for setting an array representing a large bitfield to 1s: memset(arr, UCHAR_MAX, sizeof(arr)). And there are 41 calls to bzero(3). >>> >>>> My point is this is a lot of code and infrastructure for a symbol marked >>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of >>>> marginal gains in specific cases. >>> >>> Indeed, what we really should discuss is how to remove the last traces of >>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol >>> or could we just get rid of it altogether? I think it's sad that POSIX removed bzero(3). In the end, people need to zero memory, and there's no simpler interface than bzero(3) for that. memset(3) has a useless extra parameter. Even if you just do a simple wrapper as the following, which is no big overhead for glibc I guess, you would be improving (or not worsening) my and hopefully others' lives: static inline bzero(s, n) { memset(s, 0, n); } Is that really a pain to maintain? If libc ever removes bzero(3), it's not like I'm going to start using memset(3). I'll just define it myself. That's not an improvement, but the opposite, IMO. Ideally, POSIX would reconsider some day, and reincorporate bzero(3), but I don't expect that any soon :(. >> >> We need to keep the symbols as-is afaiu, since callers might still target >> old POSIX where the symbol is defined as supported. We might add either >> compiler or linker warning stating the symbols is deprecated, but imho it >> would just be better if we stop trying to microoptimize it and just use >> the generic interface (which call memset). No please, I know they are deprecated, and explicitly want to use it. Don't need some extra spurious warning. Other projects that I have touched (including nginx and shadow-utils), seem to have some similar opinion to me, and they define memzero() or some other similar name, with an interface identical to bzero(3) (just a different name to avoid problems), to wrap either bzero(3) or memset(3), depending on what is available. Thanks, Alex
On Thu, Feb 10, 2022 at 7:02 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > > Hi, > > >> The saving is in the lane-cross broadcast which is on the critical > >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). > > What is the speedup in eg. bench-memset? Generally the OoO engine will > be able to hide a small increase in latency, so I'd be surprised it shows up > as a significant gain. > > If you can show a good speedup in an important application (or benchmark > like SPEC2017) then it may be worth pursuing. However there are other > optimization opportunities that may be easier or give a larger benefit. Very much so doubt any benefit on SPEC/other unless the compiler decided to build zeros with long dependency chains instead of xor. > > >> Agreed it's not clear if it's worth it to start replacing memset calls with > >> bzero calls, but at the very least this will improve existing code that > >> uses bzero. > > No code uses bzero, no compiler emits bzero. It died 2 decades ago... > > > My point is this is a lot of code and infrastructure for a symbol marked > > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of > > marginal gains in specific cases. > > Indeed, what we really should discuss is how to remove the last traces of > bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol > or could we just get rid of it altogether? > > Cheers, > Wilco
On Thu, Feb 10, 2022 at 7:02 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > > Hi, > > >> The saving is in the lane-cross broadcast which is on the critical > >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). > > What is the speedup in eg. bench-memset? Generally the OoO engine will > be able to hide a small increase in latency, so I'd be surprised it shows up > as a significant gain. Well comparing the previous sse2 bzero against avx2/evex/avx512 versions there is obviously speedup. Comparing memset-${version} vs bzero-${version} it's ambiguous if there is any benefits. > > If you can show a good speedup in an important application (or benchmark > like SPEC2017) then it may be worth pursuing. However there are other > optimization opportunities that may be easier or give a larger benefit. > > >> Agreed it's not clear if it's worth it to start replacing memset calls with > >> bzero calls, but at the very least this will improve existing code that > >> uses bzero. > > No code uses bzero, no compiler emits bzero. It died 2 decades ago... > > > My point is this is a lot of code and infrastructure for a symbol marked > > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of > > marginal gains in specific cases. > > Indeed, what we really should discuss is how to remove the last traces of > bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol > or could we just get rid of it altogether? > > Cheers, > Wilco
Hi Alex, Don't worry, you're not a bzero user - just check your binaries and I bet you will find zero calls to bzero! GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to avoid unnecessary proliferation of almost identical string functions where there is absolutely no performance benefit. In fact this duplication is typically a net loss, partly due to reducing optimization effort and the extra runtime overhead due to having multiple copies of the same function in caches and predictors. However if you care about having a standardized call to zero memory, just propose it for the next C/C++ standard. Given we will not agree on naming, we could just add memzero, memclr, memset_zero and bcopy as synonyms for memset. Cheers, Wilco
On 10/02/2022 14:50, Alejandro Colomar (man-pages) wrote: > Hi, > > > On 2/10/22 14:16, Adhemerval Zanella via Libc-alpha wrote: >> On 10/02/2022 10:10, Adhemerval Zanella wrote: >>> On 10/02/2022 10:01, Wilco Dijkstra wrote: >>>>>> Agreed it's not clear if it's worth it to start replacing memset > calls with >>>>>> bzero calls, but at the very least this will improve existing code > that >>>>>> uses bzero. >>>> >>>> No code uses bzero, no compiler emits bzero. It died 2 decades ago... > > Sorry to ruin your day, but there's a bzero(3) user here :) > > There are rational reasons to prefer bzero(3) due to it's better > interface, and only one to prefer memset(3): standards people decided to > remove bzero(3). See <https://stackoverflow.com/a/17097978/6872717>. > > Consider the following quote from "UNIX Network Programming" by Stevens > et al., Section 1.2 (emphasis added): >> `bzero` is not an ANSI C function. It is derived from early Berkely >> networking code. Nevertheless, we use it throughout the text, instead >> of the ANSI C `memset` function, because `bzero` is easier to remember >> (with only two arguments) than `memset` (with three arguments). Almost >> every vendor that supports the sockets API also provides `bzero`, and >> if not, we provide a macro definition in our `unp.h` header. >> >> Indeed, **the author of TCPv3 [TCP/IP Illustrated, Volume 3 - Stevens > 1996] made the mistake of swapping the second >> and third arguments to `memset` in 10 occurrences in the first >> printing**. A C compiler cannot catch this error because both arguments >> are of the same type. (Actually, the second argument is an `int` and >> the third argument is `size_t`, which is typically an `unsigned int`, >> but the values specified, 0 and 16, respectively, are still acceptable >> for the other type of argument.) The call to `memset` still worked, >> because only a few of the socket functions actually require that the >> final 8 bytes of an Internet socket address structure be set to 0. >> Nevertheless, it was an error, and one that could be avoided by using >> `bzero`, because swapping the two arguments to `bzero` will always be >> caught by the C compiler if function prototypes are used. > > I consistently use bzero(3), and dislike the interface of memset(3) for > zeroing memory. I checked how many memset(3) calls there are in a > codebase of mine, and there is exactly 1 call to memset(3), for setting > an array representing a large bitfield to 1s: memset(arr, UCHAR_MAX, > sizeof(arr)). And there are 41 calls to bzero(3). The bzero won't go away since glibc will continue to support old POSIX interfaces, but Wilco has a point: it is an interface that has been deprecated for more than 2 decades and usually only BSD-like code still keeps using it (explicit_bzero was initialized from openbsd). > >>>> >>>>> My point is this is a lot of code and infrastructure for a symbol > marked >>>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of >>>>> marginal gains in specific cases. >>>> >>>> Indeed, what we really should discuss is how to remove the last > traces of >>>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol >>>> or could we just get rid of it altogether? > > I think it's sad that POSIX removed bzero(3). In the end, people need > to zero memory, and there's no simpler interface than bzero(3) for that. > memset(3) has a useless extra parameter. Even if you just do a simple > wrapper as the following, which is no big overhead for glibc I guess, > you would be improving (or not worsening) my and hopefully others' lives: > > static inline bzero(s, n) > { > memset(s, 0, n); > } > > Is that really a pain to maintain? Yes, this is *exactly* what we are trying to remove and I will oppose to reintroduce it after all the cleanups we did in this front (for instance 18b10de7ced9e9). Besides your code most likely is calling memset under the hoods, gcc by default converts bzero calls to memset. It seems to be really tricky to actually force gcc emit a bzero call to libc, I would could make by not including string.h, adding an explicit prototype to bzero, along with -fno-builtin: $ cat bzero.c #include <stddef.h> extern void bzero (void *, size_t s); int main (int argc, char *argv[]) { char b[1024]; bzero (b, sizeof b); return 0; } $ gcc -Wall -fno-builtin bzero.c -o test && objdump -R test | grep -w bzero 0000000000003fd0 R_X86_64_JUMP_SLOT bzero@GLIBC_2.2.5 > > If libc ever removes bzero(3), it's not like I'm going to start using > memset(3). I'll just define it myself. That's not an improvement, but > the opposite, IMO. > > Ideally, POSIX would reconsider some day, and reincorporate bzero(3), > but I don't expect that any soon :(. I really think it is unlikely, but it is just my view. > >>> >>> We need to keep the symbols as-is afaiu, since callers might still target >>> old POSIX where the symbol is defined as supported. We might add either >>> compiler or linker warning stating the symbols is deprecated, but imho it >>> would just be better if we stop trying to microoptimize it and just use >>> the generic interface (which call memset). > > No please, I know they are deprecated, and explicitly want to use it. > Don't need some extra spurious warning. > > Other projects that I have touched (including nginx and shadow-utils), > seem to have some similar opinion to me, and they define memzero() or > some other similar name, with an interface identical to bzero(3) (just a > different name to avoid problems), to wrap either bzero(3) or memset(3), > depending on what is available. We are discussing different subjects here: what I want is to remove the glibc *internal* optimization for bzero, which is essentially an implementation detail. In a first glance it would change performance, however gcc does a hard job replacing bzero/bcmp/bcopy with their str* counterparts, so it highly unlike that newer binaries will actually call bzero. I did an experiment on my system and indeed there is no calls to bzero. $ cat count_bzero.sh #!/bin/bash files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done` total=0 for file in $files; do bzero=`objdump -R $file 2>&1` if [ $? -eq 0 ]; then ncalls=`echo $bzero | grep -w bzero | wc -l` ((total=total+ncalls)) if [ $ncalls -gt 0 ]; then echo "$file: $ncalls" fi fi done echo "TOTAL=$total" $ ./count_bzero.sh TOTAL=0
Hi Wilco and Adhemerval, On 2/10/22 14:01, Wilco Dijkstra via Libc-alpha wrote: > No code uses bzero, no compiler emits bzero. It died 2 decades ago... > >> My point is this is a lot of code and infrastructure for a symbol marked >> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of >> marginal gains in specific cases. That was what triggered my fears. I as a user, not a compiler or libc writer, am not really interested in the implementation details, as long as the implementation does the magic for me. I want to be able to keep using bzero(3) in source code, but if the compiler transforms it into memset(3) (or a for loop), I don't even want to know. On 2/10/22 20:19, Wilco Dijkstra wrote: > Hi Alex, > > Don't worry, you're not a bzero user - just check your binaries and > I bet you will find zero calls to bzero! > :) > GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to > avoid unnecessary proliferation of almost identical string functions where > there is absolutely no performance benefit. In fact this duplication is typically > a net loss, partly due to reducing optimization effort and the extra runtime > overhead due to having multiple copies of the same function in caches and > predictors. > > However if you care about having a standardized call to zero memory, just > propose it for the next C/C++ standard. Given we will not agree on naming, > we could just add memzero, memclr, memset_zero and bcopy as synonyms > for memset. Hmm, I'll do so. Will try to press bzero(3) into C3x. On 2/10/22 20:42, Adhemerval Zanella wrote: > The bzero won't go away since glibc will continue to support old POSIX > interfaces, but Wilco has a point: it is an interface that has been > deprecated for more than 2 decades and usually only BSD-like code > still keeps using it (explicit_bzero was initialized from openbsd). > [...]>> >> static inline bzero(s, n) >> { >> memset(s, 0, n); >> } >> >> Is that really a pain to maintain? > > Yes, this is *exactly* what we are trying to remove and I will oppose to > reintroduce it after all the cleanups we did in this front (for instance > 18b10de7ced9e9). > > Besides your code most likely is calling memset under the hoods, gcc by > default converts bzero calls to memset. It seems to be really tricky to > actually force gcc emit a bzero call to libc, I would could make by not > including string.h, adding an explicit prototype to bzero, along with > -fno-builtin: I guess that it's GCC that defines that (or equivalent) code, instead of glibc. As a user-space programmer, that's fine by me :-). >> If libc ever removes bzero(3), it's not like I'm going to start using >> memset(3). I'll just define it myself. That's not an improvement, but >> the opposite, IMO. >> >> Ideally, POSIX would reconsider some day, and reincorporate bzero(3), >> but I don't expect that any soon :(. > > I really think it is unlikely, but it is just my view. > [...]>> Other projects that I have touched (including nginx and shadow-utils), >> seem to have some similar opinion to me, and they define memzero() or >> some other similar name, with an interface identical to bzero(3) (just a >> different name to avoid problems), to wrap either bzero(3) or memset(3), >> depending on what is available. > > We are discussing different subjects here: what I want is to remove the > glibc *internal* optimization for bzero, which is essentially an > implementation detail. In a first glance it would change performance, > however gcc does a hard job replacing bzero/bcmp/bcopy with their > str* counterparts, so it highly unlike that newer binaries will actually > call bzero. Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will continue supporting it. Just remember that some users keep writing and wanting to write bzero(3) instead of memset(3) in their .c files, so it's far from being dead in source code. Cheers, Alex
On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote: >> We are discussing different subjects here: what I want is to remove the >> glibc *internal* optimization for bzero, which is essentially an >> implementation detail. In a first glance it would change performance, >> however gcc does a hard job replacing bzero/bcmp/bcopy with their >> str* counterparts, so it highly unlike that newer binaries will actually >> call bzero. > > Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will > continue supporting it. Just remember that some users keep writing and > wanting to write bzero(3) instead of memset(3) in their .c files, so > it's far from being dead in source code. Again, I am not proposing to *remove* bzero, but rather the internal optimizations that currently only adds code complexity and maintenance burden. My patchset [1] will keep the ABI as-is, the difference is bcopy and bzero will use the default implementation on all architectures. [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
Just as another point of information, Solaris libc implemented bzero as moving arguments around appropriately then jumping to memset. Noone noticed enough to file a complaint. Of course, short fixed-length bzero was handled with in line stores of zero by the compiler. For long vector bzeroing, the overhead was negligible. When certain Sparc hardware implementations provided faster methods for zeroing a cache line at a time on cache line boundaries, memset added a single test for zero ifandonlyif the length of code to memset was over a threshold that seemed likely to make it worthwhile to use the faster method. The principal advantage of the fast zeroing operation is that it did not require data to move from memory to cache before writing zeros to memory, protecting cache locality in the face of large block zeroing. I was responsible for much of that optimization effort. Whether that optimization was really worth it is open for debate for a variety of reasons that I won't go into just now. Apps still used bzero or memset(target,zero,length) according to their preferences, but the code was unified under memset. I am inclined to agree with keeping bzero in the API for compatibility with old code/old binaries/old programmers. :-) Using shared memset code for the implementation of bzero is worthwhile for reducing future maintenance costs. - Patrick McGehearty former Sparc/Solaris performance tuning person On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote: > > On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote: >>> We are discussing different subjects here: what I want is to remove the >>> glibc *internal* optimization for bzero, which is essentially an >>> implementation detail. In a first glance it would change performance, >>> however gcc does a hard job replacing bzero/bcmp/bcopy with their >>> str* counterparts, so it highly unlike that newer binaries will actually >>> call bzero. >> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will >> continue supporting it. Just remember that some users keep writing and >> wanting to write bzero(3) instead of memset(3) in their .c files, so >> it's far from being dead in source code. > Again, I am not proposing to *remove* bzero, but rather the internal > optimizations that currently only adds code complexity and maintenance > burden. My patchset [1] will keep the ABI as-is, the difference is > bcopy and bzero will use the default implementation on all architectures. > > [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
On 2/10/22 21:42, Adhemerval Zanella wrote: > Again, I am not proposing to *remove* bzero, but rather the internal > optimizations that currently only adds code complexity and maintenance > burden. My patchset [1] will keep the ABI as-is, the difference is > bcopy and bzero will use the default implementation on all architectures. That's good. Thanks, Alex
On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: > Just as another point of information, Solaris libc implemented > bzero as moving arguments around appropriately then jumping to > memset. Noone noticed enough to file a complaint. Of course, > short fixed-length bzero was handled with in line stores of zero > by the compiler. For long vector bzeroing, the overhead was > negligible. > > When certain Sparc hardware implementations provided faster methods > for zeroing a cache line at a time on cache line boundaries, > memset added a single test for zero ifandonlyif the length of code > to memset was over a threshold that seemed likely to make it > worthwhile to use the faster method. The principal advantage > of the fast zeroing operation is that it did not require data > to move from memory to cache before writing zeros to memory, > protecting cache locality in the face of large block zeroing. > I was responsible for much of that optimization effort. > Whether that optimization was really worth it is open for debate > for a variety of reasons that I won't go into just now. Afaik this is pretty much what optimized memset implementations does, if architecture allows it. For instance, aarch64 uses 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a similar strategy. > > Apps still used bzero or memset(target,zero,length) according to > their preferences, but the code was unified under memset. > > I am inclined to agree with keeping bzero in the API for > compatibility with old code/old binaries/old programmers. :-) The main driver to remove the bzero internal implementation is just the *currently* gcc just do not generate bzero calls as default (I couldn't find a single binary that calls bzero in my system). So to actually see any performance advantage from the optimized bzero, we will need to reevaluate the gcc optimization to transform it on memset (which will need to be applied per-architecture base) which I seem highly unlikely gcc maintainer will accept it. Some time ago LLVM tried to do something similar to bcmp, but in the end it was not a good idea to use an already define symbol and it ended up with __memcmp_eq instead. > > Using shared memset code for the implementation of bzero > is worthwhile for reducing future maintenance costs. > > - Patrick McGehearty > former Sparc/Solaris performance tuning person > > > > On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote: >> >> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote: >>>> We are discussing different subjects here: what I want is to remove the >>>> glibc *internal* optimization for bzero, which is essentially an >>>> implementation detail. In a first glance it would change performance, >>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their >>>> str* counterparts, so it highly unlike that newer binaries will actually >>>> call bzero. >>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will >>> continue supporting it. Just remember that some users keep writing and >>> wanting to write bzero(3) instead of memset(3) in their .c files, so >>> it's far from being dead in source code. >> Again, I am not proposing to *remove* bzero, but rather the internal >> optimizations that currently only adds code complexity and maintenance >> burden. My patchset [1] will keep the ABI as-is, the difference is >> bcopy and bzero will use the default implementation on all architectures. >> >> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243 >
On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> wrote: > > > > On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: > > Just as another point of information, Solaris libc implemented > > bzero as moving arguments around appropriately then jumping to > > memset. Noone noticed enough to file a complaint. Of course, > > short fixed-length bzero was handled with in line stores of zero > > by the compiler. For long vector bzeroing, the overhead was > > negligible. > > > > When certain Sparc hardware implementations provided faster methods > > for zeroing a cache line at a time on cache line boundaries, > > memset added a single test for zero ifandonlyif the length of code > > to memset was over a threshold that seemed likely to make it > > worthwhile to use the faster method. The principal advantage > > of the fast zeroing operation is that it did not require data > > to move from memory to cache before writing zeros to memory, > > protecting cache locality in the face of large block zeroing. > > I was responsible for much of that optimization effort. > > Whether that optimization was really worth it is open for debate > > for a variety of reasons that I won't go into just now. > > Afaik this is pretty much what optimized memset implementations > does, if architecture allows it. For instance, aarch64 uses > 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a > similar strategy. > > > > > Apps still used bzero or memset(target,zero,length) according to > > their preferences, but the code was unified under memset. > > > > I am inclined to agree with keeping bzero in the API for > > compatibility with old code/old binaries/old programmers. :-) > > The main driver to remove the bzero internal implementation is just > the *currently* gcc just do not generate bzero calls as default > (I couldn't find a single binary that calls bzero in my system). Does it make sense then to add '__memsetzero' so that we can have a function optimized for setting zero? > > So to actually see any performance advantage from the optimized > bzero, we will need to reevaluate the gcc optimization to transform > it on memset (which will need to be applied per-architecture base) > which I seem highly unlikely gcc maintainer will accept it. > > Some time ago LLVM tried to do something similar to bcmp, but in the > end it was not a good idea to use an already define symbol and it > ended up with __memcmp_eq instead. > > > > > Using shared memset code for the implementation of bzero > > is worthwhile for reducing future maintenance costs. > > > > - Patrick McGehearty > > former Sparc/Solaris performance tuning person > > > > > > > > On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote: > >> > >> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote: > >>>> We are discussing different subjects here: what I want is to remove the > >>>> glibc *internal* optimization for bzero, which is essentially an > >>>> implementation detail. In a first glance it would change performance, > >>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their > >>>> str* counterparts, so it highly unlike that newer binaries will actually > >>>> call bzero. > >>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will > >>> continue supporting it. Just remember that some users keep writing and > >>> wanting to write bzero(3) instead of memset(3) in their .c files, so > >>> it's far from being dead in source code. > >> Again, I am not proposing to *remove* bzero, but rather the internal > >> optimizations that currently only adds code complexity and maintenance > >> burden. My patchset [1] will keep the ABI as-is, the difference is > >> bcopy and bzero will use the default implementation on all architectures. > >> > >> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243 > >
On 12/02/2022 20:46, Noah Goldstein wrote: > On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> >> >> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: >>> Just as another point of information, Solaris libc implemented >>> bzero as moving arguments around appropriately then jumping to >>> memset. Noone noticed enough to file a complaint. Of course, >>> short fixed-length bzero was handled with in line stores of zero >>> by the compiler. For long vector bzeroing, the overhead was >>> negligible. >>> >>> When certain Sparc hardware implementations provided faster methods >>> for zeroing a cache line at a time on cache line boundaries, >>> memset added a single test for zero ifandonlyif the length of code >>> to memset was over a threshold that seemed likely to make it >>> worthwhile to use the faster method. The principal advantage >>> of the fast zeroing operation is that it did not require data >>> to move from memory to cache before writing zeros to memory, >>> protecting cache locality in the face of large block zeroing. >>> I was responsible for much of that optimization effort. >>> Whether that optimization was really worth it is open for debate >>> for a variety of reasons that I won't go into just now. >> >> Afaik this is pretty much what optimized memset implementations >> does, if architecture allows it. For instance, aarch64 uses >> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a >> similar strategy. >> >>> >>> Apps still used bzero or memset(target,zero,length) according to >>> their preferences, but the code was unified under memset. >>> >>> I am inclined to agree with keeping bzero in the API for >>> compatibility with old code/old binaries/old programmers. :-) >> >> The main driver to remove the bzero internal implementation is just >> the *currently* gcc just do not generate bzero calls as default >> (I couldn't find a single binary that calls bzero in my system). > > Does it make sense then to add '__memsetzero' so that we can have > a function optimized for setting zero? Will it be really a huge gain instead of a microoptimization that will just a bunch of more ifunc variants along with the maintenance cost associated with this? My understanding is __memsetzero would maybe yield some gain in the store mask generation (some architecture might have a zero register or some instruction to generate one), however it would require to use the same strategy as memset to use specific architecture instruction that optimize cache utilization (dc zva, dcbz). So it would mostly require a lot of arch-specific code to to share the memset code with __memsetzero (to avoid increasing code size), so I am not sure if this is really a gain in the long term.
On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > > > On 12/02/2022 20:46, Noah Goldstein wrote: > > On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > >> > >> > >> > >> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: > >>> Just as another point of information, Solaris libc implemented > >>> bzero as moving arguments around appropriately then jumping to > >>> memset. Noone noticed enough to file a complaint. Of course, > >>> short fixed-length bzero was handled with in line stores of zero > >>> by the compiler. For long vector bzeroing, the overhead was > >>> negligible. > >>> > >>> When certain Sparc hardware implementations provided faster methods > >>> for zeroing a cache line at a time on cache line boundaries, > >>> memset added a single test for zero ifandonlyif the length of code > >>> to memset was over a threshold that seemed likely to make it > >>> worthwhile to use the faster method. The principal advantage > >>> of the fast zeroing operation is that it did not require data > >>> to move from memory to cache before writing zeros to memory, > >>> protecting cache locality in the face of large block zeroing. > >>> I was responsible for much of that optimization effort. > >>> Whether that optimization was really worth it is open for debate > >>> for a variety of reasons that I won't go into just now. > >> > >> Afaik this is pretty much what optimized memset implementations > >> does, if architecture allows it. For instance, aarch64 uses > >> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a > >> similar strategy. > >> > >>> > >>> Apps still used bzero or memset(target,zero,length) according to > >>> their preferences, but the code was unified under memset. > >>> > >>> I am inclined to agree with keeping bzero in the API for > >>> compatibility with old code/old binaries/old programmers. :-) > >> > >> The main driver to remove the bzero internal implementation is just > >> the *currently* gcc just do not generate bzero calls as default > >> (I couldn't find a single binary that calls bzero in my system). > > > > Does it make sense then to add '__memsetzero' so that we can have > > a function optimized for setting zero? > > Will it be really a huge gain instead of a microoptimization that will > just a bunch of more ifunc variants along with the maintenance cost > associated with this? Is there any way it can be setup so that one C impl can cover all the arch that want to just leave `__memsetzero` as an alias to `memset`? I know they have incompatible interfaces that make it hard but would a weak static inline in string.h work? For some of the shorter control flows (which are generally small sizes and very hot) we saw reasonable benefits on x86_64. The most significant was the EVEX/AVX2 [32, 64] case where it net us ~25% throughput. This is a pretty hot set value so it may be worth it. > > My understanding is __memsetzero would maybe yield some gain in the > store mask generation (some architecture might have a zero register > or some instruction to generate one), however it would require to > use the same strategy as memset to use specific architecture instruction > that optimize cache utilization (dc zva, dcbz). > > So it would mostly require a lot of arch-specific code to to share > the memset code with __memsetzero (to avoid increasing code size), > so I am not sure if this is really a gain in the long term. It's worth noting that between the two `memset` is the cold function and `__memsetzero` is the hot one. Based on profiles of GCC11 and Python3.7.7 setting zero covers 99%+ cases.
On 14/02/2022 09:41, Noah Goldstein wrote: > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 12/02/2022 20:46, Noah Goldstein wrote: >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha >>> <libc-alpha@sourceware.org> wrote: >>>> >>>> >>>> >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: >>>>> Just as another point of information, Solaris libc implemented >>>>> bzero as moving arguments around appropriately then jumping to >>>>> memset. Noone noticed enough to file a complaint. Of course, >>>>> short fixed-length bzero was handled with in line stores of zero >>>>> by the compiler. For long vector bzeroing, the overhead was >>>>> negligible. >>>>> >>>>> When certain Sparc hardware implementations provided faster methods >>>>> for zeroing a cache line at a time on cache line boundaries, >>>>> memset added a single test for zero ifandonlyif the length of code >>>>> to memset was over a threshold that seemed likely to make it >>>>> worthwhile to use the faster method. The principal advantage >>>>> of the fast zeroing operation is that it did not require data >>>>> to move from memory to cache before writing zeros to memory, >>>>> protecting cache locality in the face of large block zeroing. >>>>> I was responsible for much of that optimization effort. >>>>> Whether that optimization was really worth it is open for debate >>>>> for a variety of reasons that I won't go into just now. >>>> >>>> Afaik this is pretty much what optimized memset implementations >>>> does, if architecture allows it. For instance, aarch64 uses >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a >>>> similar strategy. >>>> >>>>> >>>>> Apps still used bzero or memset(target,zero,length) according to >>>>> their preferences, but the code was unified under memset. >>>>> >>>>> I am inclined to agree with keeping bzero in the API for >>>>> compatibility with old code/old binaries/old programmers. :-) >>>> >>>> The main driver to remove the bzero internal implementation is just >>>> the *currently* gcc just do not generate bzero calls as default >>>> (I couldn't find a single binary that calls bzero in my system). >>> >>> Does it make sense then to add '__memsetzero' so that we can have >>> a function optimized for setting zero? >> >> Will it be really a huge gain instead of a microoptimization that will >> just a bunch of more ifunc variants along with the maintenance cost >> associated with this? > Is there any way it can be setup so that one C impl can cover all the > arch that want to just leave `__memsetzero` as an alias to `memset`? > I know they have incompatible interfaces that make it hard but would > a weak static inline in string.h work? > > For some of the shorter control flows (which are generally small sizes > and very hot) we saw reasonable benefits on x86_64. > > The most significant was the EVEX/AVX2 [32, 64] case where it net > us ~25% throughput. This is a pretty hot set value so it may be worth it. With different prototypes and semantics we won't be able to define an alias. What we used to do, but we move away in recent version, was to define static inline function that glue the two function if optimization is set. > >> >> My understanding is __memsetzero would maybe yield some gain in the >> store mask generation (some architecture might have a zero register >> or some instruction to generate one), however it would require to >> use the same strategy as memset to use specific architecture instruction >> that optimize cache utilization (dc zva, dcbz). >> >> So it would mostly require a lot of arch-specific code to to share >> the memset code with __memsetzero (to avoid increasing code size), >> so I am not sure if this is really a gain in the long term. > > It's worth noting that between the two `memset` is the cold function > and `__memsetzero` is the hot one. Based on profiles of GCC11 and > Python3.7.7 setting zero covers 99%+ cases. This is very workload specific and I think with more advance compiler optimization like LTO and PGO such calls could most likely being optimized by the compiler itself (either by inline or by create a synthetic function to handle it). What I worried is such symbols might ended up as the AEBI memcpy variants that was added as way to optimize when alignment is know to be multiple of words, but it ended up not being implemented and also not being generated by the compiler (at least not gcc).
On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> wrote: > > > > On 14/02/2022 09:41, Noah Goldstein wrote: > > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella > > <adhemerval.zanella@linaro.org> wrote: > >> > >> > >> > >> On 12/02/2022 20:46, Noah Goldstein wrote: > >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha > >>> <libc-alpha@sourceware.org> wrote: > >>>> > >>>> > >>>> > >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: > >>>>> Just as another point of information, Solaris libc implemented > >>>>> bzero as moving arguments around appropriately then jumping to > >>>>> memset. Noone noticed enough to file a complaint. Of course, > >>>>> short fixed-length bzero was handled with in line stores of zero > >>>>> by the compiler. For long vector bzeroing, the overhead was > >>>>> negligible. > >>>>> > >>>>> When certain Sparc hardware implementations provided faster methods > >>>>> for zeroing a cache line at a time on cache line boundaries, > >>>>> memset added a single test for zero ifandonlyif the length of code > >>>>> to memset was over a threshold that seemed likely to make it > >>>>> worthwhile to use the faster method. The principal advantage > >>>>> of the fast zeroing operation is that it did not require data > >>>>> to move from memory to cache before writing zeros to memory, > >>>>> protecting cache locality in the face of large block zeroing. > >>>>> I was responsible for much of that optimization effort. > >>>>> Whether that optimization was really worth it is open for debate > >>>>> for a variety of reasons that I won't go into just now. > >>>> > >>>> Afaik this is pretty much what optimized memset implementations > >>>> does, if architecture allows it. For instance, aarch64 uses > >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a > >>>> similar strategy. > >>>> > >>>>> > >>>>> Apps still used bzero or memset(target,zero,length) according to > >>>>> their preferences, but the code was unified under memset. > >>>>> > >>>>> I am inclined to agree with keeping bzero in the API for > >>>>> compatibility with old code/old binaries/old programmers. :-) > >>>> > >>>> The main driver to remove the bzero internal implementation is just > >>>> the *currently* gcc just do not generate bzero calls as default > >>>> (I couldn't find a single binary that calls bzero in my system). > >>> > >>> Does it make sense then to add '__memsetzero' so that we can have > >>> a function optimized for setting zero? > >> > >> Will it be really a huge gain instead of a microoptimization that will > >> just a bunch of more ifunc variants along with the maintenance cost > >> associated with this? > > Is there any way it can be setup so that one C impl can cover all the > > arch that want to just leave `__memsetzero` as an alias to `memset`? > > I know they have incompatible interfaces that make it hard but would > > a weak static inline in string.h work? > > > > For some of the shorter control flows (which are generally small sizes > > and very hot) we saw reasonable benefits on x86_64. > > > > The most significant was the EVEX/AVX2 [32, 64] case where it net > > us ~25% throughput. This is a pretty hot set value so it may be worth it. > > With different prototypes and semantics we won't be able to define an > alias. What we used to do, but we move away in recent version, was to > define static inline function that glue the two function if optimization > is set. I have /* NB: bzero returns void and __memsetzero returns void *. */ asm (".weak bzero"); asm ("bzero = __memsetzero"); asm (".global __bzero"); asm ("__bzero = __memsetzero"); > > > >> > >> My understanding is __memsetzero would maybe yield some gain in the > >> store mask generation (some architecture might have a zero register > >> or some instruction to generate one), however it would require to > >> use the same strategy as memset to use specific architecture instruction > >> that optimize cache utilization (dc zva, dcbz). > >> > >> So it would mostly require a lot of arch-specific code to to share > >> the memset code with __memsetzero (to avoid increasing code size), > >> so I am not sure if this is really a gain in the long term. > > > > It's worth noting that between the two `memset` is the cold function > > and `__memsetzero` is the hot one. Based on profiles of GCC11 and > > Python3.7.7 setting zero covers 99%+ cases. > > This is very workload specific and I think with more advance compiler > optimization like LTO and PGO such calls could most likely being > optimized by the compiler itself (either by inline or by create a > synthetic function to handle it). > > What I worried is such symbols might ended up as the AEBI memcpy variants > that was added as way to optimize when alignment is know to be multiple > of words, but it ended up not being implemented and also not being generated > by the compiler (at least not gcc).
On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > > > > > On 14/02/2022 09:41, Noah Goldstein wrote: > > > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella > > > <adhemerval.zanella@linaro.org> wrote: > > >> > > >> > > >> > > >> On 12/02/2022 20:46, Noah Goldstein wrote: > > >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha > > >>> <libc-alpha@sourceware.org> wrote: > > >>>> > > >>>> > > >>>> > > >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: > > >>>>> Just as another point of information, Solaris libc implemented > > >>>>> bzero as moving arguments around appropriately then jumping to > > >>>>> memset. Noone noticed enough to file a complaint. Of course, > > >>>>> short fixed-length bzero was handled with in line stores of zero > > >>>>> by the compiler. For long vector bzeroing, the overhead was > > >>>>> negligible. > > >>>>> > > >>>>> When certain Sparc hardware implementations provided faster methods > > >>>>> for zeroing a cache line at a time on cache line boundaries, > > >>>>> memset added a single test for zero ifandonlyif the length of code > > >>>>> to memset was over a threshold that seemed likely to make it > > >>>>> worthwhile to use the faster method. The principal advantage > > >>>>> of the fast zeroing operation is that it did not require data > > >>>>> to move from memory to cache before writing zeros to memory, > > >>>>> protecting cache locality in the face of large block zeroing. > > >>>>> I was responsible for much of that optimization effort. > > >>>>> Whether that optimization was really worth it is open for debate > > >>>>> for a variety of reasons that I won't go into just now. > > >>>> > > >>>> Afaik this is pretty much what optimized memset implementations > > >>>> does, if architecture allows it. For instance, aarch64 uses > > >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a > > >>>> similar strategy. > > >>>> > > >>>>> > > >>>>> Apps still used bzero or memset(target,zero,length) according to > > >>>>> their preferences, but the code was unified under memset. > > >>>>> > > >>>>> I am inclined to agree with keeping bzero in the API for > > >>>>> compatibility with old code/old binaries/old programmers. :-) > > >>>> > > >>>> The main driver to remove the bzero internal implementation is just > > >>>> the *currently* gcc just do not generate bzero calls as default > > >>>> (I couldn't find a single binary that calls bzero in my system). > > >>> > > >>> Does it make sense then to add '__memsetzero' so that we can have > > >>> a function optimized for setting zero? > > >> > > >> Will it be really a huge gain instead of a microoptimization that will > > >> just a bunch of more ifunc variants along with the maintenance cost > > >> associated with this? > > > Is there any way it can be setup so that one C impl can cover all the > > > arch that want to just leave `__memsetzero` as an alias to `memset`? > > > I know they have incompatible interfaces that make it hard but would > > > a weak static inline in string.h work? > > > > > > For some of the shorter control flows (which are generally small sizes > > > and very hot) we saw reasonable benefits on x86_64. > > > > > > The most significant was the EVEX/AVX2 [32, 64] case where it net > > > us ~25% throughput. This is a pretty hot set value so it may be worth it. > > > > With different prototypes and semantics we won't be able to define an > > alias. What we used to do, but we move away in recent version, was to > > define static inline function that glue the two function if optimization > > is set. > > I have > > /* NB: bzero returns void and __memsetzero returns void *. */ > asm (".weak bzero"); > asm ("bzero = __memsetzero"); > asm (".global __bzero"); > asm ("__bzero = __memsetzero"); > > > > > > >> > > >> My understanding is __memsetzero would maybe yield some gain in the > > >> store mask generation (some architecture might have a zero register > > >> or some instruction to generate one), however it would require to > > >> use the same strategy as memset to use specific architecture instruction > > >> that optimize cache utilization (dc zva, dcbz). > > >> > > >> So it would mostly require a lot of arch-specific code to to share > > >> the memset code with __memsetzero (to avoid increasing code size), > > >> so I am not sure if this is really a gain in the long term. > > > > > > It's worth noting that between the two `memset` is the cold function > > > and `__memsetzero` is the hot one. Based on profiles of GCC11 and > > > Python3.7.7 setting zero covers 99%+ cases. > > > > This is very workload specific and I think with more advance compiler > > optimization like LTO and PGO such calls could most likely being > > optimized by the compiler itself (either by inline or by create a > > synthetic function to handle it). > > > > What I worried is such symbols might ended up as the AEBI memcpy variants > > that was added as way to optimize when alignment is know to be multiple > > of words, but it ended up not being implemented and also not being generated > > by the compiler (at least not gcc). > > > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
On 04/05/2022 03:35, Sunil Pandey wrote: > On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha >> <libc-alpha@sourceware.org> wrote: >>> >>> >>> >>> On 14/02/2022 09:41, Noah Goldstein wrote: >>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella >>>> <adhemerval.zanella@linaro.org> wrote: >>>>> >>>>> >>>>> >>>>> On 12/02/2022 20:46, Noah Goldstein wrote: >>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha >>>>>> <libc-alpha@sourceware.org> wrote: >>>>>>> >>>>>>> >>>>>>> >>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: >>>>>>>> Just as another point of information, Solaris libc implemented >>>>>>>> bzero as moving arguments around appropriately then jumping to >>>>>>>> memset. Noone noticed enough to file a complaint. Of course, >>>>>>>> short fixed-length bzero was handled with in line stores of zero >>>>>>>> by the compiler. For long vector bzeroing, the overhead was >>>>>>>> negligible. >>>>>>>> >>>>>>>> When certain Sparc hardware implementations provided faster methods >>>>>>>> for zeroing a cache line at a time on cache line boundaries, >>>>>>>> memset added a single test for zero ifandonlyif the length of code >>>>>>>> to memset was over a threshold that seemed likely to make it >>>>>>>> worthwhile to use the faster method. The principal advantage >>>>>>>> of the fast zeroing operation is that it did not require data >>>>>>>> to move from memory to cache before writing zeros to memory, >>>>>>>> protecting cache locality in the face of large block zeroing. >>>>>>>> I was responsible for much of that optimization effort. >>>>>>>> Whether that optimization was really worth it is open for debate >>>>>>>> for a variety of reasons that I won't go into just now. >>>>>>> >>>>>>> Afaik this is pretty much what optimized memset implementations >>>>>>> does, if architecture allows it. For instance, aarch64 uses >>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a >>>>>>> similar strategy. >>>>>>> >>>>>>>> >>>>>>>> Apps still used bzero or memset(target,zero,length) according to >>>>>>>> their preferences, but the code was unified under memset. >>>>>>>> >>>>>>>> I am inclined to agree with keeping bzero in the API for >>>>>>>> compatibility with old code/old binaries/old programmers. :-) >>>>>>> >>>>>>> The main driver to remove the bzero internal implementation is just >>>>>>> the *currently* gcc just do not generate bzero calls as default >>>>>>> (I couldn't find a single binary that calls bzero in my system). >>>>>> >>>>>> Does it make sense then to add '__memsetzero' so that we can have >>>>>> a function optimized for setting zero? >>>>> >>>>> Will it be really a huge gain instead of a microoptimization that will >>>>> just a bunch of more ifunc variants along with the maintenance cost >>>>> associated with this? >>>> Is there any way it can be setup so that one C impl can cover all the >>>> arch that want to just leave `__memsetzero` as an alias to `memset`? >>>> I know they have incompatible interfaces that make it hard but would >>>> a weak static inline in string.h work? >>>> >>>> For some of the shorter control flows (which are generally small sizes >>>> and very hot) we saw reasonable benefits on x86_64. >>>> >>>> The most significant was the EVEX/AVX2 [32, 64] case where it net >>>> us ~25% throughput. This is a pretty hot set value so it may be worth it. >>> >>> With different prototypes and semantics we won't be able to define an >>> alias. What we used to do, but we move away in recent version, was to >>> define static inline function that glue the two function if optimization >>> is set. >> >> I have >> >> /* NB: bzero returns void and __memsetzero returns void *. */ >> asm (".weak bzero"); >> asm ("bzero = __memsetzero"); >> asm (".global __bzero"); >> asm ("__bzero = __memsetzero"); >> >>>> >>>>> >>>>> My understanding is __memsetzero would maybe yield some gain in the >>>>> store mask generation (some architecture might have a zero register >>>>> or some instruction to generate one), however it would require to >>>>> use the same strategy as memset to use specific architecture instruction >>>>> that optimize cache utilization (dc zva, dcbz). >>>>> >>>>> So it would mostly require a lot of arch-specific code to to share >>>>> the memset code with __memsetzero (to avoid increasing code size), >>>>> so I am not sure if this is really a gain in the long term. >>>> >>>> It's worth noting that between the two `memset` is the cold function >>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and >>>> Python3.7.7 setting zero covers 99%+ cases. >>> >>> This is very workload specific and I think with more advance compiler >>> optimization like LTO and PGO such calls could most likely being >>> optimized by the compiler itself (either by inline or by create a >>> synthetic function to handle it). >>> >>> What I worried is such symbols might ended up as the AEBI memcpy variants >>> that was added as way to optimize when alignment is know to be multiple >>> of words, but it ended up not being implemented and also not being generated >>> by the compiler (at least not gcc). >> >> >> >> -- >> H.J. > > I would like to backport this patch to release branches. > Any comments or objections? Nothing really against, but as previous discussion we had on this maillist optimizing bzero does not yield much gain compared to memset (compiler won't generate libcall for loop transformation, among other shortcomings). My idea is to follow other architecture and just remove all x86_64 optimizations.
On Wed, May 4, 2022 at 5:52 AM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > > > On 04/05/2022 03:35, Sunil Pandey wrote: > > On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > >> > >> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha > >> <libc-alpha@sourceware.org> wrote: > >>> > >>> > >>> > >>> On 14/02/2022 09:41, Noah Goldstein wrote: > >>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella > >>>> <adhemerval.zanella@linaro.org> wrote: > >>>>> > >>>>> > >>>>> > >>>>> On 12/02/2022 20:46, Noah Goldstein wrote: > >>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha > >>>>>> <libc-alpha@sourceware.org> wrote: > >>>>>>> > >>>>>>> > >>>>>>> > >>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: > >>>>>>>> Just as another point of information, Solaris libc implemented > >>>>>>>> bzero as moving arguments around appropriately then jumping to > >>>>>>>> memset. Noone noticed enough to file a complaint. Of course, > >>>>>>>> short fixed-length bzero was handled with in line stores of zero > >>>>>>>> by the compiler. For long vector bzeroing, the overhead was > >>>>>>>> negligible. > >>>>>>>> > >>>>>>>> When certain Sparc hardware implementations provided faster methods > >>>>>>>> for zeroing a cache line at a time on cache line boundaries, > >>>>>>>> memset added a single test for zero ifandonlyif the length of code > >>>>>>>> to memset was over a threshold that seemed likely to make it > >>>>>>>> worthwhile to use the faster method. The principal advantage > >>>>>>>> of the fast zeroing operation is that it did not require data > >>>>>>>> to move from memory to cache before writing zeros to memory, > >>>>>>>> protecting cache locality in the face of large block zeroing. > >>>>>>>> I was responsible for much of that optimization effort. > >>>>>>>> Whether that optimization was really worth it is open for debate > >>>>>>>> for a variety of reasons that I won't go into just now. > >>>>>>> > >>>>>>> Afaik this is pretty much what optimized memset implementations > >>>>>>> does, if architecture allows it. For instance, aarch64 uses > >>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a > >>>>>>> similar strategy. > >>>>>>> > >>>>>>>> > >>>>>>>> Apps still used bzero or memset(target,zero,length) according to > >>>>>>>> their preferences, but the code was unified under memset. > >>>>>>>> > >>>>>>>> I am inclined to agree with keeping bzero in the API for > >>>>>>>> compatibility with old code/old binaries/old programmers. :-) > >>>>>>> > >>>>>>> The main driver to remove the bzero internal implementation is just > >>>>>>> the *currently* gcc just do not generate bzero calls as default > >>>>>>> (I couldn't find a single binary that calls bzero in my system). > >>>>>> > >>>>>> Does it make sense then to add '__memsetzero' so that we can have > >>>>>> a function optimized for setting zero? > >>>>> > >>>>> Will it be really a huge gain instead of a microoptimization that will > >>>>> just a bunch of more ifunc variants along with the maintenance cost > >>>>> associated with this? > >>>> Is there any way it can be setup so that one C impl can cover all the > >>>> arch that want to just leave `__memsetzero` as an alias to `memset`? > >>>> I know they have incompatible interfaces that make it hard but would > >>>> a weak static inline in string.h work? > >>>> > >>>> For some of the shorter control flows (which are generally small sizes > >>>> and very hot) we saw reasonable benefits on x86_64. > >>>> > >>>> The most significant was the EVEX/AVX2 [32, 64] case where it net > >>>> us ~25% throughput. This is a pretty hot set value so it may be worth it. > >>> > >>> With different prototypes and semantics we won't be able to define an > >>> alias. What we used to do, but we move away in recent version, was to > >>> define static inline function that glue the two function if optimization > >>> is set. > >> > >> I have > >> > >> /* NB: bzero returns void and __memsetzero returns void *. */ > >> asm (".weak bzero"); > >> asm ("bzero = __memsetzero"); > >> asm (".global __bzero"); > >> asm ("__bzero = __memsetzero"); > >> > >>>> > >>>>> > >>>>> My understanding is __memsetzero would maybe yield some gain in the > >>>>> store mask generation (some architecture might have a zero register > >>>>> or some instruction to generate one), however it would require to > >>>>> use the same strategy as memset to use specific architecture instruction > >>>>> that optimize cache utilization (dc zva, dcbz). > >>>>> > >>>>> So it would mostly require a lot of arch-specific code to to share > >>>>> the memset code with __memsetzero (to avoid increasing code size), > >>>>> so I am not sure if this is really a gain in the long term. > >>>> > >>>> It's worth noting that between the two `memset` is the cold function > >>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and > >>>> Python3.7.7 setting zero covers 99%+ cases. > >>> > >>> This is very workload specific and I think with more advance compiler > >>> optimization like LTO and PGO such calls could most likely being > >>> optimized by the compiler itself (either by inline or by create a > >>> synthetic function to handle it). > >>> > >>> What I worried is such symbols might ended up as the AEBI memcpy variants > >>> that was added as way to optimize when alignment is know to be multiple > >>> of words, but it ended up not being implemented and also not being generated > >>> by the compiler (at least not gcc). > >> > >> > >> > >> -- > >> H.J. > > > > I would like to backport this patch to release branches. > > Any comments or objections? > > Nothing really against, but as previous discussion we had on this maillist optimizing > bzero does not yield much gain compared to memset (compiler won't generate libcall > for loop transformation, among other shortcomings). My idea is to follow other > architecture and just remove all x86_64 optimizations. We'd like to reduce the differences between master and release branches to help future backports to release branches.
On 04/05/2022 11:50, H.J. Lu wrote: > On Wed, May 4, 2022 at 5:52 AM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 04/05/2022 03:35, Sunil Pandey wrote: >>> On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha >>> <libc-alpha@sourceware.org> wrote: >>>> >>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha >>>> <libc-alpha@sourceware.org> wrote: >>>>> >>>>> >>>>> >>>>> On 14/02/2022 09:41, Noah Goldstein wrote: >>>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella >>>>>> <adhemerval.zanella@linaro.org> wrote: >>>>>>> >>>>>>> >>>>>>> >>>>>>> On 12/02/2022 20:46, Noah Goldstein wrote: >>>>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha >>>>>>>> <libc-alpha@sourceware.org> wrote: >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote: >>>>>>>>>> Just as another point of information, Solaris libc implemented >>>>>>>>>> bzero as moving arguments around appropriately then jumping to >>>>>>>>>> memset. Noone noticed enough to file a complaint. Of course, >>>>>>>>>> short fixed-length bzero was handled with in line stores of zero >>>>>>>>>> by the compiler. For long vector bzeroing, the overhead was >>>>>>>>>> negligible. >>>>>>>>>> >>>>>>>>>> When certain Sparc hardware implementations provided faster methods >>>>>>>>>> for zeroing a cache line at a time on cache line boundaries, >>>>>>>>>> memset added a single test for zero ifandonlyif the length of code >>>>>>>>>> to memset was over a threshold that seemed likely to make it >>>>>>>>>> worthwhile to use the faster method. The principal advantage >>>>>>>>>> of the fast zeroing operation is that it did not require data >>>>>>>>>> to move from memory to cache before writing zeros to memory, >>>>>>>>>> protecting cache locality in the face of large block zeroing. >>>>>>>>>> I was responsible for much of that optimization effort. >>>>>>>>>> Whether that optimization was really worth it is open for debate >>>>>>>>>> for a variety of reasons that I won't go into just now. >>>>>>>>> >>>>>>>>> Afaik this is pretty much what optimized memset implementations >>>>>>>>> does, if architecture allows it. For instance, aarch64 uses >>>>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a >>>>>>>>> similar strategy. >>>>>>>>> >>>>>>>>>> >>>>>>>>>> Apps still used bzero or memset(target,zero,length) according to >>>>>>>>>> their preferences, but the code was unified under memset. >>>>>>>>>> >>>>>>>>>> I am inclined to agree with keeping bzero in the API for >>>>>>>>>> compatibility with old code/old binaries/old programmers. :-) >>>>>>>>> >>>>>>>>> The main driver to remove the bzero internal implementation is just >>>>>>>>> the *currently* gcc just do not generate bzero calls as default >>>>>>>>> (I couldn't find a single binary that calls bzero in my system). >>>>>>>> >>>>>>>> Does it make sense then to add '__memsetzero' so that we can have >>>>>>>> a function optimized for setting zero? >>>>>>> >>>>>>> Will it be really a huge gain instead of a microoptimization that will >>>>>>> just a bunch of more ifunc variants along with the maintenance cost >>>>>>> associated with this? >>>>>> Is there any way it can be setup so that one C impl can cover all the >>>>>> arch that want to just leave `__memsetzero` as an alias to `memset`? >>>>>> I know they have incompatible interfaces that make it hard but would >>>>>> a weak static inline in string.h work? >>>>>> >>>>>> For some of the shorter control flows (which are generally small sizes >>>>>> and very hot) we saw reasonable benefits on x86_64. >>>>>> >>>>>> The most significant was the EVEX/AVX2 [32, 64] case where it net >>>>>> us ~25% throughput. This is a pretty hot set value so it may be worth it. >>>>> >>>>> With different prototypes and semantics we won't be able to define an >>>>> alias. What we used to do, but we move away in recent version, was to >>>>> define static inline function that glue the two function if optimization >>>>> is set. >>>> >>>> I have >>>> >>>> /* NB: bzero returns void and __memsetzero returns void *. */ >>>> asm (".weak bzero"); >>>> asm ("bzero = __memsetzero"); >>>> asm (".global __bzero"); >>>> asm ("__bzero = __memsetzero"); >>>> >>>>>> >>>>>>> >>>>>>> My understanding is __memsetzero would maybe yield some gain in the >>>>>>> store mask generation (some architecture might have a zero register >>>>>>> or some instruction to generate one), however it would require to >>>>>>> use the same strategy as memset to use specific architecture instruction >>>>>>> that optimize cache utilization (dc zva, dcbz). >>>>>>> >>>>>>> So it would mostly require a lot of arch-specific code to to share >>>>>>> the memset code with __memsetzero (to avoid increasing code size), >>>>>>> so I am not sure if this is really a gain in the long term. >>>>>> >>>>>> It's worth noting that between the two `memset` is the cold function >>>>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and >>>>>> Python3.7.7 setting zero covers 99%+ cases. >>>>> >>>>> This is very workload specific and I think with more advance compiler >>>>> optimization like LTO and PGO such calls could most likely being >>>>> optimized by the compiler itself (either by inline or by create a >>>>> synthetic function to handle it). >>>>> >>>>> What I worried is such symbols might ended up as the AEBI memcpy variants >>>>> that was added as way to optimize when alignment is know to be multiple >>>>> of words, but it ended up not being implemented and also not being generated >>>>> by the compiler (at least not gcc). >>>> >>>> >>>> >>>> -- >>>> H.J. >>> >>> I would like to backport this patch to release branches. >>> Any comments or objections? >> >> Nothing really against, but as previous discussion we had on this maillist optimizing >> bzero does not yield much gain compared to memset (compiler won't generate libcall >> for loop transformation, among other shortcomings). My idea is to follow other >> architecture and just remove all x86_64 optimizations. > > We'd like to reduce the differences between master and release branches to help > future backports to release branches. > Ok, fair enough.
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 3f0517bbfc..af26e9cedc 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -35,6 +35,9 @@ punpcklwd %xmm0, %xmm0; \ pshufd $0, %xmm0, %xmm0 +# define BZERO_ZERO_VEC0() \ + pxor %xmm0, %xmm0 + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ pshufd $0, %xmm0, %xmm0; \ @@ -53,6 +56,10 @@ # define MEMSET_SYMBOL(p,s) memset #endif +#ifndef BZERO_SYMBOL +# define BZERO_SYMBOL(p,s) __bzero +#endif + #ifndef WMEMSET_SYMBOL # define WMEMSET_CHK_SYMBOL(p,s) p # define WMEMSET_SYMBOL(p,s) __wmemset @@ -63,6 +70,7 @@ libc_hidden_builtin_def (memset) #if IS_IN (libc) +weak_alias (__bzero, bzero) libc_hidden_def (__wmemset) weak_alias (__wmemset, wmemset) libc_hidden_weak (wmemset) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 4274bfdd0d..e7b413edad 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -1,6 +1,7 @@ ifeq ($(subdir),string) sysdep_routines += \ + bzero \ memchr-avx2 \ memchr-avx2-rtm \ memchr-evex \ diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c new file mode 100644 index 0000000000..58a14b2c33 --- /dev/null +++ b/sysdeps/x86_64/multiarch/bzero.c @@ -0,0 +1,106 @@ +/* Multiple versions of bzero. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __bzero __redirect___bzero +# include <string.h> +# undef __bzero + +# define SYMBOL_NAME __bzero +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) + attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE1 (avx512_unaligned_erms); + + return OPTIMIZE1 (avx512_unaligned); + } + } + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE1 (evex_unaligned_erms); + + return OPTIMIZE1 (evex_unaligned); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE1 (avx2_unaligned_erms_rtm); + + return OPTIMIZE1 (avx2_unaligned_rtm); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE1 (avx2_unaligned_erms); + + return OPTIMIZE1 (avx2_unaligned); + } + } + + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE1 (sse2_unaligned_erms); + + return OPTIMIZE1 (sse2_unaligned); +} + +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); + +weak_alias (__bzero, bzero) +#endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 68a56797d4..a594f4176e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memset_avx512_no_vzeroupper) ) + /* Support sysdeps/x86_64/multiarch/bzero.c. */ + IFUNC_IMPL (i, name, bzero, + IFUNC_IMPL_ADD (array, i, bzero, 1, + __bzero_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, bzero, 1, + __bzero_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, bzero, + CPU_FEATURE_USABLE (AVX2), + __bzero_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, bzero, + CPU_FEATURE_USABLE (AVX2), + __bzero_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, bzero, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __bzero_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, bzero, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __bzero_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, bzero, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __bzero_evex_unaligned) + IFUNC_IMPL_ADD (array, i, bzero, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __bzero_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, bzero, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __bzero_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, bzero, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __bzero_avx512_unaligned) + ) + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ IFUNC_IMPL (i, name, rawmemchr, IFUNC_IMPL_ADD (array, i, rawmemchr, diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S index 8ac3e479bb..5a5ee6f672 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -5,6 +5,7 @@ #define SECTION(p) p##.avx.rtm #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm #include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index c0bf2875d0..a093a2831f 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -14,6 +14,9 @@ vmovd d, %xmm0; \ movq r, %rax; +# define BZERO_ZERO_VEC0() \ + vpxor %xmm0, %xmm0, %xmm0 + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ MEMSET_SET_VEC0_AND_SET_RETURN(d, r) @@ -29,6 +32,9 @@ # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx2_##s # endif +# ifndef BZERO_SYMBOL +# define BZERO_SYMBOL(p,s) p##_avx2_##s +# endif # ifndef WMEMSET_SYMBOL # define WMEMSET_SYMBOL(p,s) p##_avx2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 5241216a77..727c92133a 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -19,6 +19,9 @@ vpbroadcastb d, %VEC0; \ movq r, %rax +# define BZERO_ZERO_VEC0() \ + vpxorq %XMM0, %XMM0, %XMM0 + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vpbroadcastd d, %VEC0; \ movq r, %rax diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index 6370021506..5d8fa78f05 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -19,6 +19,9 @@ vpbroadcastb d, %VEC0; \ movq r, %rax +# define BZERO_ZERO_VEC0() \ + vpxorq %XMM0, %XMM0, %XMM0 + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vpbroadcastd d, %VEC0; \ movq r, %rax diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S index 8a6f0c561a..329c58ee46 100644 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -22,6 +22,7 @@ #if IS_IN (libc) # define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) # define WMEMSET_SYMBOL(p,s) p##_sse2_##s # ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 1b502b78e4..7c94fcdae1 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -26,6 +26,10 @@ #include <sysdep.h> +#ifndef BZERO_SYMBOL +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) +#endif + #ifndef MEMSET_CHK_SYMBOL # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) #endif @@ -87,6 +91,18 @@ # define XMM_SMALL 0 #endif +#ifdef USE_LESS_VEC_MASK_STORE +# define SET_REG64 rcx +# define SET_REG32 ecx +# define SET_REG16 cx +# define SET_REG8 cl +#else +# define SET_REG64 rsi +# define SET_REG32 esi +# define SET_REG16 si +# define SET_REG8 sil +#endif + #define PAGE_SIZE 4096 /* Macro to calculate size of small memset block for aligning @@ -96,18 +112,6 @@ #ifndef SECTION # error SECTION is not defined! -#endif - - .section SECTION(.text),"ax",@progbits -#if VEC_SIZE == 16 && IS_IN (libc) -ENTRY (__bzero) - mov %RDI_LP, %RAX_LP /* Set return value. */ - mov %RSI_LP, %RDX_LP /* Set n. */ - xorl %esi, %esi - pxor %XMM0, %XMM0 - jmp L(entry_from_bzero) -END (__bzero) -weak_alias (__bzero, bzero) #endif #if IS_IN (libc) @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) WMEMSET_VDUP_TO_VEC0_LOW() cmpq $VEC_SIZE, %rdx - jb L(less_vec_no_vdup) + jb L(less_vec_from_wmemset) WMEMSET_VDUP_TO_VEC0_HIGH() jmp L(entry_from_wmemset) END (WMEMSET_SYMBOL (__wmemset, unaligned)) #endif +ENTRY (BZERO_SYMBOL(__bzero, unaligned)) +#if VEC_SIZE > 16 + BZERO_ZERO_VEC0 () +#endif + mov %RDI_LP, %RAX_LP + mov %RSI_LP, %RDX_LP +#ifndef USE_LESS_VEC_MASK_STORE + xorl %esi, %esi +#endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec_no_vdup) +#ifdef USE_LESS_VEC_MASK_STORE + xorl %esi, %esi +#endif +#if VEC_SIZE <= 16 + BZERO_ZERO_VEC0 () +#endif + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN +END (BZERO_SYMBOL(__bzero, unaligned)) + #if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) cmp %RDX_LP, %RCX_LP @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) /* Clear the upper 32 bits. */ mov %edx, %edx # endif -L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) MEMSET_VDUP_TO_VEC0_HIGH() @@ -187,6 +215,31 @@ END (__memset_erms) END (MEMSET_SYMBOL (__memset, erms)) # endif +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) +# if VEC_SIZE > 16 + BZERO_ZERO_VEC0 () +# endif + mov %RDI_LP, %RAX_LP + mov %RSI_LP, %RDX_LP +# ifndef USE_LESS_VEC_MASK_STORE + xorl %esi, %esi +# endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec_no_vdup) +# ifdef USE_LESS_VEC_MASK_STORE + xorl %esi, %esi +# endif +# if VEC_SIZE <= 16 + BZERO_ZERO_VEC0 () +# endif + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN +END (BZERO_SYMBOL(__bzero, unaligned_erms)) + # if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) cmp %RDX_LP, %RCX_LP @@ -229,6 +282,7 @@ L(last_2x_vec): .p2align 4,, 10 L(less_vec): L(less_vec_no_vdup): +L(less_vec_from_wmemset): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! @@ -374,8 +428,11 @@ L(less_vec): /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to xmm). This is only does anything for AVX2. */ MEMSET_VDUP_TO_VEC0_LOW () +L(less_vec_from_wmemset): +#if VEC_SIZE > 16 L(less_vec_no_vdup): #endif +#endif L(cross_page): #if VEC_SIZE > 32 cmpl $32, %edx @@ -386,7 +443,10 @@ L(cross_page): jge L(between_16_31) #endif #ifndef USE_XMM_LESS_VEC - MOVQ %XMM0, %rcx + MOVQ %XMM0, %SET_REG64 +#endif +#if VEC_SIZE <= 16 +L(less_vec_no_vdup): #endif cmpl $8, %edx jge L(between_8_15) @@ -395,7 +455,7 @@ L(cross_page): cmpl $1, %edx jg L(between_2_3) jl L(between_0_0) - movb %sil, (%LESS_VEC_REG) + movb %SET_REG8, (%LESS_VEC_REG) L(between_0_0): ret @@ -428,8 +488,8 @@ L(between_8_15): MOVQ %XMM0, (%rdi) MOVQ %XMM0, -8(%rdi, %rdx) #else - movq %rcx, (%LESS_VEC_REG) - movq %rcx, -8(%LESS_VEC_REG, %rdx) + movq %SET_REG64, (%LESS_VEC_REG) + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) #endif ret @@ -442,8 +502,8 @@ L(between_4_7): MOVD %XMM0, (%rdi) MOVD %XMM0, -4(%rdi, %rdx) #else - movl %ecx, (%LESS_VEC_REG) - movl %ecx, -4(%LESS_VEC_REG, %rdx) + movl %SET_REG32, (%LESS_VEC_REG) + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) #endif ret @@ -452,12 +512,12 @@ L(between_4_7): L(between_2_3): /* From 2 to 3. No branch when size == 2. */ #ifdef USE_XMM_LESS_VEC - movb %sil, (%rdi) - movb %sil, 1(%rdi) - movb %sil, -1(%rdi, %rdx) + movb %SET_REG8, (%rdi) + movb %SET_REG8, 1(%rdi) + movb %SET_REG8, -1(%rdi, %rdx) #else - movw %cx, (%LESS_VEC_REG) - movb %sil, -1(%LESS_VEC_REG, %rdx) + movw %SET_REG16, (%LESS_VEC_REG) + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) #endif ret END (MEMSET_SYMBOL (__memset, unaligned_erms))