Message ID | 20220628152735.17863-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/3] x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list | expand |
On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Implementation wise: > 1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not > use the L(stosb) label that was previously defined. > > 2. Don't give the hotpath (fallthrough) to zero size. > > Code positioning wise: > > Move L(memset_{chk}_erms) to its own file. Leaving it in between the It is ENTRY, not L. Did you mean to move them to the end of file? > memset_{impl}_unaligned both adds unnecessary complexity to the > file and wastes space in a relatively hot cache section. > --- > .../multiarch/memset-vec-unaligned-erms.S | 54 ++++++++----------- > 1 file changed, 23 insertions(+), 31 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index abc12d9cda..d98c613651 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -156,37 +156,6 @@ L(entry_from_wmemset): > #if defined USE_MULTIARCH && IS_IN (libc) > END (MEMSET_SYMBOL (__memset, unaligned)) > > -# if VEC_SIZE == 16 > -ENTRY (__memset_chk_erms) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (__memset_chk_erms) > - > -/* Only used to measure performance of REP STOSB. */ > -ENTRY (__memset_erms) > - /* Skip zero length. */ > - test %RDX_LP, %RDX_LP > - jnz L(stosb) > - movq %rdi, %rax > - ret > -# else > -/* Provide a hidden symbol to debugger. */ > - .hidden MEMSET_SYMBOL (__memset, erms) > -ENTRY (MEMSET_SYMBOL (__memset, erms)) > -# endif > -L(stosb): > - mov %RDX_LP, %RCX_LP > - movzbl %sil, %eax > - mov %RDI_LP, %RDX_LP > - rep stosb > - mov %RDX_LP, %RAX_LP > - VZEROUPPER_RETURN > -# if VEC_SIZE == 16 > -END (__memset_erms) > -# else > -END (MEMSET_SYMBOL (__memset, erms)) > -# endif > - > # if defined SHARED && IS_IN (libc) > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > cmp %RDX_LP, %RCX_LP > @@ -461,3 +430,26 @@ L(between_2_3): > #endif > ret > END (MEMSET_SYMBOL (__memset, unaligned_erms)) > + > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16 > +ENTRY (__memset_chk_erms) > + cmp %RDX_LP, %RCX_LP > + jb HIDDEN_JUMPTARGET (__chk_fail) > +END (__memset_chk_erms) > + > +/* Only used to measure performance of REP STOSB. */ > +ENTRY (__memset_erms) > + /* Skip zero length. */ > + test %RDX_LP, %RDX_LP > + jz L(stosb_return_zero) > + mov %RDX_LP, %RCX_LP > + movzbl %sil, %eax > + mov %RDI_LP, %RDX_LP > + rep stosb > + mov %RDX_LP, %RAX_LP > + ret > +L(stosb_return_zero): > + movq %rdi, %rax > + ret > +END (__memset_erms) > +#endif > -- > 2.34.1 >
On Wed, Jun 29, 2022 at 12:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Implementation wise: > > 1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not > > use the L(stosb) label that was previously defined. > > > > 2. Don't give the hotpath (fallthrough) to zero size. > > > > Code positioning wise: > > > > Move L(memset_{chk}_erms) to its own file. Leaving it in between the > > It is ENTRY, not L. Did you mean to move them to the end of file? Will fix L -> ENTRY for V2. Yes it should be moved to new file in this patch. Was rebase mistake. The file change is in the isa raising patch. Will fix both for v2. > > > memset_{impl}_unaligned both adds unnecessary complexity to the > > file and wastes space in a relatively hot cache section. > > --- > > .../multiarch/memset-vec-unaligned-erms.S | 54 ++++++++----------- > > 1 file changed, 23 insertions(+), 31 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > index abc12d9cda..d98c613651 100644 > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > @@ -156,37 +156,6 @@ L(entry_from_wmemset): > > #if defined USE_MULTIARCH && IS_IN (libc) > > END (MEMSET_SYMBOL (__memset, unaligned)) > > > > -# if VEC_SIZE == 16 > > -ENTRY (__memset_chk_erms) > > - cmp %RDX_LP, %RCX_LP > > - jb HIDDEN_JUMPTARGET (__chk_fail) > > -END (__memset_chk_erms) > > - > > -/* Only used to measure performance of REP STOSB. */ > > -ENTRY (__memset_erms) > > - /* Skip zero length. */ > > - test %RDX_LP, %RDX_LP > > - jnz L(stosb) > > - movq %rdi, %rax > > - ret > > -# else > > -/* Provide a hidden symbol to debugger. */ > > - .hidden MEMSET_SYMBOL (__memset, erms) > > -ENTRY (MEMSET_SYMBOL (__memset, erms)) > > -# endif > > -L(stosb): > > - mov %RDX_LP, %RCX_LP > > - movzbl %sil, %eax > > - mov %RDI_LP, %RDX_LP > > - rep stosb > > - mov %RDX_LP, %RAX_LP > > - VZEROUPPER_RETURN > > -# if VEC_SIZE == 16 > > -END (__memset_erms) > > -# else > > -END (MEMSET_SYMBOL (__memset, erms)) > > -# endif > > - > > # if defined SHARED && IS_IN (libc) > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > cmp %RDX_LP, %RCX_LP > > @@ -461,3 +430,26 @@ L(between_2_3): > > #endif > > ret > > END (MEMSET_SYMBOL (__memset, unaligned_erms)) > > + > > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16 > > +ENTRY (__memset_chk_erms) > > + cmp %RDX_LP, %RCX_LP > > + jb HIDDEN_JUMPTARGET (__chk_fail) > > +END (__memset_chk_erms) > > + > > +/* Only used to measure performance of REP STOSB. */ > > +ENTRY (__memset_erms) > > + /* Skip zero length. */ > > + test %RDX_LP, %RDX_LP > > + jz L(stosb_return_zero) > > + mov %RDX_LP, %RCX_LP > > + movzbl %sil, %eax > > + mov %RDI_LP, %RDX_LP > > + rep stosb > > + mov %RDX_LP, %RAX_LP > > + ret > > +L(stosb_return_zero): > > + movq %rdi, %rax > > + ret > > +END (__memset_erms) > > +#endif > > -- > > 2.34.1 > > > > > -- > H.J.
On Wed, Jun 29, 2022 at 12:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Jun 29, 2022 at 12:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > Implementation wise: > > > 1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not > > > use the L(stosb) label that was previously defined. > > > > > > 2. Don't give the hotpath (fallthrough) to zero size. > > > > > > Code positioning wise: > > > > > > Move L(memset_{chk}_erms) to its own file. Leaving it in between the > > > > It is ENTRY, not L. Did you mean to move them to the end of file? > > Will fix L -> ENTRY for V2. Fixed in V2. > > Yes it should be moved to new file in this patch. Was rebase mistake. The file > change is in the isa raising patch. Will fix both for v2. > > > > > memset_{impl}_unaligned both adds unnecessary complexity to the > > > file and wastes space in a relatively hot cache section. > > > --- > > > .../multiarch/memset-vec-unaligned-erms.S | 54 ++++++++----------- > > > 1 file changed, 23 insertions(+), 31 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > index abc12d9cda..d98c613651 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > @@ -156,37 +156,6 @@ L(entry_from_wmemset): > > > #if defined USE_MULTIARCH && IS_IN (libc) > > > END (MEMSET_SYMBOL (__memset, unaligned)) > > > > > > -# if VEC_SIZE == 16 > > > -ENTRY (__memset_chk_erms) > > > - cmp %RDX_LP, %RCX_LP > > > - jb HIDDEN_JUMPTARGET (__chk_fail) > > > -END (__memset_chk_erms) > > > - > > > -/* Only used to measure performance of REP STOSB. */ > > > -ENTRY (__memset_erms) > > > - /* Skip zero length. */ > > > - test %RDX_LP, %RDX_LP > > > - jnz L(stosb) > > > - movq %rdi, %rax > > > - ret > > > -# else > > > -/* Provide a hidden symbol to debugger. */ > > > - .hidden MEMSET_SYMBOL (__memset, erms) > > > -ENTRY (MEMSET_SYMBOL (__memset, erms)) > > > -# endif > > > -L(stosb): > > > - mov %RDX_LP, %RCX_LP > > > - movzbl %sil, %eax > > > - mov %RDI_LP, %RDX_LP > > > - rep stosb > > > - mov %RDX_LP, %RAX_LP > > > - VZEROUPPER_RETURN > > > -# if VEC_SIZE == 16 > > > -END (__memset_erms) > > > -# else > > > -END (MEMSET_SYMBOL (__memset, erms)) > > > -# endif > > > - > > > # if defined SHARED && IS_IN (libc) > > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > > cmp %RDX_LP, %RCX_LP > > > @@ -461,3 +430,26 @@ L(between_2_3): > > > #endif > > > ret > > > END (MEMSET_SYMBOL (__memset, unaligned_erms)) > > > + > > > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16 > > > +ENTRY (__memset_chk_erms) > > > + cmp %RDX_LP, %RCX_LP > > > + jb HIDDEN_JUMPTARGET (__chk_fail) > > > +END (__memset_chk_erms) > > > + > > > +/* Only used to measure performance of REP STOSB. */ > > > +ENTRY (__memset_erms) > > > + /* Skip zero length. */ > > > + test %RDX_LP, %RDX_LP > > > + jz L(stosb_return_zero) > > > + mov %RDX_LP, %RCX_LP > > > + movzbl %sil, %eax > > > + mov %RDI_LP, %RDX_LP > > > + rep stosb > > > + mov %RDX_LP, %RAX_LP > > > + ret > > > +L(stosb_return_zero): > > > + movq %rdi, %rax > > > + ret > > > +END (__memset_erms) > > > +#endif > > > -- > > > 2.34.1 > > > > > > > > > -- > > H.J.
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index abc12d9cda..d98c613651 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -156,37 +156,6 @@ L(entry_from_wmemset): #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) -# if VEC_SIZE == 16 -ENTRY (__memset_chk_erms) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memset_chk_erms) - -/* Only used to measure performance of REP STOSB. */ -ENTRY (__memset_erms) - /* Skip zero length. */ - test %RDX_LP, %RDX_LP - jnz L(stosb) - movq %rdi, %rax - ret -# else -/* Provide a hidden symbol to debugger. */ - .hidden MEMSET_SYMBOL (__memset, erms) -ENTRY (MEMSET_SYMBOL (__memset, erms)) -# endif -L(stosb): - mov %RDX_LP, %RCX_LP - movzbl %sil, %eax - mov %RDI_LP, %RDX_LP - rep stosb - mov %RDX_LP, %RAX_LP - VZEROUPPER_RETURN -# if VEC_SIZE == 16 -END (__memset_erms) -# else -END (MEMSET_SYMBOL (__memset, erms)) -# endif - # if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) cmp %RDX_LP, %RCX_LP @@ -461,3 +430,26 @@ L(between_2_3): #endif ret END (MEMSET_SYMBOL (__memset, unaligned_erms)) + +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16 +ENTRY (__memset_chk_erms) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_erms) + +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz L(stosb_return_zero) + mov %RDX_LP, %RCX_LP + movzbl %sil, %eax + mov %RDI_LP, %RDX_LP + rep stosb + mov %RDX_LP, %RAX_LP + ret +L(stosb_return_zero): + movq %rdi, %rax + ret +END (__memset_erms) +#endif