diff mbox series

[v1,2/3] x86: Move and slightly improve memset_erms

Message ID 20220628152735.17863-2-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/3] x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list | expand

Commit Message

Noah Goldstein June 28, 2022, 3:27 p.m. UTC
Implementation wise:
    1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
       use the L(stosb) label that was previously defined.

    2. Don't give the hotpath (fallthrough) to zero size.

Code positioning wise:

Move L(memset_{chk}_erms) to its own file.  Leaving it in between the
memset_{impl}_unaligned both adds unnecessary complexity to the
file and wastes space in a relatively hot cache section.
---
 .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
 1 file changed, 23 insertions(+), 31 deletions(-)

Comments

H.J. Lu June 29, 2022, 7:25 p.m. UTC | #1
On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Implementation wise:
>     1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
>        use the L(stosb) label that was previously defined.
>
>     2. Don't give the hotpath (fallthrough) to zero size.
>
> Code positioning wise:
>
> Move L(memset_{chk}_erms) to its own file.  Leaving it in between the

 It is ENTRY, not L.   Did you mean to move them to the end of file?

> memset_{impl}_unaligned both adds unnecessary complexity to the
> file and wastes space in a relatively hot cache section.
> ---
>  .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
>  1 file changed, 23 insertions(+), 31 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index abc12d9cda..d98c613651 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -156,37 +156,6 @@ L(entry_from_wmemset):
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMSET_SYMBOL (__memset, unaligned))
>
> -# if VEC_SIZE == 16
> -ENTRY (__memset_chk_erms)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (__memset_chk_erms)
> -
> -/* Only used to measure performance of REP STOSB.  */
> -ENTRY (__memset_erms)
> -       /* Skip zero length.  */
> -       test    %RDX_LP, %RDX_LP
> -       jnz      L(stosb)
> -       movq    %rdi, %rax
> -       ret
> -# else
> -/* Provide a hidden symbol to debugger.  */
> -       .hidden MEMSET_SYMBOL (__memset, erms)
> -ENTRY (MEMSET_SYMBOL (__memset, erms))
> -# endif
> -L(stosb):
> -       mov     %RDX_LP, %RCX_LP
> -       movzbl  %sil, %eax
> -       mov     %RDI_LP, %RDX_LP
> -       rep stosb
> -       mov     %RDX_LP, %RAX_LP
> -       VZEROUPPER_RETURN
> -# if VEC_SIZE == 16
> -END (__memset_erms)
> -# else
> -END (MEMSET_SYMBOL (__memset, erms))
> -# endif
> -
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>         cmp     %RDX_LP, %RCX_LP
> @@ -461,3 +430,26 @@ L(between_2_3):
>  #endif
>         ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> +
> +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
> +ENTRY (__memset_chk_erms)
> +       cmp     %RDX_LP, %RCX_LP
> +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> +END (__memset_chk_erms)
> +
> +/* Only used to measure performance of REP STOSB.  */
> +ENTRY (__memset_erms)
> +       /* Skip zero length.  */
> +       test    %RDX_LP, %RDX_LP
> +       jz       L(stosb_return_zero)
> +       mov     %RDX_LP, %RCX_LP
> +       movzbl  %sil, %eax
> +       mov     %RDI_LP, %RDX_LP
> +       rep stosb
> +       mov     %RDX_LP, %RAX_LP
> +       ret
> +L(stosb_return_zero):
> +       movq    %rdi, %rax
> +       ret
> +END (__memset_erms)
> +#endif
> --
> 2.34.1
>
Noah Goldstein June 29, 2022, 7:32 p.m. UTC | #2
On Wed, Jun 29, 2022 at 12:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Implementation wise:
> >     1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
> >        use the L(stosb) label that was previously defined.
> >
> >     2. Don't give the hotpath (fallthrough) to zero size.
> >
> > Code positioning wise:
> >
> > Move L(memset_{chk}_erms) to its own file.  Leaving it in between the
>
>  It is ENTRY, not L.   Did you mean to move them to the end of file?

Will fix L -> ENTRY for V2.

Yes it should be moved to new file in this patch. Was rebase mistake. The file
change is in the isa raising patch. Will fix both for v2.
>
> > memset_{impl}_unaligned both adds unnecessary complexity to the
> > file and wastes space in a relatively hot cache section.
> > ---
> >  .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
> >  1 file changed, 23 insertions(+), 31 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index abc12d9cda..d98c613651 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -156,37 +156,6 @@ L(entry_from_wmemset):
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  END (MEMSET_SYMBOL (__memset, unaligned))
> >
> > -# if VEC_SIZE == 16
> > -ENTRY (__memset_chk_erms)
> > -       cmp     %RDX_LP, %RCX_LP
> > -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > -END (__memset_chk_erms)
> > -
> > -/* Only used to measure performance of REP STOSB.  */
> > -ENTRY (__memset_erms)
> > -       /* Skip zero length.  */
> > -       test    %RDX_LP, %RDX_LP
> > -       jnz      L(stosb)
> > -       movq    %rdi, %rax
> > -       ret
> > -# else
> > -/* Provide a hidden symbol to debugger.  */
> > -       .hidden MEMSET_SYMBOL (__memset, erms)
> > -ENTRY (MEMSET_SYMBOL (__memset, erms))
> > -# endif
> > -L(stosb):
> > -       mov     %RDX_LP, %RCX_LP
> > -       movzbl  %sil, %eax
> > -       mov     %RDI_LP, %RDX_LP
> > -       rep stosb
> > -       mov     %RDX_LP, %RAX_LP
> > -       VZEROUPPER_RETURN
> > -# if VEC_SIZE == 16
> > -END (__memset_erms)
> > -# else
> > -END (MEMSET_SYMBOL (__memset, erms))
> > -# endif
> > -
> >  # if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >         cmp     %RDX_LP, %RCX_LP
> > @@ -461,3 +430,26 @@ L(between_2_3):
> >  #endif
> >         ret
> >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > +
> > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
> > +ENTRY (__memset_chk_erms)
> > +       cmp     %RDX_LP, %RCX_LP
> > +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > +END (__memset_chk_erms)
> > +
> > +/* Only used to measure performance of REP STOSB.  */
> > +ENTRY (__memset_erms)
> > +       /* Skip zero length.  */
> > +       test    %RDX_LP, %RDX_LP
> > +       jz       L(stosb_return_zero)
> > +       mov     %RDX_LP, %RCX_LP
> > +       movzbl  %sil, %eax
> > +       mov     %RDI_LP, %RDX_LP
> > +       rep stosb
> > +       mov     %RDX_LP, %RAX_LP
> > +       ret
> > +L(stosb_return_zero):
> > +       movq    %rdi, %rax
> > +       ret
> > +END (__memset_erms)
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
Noah Goldstein June 29, 2022, 10:11 p.m. UTC | #3
On Wed, Jun 29, 2022 at 12:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Jun 29, 2022 at 12:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Implementation wise:
> > >     1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
> > >        use the L(stosb) label that was previously defined.
> > >
> > >     2. Don't give the hotpath (fallthrough) to zero size.
> > >
> > > Code positioning wise:
> > >
> > > Move L(memset_{chk}_erms) to its own file.  Leaving it in between the
> >
> >  It is ENTRY, not L.   Did you mean to move them to the end of file?
>
> Will fix L -> ENTRY for V2.

Fixed in V2.
>
> Yes it should be moved to new file in this patch. Was rebase mistake. The file
> change is in the isa raising patch. Will fix both for v2.
> >
> > > memset_{impl}_unaligned both adds unnecessary complexity to the
> > > file and wastes space in a relatively hot cache section.
> > > ---
> > >  .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
> > >  1 file changed, 23 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index abc12d9cda..d98c613651 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -156,37 +156,6 @@ L(entry_from_wmemset):
> > >  #if defined USE_MULTIARCH && IS_IN (libc)
> > >  END (MEMSET_SYMBOL (__memset, unaligned))
> > >
> > > -# if VEC_SIZE == 16
> > > -ENTRY (__memset_chk_erms)
> > > -       cmp     %RDX_LP, %RCX_LP
> > > -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > > -END (__memset_chk_erms)
> > > -
> > > -/* Only used to measure performance of REP STOSB.  */
> > > -ENTRY (__memset_erms)
> > > -       /* Skip zero length.  */
> > > -       test    %RDX_LP, %RDX_LP
> > > -       jnz      L(stosb)
> > > -       movq    %rdi, %rax
> > > -       ret
> > > -# else
> > > -/* Provide a hidden symbol to debugger.  */
> > > -       .hidden MEMSET_SYMBOL (__memset, erms)
> > > -ENTRY (MEMSET_SYMBOL (__memset, erms))
> > > -# endif
> > > -L(stosb):
> > > -       mov     %RDX_LP, %RCX_LP
> > > -       movzbl  %sil, %eax
> > > -       mov     %RDI_LP, %RDX_LP
> > > -       rep stosb
> > > -       mov     %RDX_LP, %RAX_LP
> > > -       VZEROUPPER_RETURN
> > > -# if VEC_SIZE == 16
> > > -END (__memset_erms)
> > > -# else
> > > -END (MEMSET_SYMBOL (__memset, erms))
> > > -# endif
> > > -
> > >  # if defined SHARED && IS_IN (libc)
> > >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > >         cmp     %RDX_LP, %RCX_LP
> > > @@ -461,3 +430,26 @@ L(between_2_3):
> > >  #endif
> > >         ret
> > >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > > +
> > > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
> > > +ENTRY (__memset_chk_erms)
> > > +       cmp     %RDX_LP, %RCX_LP
> > > +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > > +END (__memset_chk_erms)
> > > +
> > > +/* Only used to measure performance of REP STOSB.  */
> > > +ENTRY (__memset_erms)
> > > +       /* Skip zero length.  */
> > > +       test    %RDX_LP, %RDX_LP
> > > +       jz       L(stosb_return_zero)
> > > +       mov     %RDX_LP, %RCX_LP
> > > +       movzbl  %sil, %eax
> > > +       mov     %RDI_LP, %RDX_LP
> > > +       rep stosb
> > > +       mov     %RDX_LP, %RAX_LP
> > > +       ret
> > > +L(stosb_return_zero):
> > > +       movq    %rdi, %rax
> > > +       ret
> > > +END (__memset_erms)
> > > +#endif
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index abc12d9cda..d98c613651 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -156,37 +156,6 @@  L(entry_from_wmemset):
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
-# if VEC_SIZE == 16
-ENTRY (__memset_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk_erms)
-
-/* Only used to measure performance of REP STOSB.  */
-ENTRY (__memset_erms)
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jnz	 L(stosb)
-	movq	%rdi, %rax
-	ret
-# else
-/* Provide a hidden symbol to debugger.  */
-	.hidden	MEMSET_SYMBOL (__memset, erms)
-ENTRY (MEMSET_SYMBOL (__memset, erms))
-# endif
-L(stosb):
-	mov	%RDX_LP, %RCX_LP
-	movzbl	%sil, %eax
-	mov	%RDI_LP, %RDX_LP
-	rep stosb
-	mov	%RDX_LP, %RAX_LP
-	VZEROUPPER_RETURN
-# if VEC_SIZE == 16
-END (__memset_erms)
-# else
-END (MEMSET_SYMBOL (__memset, erms))
-# endif
-
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP
@@ -461,3 +430,26 @@  L(between_2_3):
 #endif
 	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
+
+#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
+ENTRY (__memset_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	 L(stosb_return_zero)
+	mov	%RDX_LP, %RCX_LP
+	movzbl	%sil, %eax
+	mov	%RDI_LP, %RDX_LP
+	rep stosb
+	mov	%RDX_LP, %RAX_LP
+	ret
+L(stosb_return_zero):
+	movq	%rdi, %rax
+	ret
+END (__memset_erms)
+#endif