diff mbox series

[v2,2/2] x86: Cleanup pthread_spin_{try}lock.S

Message ID 20221001041327.1133757-2-goldstein.w.n@gmail.com
State New
Headers show
Series [v2,1/2] Benchtests: Add bench for pthread_spin_{try}lock and mutex_trylock | expand

Commit Message

Noah Goldstein Oct. 1, 2022, 4:13 a.m. UTC
Save a jmp on the lock path coming from an initial failure in
pthread_spin_lock.S.  This costs 4-bytes of code but since the
function still fits in the same number of 16-byte blocks (default
function alignment) it does not have affect on the total binary size
of libc.so (unchanged after this commit).

pthread_spin_trylock was using a CAS when a simple xchg works which
is often more expensive.

Full check passes on x86-64.
---
 sysdeps/x86_64/nptl/pthread_spin_lock.S    | 23 +++++++++++++++-------
 sysdeps/x86_64/nptl/pthread_spin_trylock.S | 18 ++++++++++++-----
 2 files changed, 29 insertions(+), 12 deletions(-)

Comments

H.J. Lu Oct. 3, 2022, 5:08 p.m. UTC | #1
On Fri, Sep 30, 2022 at 9:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Save a jmp on the lock path coming from an initial failure in
> pthread_spin_lock.S.  This costs 4-bytes of code but since the
> function still fits in the same number of 16-byte blocks (default
> function alignment) it does not have affect on the total binary size
> of libc.so (unchanged after this commit).
>
> pthread_spin_trylock was using a CAS when a simple xchg works which
> is often more expensive.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/nptl/pthread_spin_lock.S    | 23 +++++++++++++++-------
>  sysdeps/x86_64/nptl/pthread_spin_trylock.S | 18 ++++++++++++-----
>  2 files changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S
> index 44b837d9db..1e09e59b10 100644
> --- a/sysdeps/x86_64/nptl/pthread_spin_lock.S
> +++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S
> @@ -19,18 +19,27 @@
>  #include <shlib-compat.h>
>
>  ENTRY(__pthread_spin_lock)
> -1:     LOCK
> -       decl    0(%rdi)
> -       jne     2f
> +       /* Always return zero.  */
>         xor     %eax, %eax
> +       LOCK
> +       decl    0(%rdi)
> +       jne     1f
>         ret
>
>         .align  16
> -2:     rep
> +1:
> +       /* `rep nop` == `pause`.  */
> +       rep
>         nop
> -       cmpl    $0, 0(%rdi)
> -       jg      1b
> -       jmp     2b
> +       cmpl    %eax, 0(%rdi)
> +       jle     1b
> +       /* Just repeat the `lock decl` logic here.  The code size save
> +          of jumping back to entry doesn't change how many 16-byte
> +          chunks (default function alignment) that the code fits in.  */
> +       LOCK
> +       decl    0(%rdi)
> +       jne     1b
> +       ret
>  END(__pthread_spin_lock)
>  versioned_symbol (libc, __pthread_spin_lock, pthread_spin_lock, GLIBC_2_34)
>
> diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
> index fffdb27dd9..a1f97cb420 100644
> --- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S
> +++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
> @@ -20,13 +20,21 @@
>  #include <shlib-compat.h>
>
>  ENTRY(__pthread_spin_trylock)
> -       movl    $1, %eax
>         xorl    %ecx, %ecx
> -       lock
> -       cmpxchgl %ecx, (%rdi)
> +       /* xchg has implicit LOCK prefix.  */
> +       xchgl   %ecx, (%rdi)
> +
> +       /* Branch on result.  Expectation is the use of trylock will be
> +          branching on success/failure so this branch can be used to
> +          to predict the coming branch.  It has the benefit of
> +          breaking the likely expensive memory dependency on (%rdi).  */
> +       cmpl    $1, %ecx
> +       jnz     1f
> +       xorl    %eax, %eax
> +       ret
> +1:
>         movl    $EBUSY, %eax
> -       cmovel  %ecx, %eax
> -       retq
> +       ret
>  END(__pthread_spin_trylock)
>  versioned_symbol (libc, __pthread_spin_trylock, pthread_spin_trylock,
>                   GLIBC_2_34)
> --
> 2.34.1
>

LGTM.

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S
index 44b837d9db..1e09e59b10 100644
--- a/sysdeps/x86_64/nptl/pthread_spin_lock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S
@@ -19,18 +19,27 @@ 
 #include <shlib-compat.h>
 
 ENTRY(__pthread_spin_lock)
-1:	LOCK
-	decl	0(%rdi)
-	jne	2f
+	/* Always return zero.  */
 	xor	%eax, %eax
+	LOCK
+	decl	0(%rdi)
+	jne	1f
 	ret
 
 	.align	16
-2:	rep
+1:
+	/* `rep nop` == `pause`.  */
+	rep
 	nop
-	cmpl	$0, 0(%rdi)
-	jg	1b
-	jmp	2b
+	cmpl	%eax, 0(%rdi)
+	jle	1b
+	/* Just repeat the `lock decl` logic here.  The code size save
+	   of jumping back to entry doesn't change how many 16-byte
+	   chunks (default function alignment) that the code fits in.  */
+	LOCK
+	decl	0(%rdi)
+	jne	1b
+	ret
 END(__pthread_spin_lock)
 versioned_symbol (libc, __pthread_spin_lock, pthread_spin_lock, GLIBC_2_34)
 
diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
index fffdb27dd9..a1f97cb420 100644
--- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
@@ -20,13 +20,21 @@ 
 #include <shlib-compat.h>
 
 ENTRY(__pthread_spin_trylock)
-	movl	$1, %eax
 	xorl	%ecx, %ecx
-	lock
-	cmpxchgl %ecx, (%rdi)
+	/* xchg has implicit LOCK prefix.  */
+	xchgl	%ecx, (%rdi)
+
+	/* Branch on result.  Expectation is the use of trylock will be
+	   branching on success/failure so this branch can be used to
+	   to predict the coming branch.  It has the benefit of
+	   breaking the likely expensive memory dependency on (%rdi).  */
+	cmpl	$1, %ecx
+	jnz	1f
+	xorl	%eax, %eax
+	ret
+1:
 	movl	$EBUSY, %eax
-	cmovel	%ecx, %eax
-	retq
+	ret
 END(__pthread_spin_trylock)
 versioned_symbol (libc, __pthread_spin_trylock, pthread_spin_trylock,
 		  GLIBC_2_34)