diff mbox series

[uclibc-ng-devel,1/2] linuxthreads/arm: fix ldrex/strex loop when built with O0

Message ID 20220718115724.13051-1-vladimir.murzin@arm.com
State Accepted
Headers show
Series [uclibc-ng-devel,1/2] linuxthreads/arm: fix ldrex/strex loop when built with O0 | expand

Commit Message

Vladimir Murzin July 18, 2022, 11:57 a.m. UTC
O0 build result in the following codegen

00000000 <ldrex>:
   0:	b480      	push	{r7}
   2:	b085      	sub	sp, #20
   4:	af00      	add	r7, sp, #0
   6:	6078      	str	r0, [r7, #4]
   8:	687b      	ldr	r3, [r7, #4]
   a:	e853 3f00 	ldrex	r3, [r3]
   e:	60fb      	str	r3, [r7, #12]
  10:	68fb      	ldr	r3, [r7, #12]
  12:	4618      	mov	r0, r3
  14:	3714      	adds	r7, #20
  16:	46bd      	mov	sp, r7
  18:	f85d 7b04 	ldr.w	r7, [sp], #4
  1c:	4770      	bx	lr

0000001e <strex>:
  1e:	b480      	push	{r7}
  20:	b085      	sub	sp, #20
  22:	af00      	add	r7, sp, #0
  24:	6078      	str	r0, [r7, #4]
  26:	6039      	str	r1, [r7, #0]
  28:	687b      	ldr	r3, [r7, #4]
  2a:	683a      	ldr	r2, [r7, #0]
  2c:	e842 3300 	strex	r3, r3, [r2]
  30:	60fb      	str	r3, [r7, #12]
  32:	68fb      	ldr	r3, [r7, #12]
  34:	4618      	mov	r0, r3
  36:	3714      	adds	r7, #20
  38:	46bd      	mov	sp, r7
  3a:	f85d 7b04 	ldr.w	r7, [sp], #4
  3e:	4770      	bx	lr

00000040 <testandset>:
  40:	b590      	push	{r4, r7, lr}
  42:	b083      	sub	sp, #12
  44:	af00      	add	r7, sp, #0
  46:	6078      	str	r0, [r7, #4]
  48:	6878      	ldr	r0, [r7, #4]
  4a:	f7ff fffe 	bl	0 <ldrex>
  4e:	4603      	mov	r3, r0
  50:	461c      	mov	r4, r3
  52:	6879      	ldr	r1, [r7, #4]
  54:	2001      	movs	r0, #1
  56:	f7ff fffe 	bl	1e <strex>
  5a:	4603      	mov	r3, r0
  5c:	2b00      	cmp	r3, #0
  5e:	d1f3      	bne.n	48 <testandset+0x8>
  60:	4623      	mov	r3, r4
  62:	4618      	mov	r0, r3
  64:	370c      	adds	r7, #12
  66:	46bd      	mov	sp, r7
  68:	bd90      	pop	{r4, r7, pc}

ARM ARM suggests that LoadExcl/StoreExcl loops are guaranteed to make
forward progress only if, for any LoadExcl/StoreExcl loop within a
single thread of execution, the software meets all of the following
conditions:

1 Between the Load-Exclusive and the Store-Exclusive, there are no
  explicit memory accesses, preloads, direct or indirect System
  register writes, address translation instructions, cache or TLB
  maintenance instructions, exception generating instructions,
  exception returns, or indirect branches.

...

Obviously condition is not met for O0 builds.

O2 build (which is highly likely the most common setting) able to do
the right thing resulting in

00000000 <ldrex>:
   0:	e850 0f00 	ldrex	r0, [r0]
   4:	4770      	bx	lr
   6:	bf00      	nop

00000008 <strex>:
   8:	e841 0000 	strex	r0, r0, [r1]
   c:	4770      	bx	lr
   e:	bf00      	nop

00000010 <testandset>:
  10:	2101      	movs	r1, #1
  12:	4603      	mov	r3, r0
  14:	e853 0f00 	ldrex	r0, [r3]
  18:	e843 1200 	strex	r2, r1, [r3]
  1c:	2a00      	cmp	r2, #0
  1e:	d1f9      	bne.n	14 <testandset+0x4>
  20:	4770      	bx	lr
  22:	bf00      	nop

Rather than depending on level of optimisation implement whole
ldrex/strex loop in inline assembly.

Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
---
 .../linuxthreads/sysdeps/arm/pt-machine.h     | 34 +++++--------------
 1 file changed, 9 insertions(+), 25 deletions(-)

Comments

Waldemar Brodkorb July 26, 2022, 7:58 a.m. UTC | #1
Hi Vladimir,

both patches applied and pushed,

best regards
 Waldemar

Vladimir Murzin wrote,

> O0 build result in the following codegen
> 
> 00000000 <ldrex>:
>    0:	b480      	push	{r7}
>    2:	b085      	sub	sp, #20
>    4:	af00      	add	r7, sp, #0
>    6:	6078      	str	r0, [r7, #4]
>    8:	687b      	ldr	r3, [r7, #4]
>    a:	e853 3f00 	ldrex	r3, [r3]
>    e:	60fb      	str	r3, [r7, #12]
>   10:	68fb      	ldr	r3, [r7, #12]
>   12:	4618      	mov	r0, r3
>   14:	3714      	adds	r7, #20
>   16:	46bd      	mov	sp, r7
>   18:	f85d 7b04 	ldr.w	r7, [sp], #4
>   1c:	4770      	bx	lr
> 
> 0000001e <strex>:
>   1e:	b480      	push	{r7}
>   20:	b085      	sub	sp, #20
>   22:	af00      	add	r7, sp, #0
>   24:	6078      	str	r0, [r7, #4]
>   26:	6039      	str	r1, [r7, #0]
>   28:	687b      	ldr	r3, [r7, #4]
>   2a:	683a      	ldr	r2, [r7, #0]
>   2c:	e842 3300 	strex	r3, r3, [r2]
>   30:	60fb      	str	r3, [r7, #12]
>   32:	68fb      	ldr	r3, [r7, #12]
>   34:	4618      	mov	r0, r3
>   36:	3714      	adds	r7, #20
>   38:	46bd      	mov	sp, r7
>   3a:	f85d 7b04 	ldr.w	r7, [sp], #4
>   3e:	4770      	bx	lr
> 
> 00000040 <testandset>:
>   40:	b590      	push	{r4, r7, lr}
>   42:	b083      	sub	sp, #12
>   44:	af00      	add	r7, sp, #0
>   46:	6078      	str	r0, [r7, #4]
>   48:	6878      	ldr	r0, [r7, #4]
>   4a:	f7ff fffe 	bl	0 <ldrex>
>   4e:	4603      	mov	r3, r0
>   50:	461c      	mov	r4, r3
>   52:	6879      	ldr	r1, [r7, #4]
>   54:	2001      	movs	r0, #1
>   56:	f7ff fffe 	bl	1e <strex>
>   5a:	4603      	mov	r3, r0
>   5c:	2b00      	cmp	r3, #0
>   5e:	d1f3      	bne.n	48 <testandset+0x8>
>   60:	4623      	mov	r3, r4
>   62:	4618      	mov	r0, r3
>   64:	370c      	adds	r7, #12
>   66:	46bd      	mov	sp, r7
>   68:	bd90      	pop	{r4, r7, pc}
> 
> ARM ARM suggests that LoadExcl/StoreExcl loops are guaranteed to make
> forward progress only if, for any LoadExcl/StoreExcl loop within a
> single thread of execution, the software meets all of the following
> conditions:
> 
> 1 Between the Load-Exclusive and the Store-Exclusive, there are no
>   explicit memory accesses, preloads, direct or indirect System
>   register writes, address translation instructions, cache or TLB
>   maintenance instructions, exception generating instructions,
>   exception returns, or indirect branches.
> 
> ...
> 
> Obviously condition is not met for O0 builds.
> 
> O2 build (which is highly likely the most common setting) able to do
> the right thing resulting in
> 
> 00000000 <ldrex>:
>    0:	e850 0f00 	ldrex	r0, [r0]
>    4:	4770      	bx	lr
>    6:	bf00      	nop
> 
> 00000008 <strex>:
>    8:	e841 0000 	strex	r0, r0, [r1]
>    c:	4770      	bx	lr
>    e:	bf00      	nop
> 
> 00000010 <testandset>:
>   10:	2101      	movs	r1, #1
>   12:	4603      	mov	r3, r0
>   14:	e853 0f00 	ldrex	r0, [r3]
>   18:	e843 1200 	strex	r2, r1, [r3]
>   1c:	2a00      	cmp	r2, #0
>   1e:	d1f9      	bne.n	14 <testandset+0x4>
>   20:	4770      	bx	lr
>   22:	bf00      	nop
> 
> Rather than depending on level of optimisation implement whole
> ldrex/strex loop in inline assembly.
> 
> Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
> ---
>  .../linuxthreads/sysdeps/arm/pt-machine.h     | 34 +++++--------------
>  1 file changed, 9 insertions(+), 25 deletions(-)
> 
> diff --git a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
> index fc17e9bc7..b00b10495 100644
> --- a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
> +++ b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
> @@ -29,35 +29,19 @@
>  #endif
>  
>  #if defined(__thumb2__)
> -PT_EI long int ldrex(int *spinlock)
> -{
> -	long int ret;
> -	__asm__ __volatile__(
> -		"ldrex %0, [%1]\n"
> -		: "=r"(ret)
> -		: "r"(spinlock) : "memory");
> -	return ret;
> -}
> -
> -PT_EI long int strex(int val, int *spinlock)
> -{
> -	long int ret;
> -	__asm__ __volatile__(
> -		"strex %0, %1, [%2]\n"
> -		: "=r"(ret)
> -		: "r" (val), "r"(spinlock) : "memory");
> -	return ret;
> -}
> -
>  /* Spinlock implementation; required.  */
>  PT_EI long int
>  testandset (int *spinlock)
>  {
> -  register unsigned int ret;
> -
> -  do {
> -	  ret = ldrex(spinlock);
> -  } while (strex(1, spinlock));
> +  unsigned int ret, tmp, val = 1;
> +
> +  __asm__ __volatile__ (
> +"0:    ldrex   %0, [%2]       \n"
> +"      strex   %1, %3, [%2]   \n"
> +"      cmp     %1, #0         \n"
> +"      bne     0b"
> +       : "=&r" (ret), "=&r" (tmp)
> +       : "r" (spinlock), "r" (val) : "memory", "cc");
>  
>    return ret;
>  }
> -- 
> 2.25.1
>
diff mbox series

Patch

diff --git a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
index fc17e9bc7..b00b10495 100644
--- a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
+++ b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h
@@ -29,35 +29,19 @@ 
 #endif
 
 #if defined(__thumb2__)
-PT_EI long int ldrex(int *spinlock)
-{
-	long int ret;
-	__asm__ __volatile__(
-		"ldrex %0, [%1]\n"
-		: "=r"(ret)
-		: "r"(spinlock) : "memory");
-	return ret;
-}
-
-PT_EI long int strex(int val, int *spinlock)
-{
-	long int ret;
-	__asm__ __volatile__(
-		"strex %0, %1, [%2]\n"
-		: "=r"(ret)
-		: "r" (val), "r"(spinlock) : "memory");
-	return ret;
-}
-
 /* Spinlock implementation; required.  */
 PT_EI long int
 testandset (int *spinlock)
 {
-  register unsigned int ret;
-
-  do {
-	  ret = ldrex(spinlock);
-  } while (strex(1, spinlock));
+  unsigned int ret, tmp, val = 1;
+
+  __asm__ __volatile__ (
+"0:    ldrex   %0, [%2]       \n"
+"      strex   %1, %3, [%2]   \n"
+"      cmp     %1, #0         \n"
+"      bne     0b"
+       : "=&r" (ret), "=&r" (tmp)
+       : "r" (spinlock), "r" (val) : "memory", "cc");
 
   return ret;
 }