Message ID | 20220718115724.13051-1-vladimir.murzin@arm.com |
---|---|
State | Accepted |
Headers | show |
Series | [uclibc-ng-devel,1/2] linuxthreads/arm: fix ldrex/strex loop when built with O0 | expand |
Hi Vladimir, both patches applied and pushed, best regards Waldemar Vladimir Murzin wrote, > O0 build result in the following codegen > > 00000000 <ldrex>: > 0: b480 push {r7} > 2: b085 sub sp, #20 > 4: af00 add r7, sp, #0 > 6: 6078 str r0, [r7, #4] > 8: 687b ldr r3, [r7, #4] > a: e853 3f00 ldrex r3, [r3] > e: 60fb str r3, [r7, #12] > 10: 68fb ldr r3, [r7, #12] > 12: 4618 mov r0, r3 > 14: 3714 adds r7, #20 > 16: 46bd mov sp, r7 > 18: f85d 7b04 ldr.w r7, [sp], #4 > 1c: 4770 bx lr > > 0000001e <strex>: > 1e: b480 push {r7} > 20: b085 sub sp, #20 > 22: af00 add r7, sp, #0 > 24: 6078 str r0, [r7, #4] > 26: 6039 str r1, [r7, #0] > 28: 687b ldr r3, [r7, #4] > 2a: 683a ldr r2, [r7, #0] > 2c: e842 3300 strex r3, r3, [r2] > 30: 60fb str r3, [r7, #12] > 32: 68fb ldr r3, [r7, #12] > 34: 4618 mov r0, r3 > 36: 3714 adds r7, #20 > 38: 46bd mov sp, r7 > 3a: f85d 7b04 ldr.w r7, [sp], #4 > 3e: 4770 bx lr > > 00000040 <testandset>: > 40: b590 push {r4, r7, lr} > 42: b083 sub sp, #12 > 44: af00 add r7, sp, #0 > 46: 6078 str r0, [r7, #4] > 48: 6878 ldr r0, [r7, #4] > 4a: f7ff fffe bl 0 <ldrex> > 4e: 4603 mov r3, r0 > 50: 461c mov r4, r3 > 52: 6879 ldr r1, [r7, #4] > 54: 2001 movs r0, #1 > 56: f7ff fffe bl 1e <strex> > 5a: 4603 mov r3, r0 > 5c: 2b00 cmp r3, #0 > 5e: d1f3 bne.n 48 <testandset+0x8> > 60: 4623 mov r3, r4 > 62: 4618 mov r0, r3 > 64: 370c adds r7, #12 > 66: 46bd mov sp, r7 > 68: bd90 pop {r4, r7, pc} > > ARM ARM suggests that LoadExcl/StoreExcl loops are guaranteed to make > forward progress only if, for any LoadExcl/StoreExcl loop within a > single thread of execution, the software meets all of the following > conditions: > > 1 Between the Load-Exclusive and the Store-Exclusive, there are no > explicit memory accesses, preloads, direct or indirect System > register writes, address translation instructions, cache or TLB > maintenance instructions, exception generating instructions, > exception returns, or indirect branches. > > ... > > Obviously condition is not met for O0 builds. > > O2 build (which is highly likely the most common setting) able to do > the right thing resulting in > > 00000000 <ldrex>: > 0: e850 0f00 ldrex r0, [r0] > 4: 4770 bx lr > 6: bf00 nop > > 00000008 <strex>: > 8: e841 0000 strex r0, r0, [r1] > c: 4770 bx lr > e: bf00 nop > > 00000010 <testandset>: > 10: 2101 movs r1, #1 > 12: 4603 mov r3, r0 > 14: e853 0f00 ldrex r0, [r3] > 18: e843 1200 strex r2, r1, [r3] > 1c: 2a00 cmp r2, #0 > 1e: d1f9 bne.n 14 <testandset+0x4> > 20: 4770 bx lr > 22: bf00 nop > > Rather than depending on level of optimisation implement whole > ldrex/strex loop in inline assembly. > > Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com> > --- > .../linuxthreads/sysdeps/arm/pt-machine.h | 34 +++++-------------- > 1 file changed, 9 insertions(+), 25 deletions(-) > > diff --git a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h > index fc17e9bc7..b00b10495 100644 > --- a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h > +++ b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h > @@ -29,35 +29,19 @@ > #endif > > #if defined(__thumb2__) > -PT_EI long int ldrex(int *spinlock) > -{ > - long int ret; > - __asm__ __volatile__( > - "ldrex %0, [%1]\n" > - : "=r"(ret) > - : "r"(spinlock) : "memory"); > - return ret; > -} > - > -PT_EI long int strex(int val, int *spinlock) > -{ > - long int ret; > - __asm__ __volatile__( > - "strex %0, %1, [%2]\n" > - : "=r"(ret) > - : "r" (val), "r"(spinlock) : "memory"); > - return ret; > -} > - > /* Spinlock implementation; required. */ > PT_EI long int > testandset (int *spinlock) > { > - register unsigned int ret; > - > - do { > - ret = ldrex(spinlock); > - } while (strex(1, spinlock)); > + unsigned int ret, tmp, val = 1; > + > + __asm__ __volatile__ ( > +"0: ldrex %0, [%2] \n" > +" strex %1, %3, [%2] \n" > +" cmp %1, #0 \n" > +" bne 0b" > + : "=&r" (ret), "=&r" (tmp) > + : "r" (spinlock), "r" (val) : "memory", "cc"); > > return ret; > } > -- > 2.25.1 >
diff --git a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h index fc17e9bc7..b00b10495 100644 --- a/libpthread/linuxthreads/sysdeps/arm/pt-machine.h +++ b/libpthread/linuxthreads/sysdeps/arm/pt-machine.h @@ -29,35 +29,19 @@ #endif #if defined(__thumb2__) -PT_EI long int ldrex(int *spinlock) -{ - long int ret; - __asm__ __volatile__( - "ldrex %0, [%1]\n" - : "=r"(ret) - : "r"(spinlock) : "memory"); - return ret; -} - -PT_EI long int strex(int val, int *spinlock) -{ - long int ret; - __asm__ __volatile__( - "strex %0, %1, [%2]\n" - : "=r"(ret) - : "r" (val), "r"(spinlock) : "memory"); - return ret; -} - /* Spinlock implementation; required. */ PT_EI long int testandset (int *spinlock) { - register unsigned int ret; - - do { - ret = ldrex(spinlock); - } while (strex(1, spinlock)); + unsigned int ret, tmp, val = 1; + + __asm__ __volatile__ ( +"0: ldrex %0, [%2] \n" +" strex %1, %3, [%2] \n" +" cmp %1, #0 \n" +" bne 0b" + : "=&r" (ret), "=&r" (tmp) + : "r" (spinlock), "r" (val) : "memory", "cc"); return ret; }
O0 build result in the following codegen 00000000 <ldrex>: 0: b480 push {r7} 2: b085 sub sp, #20 4: af00 add r7, sp, #0 6: 6078 str r0, [r7, #4] 8: 687b ldr r3, [r7, #4] a: e853 3f00 ldrex r3, [r3] e: 60fb str r3, [r7, #12] 10: 68fb ldr r3, [r7, #12] 12: 4618 mov r0, r3 14: 3714 adds r7, #20 16: 46bd mov sp, r7 18: f85d 7b04 ldr.w r7, [sp], #4 1c: 4770 bx lr 0000001e <strex>: 1e: b480 push {r7} 20: b085 sub sp, #20 22: af00 add r7, sp, #0 24: 6078 str r0, [r7, #4] 26: 6039 str r1, [r7, #0] 28: 687b ldr r3, [r7, #4] 2a: 683a ldr r2, [r7, #0] 2c: e842 3300 strex r3, r3, [r2] 30: 60fb str r3, [r7, #12] 32: 68fb ldr r3, [r7, #12] 34: 4618 mov r0, r3 36: 3714 adds r7, #20 38: 46bd mov sp, r7 3a: f85d 7b04 ldr.w r7, [sp], #4 3e: 4770 bx lr 00000040 <testandset>: 40: b590 push {r4, r7, lr} 42: b083 sub sp, #12 44: af00 add r7, sp, #0 46: 6078 str r0, [r7, #4] 48: 6878 ldr r0, [r7, #4] 4a: f7ff fffe bl 0 <ldrex> 4e: 4603 mov r3, r0 50: 461c mov r4, r3 52: 6879 ldr r1, [r7, #4] 54: 2001 movs r0, #1 56: f7ff fffe bl 1e <strex> 5a: 4603 mov r3, r0 5c: 2b00 cmp r3, #0 5e: d1f3 bne.n 48 <testandset+0x8> 60: 4623 mov r3, r4 62: 4618 mov r0, r3 64: 370c adds r7, #12 66: 46bd mov sp, r7 68: bd90 pop {r4, r7, pc} ARM ARM suggests that LoadExcl/StoreExcl loops are guaranteed to make forward progress only if, for any LoadExcl/StoreExcl loop within a single thread of execution, the software meets all of the following conditions: 1 Between the Load-Exclusive and the Store-Exclusive, there are no explicit memory accesses, preloads, direct or indirect System register writes, address translation instructions, cache or TLB maintenance instructions, exception generating instructions, exception returns, or indirect branches. ... Obviously condition is not met for O0 builds. O2 build (which is highly likely the most common setting) able to do the right thing resulting in 00000000 <ldrex>: 0: e850 0f00 ldrex r0, [r0] 4: 4770 bx lr 6: bf00 nop 00000008 <strex>: 8: e841 0000 strex r0, r0, [r1] c: 4770 bx lr e: bf00 nop 00000010 <testandset>: 10: 2101 movs r1, #1 12: 4603 mov r3, r0 14: e853 0f00 ldrex r0, [r3] 18: e843 1200 strex r2, r1, [r3] 1c: 2a00 cmp r2, #0 1e: d1f9 bne.n 14 <testandset+0x4> 20: 4770 bx lr 22: bf00 nop Rather than depending on level of optimisation implement whole ldrex/strex loop in inline assembly. Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com> --- .../linuxthreads/sysdeps/arm/pt-machine.h | 34 +++++-------------- 1 file changed, 9 insertions(+), 25 deletions(-)